forked from zaclys/searxng
[fix] ddg-lite & ddg-extra: don't send empty vqd value
DDG's bot detection is sensitive to the vqd value. For some search terms (such as extremely long search terms that are often sent by bots), no vqd value can be determined. If SearXNG cannot determine a vqd value, then no request should go out to DDG (WEB): a request with a wrong vqd value leads to DDG temporarily putting SearXNG's IP on a block list. Requests from IPs in this block list run into timeouts. Not sure, but it seems the block list is a sliding window: to get my IP rid from the bot list I had to cool down my IP for 1h (send no requests from that IP to DDG). Since such issues can't reproduce in a local instance I tested this patch 24h on my public SearXNG instance: There are still errors (rare), but the reliability is still 100%. Related: - https://github.com/searxng/searxng/pull/2922 - https://github.com/searxng/searxng/pull/2923 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
dac63f7764
commit
d65753428b
|
@ -61,17 +61,7 @@ form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
|
||||||
|
|
||||||
|
|
||||||
def cache_vqd(query, value):
|
def cache_vqd(query, value):
|
||||||
"""Caches a ``vqd`` value from a query.
|
"""Caches a ``vqd`` value from a query."""
|
||||||
|
|
||||||
The vqd value depends on the query string and is needed for the follow up
|
|
||||||
pages or the images loaded by a XMLHttpRequest:
|
|
||||||
|
|
||||||
- DuckDuckGo Web: ``https://links.duckduckgo.com/d.js?q=...&vqd=...``
|
|
||||||
- DuckDuckGo Images: ``https://duckduckgo.com/i.js??q=...&vqd=...``
|
|
||||||
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
|
|
||||||
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
|
|
||||||
|
|
||||||
"""
|
|
||||||
c = redisdb.client()
|
c = redisdb.client()
|
||||||
if c:
|
if c:
|
||||||
logger.debug("cache vqd value: %s", value)
|
logger.debug("cache vqd value: %s", value)
|
||||||
|
@ -84,13 +74,43 @@ def get_vqd(query):
|
||||||
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
|
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
|
||||||
response.
|
response.
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
If an empty string is returned there are no results for the ``query`` and
|
||||||
|
therefore no ``vqd`` value.
|
||||||
|
|
||||||
|
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
|
||||||
|
(such as extremely long search terms that are often sent by bots), no ``vqd``
|
||||||
|
value can be determined.
|
||||||
|
|
||||||
|
If SearXNG cannot determine a ``vqd`` value, then no request should go out
|
||||||
|
to DDG:
|
||||||
|
|
||||||
|
A request with a wrong ``vqd`` value leads to DDG temporarily putting
|
||||||
|
SearXNG's IP on a block list.
|
||||||
|
|
||||||
|
Requests from IPs in this block list run into timeouts.
|
||||||
|
|
||||||
|
Not sure, but it seems the block list is a sliding window: to get my IP rid
|
||||||
|
from the bot list I had to cool down my IP for 1h (send no requests from
|
||||||
|
that IP to DDG).
|
||||||
|
|
||||||
|
TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
|
||||||
|
by all request to DDG:
|
||||||
|
|
||||||
|
- DuckDuckGo Lite: ``https://lite.duckduckgo.com/lite`` (POST form data)
|
||||||
|
- DuckDuckGo Web: ``https://links.duckduckgo.com/d.js?q=...&vqd=...``
|
||||||
|
- DuckDuckGo Images: ``https://duckduckgo.com/i.js??q=...&vqd=...``
|
||||||
|
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
|
||||||
|
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
|
||||||
|
|
||||||
"""
|
"""
|
||||||
value = None
|
value = ''
|
||||||
c = redisdb.client()
|
c = redisdb.client()
|
||||||
if c:
|
if c:
|
||||||
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
|
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
|
||||||
value = c.get(key)
|
value = c.get(key)
|
||||||
if value:
|
if value or value == b'':
|
||||||
value = value.decode('utf-8')
|
value = value.decode('utf-8')
|
||||||
logger.debug("re-use cached vqd value: %s", value)
|
logger.debug("re-use cached vqd value: %s", value)
|
||||||
return value
|
return value
|
||||||
|
@ -102,9 +122,11 @@ def get_vqd(query):
|
||||||
if value:
|
if value:
|
||||||
value = value[0]
|
value = value[0]
|
||||||
else:
|
else:
|
||||||
# some search terms do not have results and therefore no vqd value
|
# Some search terms do not have results and therefore no vqd value. If
|
||||||
|
# no vqd value can be determined for the search term, an empty string is
|
||||||
|
# chached.
|
||||||
value = ''
|
value = ''
|
||||||
logger.debug("new vqd value: %s", value)
|
logger.debug("new vqd value: '%s'", value)
|
||||||
cache_vqd(query, value)
|
cache_vqd(query, value)
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
@ -204,6 +226,13 @@ ddg_lang_map = {
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
|
||||||
|
# request needs a vqd argument
|
||||||
|
vqd = get_vqd(query)
|
||||||
|
if not vqd:
|
||||||
|
# some search terms do not have results and therefore no vqd value
|
||||||
|
params['url'] = None
|
||||||
|
return params
|
||||||
|
|
||||||
# quote ddg bangs
|
# quote ddg bangs
|
||||||
query_parts = []
|
query_parts = []
|
||||||
# for val in re.split(r'(\s+)', query):
|
# for val in re.split(r'(\s+)', query):
|
||||||
|
@ -227,6 +256,7 @@ def request(query, params):
|
||||||
# link again and again ..
|
# link again and again ..
|
||||||
|
|
||||||
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
|
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||||
|
params['data']['vqd'] = vqd
|
||||||
|
|
||||||
# initial page does not have an offset
|
# initial page does not have an offset
|
||||||
if params['pageno'] == 2:
|
if params['pageno'] == 2:
|
||||||
|
@ -248,9 +278,6 @@ def request(query, params):
|
||||||
params['data']['api'] = form_data.get('api', 'd.js')
|
params['data']['api'] = form_data.get('api', 'd.js')
|
||||||
params['data']['nextParams'] = form_data.get('nextParams', '')
|
params['data']['nextParams'] = form_data.get('nextParams', '')
|
||||||
params['data']['v'] = form_data.get('v', 'l')
|
params['data']['v'] = form_data.get('v', 'l')
|
||||||
|
|
||||||
# request needs a vqd argument
|
|
||||||
params['data']['vqd'] = get_vqd(query)
|
|
||||||
params['headers']['Referer'] = 'https://lite.duckduckgo.com/'
|
params['headers']['Referer'] = 'https://lite.duckduckgo.com/'
|
||||||
|
|
||||||
params['data']['kl'] = eng_region
|
params['data']['kl'] = eng_region
|
||||||
|
|
|
@ -48,6 +48,13 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
|
||||||
|
# request needs a vqd argument
|
||||||
|
vqd = get_vqd(query)
|
||||||
|
if not vqd:
|
||||||
|
# some search terms do not have results and therefore no vqd value
|
||||||
|
params['url'] = None
|
||||||
|
return params
|
||||||
|
|
||||||
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
|
||||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||||
|
|
||||||
|
@ -57,7 +64,7 @@ def request(query, params):
|
||||||
# 'u': 'bing',
|
# 'u': 'bing',
|
||||||
'l': eng_region,
|
'l': eng_region,
|
||||||
'f': ',,,,,',
|
'f': ',,,,,',
|
||||||
'vqd': get_vqd(query),
|
'vqd': vqd,
|
||||||
}
|
}
|
||||||
|
|
||||||
if params['pageno'] > 1:
|
if params['pageno'] > 1:
|
||||||
|
|
Loading…
Reference in New Issue