[fix] engine - duckduckgo vqd edge-case

This commit is contained in:
jazzzooo 2023-09-19 06:47:26 -07:00 committed by MatthieuBarbu
parent df71f4d7ca
commit 8bea311d5a
1 changed files with 40 additions and 27 deletions

View File

@ -57,13 +57,13 @@ url = 'https://lite.duckduckgo.com/lite/'
# url_ping = 'https://duckduckgo.com/t/sl_l' # url_ping = 'https://duckduckgo.com/t/sl_l'
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
def cache_vqd(query, value): def cache_vqd(query, value):
"""Caches a ``vqd`` value from a query. """Caches a ``vqd`` token from a query, if token is None the cached value
is deleted.
The vqd value depends on the query string and is needed for the follow up The vqd token depends on the query string and is needed for the follow up
pages or the images loaded by a XMLHttpRequest: pages or the images loaded by a XMLHttpRequest:
- DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...` - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
@ -72,18 +72,27 @@ def cache_vqd(query, value):
""" """
c = redisdb.client() c = redisdb.client()
if c: if c:
logger.debug("cache vqd value: %s", value)
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
c.set(key, value, ex=600) if value is not None:
logger.debug("cache vqd value: %s", value)
c.set(key, value, ex=600)
else:
# remove from cache
c.delete(key)
def _get_vqd_value(query):
res = get('https://lite.duckduckgo.com/lite/?' + urlencode({'q': query}))
doc = lxml.html.fromstring(res.text)
return eval_xpath_getindex(doc, "//input[@name='vqd']/@value", 0, None)
def get_vqd(query): def get_vqd(query):
"""Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached """Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd token from the
response. response.
""" """
value = None
c = redisdb.client() c = redisdb.client()
if c: if c:
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query) key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
@ -93,12 +102,20 @@ def get_vqd(query):
logger.debug("re-use cached vqd value: %s", value) logger.debug("re-use cached vqd value: %s", value)
return value return value
query_url = 'https://lite.duckduckgo.com/lite/?{args}'.format(args=urlencode({'q': query})) value = _get_vqd_value(query)
res = get(query_url) if not value:
doc = lxml.html.fromstring(res.text) # seems we got a CAPTCHA for this query string, send a dummy request to
value = doc.xpath("//input[@name='vqd']/@value")[0] # release the captcha and then fetch the vqd value for the query string
logger.debug("new vqd value: %s", value) # again.
cache_vqd(query, value) logger.warning("vqd token will no longer work, trying to get a new one by sending another query")
_get_vqd_value(f'{query[:3]} duckduckgo')
value = _get_vqd_value(query)
if not value:
logger.error("was not able to fetch a valid vqd token from DDG")
else:
logger.debug("new vqd value: %s", value)
cache_vqd(query, value)
return value return value
@ -241,10 +258,10 @@ def request(query, params):
# initial page does not have additional data in the input form # initial page does not have additional data in the input form
if params['pageno'] > 1: if params['pageno'] > 1:
params['data']['o'] = form_data.get('o', 'json') params['data']['o'] = 'json'
params['data']['api'] = form_data.get('api', 'd.js') params['data']['api'] = 'd.js'
params['data']['nextParams'] = form_data.get('nextParams', '') params['data']['nextParams'] = ''
params['data']['v'] = form_data.get('v', 'l') params['data']['v'] = 'l'
params['data']['kl'] = eng_region params['data']['kl'] = eng_region
params['cookies']['kl'] = eng_region params['cookies']['kl'] = eng_region
@ -274,23 +291,19 @@ def response(resp):
# the layout of the HTML tables is different. # the layout of the HTML tables is different.
result_table = result_table[1] result_table = result_table[1]
elif not len(result_table) >= 3: elif not len(result_table) >= 3:
# no more results # no more results / if we have the vqd token in cache, it's no longer
# valid and has to be deleted
cache_vqd(resp.search_params['data']['q'], None)
return [] return []
else: else:
result_table = result_table[2] result_table = result_table[2]
# update form data from response # update form data from response
form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..') form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
if len(form): if len(form):
value = eval_xpath_getindex(form[0], "//input[@name='vqd']/@value", 0, None)
form = form[0]
form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
logger.debug('form_data: %s', form_data)
value = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
query = resp.search_params['data']['q'] query = resp.search_params['data']['q']
cache_vqd(query, value) if value:
cache_vqd(query, value)
tr_rows = eval_xpath(result_table, './/tr') tr_rows = eval_xpath(result_table, './/tr')
# In the last <tr> is the form of the 'previous/next page' links # In the last <tr> is the form of the 'previous/next page' links