searxng/searx/autocomplete.py
Markus Heiser 744d96a16c [fix] startpage engine: language/region & time support & fix CAPTCHA
One reason for the often seen CAPTCHA of the startpage requests are the
incomplete requests SearXNG sends to startpage.com.  To avoid CAPTCHA we need to
send a well formed HTTP POST request with a cookie, we need to form a request
that is identical to the request build by startpage.com itself:

- in the cookie the **region** is selected
- in the POST arguments the **language** is selected

Based on the *engine_properties* boilerplate, SearXNG's startpage engine now
implements a `_fetch_engine_properties()` function to fetch regions & languages
from startpage.com.

This patch is a complete new implementation of the request() function, reversed
engineered from the startpage.com page.  The new implementation adds

- time-range support
- save-search support

to the startpage engine which has been missed in the past.

The locale code 'no_NO' from startpage does not exists and is mapped to nb-NO.
For reference see languages-subtag at iana [1], `no` is the macrolanguage::

     type: language
     Subtag: nb
     Description: Norwegian Bokmål
     Added: 2005-10-16
     Suppress-Script: Latn
     Macrolanguage: no

Additional hints:

- To fetch languages from startpage, this patch makes use of the
  EngineProperties implemented in 7bf0d46c

- Te get Startpage's locale & language, the function get_engine_locale from
  9ae409a is used.

[1] https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
[2] https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag

Closes: https://github.com/searxng/searxng/issues/1081
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-08-29 19:27:50 +02:00

178 lines
4.5 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This module implements functions needed for the autocompleter.
"""
from json import loads
from urllib.parse import urlencode
from lxml import etree
from httpx import HTTPError
from searx import settings
from searx.network import get as http_get
from searx.exceptions import SearxEngineResponseException
from searx.engines import engines
# a _fetch_supported_properites() for XPath engines isn't available right now
# _brave = ENGINES_LANGUAGES['brave'].keys()
def get(*args, **kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = settings['outgoing']['request_timeout']
kwargs['raise_for_httperror'] = True
return http_get(*args, **kwargs)
def brave(query, _lang):
# brave search autocompleter
url = 'https://search.brave.com/api/suggest?'
url += urlencode({'q': query})
country = 'all'
# if lang in _brave:
# country = lang
kwargs = {'cookies': {'country': country}}
resp = get(url, **kwargs)
results = []
if resp.ok:
data = resp.json()
for item in data[1]:
results.append(item)
return results
def dbpedia(query, _lang):
# dbpedia autocompleter, no HTTPS
autocomplete_url = 'https://lookup.dbpedia.org/api/search.asmx/KeywordSearch?'
response = get(autocomplete_url + urlencode(dict(QueryString=query)))
results = []
if response.ok:
dom = etree.fromstring(response.content)
results = dom.xpath('//Result/Label//text()')
return results
def duckduckgo(query, _lang):
# duckduckgo autocompleter
url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'
resp = loads(get(url.format(urlencode(dict(q=query)))).text)
if len(resp) > 1:
return resp[1]
return []
def google(query, lang):
# google autocompleter
autocomplete_url = 'https://suggestqueries.google.com/complete/search?client=toolbar&'
response = get(autocomplete_url + urlencode(dict(hl=lang, q=query)))
results = []
if response.ok:
dom = etree.fromstring(response.text)
results = dom.xpath('//suggestion/@data')
return results
def seznam(query, _lang):
# seznam search autocompleter
url = 'https://suggest.seznam.cz/fulltext/cs?{query}'
resp = get(
url.format(
query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)
)
)
if not resp.ok:
return []
data = resp.json()
return [
''.join([part.get('text', '') for part in item.get('text', [])])
for item in data.get('result', [])
if item.get('itemType', None) == 'ItemType.TEXT'
]
def startpage(query, lang):
# startpage autocompleter
engine = engines['startpage']
_, engine_language, _ = engine.get_engine_locale(lang)
url = 'https://startpage.com/suggestions?{query}'
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': engine_language})))
data = resp.json()
return [e['text'] for e in data.get('suggestions', []) if 'text' in e]
def swisscows(query, _lang):
# swisscows autocompleter
url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5'
resp = loads(get(url.format(query=urlencode({'query': query}))).text)
return resp
def qwant(query, lang):
# qwant autocompleter (additional parameter : lang=en_en&count=xxx )
url = 'https://api.qwant.com/api/suggest?{query}'
resp = get(url.format(query=urlencode({'q': query, 'lang': lang})))
results = []
if resp.ok:
data = loads(resp.text)
if data['status'] == 'success':
for item in data['data']['items']:
results.append(item['value'])
return results
def wikipedia(query, lang):
# wikipedia autocompleter
url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'
resp = loads(get(url.format(urlencode(dict(search=query)))).text)
if len(resp) > 1:
return resp[1]
return []
backends = {
'dbpedia': dbpedia,
'duckduckgo': duckduckgo,
'google': google,
'seznam': seznam,
'startpage': startpage,
'swisscows': swisscows,
'qwant': qwant,
'wikipedia': wikipedia,
'brave': brave,
}
def search_autocomplete(backend_name, query, lang):
backend = backends.get(backend_name)
if backend is None:
return []
try:
return backend(query, lang)
except (HTTPError, SearxEngineResponseException):
return []