Merge remote-tracking branch 'origin/latesto'

This commit is contained in:
Joseph Cheung 2023-02-19 11:16:56 +08:00
commit d008d78cd6
408 changed files with 96618 additions and 55272 deletions

View file

@ -3,6 +3,7 @@
"""This module implements functions needed for the autocompleter.
"""
# pylint: disable=use-dict-literal
from json import loads
from urllib.parse import urlencode
@ -89,17 +90,24 @@ def seznam(query, _lang):
# seznam search autocompleter
url = 'https://suggest.seznam.cz/fulltext/cs?{query}'
resp = get(url.format(query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)))
resp = get(
url.format(
query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)
)
)
if not resp.ok:
return []
data = resp.json()
return [''.join(
[part.get('text', '') for part in item.get('text', [])]
) for item in data.get('result', []) if item.get('itemType', None) == 'ItemType.TEXT']
return [
''.join([part.get('text', '') for part in item.get('text', [])])
for item in data.get('result', [])
if item.get('itemType', None) == 'ItemType.TEXT'
]
def startpage(query, lang):
# startpage autocompleter
@ -145,6 +153,16 @@ def wikipedia(query, lang):
return []
def yandex(query, _lang):
# yandex autocompleter
url = "https://suggest.yandex.com/suggest-ff.cgi?{0}"
resp = loads(get(url.format(urlencode(dict(part=query)))).text)
if len(resp) > 1:
return resp[1]
return []
backends = {
'dbpedia': dbpedia,
'duckduckgo': duckduckgo,
@ -155,6 +173,7 @@ backends = {
'qwant': qwant,
'wikipedia': wikipedia,
'brave': brave,
'yandex': yandex,
}

View file

@ -7,8 +7,11 @@
# pylint: disable=C,R
__all__ = ('cached_property',)
try:
from functools import cached_property # pylint: disable=unused-import
from functools import cached_property # type: ignore
except ImportError:

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -1396,170 +1396,155 @@
"sv",
"zh"
],
"qwant": [
"bg-BG",
"ca-ES",
"cs-CZ",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-CA",
"en-GB",
"en-IE",
"en-MY",
"en-NZ",
"en-US",
"es-AR",
"es-CL",
"es-ES",
"es-MX",
"et-EE",
"fi-FI",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"hu-HU",
"it-CH",
"it-IT",
"ko-KR",
"nb-NO",
"nl-BE",
"nl-NL",
"pl-PL",
"pt-PT",
"ro-RO",
"sv-SE",
"th-TH",
"zh-CN",
"zh-HK"
],
"qwant images": [
"bg-BG",
"ca-ES",
"cs-CZ",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-CA",
"en-GB",
"en-IE",
"en-MY",
"en-NZ",
"en-US",
"es-AR",
"es-CL",
"es-ES",
"es-MX",
"et-EE",
"fi-FI",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"hu-HU",
"it-CH",
"it-IT",
"ko-KR",
"nb-NO",
"nl-BE",
"nl-NL",
"pl-PL",
"pt-PT",
"ro-RO",
"sv-SE",
"th-TH",
"zh-CN",
"zh-HK"
],
"qwant news": [
"bg-BG",
"ca-ES",
"cs-CZ",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-CA",
"en-GB",
"en-IE",
"en-MY",
"en-NZ",
"en-US",
"es-AR",
"es-CL",
"es-ES",
"es-MX",
"et-EE",
"fi-FI",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"hu-HU",
"it-CH",
"it-IT",
"ko-KR",
"nb-NO",
"nl-BE",
"nl-NL",
"pl-PL",
"pt-PT",
"ro-RO",
"sv-SE",
"th-TH",
"zh-CN",
"zh-HK"
],
"qwant videos": [
"bg-BG",
"ca-ES",
"cs-CZ",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-CA",
"en-GB",
"en-IE",
"en-MY",
"en-NZ",
"en-US",
"es-AR",
"es-CL",
"es-ES",
"es-MX",
"et-EE",
"fi-FI",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"hu-HU",
"it-CH",
"it-IT",
"ko-KR",
"nb-NO",
"nl-BE",
"nl-NL",
"pl-PL",
"pt-PT",
"ro-RO",
"sv-SE",
"th-TH",
"zh-CN",
"zh-HK"
],
"qwant": {
"bg-BG": "bg_BG",
"ca-ES": "ca_ES",
"cs-CZ": "cs_CZ",
"da-DK": "da_DK",
"de-AT": "de_AT",
"de-CH": "de_CH",
"de-DE": "de_DE",
"el-GR": "el_GR",
"en-AU": "en_AU",
"en-CA": "en_CA",
"en-GB": "en_GB",
"en-IE": "en_IE",
"en-MY": "en_MY",
"en-NZ": "en_NZ",
"en-US": "en_US",
"es-AR": "es_AR",
"es-CL": "es_CL",
"es-ES": "es_ES",
"es-MX": "es_MX",
"et-EE": "et_EE",
"fi-FI": "fi_FI",
"fr-BE": "fr_BE",
"fr-CA": "fr_CA",
"fr-CH": "fr_CH",
"fr-FR": "fr_FR",
"hu-HU": "hu_HU",
"it-CH": "it_CH",
"it-IT": "it_IT",
"ko-KR": "ko_KR",
"nb-NO": "nb_NO",
"nl-BE": "nl_BE",
"nl-NL": "nl_NL",
"pl-PL": "pl_PL",
"pt-PT": "pt_PT",
"ro-RO": "ro_RO",
"sv-SE": "sv_SE",
"th-TH": "th_TH",
"zh-CN": "zh_CN",
"zh-HK": "zh_HK"
},
"qwant images": {
"bg-BG": "bg_BG",
"ca-ES": "ca_ES",
"cs-CZ": "cs_CZ",
"da-DK": "da_DK",
"de-AT": "de_AT",
"de-CH": "de_CH",
"de-DE": "de_DE",
"el-GR": "el_GR",
"en-AU": "en_AU",
"en-CA": "en_CA",
"en-GB": "en_GB",
"en-IE": "en_IE",
"en-MY": "en_MY",
"en-NZ": "en_NZ",
"en-US": "en_US",
"es-AR": "es_AR",
"es-CL": "es_CL",
"es-ES": "es_ES",
"es-MX": "es_MX",
"et-EE": "et_EE",
"fi-FI": "fi_FI",
"fr-BE": "fr_BE",
"fr-CA": "fr_CA",
"fr-CH": "fr_CH",
"fr-FR": "fr_FR",
"hu-HU": "hu_HU",
"it-CH": "it_CH",
"it-IT": "it_IT",
"ko-KR": "ko_KR",
"nb-NO": "nb_NO",
"nl-BE": "nl_BE",
"nl-NL": "nl_NL",
"pl-PL": "pl_PL",
"pt-PT": "pt_PT",
"ro-RO": "ro_RO",
"sv-SE": "sv_SE",
"th-TH": "th_TH",
"zh-CN": "zh_CN",
"zh-HK": "zh_HK"
},
"qwant news": {
"ca-ES": "ca_ES",
"de-AT": "de_AT",
"de-CH": "de_CH",
"de-DE": "de_DE",
"en-AU": "en_AU",
"en-CA": "en_CA",
"en-GB": "en_GB",
"en-IE": "en_IE",
"en-MY": "en_MY",
"en-NZ": "en_NZ",
"en-US": "en_US",
"es-AR": "es_AR",
"es-CL": "es_CL",
"es-ES": "es_ES",
"es-MX": "es_MX",
"fr-BE": "fr_BE",
"fr-CA": "fr_CA",
"fr-CH": "fr_CH",
"fr-FR": "fr_FR",
"it-CH": "it_CH",
"it-IT": "it_IT",
"nl-BE": "nl_BE",
"nl-NL": "nl_NL",
"pt-PT": "pt_PT"
},
"qwant videos": {
"bg-BG": "bg_BG",
"ca-ES": "ca_ES",
"cs-CZ": "cs_CZ",
"da-DK": "da_DK",
"de-AT": "de_AT",
"de-CH": "de_CH",
"de-DE": "de_DE",
"el-GR": "el_GR",
"en-AU": "en_AU",
"en-CA": "en_CA",
"en-GB": "en_GB",
"en-IE": "en_IE",
"en-MY": "en_MY",
"en-NZ": "en_NZ",
"en-US": "en_US",
"es-AR": "es_AR",
"es-CL": "es_CL",
"es-ES": "es_ES",
"es-MX": "es_MX",
"et-EE": "et_EE",
"fi-FI": "fi_FI",
"fr-BE": "fr_BE",
"fr-CA": "fr_CA",
"fr-CH": "fr_CH",
"fr-FR": "fr_FR",
"hu-HU": "hu_HU",
"it-CH": "it_CH",
"it-IT": "it_IT",
"ko-KR": "ko_KR",
"nb-NO": "nb_NO",
"nl-BE": "nl_BE",
"nl-NL": "nl_NL",
"pl-PL": "pl_PL",
"pt-PT": "pt_PT",
"ro-RO": "ro_RO",
"sv-SE": "sv_SE",
"th-TH": "th_TH",
"zh-CN": "zh_CN",
"zh-HK": "zh_HK"
},
"startpage": {
"af": {
"alias": "afrikaans"
@ -1952,6 +1937,10 @@
"english_name": "Banjar",
"name": "Bahasa Banjar"
},
"blk": {
"english_name": "Pa'O",
"name": "\u1015\u1021\u102d\u102f\u101d\u103a\u108f\u1018\u102c\u108f\u101e\u102c\u108f"
},
"bm": {
"english_name": "Bambara",
"name": "Bamanankan"
@ -2352,6 +2341,10 @@
"english_name": "Kabiye",
"name": "Kab\u0269y\u025b"
},
"kcg": {
"english_name": "Tyap",
"name": "Tyap"
},
"kg": {
"english_name": "Kongo",
"name": "Kik\u00f4ngo"
@ -2668,6 +2661,10 @@
"english_name": "Picard",
"name": "Picard"
},
"pcm": {
"english_name": "Nigerian Pidgin",
"name": "Naij\u00e1"
},
"pdc": {
"english_name": "Pennsylvania German",
"name": "Deitsch"
@ -3214,6 +3211,10 @@
"english_name": "Banjar",
"name": "Bahasa Banjar"
},
"blk": {
"english_name": "Pa'O",
"name": "\u1015\u1021\u102d\u102f\u101d\u103a\u108f\u1018\u102c\u108f\u101e\u102c\u108f"
},
"bm": {
"english_name": "Bambara",
"name": "Bamanankan"
@ -3614,6 +3615,10 @@
"english_name": "Kabiye",
"name": "Kab\u0269y\u025b"
},
"kcg": {
"english_name": "Tyap",
"name": "Tyap"
},
"kg": {
"english_name": "Kongo",
"name": "Kik\u00f4ngo"
@ -3930,6 +3935,10 @@
"english_name": "Picard",
"name": "Picard"
},
"pcm": {
"english_name": "Nigerian Pidgin",
"name": "Naij\u00e1"
},
"pdc": {
"english_name": "Pennsylvania German",
"name": "Deitsch"

File diff suppressed because it is too large Load diff

BIN
searx/data/lid.176.ftz Executable file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,7 @@
{
"versions": [
"99.0.1",
"99.0",
"98.0.2",
"98.0.1",
"98.0"
"109.0",
"108.0"
],
"os": [
"Windows NT 10.0; Win64; x64",

View file

@ -114,7 +114,6 @@
"Q106645257": "MN m",
"Q106645261": "kN m",
"Q106645290": "dN m",
"Q106647058": "u",
"Q1067722": "Fg",
"Q106777906": "μS/m",
"Q106777917": "S/cm",
@ -154,7 +153,6 @@
"Q107164998": "cd mm²/m²",
"Q107210119": "g/s",
"Q107210344": "mg/s",
"Q107213614": "kJ/100g",
"Q107226391": "cm⁻¹",
"Q1072404": "K",
"Q107244316": "mm⁻¹",
@ -209,16 +207,45 @@
"Q1091257": "tex",
"Q1092296": "a",
"Q110143852": "Ω cm",
"Q110143896": "cm³/g",
"Q1104069": "$",
"Q11061003": "μm²",
"Q11061005": "nm²",
"Q110742003": "dppx",
"Q1131660": "st",
"Q1137675": "cr",
"Q114002440": "𒄀",
"Q114002534": "𒃻",
"Q114002568": "𒂠",
"Q114002639": "𒈨𒊑",
"Q114002688": "𒋗𒋛",
"Q114002734": "𒊺",
"Q114002796": "𒂆",
"Q114002897": "𒊬",
"Q114002930": "𒀺",
"Q114002955": "𒀹𒃷",
"Q114002974": "𒃷",
"Q114002998": "𒁓",
"Q114018694": "𒄥",
"Q114018781": "𒁀𒌷𒂵",
"Q1140444": "Zb",
"Q1140577": "Yb",
"Q114589269": "A",
"Q1152074": "Pb",
"Q1152323": "Tb",
"Q115277430": "QB",
"Q115280832": "RB",
"Q115359862": "qg",
"Q115359863": "rg",
"Q115359865": "Rg",
"Q115359866": "Qg",
"Q115359910": "Rm",
"Q115533751": "rm",
"Q115533764": "qm",
"Q115533776": "Qm",
"Q116432446": "ᵐ",
"Q116432563": "ˢ",
"Q116443090": "ʰ",
"Q1165799": "mil",
"Q11776930": "Mg",
"Q11830636": "psf",
@ -237,12 +264,14 @@
"Q12257695": "Eb/s",
"Q12257696": "EB/s",
"Q12261466": "kB/s",
"Q12263659": "mgal",
"Q12265780": "Pb/s",
"Q12265783": "PB/s",
"Q12269121": "Yb/s",
"Q12269122": "YB/s",
"Q12269308": "Zb/s",
"Q12269309": "ZB/s",
"Q1238720": "vols.",
"Q1247300": "cm H₂O",
"Q12714022": "cwt",
"Q12789864": "GeV",
@ -283,7 +312,6 @@
"Q14914907": "th",
"Q14916719": "Gpc",
"Q14923662": "Pm³",
"Q1511773": "LSd",
"Q15120301": "l atm",
"Q1542309": "xu",
"Q1545979": "ft³",
@ -305,7 +333,6 @@
"Q17255465": "v_P",
"Q173117": "R$",
"Q1741429": "kpm",
"Q174467": "Lm",
"Q174728": "cm",
"Q174789": "mm",
"Q175821": "μm",
@ -329,13 +356,11 @@
"Q182429": "m/s",
"Q1826195": "dl",
"Q18413919": "cm/s",
"Q184172": "F",
"Q185078": "a",
"Q185153": "erg",
"Q185648": "Torr",
"Q185759": "span",
"Q1872619": "zs",
"Q189097": "₧",
"Q190095": "Gy",
"Q19017495": "mm²",
"Q190951": "S$",
@ -351,6 +376,7 @@
"Q194339": "B$",
"Q1970718": "mam",
"Q1972579": "pdl",
"Q19877834": "cd-ft",
"Q199462": "LE",
"Q199471": "Afs",
"Q200323": "dm",
@ -389,7 +415,7 @@
"Q211256": "mi/h",
"Q21154419": "PD",
"Q211580": "BTU (th)",
"Q212120": "A h",
"Q212120": "Ah",
"Q213005": "G$",
"Q2140397": "in³",
"Q214377": "ell",
@ -429,7 +455,6 @@
"Q23931040": "dam²",
"Q23931103": "nmi²",
"Q240468": "syr£",
"Q2414435": "$b.",
"Q242988": "Lib$",
"Q2438073": "ag",
"Q2448803": "mV",
@ -507,6 +532,7 @@
"Q3013059": "ka",
"Q304479": "tr",
"Q305896": "DPI",
"Q3095010": "γ",
"Q31889818": "ppq",
"Q3194304": "kb",
"Q3207456": "mW",
@ -546,7 +572,7 @@
"Q3773454": "Mpc",
"Q3815076": "Kib",
"Q3833309": "£",
"Q3858002": "mA h",
"Q3858002": "mAh",
"Q3867152": "ft/s²",
"Q389062": "Tib",
"Q3902688": "pl",
@ -607,6 +633,8 @@
"Q53393868": "GJ",
"Q53393886": "PJ",
"Q53393890": "EJ",
"Q53393893": "ZJ",
"Q53393898": "YJ",
"Q53448786": "yHz",
"Q53448790": "zHz",
"Q53448794": "fHz",
@ -620,6 +648,7 @@
"Q53448826": "hHz",
"Q53448828": "yJ",
"Q53448832": "zJ",
"Q53448835": "fJ",
"Q53448842": "pJ",
"Q53448844": "nJ",
"Q53448847": "μJ",
@ -682,6 +711,7 @@
"Q53951982": "Mt",
"Q53952048": "kt",
"Q54006645": "ZWb",
"Q54081354": "ZT",
"Q54081925": "ZSv",
"Q54082468": "ZS",
"Q54083144": "ZΩ",
@ -706,8 +736,6 @@
"Q56157046": "nmol",
"Q56157048": "pmol",
"Q56160603": "fmol",
"Q56302633": "UM",
"Q56317116": "mgal",
"Q56317622": "Q_P",
"Q56318907": "kbar",
"Q56349362": "Bs.S",
@ -1184,10 +1212,10 @@
"Q11570": "kg",
"Q11573": "m",
"Q11574": "s",
"Q11579": "K",
"Q11582": "L",
"Q12129": "pc",
"Q12438": "N",
"Q16068": "DM",
"Q1811": "AU",
"Q20764": "Ma",
"Q2101": "e",

77
searx/engines/9gag.py Normal file
View file

@ -0,0 +1,77 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=invalid-name
"""9GAG (social media)"""
from json import loads
from datetime import datetime
from urllib.parse import urlencode
about = {
"website": 'https://9gag.com/',
"wikidata_id": 'Q277421',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['social media']
paging = True
search_url = "https://9gag.com/v1/search-posts?{query}"
page_size = 10
def request(query, params):
query = urlencode({'query': query, 'c': (params['pageno'] - 1) * page_size})
params['url'] = search_url.format(query=query)
return params
def response(resp):
results = []
json_results = loads(resp.text)['data']
for result in json_results['posts']:
result_type = result['type']
# Get the not cropped version of the thumbnail when the image height is not too important
if result['images']['image700']['height'] > 400:
thumbnail = result['images']['imageFbThumbnail']['url']
else:
thumbnail = result['images']['image700']['url']
if result_type == 'Photo':
results.append(
{
'template': 'images.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.utcfromtimestamp(result['creationTs']),
'img_src': result['images']['image700']['url'],
'thumbnail_src': thumbnail,
}
)
elif result_type == 'Animated':
results.append(
{
'template': 'videos.html',
'url': result['url'],
'title': result['title'],
'content': result['description'],
'publishedDate': datetime.utcfromtimestamp(result['creationTs']),
'thumbnail': thumbnail,
'iframe_src': result['images'].get('image460sv', {}).get('url'),
}
)
if 'tags' in json_results:
for suggestion in json_results['tags']:
results.append({'suggestion': suggestion['key']})
return results

View file

@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = {
"enable_http": False,
"using_tor_proxy": False,
"display_error_messages": True,
"send_accept_language_header": False,
"tokens": [],
"about": {},
}
@ -80,6 +81,7 @@ engine_shortcuts = {}
engine_shortcuts[engine.shortcut] = engine.name
:meta hide-value:
"""
@ -104,8 +106,12 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
- required attribute is not set :py:func:`is_missing_required_attributes`
"""
# pylint: disable=too-many-return-statements
engine_name = engine_data['name']
engine_name = engine_data.get('name')
if engine_name is None:
logger.error('An engine does not have a "name" field')
return None
if '_' in engine_name:
logger.error('Engine name contains underscore: "{}"'.format(engine_name))
return None
@ -116,7 +122,10 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
engine_data['name'] = engine_name
# load_module
engine_module = engine_data['engine']
engine_module = engine_data.get('engine')
if engine_module is None:
logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name))
return None
try:
engine = load_module(engine_module + '.py', ENGINE_DIR)
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
@ -149,7 +158,11 @@ def set_loggers(engine, engine_name):
engine.logger = logger.getChild(engine_name)
# the engine may have load some other engines
# may sure the logger is initialized
for module_name, module in sys.modules.items():
# use sys.modules.copy() to avoid "RuntimeError: dictionary changed size during iteration"
# see https://github.com/python/cpython/issues/89516
# and https://docs.python.org/3.10/library/sys.html#sys.modules
modules = sys.modules.copy()
for module_name, module in modules.items():
if (
module_name.startswith("searx.engines")
and module_name != "searx.engines.__init__"
@ -269,12 +282,12 @@ def is_engine_active(engine: Engine):
def register_engine(engine: Engine):
if engine.name in engines:
logger.error('Engine config error: ambigious name: {0}'.format(engine.name))
logger.error('Engine config error: ambiguous name: {0}'.format(engine.name))
sys.exit(1)
engines[engine.name] = engine
if engine.shortcut in engine_shortcuts:
logger.error('Engine config error: ambigious shortcut: {0}'.format(engine.shortcut))
logger.error('Engine config error: ambiguous shortcut: {0}'.format(engine.shortcut))
sys.exit(1)
engine_shortcuts[engine.shortcut] = engine.name

View file

@ -0,0 +1,57 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Apple App Store
"""
from json import loads
from urllib.parse import urlencode
from dateutil.parser import parse
about = {
"website": 'https://www.apple.com/app-store/',
"wikidata_id": 'Q368215',
"official_api_documentation": (
'https://developer.apple.com/library/archive/documentation/AudioVideo/Conceptual/'
'iTuneSearchAPI/UnderstandingSearchResults.html#//apple_ref/doc/uid/TP40017632-CH8-SW1'
),
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['files', 'apps']
safesearch = True
search_url = 'https://itunes.apple.com/search?{query}'
def request(query, params):
explicit = "Yes"
if params['safesearch'] > 0:
explicit = "No"
params['url'] = search_url.format(query=urlencode({'term': query, 'media': 'software', 'explicit': explicit}))
return params
def response(resp):
results = []
json_result = loads(resp.text)
for result in json_result['results']:
results.append(
{
'url': result['trackViewUrl'],
'title': result['trackName'],
'content': result['description'],
'img_src': result['artworkUrl100'],
'publishedDate': parse(result['currentVersionReleaseDate']),
'author': result['sellerName'],
}
)
return results

113
searx/engines/apple_maps.py Normal file
View file

@ -0,0 +1,113 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Apple Maps"""
from json import loads
from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label
about = {
"website": 'https://www.apple.com/maps/',
"wikidata_id": 'Q276101',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
token = {'value': '', 'last_updated': None}
categories = ['map']
paging = False
search_url = "https://api.apple-mapkit.com/v1/search?{query}&mkjsVersion=5.72.53"
def obtain_token():
update_time = time() - (time() % 1800)
try:
# use duckduckgo's mapkit token
token_response = http_get('https://duckduckgo.com/local.js?get_mk_token=1', timeout=2.0)
actual_token = http_get(
'https://cdn.apple-mapkit.com/ma/bootstrap?apiVersion=2&mkjsVersion=5.72.53&poi=1',
timeout=2.0,
headers={'Authorization': 'Bearer ' + token_response.text},
)
token['value'] = loads(actual_token.text)['authInfo']['access_token']
token['last_updated'] = update_time
# pylint: disable=bare-except
except:
pass
return token
def request(query, params):
if time() - (token['last_updated'] or 0) > 1800:
obtain_token()
params['url'] = search_url.format(query=urlencode({'q': query, 'lang': params['language']}))
params['headers'] = {'Authorization': 'Bearer ' + token['value']}
return params
def response(resp):
results = []
resp_json = loads(resp.text)
user_language = resp.search_params['language']
for result in resp_json['results']:
boundingbox = None
if 'displayMapRegion' in result:
box = result['displayMapRegion']
boundingbox = [box['southLat'], box['northLat'], box['westLng'], box['eastLng']]
links = []
if 'telephone' in result:
telephone = result['telephone']
links.append(
{
'label': get_key_label('phone', user_language),
'url': 'tel:' + telephone,
'url_label': telephone,
}
)
if result.get('urls'):
url = result['urls'][0]
links.append(
{
'label': get_key_label('website', user_language),
'url': url,
'url_label': url,
}
)
results.append(
{
'template': 'map.html',
'type': result.get('poiCategory'),
'title': result['name'],
'links': links,
'latitude': result['center']['lat'],
'longitude': result['center']['lng'],
'url': result['placecardUrl'],
'boundingbox': boundingbox,
'geojson': {'type': 'Point', 'coordinates': [result['center']['lng'], result['center']['lat']]},
'address': {
'name': result['name'],
'house_number': result.get('subThoroughfare'),
'road': result.get('thoroughfare'),
'locality': result.get('locality'),
'postcode': result.get('postCode'),
'country': result.get('country'),
},
}
)
return results

View file

@ -3,9 +3,10 @@
ArXiV (Scientific preprints)
"""
from lxml import html
from lxml import etree
from lxml.etree import XPath
from datetime import datetime
from searx.utils import eval_xpath_list, eval_xpath_getindex
from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
@ -17,7 +18,7 @@ about = {
"results": 'XML-RSS',
}
categories = ['science']
categories = ['science', 'scientific publications']
paging = True
base_url = (
@ -27,6 +28,23 @@ base_url = (
# engine dependent config
number_of_results = 10
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
def request(query, params):
# basic search
@ -41,30 +59,50 @@ def request(query, params):
def response(resp):
results = []
dom = etree.fromstring(resp.content)
for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
dom = html.fromstring(resp.content)
url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
for entry in eval_xpath_list(dom, '//entry'):
title = eval_xpath_getindex(entry, './/title', 0).text
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
url = eval_xpath_getindex(entry, './/id', 0).text
# doi
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
content_string = '{doi_content}{abstract_content}'
# pdf
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
abstract = eval_xpath_getindex(entry, './/summary', 0).text
# journal
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
journal = None if journal_element is None else journal_element.text
# If a doi is available, add it to the snipppet
doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None)
doi_content = doi_element.text if doi_element is not None else ''
content = content_string.format(doi_content=doi_content, abstract_content=abstract)
# tags
tag_elements = eval_xpath(entry, xpath_category)
tags = [str(tag) for tag in tag_elements]
if len(content) > 300:
content = content[0:300] + "..."
# TODO: center snippet on query term
# comments
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
comments = None if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ')
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
results.append(res_dict)

View file

@ -4,11 +4,13 @@
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
"""
# pylint: disable=too-many-branches
import re
from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html
from searx.utils import eval_xpath, extract_text, match_language
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
from searx.network import multi_requests, Request
about = {
"website": 'https://www.bing.com',
@ -24,6 +26,7 @@ categories = ['general', 'web']
paging = True
time_range_support = False
safesearch = False
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {}
@ -67,42 +70,71 @@ def request(query, params):
logger.debug("headers.Referer --> %s", referer)
params['url'] = base_url + search_path
params['headers']['Accept-Language'] = "en-US,en;q=0.5"
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params
def response(resp):
results = []
result_len = 0
dom = html.fromstring(resp.text)
for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
# IMO //div[@class="sa_cc"] does no longer match
logger.debug('found //div[@class="sa_cc"] --> %s', result)
link = eval_xpath(result, './/h3/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url, 'title': title, 'content': content})
# parse results again if nothing is found yet
for result in eval_xpath(dom, '//li[@class="b_algo"]'):
link = eval_xpath(result, './/h2/a')[0]
url_to_resolve = []
url_to_resolve_index = []
i = 0
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath_getindex(result, './/h2/a', 0, None)
if link is None:
continue
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(result, './/p'))
# Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
content = eval_xpath(result, '(.//p)[1]')
for p in content:
for e in p.xpath('.//a'):
e.getparent().remove(e)
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
e.getparent().remove(e)
content = extract_text(content)
# get the real URL either using the URL shown to user or following the Bing URL
if url.startswith('https://www.bing.com/ck/a?'):
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
# Bing can shorten the URL either at the end or in the middle of the string
if (
url_cite.startswith('https://')
and '' not in url_cite
and '...' not in url_cite
and '' not in url_cite
):
# no need for an additional HTTP request
url = url_cite
else:
# resolve the URL with an additional HTTP request
url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
url_to_resolve_index.append(i)
url = None # remove the result if the HTTP Bing redirect raise an exception
# append result
results.append({'url': url, 'title': title, 'content': content})
# increment result pointer for the next iteration in this loop
i += 1
# resolve all Bing redirections in parallel
request_list = [
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
]
response_list = multi_requests(request_list)
for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception):
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
# get number_of_results
try:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
if "-" in result_len_container:

View file

@ -31,6 +31,7 @@ categories = ['images', 'web']
paging = True
safesearch = True
time_range_support = True
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general'
number_of_results = 28

View file

@ -34,6 +34,7 @@ about = {
categories = ['news']
paging = True
time_range_support = True
send_accept_language_header = True
# search-url
base_url = 'https://www.bing.com/'

View file

@ -30,6 +30,7 @@ categories = ['videos', 'web']
paging = True
safesearch = True
time_range_support = True
send_accept_language_header = True
number_of_results = 28
base_url = 'https://www.bing.com/'
@ -70,10 +71,6 @@ def request(query, params):
if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
# bing videos did not like "older" versions < 70.0.1 when selectin other
# languages then 'en' .. very strange ?!?!
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
return params
@ -83,7 +80,7 @@ def response(resp):
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]'):
for result in dom.xpath('//div[@class="dg_u"]/div[contains(@class, "mc_vtvc")]'):
metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
content = '{0} - {1}'.format(metadata['du'], info)

View file

@ -4,7 +4,6 @@
"""
from json import loads
from datetime import datetime
from urllib.parse import urlencode
@ -19,7 +18,7 @@ about = {
"results": 'JSON',
}
categories = ['science']
categories = ['science', 'scientific publications']
paging = True
nb_per_page = 10
@ -42,39 +41,75 @@ def request(query, params):
)
params['url'] = base_url + search_path
logger.debug("query_url --> %s", params['url'])
return params
def response(resp):
results = []
json_data = loads(resp.text)
json_data = resp.json()
for result in json_data['data']:
source = result['_source']
url = None
if source.get('urls'):
url = source['urls'][0].replace('http://', 'https://', 1)
if url is None and source.get('doi'):
# use the DOI reference
url = 'https://doi.org/' + source['doi']
if url is None and source.get('downloadUrl'):
# use the downloadUrl
url = source['downloadUrl']
if url is None and source.get('identifiers'):
# try to find an ark id, see
# https://www.wikidata.org/wiki/Property:P8091
# and https://en.wikipedia.org/wiki/Archival_Resource_Key
arkids = [
identifier[5:] # 5 is the length of "ark:/"
for identifier in source.get('identifiers')
if isinstance(identifier, str) and identifier.startswith('ark:/')
]
if len(arkids) > 0:
url = 'https://n2t.net/' + arkids[0]
if url is None:
continue
publishedDate = None
time = source['publishedDate'] or source['depositedDate']
if time:
date = datetime.fromtimestamp(time / 1000)
else:
date = None
publishedDate = datetime.fromtimestamp(time / 1000)
metadata = []
if source['publisher'] and len(source['publisher']) > 3:
metadata.append(source['publisher'])
if source['topics']:
metadata.append(source['topics'][0])
if source['doi']:
metadata.append(source['doi'])
metadata = ' / '.join(metadata)
# sometimes the 'title' is None / filter None values
journals = [j['title'] for j in (source.get('journals') or []) if j['title']]
publisher = source['publisher']
if publisher:
publisher = source['publisher'].strip("'")
results.append(
{
'url': source['urls'][0].replace('http://', 'https://', 1),
'template': 'paper.html',
'title': source['title'],
'content': source['description'],
'publishedDate': date,
'metadata': metadata,
'url': url,
'content': source['description'] or '',
# 'comments': '',
'tags': source['topics'],
'publishedDate': publishedDate,
'type': (source['types'] or [None])[0],
'authors': source['authors'],
'editor': ', '.join(source['contributors'] or []),
'publisher': publisher,
'journal': ', '.join(journals),
# 'volume': '',
# 'pages' : '',
# 'number': '',
'doi': source['doi'],
'issn': [x for x in [source.get('issn')] if x],
'isbn': [x for x in [source.get('isbn')] if x], # exists in the rawRecordXml
'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'),
}
)

60
searx/engines/crossref.py Normal file
View file

@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Semantic Scholar (Science)
"""
# pylint: disable=use-dict-literal
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": 'https://www.crossref.org/',
"wikidata_id": 'Q5188229',
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
search_url = 'https://api.crossref.org/works'
def request(query, params):
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
return params
def response(resp):
res = resp.json()
results = []
for record in res['message']['items']:
record_type = record['type']
if record_type == 'book-chapter':
title = record['container-title'][0]
if record['title'][0].lower().strip() != title.lower().strip():
title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')'
journal = None
else:
title = html_to_text(record['title'][0])
journal = record.get('container-title', [None])[0]
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
results.append(
{
'template': 'paper.html',
'url': url,
'title': title,
'journal': journal,
'volume': record.get('volume'),
'type': record['type'],
'content': html_to_text(record.get('abstract', '')),
'publisher': record.get('publisher'),
'authors': authors,
'doi': record['DOI'],
'isbn': isbn,
}
)
return results

View file

@ -30,7 +30,7 @@ number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
@ -58,7 +58,7 @@ search_url = (
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
).format(
fields=','.join(result_fields),
password_protected= 'false',
password_protected='false',
private='false',
sort='relevance',
limit=number_of_results,
@ -93,7 +93,7 @@ def request(query, params):
query_args = {
'search': query,
'languages': language_iso639,
'page': params['pageno'],
'page': params['pageno'],
}
if locale.territory:
@ -170,7 +170,4 @@ def response(resp):
# get supported languages from their site
def _fetch_supported_languages(resp):
response_json = resp.json()
return [
item['locale']
for item in response_json['list']
]
return [item['locale'] for item in response_json['list']]

62
searx/engines/deepl.py Normal file
View file

@ -0,0 +1,62 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Deepl translation engine"""
from json import loads
about = {
"website": 'https://deepl.com',
"wikidata_id": 'Q43968444',
"official_api_documentation": 'https://www.deepl.com/docs-api',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general']
url = 'https://api-free.deepl.com/v2/translate'
api_key = None
def request(_query, params):
'''pre-request callback
params<dict>:
- ``method`` : POST/GET
- ``headers``: {}
- ``data``: {} # if method == POST
- ``url``: ''
- ``category``: 'search category'
- ``pageno``: 1 # number of the requested page
'''
params['url'] = url
params['method'] = 'POST'
params['data'] = {'auth_key': api_key, 'text': params['query'], 'target_lang': params['to_lang'][1]}
return params
def response(resp):
results = []
result = loads(resp.text)
translations = result['translations']
infobox = "<dl>"
for translation in translations:
infobox += f"<dd>{translation['text']}</dd>"
infobox += "</dl>"
results.append(
{
'infobox': 'Deepl',
'content': infobox,
}
)
return results

View file

@ -19,7 +19,8 @@ list in ``settings.yml``:
from json import loads
from urllib.parse import urlencode
engine_type = 'offline'
engine_type = 'online'
send_accept_language_header = True
categories = ['general']
disabled = True
timeout = 2.0

View file

@ -3,6 +3,7 @@
"""Docker Hub (IT)
"""
# pylint: disable=use-dict-literal
from json import loads
from urllib.parse import urlencode

View file

@ -18,7 +18,7 @@ from searx.network import get
# about
about = {
"website": 'https://lite.duckduckgo.com/lite',
"website": 'https://lite.duckduckgo.com/lite/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": False,
@ -31,6 +31,7 @@ categories = ['general', 'web']
paging = True
supported_languages_url = 'https://duckduckgo.com/util/u588.js'
time_range_support = True
send_accept_language_header = True
language_aliases = {
'ar-SA': 'ar-XA',
@ -45,7 +46,7 @@ language_aliases = {
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
# search-url
url = 'https://lite.duckduckgo.com/lite'
url = 'https://lite.duckduckgo.com/lite/'
url_ping = 'https://duckduckgo.com/t/sl_l'
# match query's language to a region code that duckduckgo will accept
@ -72,6 +73,7 @@ def request(query, params):
# link again and again ..
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
params['headers']['Referer'] = 'https://google.com/'
# initial page does not have an offset
if params['pageno'] == 2:

View file

@ -27,6 +27,8 @@ about = {
"results": 'JSON',
}
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
@ -62,7 +64,6 @@ def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0]
params['headers']['Accept-Language'] = language
return params
@ -78,7 +79,7 @@ def response(resp):
# * book / performing art / film / television / media franchise / concert tour / playwright
# * prepared food
# * website / software / os / programming language / file format / software engineer
# * compagny
# * company
content = ''
heading = search_res.get('Heading', '')

View file

@ -30,6 +30,7 @@ about = {
categories = ['images', 'web']
paging = True
safesearch = True
send_accept_language_header = True
# search-url
images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'

View file

@ -0,0 +1,136 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""DuckDuckGo Weather"""
from json import loads
from urllib.parse import quote
from datetime import datetime
from flask_babel import gettext
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ["others"]
url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
def generate_condition_table(condition):
res = ""
res += f"<tr><td><b>{gettext('Condition')}</b></td>" f"<td><b>{condition['summary']}</b></td></tr>"
res += (
f"<tr><td><b>{gettext('Temperature')}</b></td>"
f"<td><b>{f_to_c(condition['temperature'])}°C / {condition['temperature']}°F</b></td></tr>"
)
res += (
f"<tr><td>{gettext('Feels like')}</td><td>{f_to_c(condition['apparentTemperature'])}°C / "
f"{condition['apparentTemperature']}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Wind')}</td><td>{condition['windBearing']}° — "
f"{(condition['windSpeed'] * 1.6093440006147):.2f} km/h / {condition['windSpeed']} mph</td></tr>"
)
res += f"<tr><td>{gettext('Visibility')}</td><td>{condition['visibility']} km</td>"
res += f"<tr><td>{gettext('Humidity')}</td><td>{(condition['humidity'] * 100):.1f}%</td></tr>"
return res
def generate_day_table(day):
res = ""
res += (
f"<tr><td>{gettext('Min temp.')}</td><td>{f_to_c(day['temperatureLow'])}°C / "
f"{day['temperatureLow']}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Max temp.')}</td><td>{f_to_c(day['temperatureHigh'])}°C / "
f"{day['temperatureHigh']}°F</td></tr>"
)
res += f"<tr><td>{gettext('UV index')}</td><td>{day['uvIndex']}</td></tr>"
res += (
f"<tr><td>{gettext('Sunrise')}</td><td>{datetime.fromtimestamp(day['sunriseTime']).strftime('%H:%M')}</td></tr>"
)
res += (
f"<tr><td>{gettext('Sunset')}</td><td>{datetime.fromtimestamp(day['sunsetTime']).strftime('%H:%M')}</td></tr>"
)
return res
def request(query, params):
params["url"] = url.format(query=quote(query), lang=params['language'].split('-')[0])
return params
def f_to_c(temperature):
return "%.2f" % ((temperature - 32) / 1.8)
def response(resp):
results = []
if resp.text.strip() == "ddg_spice_forecast();":
return []
result = loads(resp.text[resp.text.find('\n') + 1 : resp.text.rfind('\n') - 2])
current = result["currently"]
title = result['flags']['ddg-location']
infobox = f"<h3>{gettext('Current condition')}</h3><table><tbody>"
infobox += generate_condition_table(current)
infobox += "</tbody></table>"
last_date = None
for time in result['hourly']['data']:
current_time = datetime.fromtimestamp(time['time'])
if last_date != current_time.date():
if last_date is not None:
infobox += "</tbody></table>"
infobox += f"<h3>{current_time.strftime('%Y-%m-%d')}</h3>"
infobox += "<table><tbody>"
for day in result['daily']['data']:
if datetime.fromtimestamp(day['time']).date() == current_time.date():
infobox += generate_day_table(day)
infobox += "</tbody></table><table><tbody>"
last_date = current_time.date()
infobox += f"<tr><td rowspan=\"7\"><b>{current_time.strftime('%H:%M')}</b></td></tr>"
infobox += generate_condition_table(time)
infobox += "</tbody></table>"
results.append(
{
"infobox": title,
"content": infobox,
}
)
return results

View file

@ -7,6 +7,7 @@ import re
from urllib.parse import quote, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.network import raise_for_httperror
# about
about = {
@ -47,6 +48,7 @@ def request(query, params):
# after the last page of results, spelling corrections are returned after a HTTP redirect
# whatever the page number is
params['soft_max_redirects'] = 1
params['raise_for_httperror'] = False
return params
@ -56,6 +58,11 @@ def response(resp):
'''
results = []
if resp.status_code == 404:
return results
raise_for_httperror(resp)
dom = html.fromstring(resp.text)
number_of_results_element = eval_xpath_getindex(

View file

@ -0,0 +1,67 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Emojipedia
Emojipedia is an emoji reference website which documents the meaning and
common usage of emoji characters in the Unicode Standard. It is owned by Zedge
since 2021. Emojipedia is a voting member of The Unicode Consortium.[1]
[1] https://en.wikipedia.org/wiki/Emojipedia
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
about = {
"website": 'https://emojipedia.org',
"wikidata_id": 'Q22908129',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = []
paging = False
time_range_support = False
base_url = 'https://emojipedia.org'
search_url = base_url + '/search/?{query}'
def request(query, params):
params['url'] = search_url.format(
query=urlencode({'q': query}),
)
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, "//ol[@class='search-results']/li"):
extracted_desc = extract_text(eval_xpath_getindex(result, './/p', 0))
if 'No results found.' in extracted_desc:
break
link = eval_xpath_getindex(result, './/h2/a', 0)
url = base_url + link.attrib.get('href')
title = extract_text(link)
content = extracted_desc
res = {'url': url, 'title': title, 'content': content}
results.append(res)
return results

View file

@ -3,7 +3,7 @@
"""
Gigablast (Web)
"""
# pylint: disable=invalid-name
# pylint: disable=invalid-name, use-dict-literal
import re
from time import time

View file

@ -40,7 +40,7 @@ def response(resp):
search_res = loads(resp.text)
# check if items are recieved
# check if items are received
if 'items' not in search_res:
return []

View file

@ -13,9 +13,9 @@ The google WEB engine itself has a special setup option:
- name: google
...
use_mobile_ui: true
use_mobile_ui: false
``use_mobile_ui``: (default: ``true``)
``use_mobile_ui``: (default: ``false``)
Enables to use *mobile endpoint* to bypass the google blocking (see
:issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More
results` is not affected by Google rate limiting and we can still do requests
@ -45,6 +45,7 @@ categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
send_accept_language_header = True
use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages'
@ -111,21 +112,14 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables
# ------------------------
# google results are grouped into <div class="jtfYYd ..." ../>
results_xpath = '//div[@class="jtfYYd"]'
results_xpath = './/div[@data-sokoban-container]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-content-feature=1]'
# google *sections* are no usual *results*, we ignore them
g_section_with_header = './g-section-with-header'
# the title is a h3 tag relative to the result group
title_xpath = './/h3[1]'
# in the result group there is <div class="yuRUbf" ../> it's first child is a <a
# href=...>
href_xpath = './/div[@class="yuRUbf"]//a/@href'
# in the result group there is <div class="VwiC3b ..." ../> containing the *content*
content_xpath = './/div[contains(@class, "VwiC3b")]'
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
@ -241,16 +235,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
# language.
ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
# Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
ret_val['headers']['Accept-Language'] = ','.join(
[
lang_country,
language + ';q=0.8,',
'en;q=0.6',
'*;q=0.5',
]
)
return ret_val
@ -270,7 +254,7 @@ def request(query, params):
if use_mobile_ui:
additional_parameters = {
'asearch': 'arc',
'async': 'use_ac:true,_fmt:pc',
'async': 'use_ac:true,_fmt:prog',
}
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
@ -298,6 +282,7 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
if use_mobile_ui:
params['headers']['Accept'] = '*/*'
@ -341,14 +326,14 @@ def response(resp):
# google *sections*
if extract_text(eval_xpath(result, g_section_with_header)):
logger.debug("ingoring <g-section-with-header>")
logger.debug("ignoring <g-section-with-header>")
continue
try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ingoring item from the result_xpath list: missing title')
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0, None)
@ -356,7 +341,7 @@ def response(resp):
continue
content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
if content is None:
logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title)
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
continue
logger.debug('add link to results: %s', title)

View file

@ -1,28 +1,20 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google images engine.
"""This is the implementation of the google images engine using the google
internal API used the Google Go Android app.
.. admonition:: Content-Security-Policy (CSP)
This internal API offer results in
This engine needs to allow images from the `data URLs`_ (prefixed with the
``data:`` scheme)::
- JSON (_fmt:json)
- Protobuf (_fmt:pb)
- Protobuf compressed? (_fmt:pc)
- HTML (_fmt:html)
- Protobuf encoded in JSON (_fmt:jspb).
Header set Content-Security-Policy "img-src 'self' data: ;"
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
"""
import re
from urllib.parse import urlencode, unquote
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from urllib.parse import urlencode
from json import loads
from searx.engines.google import (
get_lang_info,
@ -42,90 +34,24 @@ about = {
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"results": 'JSON',
}
# engine dependent config
categories = ['images', 'web']
paging = False
paging = True
use_locale_domain = True
time_range_support = True
safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
def scrap_out_thumbs(dom):
"""Scrap out thumbnail data from <script> tags."""
ret_val = {}
for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'):
_script = script.text
# _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....');
_thumb_no, _img_data = _script[len("_setImgSrc(") : -2].split(",", 1)
_thumb_no = _thumb_no.replace("'", "")
_img_data = _img_data.replace("'", "")
_img_data = _img_data.replace(r"\/", r"/")
ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=")
return ret_val
# [0, "-H96xjSoW5DsgM", ["https://encrypted-tbn0.gstatic.com/images?q...", 155, 324]
# , ["https://assets.cdn.moviepilot.de/files/d3bf..", 576, 1200],
_RE_JS_IMAGE_URL = re.compile(
r'"'
r'([^"]*)' # -H96xjSoW5DsgM
r'",\s*\["'
r'https://[^\.]*\.gstatic.com/images[^"]*' # https://encrypted-tbn0.gstatic.com/images?q...
r'[^\[]*\["'
r'(https?://[^"]*)' # https://assets.cdn.moviepilot.de/files/d3bf...
)
def parse_urls_img_from_js(dom):
# There are two HTML script tags starting with a JS function
# 'AF_initDataCallback(...)'
#
# <script nonce="zscm+Ab/JzBk1Qd4GY6wGQ">
# AF_initDataCallback({key: 'ds:0', hash: '1', data:[], sideChannel: {}});
# </script>
# <script nonce="zscm+Ab/JzBk1Qd4GY6wGQ">
# AF_initDataCallback({key: 'ds:1', hash: '2', data:[null,[[["online_chips",[["the big",
# ["https://encrypted-tbn0.gstatic.com/images?q...",null,null,true,[null,0],f
# ...
# </script>
#
# The second script contains the URLs of the images.
# The AF_initDataCallback(..) is called with very large dictionary, that
# looks like JSON but it is not JSON since it contains JS variables and
# constants like 'null' (we can't use a JSON parser for).
#
# The alternative is to parse the entire <script> and find all image URLs by
# a regular expression.
img_src_script = eval_xpath_getindex(dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
data_id_to_img_url = {}
for data_id, url in _RE_JS_IMAGE_URL.findall(img_src_script):
data_id_to_img_url[data_id] = url
return data_id_to_img_url
def get_img_url_by_data_id(data_id_to_img_url, img_node):
"""Get full image URL by @data-id from parent element."""
data_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
img_url = data_id_to_img_url.get(data_id, '')
img_url = unquote(img_url.replace(r'\u00', r'%'))
return img_url
def request(query, params):
"""Google-Video search request"""
"""Google-Image search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = (
'https://'
@ -139,7 +65,8 @@ def request(query, params):
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
'num': 30,
'asearch': 'isch',
'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
}
)
)
@ -151,7 +78,8 @@ def request(query, params):
params['url'] = query_url
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
params['headers']['Accept'] = '*/*'
return params
@ -161,78 +89,34 @@ def response(resp):
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text)
img_bas64_map = scrap_out_thumbs(dom)
data_id_to_img_url = parse_urls_img_from_js(dom)
json_start = resp.text.find('{"ischj":')
json_data = loads(resp.text[json_start:])
# parse results
#
# root element::
# <div id="islmp" ..>
# result div per image::
# <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
# The data-id matches to a item in a json-data structure in::
# <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
# In this structure the link to the origin PNG, JPG or whatever is given
# first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
# <img class="rg_i Q4LuWd" data-iid="0"
# second link per image-div is the target link::
# <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
# the second link also contains two div tags with the *description* and *publisher*::
# <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
# <div class="fxgdke">en.wikipedia.org</div>
for item in json_data["ischj"]["metadata"]:
root = eval_xpath(dom, '//div[@id="islmp"]')
if not root:
logger.error("did not find root element id='islmp'")
return results
result_item = {
'url': item["result"]["referrer_url"],
'title': item["result"]["page_title"],
'content': item["text_in_grid"]["snippet"],
'source': item["result"]["site_title"],
'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
'img_src': item["original_image"]["url"],
'thumbnail_src': item["thumbnail"]["url"],
'template': 'images.html',
}
root = root[0]
for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):
author = item["result"].get('iptc', {}).get('creator')
if author:
result_item['author'] = ', '.join(author)
img_alt = eval_xpath_getindex(img_node, '@alt', 0)
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
if copyright_notice:
result_item['source'] += ' / ' + copyright_notice
img_base64_id = eval_xpath(img_node, '@data-iid')
if img_base64_id:
img_base64_id = img_base64_id[0]
thumbnail_src = img_bas64_map[img_base64_id]
else:
thumbnail_src = eval_xpath(img_node, '@src')
if not thumbnail_src:
thumbnail_src = eval_xpath(img_node, '@data-src')
if thumbnail_src:
thumbnail_src = thumbnail_src[0]
else:
thumbnail_src = ''
file_size = item.get('gsa', {}).get('file_size')
if file_size:
result_item['source'] += ' (%s)' % file_size
link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
url = eval_xpath_getindex(link_node, '@href', 0, None)
if url is None:
logger.error("missing @href in node: %s", html.tostring(link_node))
continue
pub_nodes = eval_xpath(link_node, './div/div')
pub_descr = img_alt
pub_source = ''
if pub_nodes:
pub_descr = extract_text(pub_nodes[0])
pub_source = extract_text(pub_nodes[1])
src_url = get_img_url_by_data_id(data_id_to_img_url, img_node)
if not src_url:
src_url = thumbnail_src
results.append(
{
'url': url,
'title': img_alt,
'content': pub_descr,
'source': pub_source,
'img_src': src_url,
'thumbnail_src': thumbnail_src,
'template': 'images.html',
}
)
results.append(result_item)
return results

View file

@ -14,7 +14,6 @@ ignores some parameters from the common :ref:`google API`:
# pylint: disable=invalid-name
import binascii
from datetime import datetime
import re
from urllib.parse import urlencode
from base64 import b64decode
@ -71,13 +70,13 @@ time_range_support = True
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False
send_accept_language_header = True
def request(query, params):
"""Google-News search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# google news has only one domain
lang_info['subdomain'] = 'news.google.com'
@ -98,22 +97,14 @@ def request(query, params):
+ lang_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
'gl': lang_info['country'],
}
)
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']})
+ ('&ceid=%s' % ceid)
) # ceid includes a ':' character which must not be urlencoded
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['headers']['Cookie'] = "CONSENT=YES+cb.%s-14-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
return params
@ -150,7 +141,7 @@ def response(resp):
padding = (4 - (len(jslog) % 4)) * "="
jslog = b64decode(jslog + padding)
except binascii.Error:
# URL cant be read, skip this result
# URL can't be read, skip this result
continue
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
@ -159,24 +150,12 @@ def response(resp):
# the first <h3> tag in the <article> contains the title of the link
title = extract_text(eval_xpath(result, './article/h3[1]'))
# the first <div> tag in the <article> contains the content of the link
content = extract_text(eval_xpath(result, './article/div[1]'))
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time'))
pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a'))
# the second <div> tag contains origin publisher and the publishing date
pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))
pub_info = []
if pub_origin:
pub_info.append(pub_origin)
if pub_date:
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_info.append(pub_date)
pub_info = ', '.join(pub_info)
if pub_info:
content = pub_info + ': ' + content
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"

View file

@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Google Play Apps
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import (
eval_xpath,
extract_url,
extract_text,
eval_xpath_list,
eval_xpath_getindex,
)
about = {
"website": "https://play.google.com/",
"wikidata_id": "Q79576",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
categories = ["files", "apps"]
send_accept_language_header = True
search_url = "https://play.google.com/store/search?{query}&c=apps"
def request(query, params):
params["url"] = search_url.format(query=urlencode({"q": query}))
params['cookies']['CONSENT'] = "YES+"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
if eval_xpath(dom, '//div[@class="v6DsQb"]'):
return []
spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
if spot is not None:
url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
results.append({"url": url, "title": title, "content": content, "img_src": img})
more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
for result in more:
url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
img = extract_text(
eval_xpath(
result,
'.//img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
)
)
results.append({"url": url, "title": title, "content": content, "img_src": img})
for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
return results

View file

@ -13,10 +13,12 @@ Definitions`_.
from urllib.parse import urlencode
from datetime import datetime
from typing import Optional
from lxml import html
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
@ -46,12 +48,13 @@ about = {
}
# engine dependent config
categories = ['science']
categories = ['science', 'scientific publications']
paging = True
language_support = True
use_locale_domain = True
time_range_support = True
safesearch = False
send_accept_language_header = True
def time_range_url(params):
@ -75,7 +78,6 @@ def request(query, params):
offset = (params['pageno'] - 1) * 10
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# subdomain is: scholar.google.xy
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
@ -85,20 +87,13 @@ def request(query, params):
+ lang_info['subdomain']
+ '/scholar'
+ "?"
+ urlencode(
{
'q': query,
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
'start': offset,
}
)
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset})
)
query_url += time_range_url(params)
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
@ -106,7 +101,43 @@ def request(query, params):
return params
def response(resp):
def parse_gs_a(text: Optional[str]):
"""Parse the text written in green.
Possible formats:
* "{authors} - {journal}, {year} - {publisher}"
* "{authors} - {year} - {publisher}"
* "{authors} - {publisher}"
"""
if text is None or text == "":
return None, None, None, None
s_text = text.split(' - ')
authors = s_text[0].split(', ')
publisher = s_text[-1]
if len(s_text) != 3:
return authors, None, publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(', ')
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal = ', '.join(journal_year[0:-1])
if journal == '':
journal = None
else:
journal = None
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), '%Y')
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate
def response(resp): # pylint: disable=too-many-locals
"""Get response from google's search request"""
results = []
@ -119,30 +150,53 @@ def response(resp):
dom = html.fromstring(resp.text)
# parse results
for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'):
for result in eval_xpath_list(dom, '//div[@data-cid]'):
title = extract_text(eval_xpath(result, './h3[1]//a'))
title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title:
# this is a [ZITATION] block
continue
url = eval_xpath(result, './h3[1]//a/@href')[0]
content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or ''
pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]'))
if pub_info:
content += "[%s]" % pub_info
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
if pub_type:
title = title + " " + pub_type
pub_type = pub_type[1:-1].lower()
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
)
if publisher in url:
publisher = None
# cited by
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
# link to the html or pdf document
html_url = None
pdf_url = None
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
results.append(
{
'template': 'paper.html',
'type': pub_type,
'url': url,
'title': title,
'authors': authors,
'publisher': publisher,
'journal': journal,
'publishedDate': publishedDate,
'content': content,
'comments': comments,
'html_url': html_url,
'pdf_url': pdf_url,
}
)

View file

@ -60,6 +60,7 @@ language_support = True
use_locale_domain = True
time_range_support = True
safesearch = True
send_accept_language_header = True
RE_CACHE = {}
@ -111,22 +112,13 @@ def request(query, params):
"""Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = (
'https://'
+ lang_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "vid",
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
}
)
+ urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"})
)
if params['time_range'] in time_range_dict:
@ -135,6 +127,7 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params
@ -157,7 +150,7 @@ def response(resp):
# ignore google *sections*
if extract_text(eval_xpath(result, g_section_with_header)):
logger.debug("ingoring <g-section-with-header>")
logger.debug("ignoring <g-section-with-header>")
continue
# ingnore articles without an image id / e.g. news articles

View file

@ -53,19 +53,16 @@ def response(resp):
if 'reading' in title_raw:
title += ' (' + title_raw['reading'] + ')'
alt_forms.append(title)
#
result_url = urljoin(BASE_URL, page['slug'])
definitions = get_definitions(page)
# For results, we'll return the URL, all alternative forms (as title),
# and all definitions (as description) truncated to 300 characters.
content = " ".join(f"{engdef}." for _, engdef, _ in definitions)
results.append({
'url': result_url,
'title': ", ".join(alt_forms),
'content': content[:300] + (content[300:] and '...')
})
results.append(
{'url': result_url, 'title': ", ".join(alt_forms), 'content': content[:300] + (content[300:] and '...')}
)
# Like Wordnik, we'll return the first result in an infobox too.
if first_result:
@ -93,11 +90,13 @@ def get_definitions(page):
extra.append(', '.join(defn_raw['info']).capitalize() + '. ')
if defn_raw.get('restrictions'):
extra.append('Only applies to: ' + ', '.join(defn_raw['restrictions']) + '. ')
definitions.append((
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
''.join(extra)[:-1],
))
definitions.append(
(
', '.join(defn_raw['parts_of_speech']),
'; '.join(defn_raw['english_definitions']),
''.join(extra)[:-1],
)
)
return definitions
@ -109,12 +108,14 @@ def get_infobox(alt_forms, result_url, definitions):
infobox_content.append(f'<p><i>Other forms:</i> {", ".join(alt_forms[1:])}</p>')
# definitions
infobox_content.append('''
infobox_content.append(
'''
<small><a href="https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project">JMdict</a>
and <a href="https://www.edrdg.org/enamdict/enamdict_doc.html">JMnedict</a>
by <a href="https://www.edrdg.org/edrdg/licence.html">EDRDG</a>, CC BY-SA 3.0.</small>
<ul>
''')
'''
)
for pos, engdef, extra in definitions:
if pos == 'Wikipedia definition':
infobox_content.append('</ul><small>Wikipedia, CC BY-SA 3.0.</small><ul>')
@ -132,5 +133,5 @@ def get_infobox(alt_forms, result_url, definitions):
'title': 'Jisho.org',
'url': result_url,
}
]
],
}

View file

@ -16,6 +16,11 @@ paging = False
suggestion_query = ''
results_query = ''
cookies = {}
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''
# parameters for engines with paging support
#
# number of results on each page
@ -88,6 +93,9 @@ def request(query, params):
if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['cookies'].update(cookies)
params['headers'].update(headers)
params['url'] = search_url.format(**fp)
params['query'] = query

68
searx/engines/lingva.py Normal file
View file

@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Lingva (alternative Google Translate frontend)"""
from json import loads
about = {
"website": 'https://lingva.ml',
"wikidata_id": None,
"official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
engine_type = 'online_dictionary'
categories = ['general']
url = "https://lingva.ml"
search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"
def request(_query, params):
params['url'] = search_url.format(
url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query']
)
return params
def response(resp):
results = []
result = loads(resp.text)
info = result["info"]
from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
if "typo" in info:
results.append({"suggestion": from_to_prefix + info["typo"]})
if 'definitions' in info: # pylint: disable=too-many-nested-blocks
for definition in info['definitions']:
if 'list' in definition:
for item in definition['list']:
if 'synonyms' in item:
for synonym in item['synonyms']:
results.append({"suggestion": from_to_prefix + synonym})
infobox = ""
for translation in info["extraTranslations"]:
infobox += f"<b>{translation['type']}</b>"
for word in translation["list"]:
infobox += f"<dl><dt>{word['word']}</dt>"
for meaning in word["meanings"]:
infobox += f"<dd>{meaning}</dd>"
infobox += "</dl>"
results.append(
{
'infobox': result["translation"],
'content': infobox,
}
)
return results

79
searx/engines/metacpan.py Normal file
View file

@ -0,0 +1,79 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""metacpan
"""
from urllib.parse import urlunparse
from json import dumps
# about
about = {
"website": 'https://metacpan.org/',
"wikidata_id": 'Q841507',
"official_api_documentation": 'https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
number_of_results = 20 # Don't put this over 5000
categories = ["it", "packages"]
disabled = True
shortcut = "cpan"
paging = True
query_data_template = {
'query': {
'multi_match': {
'type': 'most_fields',
'fields': ['documentation', 'documentation.*'],
'analyzer': 'camelcase',
}
},
'filter': {
'bool': {
'must': [
{'exists': {'field': 'documentation'}},
{'term': {'status': 'latest'}},
{'term': {'indexed': 1}},
{'term': {'authorized': 1}},
]
}
},
"sort": [
{"_score": {"order": "desc"}},
{"date": {"order": "desc"}},
],
'_source': ['documentation', "abstract"],
'size': number_of_results,
}
search_url = urlunparse(["https", "fastapi.metacpan.org", "/v1/file/_search", "", "", ""])
def request(query, params):
params["url"] = search_url
params["method"] = "POST"
query_data = query_data_template
query_data["query"]["multi_match"]["query"] = query
query_data["from"] = (params["pageno"] - 1) * number_of_results
params["data"] = dumps(query_data)
return params
def response(resp):
results = []
search_results = resp.json()["hits"]["hits"]
for result in search_results:
fields = result["_source"]
module = fields["documentation"]
results.append(
{
"url": "https://metacpan.org/pod/" + module,
"title": module,
"content": fields.get("abstract", ""),
}
)
return results

View file

@ -5,7 +5,7 @@
"""
import re
from pymongo import MongoClient # pylint: disable=import-error
from pymongo import MongoClient # pyright: ignore # pylint: disable=import-error
engine_type = 'offline'

View file

@ -6,7 +6,7 @@
# import error is ignored because the admin has to install mysql manually to use
# the engine
import mysql.connector # pylint: disable=import-error
import mysql.connector # pyright: ignore # pylint: disable=import-error
engine_type = 'offline'
auth_plugin = 'caching_sha2_password'

View file

@ -29,6 +29,8 @@ about = {
# engine dependent config
categories = ['map']
paging = False
language_support = True
send_accept_language_header = True
# search-url
base_url = 'https://nominatim.openstreetmap.org/'
@ -141,6 +143,8 @@ def request(query, params):
params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
params['route'] = route_re.match(query)
params['headers']['User-Agent'] = searx_useragent()
if 'Accept-Language' not in params['headers']:
params['headers']['Accept-Language'] = 'en'
return params
@ -202,7 +206,7 @@ def get_wikipedia_image(raw_value):
return get_external_url('wikimedia_image', raw_value)
def fetch_wikidata(nominatim_json, user_langage):
def fetch_wikidata(nominatim_json, user_language):
"""Update nominatim_json using the result of an unique to wikidata
For result in nominatim_json:
@ -223,10 +227,10 @@ def fetch_wikidata(nominatim_json, user_langage):
wd_to_results.setdefault(wd_id, []).append(result)
if wikidata_ids:
user_langage = 'en' if user_langage == 'all' else user_langage
user_language = 'en' if user_language == 'all' else user_language.split('-')[0]
wikidata_ids_str = " ".join(wikidata_ids)
query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace(
'%LANGUAGE%', sparql_string_escape(user_langage)
'%LANGUAGE%', sparql_string_escape(user_language)
)
wikidata_json = send_wikidata_query(query)
for wd_result in wikidata_json.get('results', {}).get('bindings', {}):
@ -241,7 +245,7 @@ def fetch_wikidata(nominatim_json, user_langage):
# overwrite wikipedia link
wikipedia_name = wd_result.get('wikipediaName', {}).get('value')
if wikipedia_name:
result['extratags']['wikipedia'] = user_langage + ':' + wikipedia_name
result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name
# get website if not already defined
website = wd_result.get('website', {}).get('value')
if (

View file

@ -22,9 +22,7 @@ about = {
categories = ["videos"]
paging = True
base_url = "https://peer.tube"
supported_languages_url = (
'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/views/Search.vue'
)
supported_languages_url = 'https://peer.tube/api/v1/videos/languages'
# do search-request
@ -84,9 +82,6 @@ def response(resp):
def _fetch_supported_languages(resp):
import re
# https://docs.python.org/3/howto/regex.html#greedy-versus-non-greedy
videolanguages = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL)
peertube_languages = [m.group(1) for m in re.finditer(r"\{ id: '([a-z]+)', label:", videolanguages.group(1))]
videolanguages = resp.json()
peertube_languages = list(videolanguages.keys())
return peertube_languages

View file

@ -0,0 +1,94 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Petalsearch Images
"""
from json import loads
from urllib.parse import urlencode
from datetime import datetime
from lxml import html
from searx.utils import extract_text
about = {
"website": 'https://petalsearch.com/',
"wikidata_id": 'Q104399280',
"official_api_documentation": False,
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
paging = True
time_range_support = False
safesearch = True
safesearch_table = {0: 'off', 1: 'moderate', 2: 'on'}
base_url = 'https://petalsearch.com/'
search_string = 'search?{query}&channel=image&ps=50&pn={page}&region={lang}&ss_mode={safesearch}&ss_type=normal'
def request(query, params):
search_path = search_string.format(
query=urlencode({'query': query}),
page=params['pageno'],
lang=params['language'].lower(),
safesearch=safesearch_table[params['safesearch']],
)
params['url'] = base_url + search_path
return params
def response(resp):
results = []
tree = html.fromstring(resp.text)
root = tree.findall('.//script[3]')
# Convert list to JSON
json_content = extract_text(root)
# Manipulate with JSON
data = loads(json_content)
for result in data['newImages']:
url = result['url']
title = result['title']
thumbnail_src = result['image']
pic_dict = result.get('extrainfo')
date_from_api = pic_dict.get('publish_time')
width = pic_dict.get('width')
height = pic_dict.get('height')
img_src = pic_dict.get('real_url')
# Continue if img_src is missing
if img_src is None or '':
continue
# Get and convert published date
if date_from_api is not None:
publishedDate = datetime.fromtimestamp(int(date_from_api))
# Append results
results.append(
{
'template': 'images.html',
'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'width': width,
'height': height,
'publishedDate': publishedDate,
}
)
return results

View file

@ -70,7 +70,7 @@ def response(resp):
elif properties.get('osm_type') == 'R':
osm_type = 'relation'
else:
# continue if invalide osm-type
# continue if invalid osm-type
continue
url = result_base_url.format(osm_type=osm_type, osm_id=properties.get('osm_id'))

View file

@ -6,7 +6,7 @@
# import error is ignored because the admin has to install mysql manually to use
# the engine
import psycopg2 # pylint: disable=import-error
import psycopg2 # pyright: ignore # pylint: disable=import-error
engine_type = 'offline'
host = "127.0.0.1"

View file

@ -3,11 +3,15 @@
PubMed (Scholar publications)
"""
from flask_babel import gettext
from lxml import etree
from datetime import datetime
from urllib.parse import urlencode
from searx.network import get
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about
about = {
@ -22,7 +26,7 @@ about = {
"results": 'XML',
}
categories = ['science']
categories = ['science', 'scientific publications']
base_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
@ -63,46 +67,61 @@ def response(resp):
retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
search_results_xml = get(retrieve_url_encoded).content
search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
search_results_response = get(retrieve_url_encoded).content
search_results = etree.XML(search_results_response)
for entry in eval_xpath_list(search_results, '//PubmedArticle'):
medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
for entry in search_results:
title = entry.xpath('.//Article/ArticleTitle')[0].text
pmid = entry.xpath('.//PMID')[0].text
title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
pmid = eval_xpath_getindex(medline, './/PMID', 0).text
url = pubmed_url + pmid
content = extract_text(
eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
)
doi = extract_text(
eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
)
journal = extract_text(
eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
)
issn = extract_text(
eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
)
authors = []
for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
f = eval_xpath_getindex(author, './ForeName', 0, default=None)
l = eval_xpath_getindex(author, './LastName', 0, default=None)
f = '' if f is None else f.text
l = '' if l is None else l.text
authors.append((f + ' ' + l).strip())
try:
content = entry.xpath('.//Abstract/AbstractText')[0].text
except:
content = gettext('No abstract is available for this publication.')
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'content': content,
'journal': journal,
'issn': [issn],
'authors': authors,
'doi': doi,
}
# If a doi is available, add it to the snipppet
try:
doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content)
except:
pass
if len(content) > 300:
content = content[0:300] + "..."
# TODO: center snippet on query term
res_dict = {'url': url, 'title': title, 'content': content}
try:
publishedDate = datetime.strptime(
entry.xpath('.//DateCreated/Year')[0].text
+ '-'
+ entry.xpath('.//DateCreated/Month')[0].text
+ '-'
+ entry.xpath('.//DateCreated/Day')[0].text,
'%Y-%m-%d',
)
res_dict['publishedDate'] = publishedDate
except:
pass
accepted_date = eval_xpath_getindex(
entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
)
if accepted_date is not None:
year = eval_xpath_getindex(accepted_date, './Year', 0)
month = eval_xpath_getindex(accepted_date, './Month', 0)
day = eval_xpath_getindex(accepted_date, './Day', 0)
try:
publishedDate = datetime.strptime(
year.text + '-' + month.text + '-' + day.text,
'%Y-%m-%d',
)
res_dict['publishedDate'] = publishedDate
except Exception as e:
print(e)
results.append(res_dict)
return results
return results

View file

@ -9,16 +9,16 @@ https://www.qwant.com/ queries.
This implementation is used by different qwant engines in the settings.yml::
- name: qwant
categories: general
qwant_categ: web
...
- name: qwant news
categories: news
qwant_categ: news
...
- name: qwant images
categories: images
qwant_categ: images
...
- name: qwant videos
categories: videos
qwant_categ: videos
...
"""
@ -30,11 +30,11 @@ from datetime import (
from json import loads
from urllib.parse import urlencode
from flask_babel import gettext
import babel
from searx.utils import match_language
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx.locales import get_engine_locale
# about
about = {
@ -50,13 +50,20 @@ about = {
categories = []
paging = True
supported_languages_url = about['website']
qwant_categ = None # web|news|inages|videos
category_to_keyword = {
'general': 'web',
'news': 'news',
'images': 'images',
'videos': 'videos',
}
safesearch = True
safe_search_map = {0: '&safesearch=0', 1: '&safesearch=1', 2: '&safesearch=2'}
# fmt: off
qwant_news_locales = [
'ca_ad', 'ca_es', 'ca_fr', 'co_fr', 'de_at', 'de_ch', 'de_de', 'en_au',
'en_ca', 'en_gb', 'en_ie', 'en_my', 'en_nz', 'en_us', 'es_ad', 'es_ar',
'es_cl', 'es_co', 'es_es', 'es_mx', 'es_pe', 'eu_es', 'eu_fr', 'fc_ca',
'fr_ad', 'fr_be', 'fr_ca', 'fr_ch', 'fr_fr', 'it_ch', 'it_it', 'nl_be',
'nl_nl', 'pt_ad', 'pt_pt',
]
# fmt: on
# search-url
url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={offset}'
@ -64,10 +71,13 @@ url = 'https://api.qwant.com/v3/search/{keyword}?{query}&count={count}&offset={o
def request(query, params):
"""Qwant search request"""
keyword = category_to_keyword[categories[0]]
if not query:
return None
count = 10 # web: count must be equal to 10
if keyword == 'images':
if qwant_categ == 'images':
count = 50
offset = (params['pageno'] - 1) * count
# count + offset must be lower than 250
@ -78,22 +88,18 @@ def request(query, params):
offset = min(offset, 40)
params['url'] = url.format(
keyword=keyword,
keyword=qwant_categ,
query=urlencode({'q': query}),
offset=offset,
count=count,
)
# add language tag
if params['language'] == 'all':
params['url'] += '&locale=en_US'
else:
language = match_language(
params['language'],
supported_languages,
language_aliases,
)
params['url'] += '&locale=' + language.replace('-', '_')
# add quant's locale
q_locale = get_engine_locale(params['language'], supported_languages, default='en_US')
params['url'] += '&locale=' + q_locale
# add safesearch option
params['url'] += safe_search_map.get(params['safesearch'], '')
params['raise_for_httperror'] = False
return params
@ -103,7 +109,6 @@ def response(resp):
"""Get response from Qwant's search request"""
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
keyword = category_to_keyword[categories[0]]
results = []
# load JSON result
@ -125,7 +130,7 @@ def response(resp):
# raise for other errors
raise_for_httperror(resp)
if keyword == 'web':
if qwant_categ == 'web':
# The WEB query contains a list named 'mainline'. This list can contain
# different result types (e.g. mainline[0]['type'] returns type of the
# result items in mainline[0]['items']
@ -136,7 +141,7 @@ def response(resp):
# result['items'].
mainline = data.get('result', {}).get('items', [])
mainline = [
{'type': keyword, 'items': mainline},
{'type': qwant_categ, 'items': mainline},
]
# return empty array if there are no results
@ -146,7 +151,7 @@ def response(resp):
for row in mainline:
mainline_type = row.get('type', 'web')
if mainline_type != keyword:
if mainline_type != qwant_categ:
continue
if mainline_type == 'ads':
@ -238,19 +243,43 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
# list of regions is embedded in page as a js object
response_text = resp.text
response_text = response_text[response_text.find('INITIAL_PROPS') :]
response_text = response_text[response_text.find('{') : response_text.find('</script>')]
regions_json = loads(response_text)
text = resp.text
text = text[text.find('INITIAL_PROPS') :]
text = text[text.find('{') : text.find('</script>')]
supported_languages = []
for country, langs in regions_json['locales'].items():
for lang in langs['langs']:
lang_code = "{lang}-{country}".format(lang=lang, country=country)
supported_languages.append(lang_code)
q_initial_props = loads(text)
q_locales = q_initial_props.get('locales')
q_valid_locales = []
for country, v in q_locales.items():
for lang in v['langs']:
_locale = "{lang}_{country}".format(lang=lang, country=country)
if qwant_categ == 'news' and _locale.lower() not in qwant_news_locales:
# qwant-news does not support all locales from qwant-web:
continue
q_valid_locales.append(_locale)
supported_languages = {}
for q_locale in q_valid_locales:
try:
locale = babel.Locale.parse(q_locale, sep='_')
except babel.core.UnknownLocaleError:
print("ERROR: can't determine babel locale of quant's locale %s" % q_locale)
continue
# note: supported_languages (dict)
#
# dict's key is a string build up from a babel.Locale object / the
# notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and
# language) notation and dict's values are the locale strings used by
# the engine.
searxng_locale = locale.language + '-' + locale.territory # --> params['language']
supported_languages[searxng_locale] = q_locale
return supported_languages

View file

@ -6,6 +6,8 @@
from json import dumps, loads
from datetime import datetime
from flask_babel import gettext
about = {
"website": 'https://www.semanticscholar.org/',
"wikidata_id": 'Q22908627',
@ -15,6 +17,7 @@ about = {
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
search_url = 'https://www.semanticscholar.org/api/1/search'
paper_url = 'https://www.semanticscholar.org/paper'
@ -45,11 +48,7 @@ def request(query, params):
def response(resp):
res = loads(resp.text)
results = []
for result in res['results']:
item = {}
metadata = []
url = result.get('primaryPaperLink', {}).get('url')
if not url and result.get('links'):
url = result.get('links')[0]
@ -60,22 +59,47 @@ def response(resp):
if not url:
url = paper_url + '/%s' % result['id']
item['url'] = url
# publishedDate
if 'pubDate' in result:
publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
else:
publishedDate = None
item['title'] = result['title']['text']
item['content'] = result['paperAbstract']['text']
# authors
authors = [author[0]['name'] for author in result.get('authors', [])]
metadata = result.get('fieldsOfStudy') or []
venue = result.get('venue', {}).get('text')
if venue:
metadata.append(venue)
if metadata:
item['metadata'] = ', '.join(metadata)
# pick for the first alternate link, but not from the crawler
pdf_url = None
for doc in result.get('alternatePaperLinks', []):
if doc['linkType'] not in ('crawler', 'doi'):
pdf_url = doc['url']
break
pubDate = result.get('pubDate')
if pubDate:
item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d")
# comments
comments = None
if 'citationStats' in result:
comments = gettext(
'{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
).format(
numCitations=result['citationStats']['numCitations'],
firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
)
results.append(item)
results.append(
{
'template': 'paper.html',
'url': url,
'title': result['title']['text'],
'content': result['paperAbstract']['text'],
'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
'doi': result.get('doiInfo', {}).get('doi'),
'tags': result.get('fieldsOfStudy'),
'authors': authors,
'pdf_url': pdf_url,
'publishedDate': publishedDate,
'comments': comments,
}
)
return results

View file

@ -1,6 +1,8 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Słownik Języka Polskiego (general)
# lint: pylint
"""Słownik Języka Polskiego
Dictionary of the polish language from PWN (sjp.pwn)
"""
from lxml.html import fromstring

View file

@ -19,7 +19,7 @@ about = {
"results": 'JSON',
}
categories = ['science']
categories = ['science', 'scientific publications']
paging = True
nb_per_page = 10
api_key = 'unset'
@ -41,32 +41,32 @@ def response(resp):
json_data = loads(resp.text)
for record in json_data['records']:
content = record['abstract'][0:500]
if len(record['abstract']) > len(content):
content += "..."
published = datetime.strptime(record['publicationDate'], '%Y-%m-%d')
metadata = [
record[x]
for x in [
'publicationName',
'identifier',
'contentType',
]
if record.get(x) is not None
]
metadata = ' / '.join(metadata)
if record.get('startingPage') and record.get('endingPage') is not None:
metadata += " (%(startingPage)s-%(endingPage)s)" % record
authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']]
tags = record.get('genre')
if isinstance(tags, str):
tags = [tags]
results.append(
{
'title': record['title'],
'template': 'paper.html',
'url': record['url'][0]['value'].replace('http://', 'https://', 1),
'content': content,
'title': record['title'],
'content': record['abstract'],
'comments': record['publicationName'],
'tags': tags,
'publishedDate': published,
'metadata': metadata,
'type': record.get('contentType'),
'authors': authors,
# 'editor': '',
'publisher': record.get('publisher'),
'journal': record.get('publicationName'),
'volume': record.get('volume') or None,
'pages': '-'.join([x for x in [record.get('startingPage'), record.get('endingPage')] if x]),
'number': record.get('number') or None,
'doi': record.get('doi'),
'issn': [x for x in [record.get('issn')] if x],
'isbn': [x for x in [record.get('isbn')] if x],
# 'pdf_url' : ''
}
)
return results

View file

@ -62,8 +62,7 @@ sc_code = ''
def raise_captcha(resp):
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
# suspend CAPTCHA for 7 days
raise SearxEngineCaptchaException(suspended_time=7 * 24 * 3600)
raise SearxEngineCaptchaException()
def get_sc_code(headers):
@ -89,15 +88,14 @@ def get_sc_code(headers):
dom = html.fromstring(resp.text)
try:
# href --> '/?sc=adrKJMgF8xwp20'
href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href')
# <input type="hidden" name="sc" value="...">
sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0]
except IndexError as exc:
# suspend startpage API --> https://github.com/searxng/searxng/pull/695
raise SearxEngineResponseException(
suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!"
) from exc
sc_code = href[5:]
sc_code_ts = time()
logger.debug("new value is: %s", sc_code)
@ -209,7 +207,7 @@ def _fetch_supported_languages(resp):
# native name, the English name of the writing script used by the language,
# or occasionally something else entirely.
# this cases are so special they need to be hardcoded, a couple of them are mispellings
# this cases are so special they need to be hardcoded, a couple of them are misspellings
language_names = {
'english_uk': 'en-GB',
'fantizhengwen': ['zh-TW', 'zh-HK'],

View file

@ -17,6 +17,7 @@ billion images `[tineye.com] <https://tineye.com/how>`_.
from urllib.parse import urlencode
from datetime import datetime
from flask_babel import gettext
about = {
"website": 'https://tineye.com',
@ -28,20 +29,41 @@ about = {
}
engine_type = 'online_url_search'
""":py:obj:`searx.search.processors.online_url_search`"""
categories = ['general']
paging = True
safesearch = False
base_url = 'https://tineye.com'
search_string = '/result_json/?page={page}&{query}'
FORMAT_NOT_SUPPORTED = gettext(
"Could not read that image url. This may be due to an unsupported file"
" format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP."
)
"""TinEye error message"""
NO_SIGNATURE_ERROR = gettext(
"The image is too simple to find matches. TinEye requires a basic level of"
" visual detail to successfully identify matches."
)
"""TinEye error message"""
DOWNLOAD_ERROR = gettext("The image could not be downloaded.")
"""TinEye error message"""
def request(query, params):
"""Build TinEye HTTP request using ``search_urls`` of a :py:obj:`engine_type`."""
params['raise_for_httperror'] = False
if params['search_urls']['data:image']:
query = params['search_urls']['data:image']
elif params['search_urls']['http']:
query = params['search_urls']['http']
logger.debug("query URL: %s", query)
query = urlencode({'url': query})
# see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
@ -59,45 +81,145 @@ def request(query, params):
return params
def parse_tineye_match(match_json):
"""Takes parsed JSON from the API server and turns it into a :py:obj:`dict`
object.
Attributes `(class Match) <https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__
- `image_url`, link to the result image.
- `domain`, domain this result was found on.
- `score`, a number (0 to 100) that indicates how closely the images match.
- `width`, image width in pixels.
- `height`, image height in pixels.
- `size`, image area in pixels.
- `format`, image format.
- `filesize`, image size in bytes.
- `overlay`, overlay URL.
- `tags`, whether this match belongs to a collection or stock domain.
- `backlinks`, a list of Backlink objects pointing to the original websites
and image URLs. List items are instances of :py:obj:`dict`, (`Backlink
<https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__):
- `url`, the image URL to the image.
- `backlink`, the original website URL.
- `crawl_date`, the date the image was crawled.
"""
# HINT: there exists an alternative backlink dict in the domains list / e.g.::
#
# match_json['domains'][0]['backlinks']
backlinks = []
if "backlinks" in match_json:
for backlink_json in match_json["backlinks"]:
if not isinstance(backlink_json, dict):
continue
crawl_date = backlink_json.get("crawl_date")
if crawl_date:
crawl_date = datetime.fromisoformat(crawl_date[:-3])
else:
crawl_date = datetime.min
backlinks.append(
{
'url': backlink_json.get("url"),
'backlink': backlink_json.get("backlink"),
'crawl_date': crawl_date,
'image_name': backlink_json.get("image_name"),
}
)
return {
'image_url': match_json.get("image_url"),
'domain': match_json.get("domain"),
'score': match_json.get("score"),
'width': match_json.get("width"),
'height': match_json.get("height"),
'size': match_json.get("size"),
'image_format': match_json.get("format"),
'filesize': match_json.get("filesize"),
'overlay': match_json.get("overlay"),
'tags': match_json.get("tags"),
'backlinks': backlinks,
}
def response(resp):
"""Parse HTTP response from TinEye."""
results = []
# Define wanted results
json_data = resp.json()
number_of_results = json_data['num_matches']
try:
json_data = resp.json()
except Exception as exc: # pylint: disable=broad-except
msg = "can't parse JSON response // %s" % exc
logger.error(msg)
json_data = {'error': msg}
for i in json_data['matches']:
image_format = i['format']
width = i['width']
height = i['height']
thumbnail_src = i['image_url']
backlink = i['domains'][0]['backlinks'][0]
url = backlink['backlink']
source = backlink['url']
title = backlink['image_name']
img_src = backlink['url']
# handle error codes from Tineye
# Get and convert published date
api_date = backlink['crawl_date'][:-3]
publishedDate = datetime.fromisoformat(api_date)
if resp.is_error:
if resp.status_code in (400, 422):
# Append results
message = 'HTTP status: %s' % resp.status_code
error = json_data.get('error')
s_key = json_data.get('suggestions', {}).get('key', '')
if error and s_key:
message = "%s (%s)" % (error, s_key)
elif error:
message = error
if s_key == "Invalid image URL":
# test https://docs.searxng.org/_static/searxng-wordmark.svg
message = FORMAT_NOT_SUPPORTED
elif s_key == 'NO_SIGNATURE_ERROR':
# test https://pngimg.com/uploads/dot/dot_PNG4.png
message = NO_SIGNATURE_ERROR
elif s_key == 'Download Error':
# test https://notexists
message = DOWNLOAD_ERROR
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
# results.append({'answer': message})
logger.error(message)
return results
resp.raise_for_status()
# append results from matches
for match_json in json_data['matches']:
tineye_match = parse_tineye_match(match_json)
if not tineye_match['backlinks']:
continue
backlink = tineye_match['backlinks'][0]
results.append(
{
'template': 'images.html',
'url': url,
'thumbnail_src': thumbnail_src,
'source': source,
'title': title,
'img_src': img_src,
'format': image_format,
'widht': width,
'height': height,
'publishedDate': publishedDate,
'url': backlink['backlink'],
'thumbnail_src': tineye_match['image_url'],
'source': backlink['url'],
'title': backlink['image_name'],
'img_src': backlink['url'],
'format': tineye_match['image_format'],
'widht': tineye_match['width'],
'height': tineye_match['height'],
'publishedDate': backlink['crawl_date'],
}
)
# Append number of results
results.append({'number_of_results': number_of_results})
# append number of results
number_of_results = json_data.get('num_matches')
if number_of_results:
results.append({'number_of_results': number_of_results})
return results

75
searx/engines/twitter.py Normal file
View file

@ -0,0 +1,75 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Twitter (microblogging platform)"""
from json import loads
from urllib.parse import urlencode
from datetime import datetime
about = {
"website": 'https://twitter.com',
"wikidata_id": None,
"official_api_documentation": 'https://developer.twitter.com/en/docs/twitter-api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['social media']
url = "https://api.twitter.com"
search_url = (
"{url}/2/search/adaptive.json?{query}&tweet_mode=extended&query_source=typed_query&pc=1&spelling_corrections=1"
)
def request(query, params):
params['url'] = search_url.format(url=url, query=urlencode({'q': query}))
params['headers'] = {
# This token is used in the Twitter web interface (twitter.com). Without this header, the API doesn't work.
# The value of the token has never changed (or maybe once a long time ago).
# https://github.com/zedeus/nitter/blob/5f31e86e0e8578377fa7d5aeb9631bbb2d35ef1e/src/consts.nim#L5
'Authorization': (
"Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKb"
"T3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
)
}
return params
def response(resp):
results = []
json_res = loads(resp.text)['globalObjects']
for tweet in json_res['tweets'].values():
text = tweet['full_text']
display = tweet['display_text_range']
img_src = tweet.get('extended_entities', {}).get('media', [{}])[0].get('media_url_https')
if img_src:
img_src += "?name=thumb"
results.append(
{
'url': 'https://twitter.com/i/web/status/' + tweet['id_str'],
'title': (text[:40] + '...') if len(text) > 40 else text,
'content': text[display[0] : display[1]],
'img_src': img_src,
'publishedDate': datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S %z %Y'),
}
)
for user in json_res['users'].values():
results.append(
{
'title': user['name'],
'content': user['description'],
'url': 'https://twitter.com/' + user['screen_name'],
'img_src': user['profile_image_url_https'],
}
)
return results

View file

@ -50,7 +50,7 @@ WIKIDATA_PROPERTIES = {
# SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE
# https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates
# https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model
# optmization:
# optimization:
# * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization
# * https://github.com/blazegraph/database/wiki/QueryHints
QUERY_TEMPLATE = """
@ -65,6 +65,7 @@ WHERE
mwapi:language "%LANGUAGE%".
?item wikibase:apiOutputItem mwapi:item.
}
hint:Prior hint:runFirst "true".
%WHERE%
@ -93,6 +94,12 @@ WHERE {
}
"""
# see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata)
# hard coded here to avoid to an additional SPARQL request when the server starts
DUMMY_ENTITY_URLS = set(
"http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402")
)
# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
@ -177,7 +184,7 @@ def response(resp):
for result in jsonresponse.get('results', {}).get('bindings', []):
attribute_result = {key: value['value'] for key, value in result.items()}
entity_url = attribute_result['item']
if entity_url not in seen_entities:
if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS:
seen_entities.add(entity_url)
results += get_results(attribute_result, attributes, language)
else:
@ -379,7 +386,7 @@ def get_attributes(language):
add_amount('P2046') # area
add_amount('P281') # postal code
add_label('P38') # currency
add_amount('P2048') # heigth (building)
add_amount('P2048') # height (building)
# Media
for p in [
@ -464,7 +471,6 @@ def get_attributes(language):
class WDAttribute:
# pylint: disable=no-self-use
__slots__ = ('name',)
def __init__(self, name):
@ -626,7 +632,6 @@ class WDImageAttribute(WDURLAttribute):
class WDDateAttribute(WDAttribute):
# pylint: disable=no-self-use
def get_select(self):
return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name)

View file

@ -19,6 +19,9 @@ about = {
"results": 'JSON',
}
send_accept_language_header = True
# search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@ -41,9 +44,6 @@ def request(query, params):
language = url_lang(params['language'])
params['url'] = search_url.format(title=quote(query), language=language)
if params['language'].lower() in language_variants.get(language, []):
params['headers']['Accept-Language'] = params['language'].lower()
params['headers']['User-Agent'] = searx_useragent()
params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2
@ -106,9 +106,9 @@ def _fetch_supported_languages(resp):
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[2].xpath('./a')[0].text
name = td[1].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
# exclude languages with too few articles
if articles >= 100:
supported_languages[code] = {"name": name, "english_name": english_name}

View file

@ -50,7 +50,7 @@ def request(query, params):
# replace private user area characters to make text legible
def replace_pua_chars(text):
pua_chars = {
'\uf522': '\u2192', # rigth arrow
'\uf522': '\u2192', # right arrow
'\uf7b1': '\u2115', # set of natural numbers
'\uf7b4': '\u211a', # set of rational numbers
'\uf7b5': '\u211d', # set of real numbers

136
searx/engines/wttr.py Normal file
View file

@ -0,0 +1,136 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""wttr.in (weather forecast service)"""
from json import loads
from urllib.parse import quote
from flask_babel import gettext
about = {
"website": "https://wttr.in",
"wikidata_id": "Q107586666",
"official_api_documentation": "https://github.com/chubin/wttr.in#json-output",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ["others"]
url = "https://wttr.in/{query}?format=j1&lang={lang}"
def get_weather_condition_key(lang):
if lang == "en":
return "weatherDesc"
return "lang_" + lang.lower()
def generate_day_table(day):
res = ""
res += f"<tr><td>{gettext('Average temp.')}</td><td>{day['avgtempC']}°C / {day['avgtempF']}°F</td></tr>"
res += f"<tr><td>{gettext('Min temp.')}</td><td>{day['mintempC']}°C / {day['mintempF']}°F</td></tr>"
res += f"<tr><td>{gettext('Max temp.')}</td><td>{day['maxtempC']}°C / {day['maxtempF']}°F</td></tr>"
res += f"<tr><td>{gettext('UV index')}</td><td>{day['uvIndex']}</td></tr>"
res += f"<tr><td>{gettext('Sunrise')}</td><td>{day['astronomy'][0]['sunrise']}</td></tr>"
res += f"<tr><td>{gettext('Sunset')}</td><td>{day['astronomy'][0]['sunset']}</td></tr>"
return res
def generate_condition_table(condition, lang, current=False):
res = ""
if current:
key = "temp_"
else:
key = "temp"
res += (
f"<tr><td><b>{gettext('Condition')}</b></td>"
f"<td><b>{condition[get_weather_condition_key(lang)][0]['value']}</b></td></tr>"
)
res += (
f"<tr><td><b>{gettext('Temperature')}</b></td>"
f"<td><b>{condition[key+'C']}°C / {condition[key+'F']}°F</b></td></tr>"
)
res += (
f"<tr><td>{gettext('Feels like')}</td><td>{condition['FeelsLikeC']}°C / {condition['FeelsLikeF']}°F</td></tr>"
)
res += (
f"<tr><td>{gettext('Wind')}</td><td>{condition['winddir16Point']}"
f"{condition['windspeedKmph']} km/h / {condition['windspeedMiles']} mph</td></tr>"
)
res += (
f"<tr><td>{gettext('Visibility')}</td><td>{condition['visibility']} km / {condition['visibilityMiles']} mi</td>"
)
res += f"<tr><td>{gettext('Humidity')}</td><td>{condition['humidity']}%</td></tr>"
return res
def request(query, params):
if query.replace('/', '') in [":help", ":bash.function", ":translation"]:
return None
if params["language"] == "all":
params["language"] = "en"
else:
params["language"] = params["language"].split("-")[0]
params["url"] = url.format(query=quote(query), lang=params["language"])
params["raise_for_httperror"] = False
return params
def response(resp):
results = []
if resp.status_code == 404:
return []
result = loads(resp.text)
current = result["current_condition"][0]
location = result['nearest_area'][0]
forecast_indices = {3: gettext('Morning'), 4: gettext('Noon'), 6: gettext('Evening'), 7: gettext('Night')}
title = f"{location['areaName'][0]['value']}, {location['region'][0]['value']}"
infobox = f"<h3>{gettext('Current condition')}</h3><table><tbody>"
infobox += generate_condition_table(current, resp.search_params['language'], True)
infobox += "</tbody></table>"
for day in result["weather"]:
infobox += f"<h3>{day['date']}</h3>"
infobox += "<table><tbody>"
infobox += generate_day_table(day)
infobox += "</tbody></table>"
infobox += "<table><tbody>"
for time in forecast_indices.items():
infobox += f"<tr><td rowspan=\"7\"><b>{time[1]}</b></td></tr>"
infobox += generate_condition_table(day['hourly'][time[0]], resp.search_params['language'])
infobox += "</tbody></table>"
results.append(
{
"infobox": title,
"content": infobox,
}
)
return results

View file

@ -22,6 +22,7 @@ from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
from searx.network import raise_for_httperror
search_url = None
"""
@ -52,7 +53,7 @@ Replacements are:
0: none, 1: moderate, 2:strict
If not supported, the URL paramter is an empty string.
If not supported, the URL parameter is an empty string.
"""
@ -61,6 +62,14 @@ lang_all = 'en'
selected.
'''
no_result_for_http_status = []
'''Return empty result for these HTTP status codes instead of throwing an error.
.. code:: yaml
no_result_for_http_status: []
'''
soft_max_redirects = 0
'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
@ -105,7 +114,7 @@ time_range_support = False
time_range_url = '&hours={time_range_val}'
'''Time range URL parameter in the in :py:obj:`search_url`. If no time range is
requested by the user, the URL paramter is an empty string. The
requested by the user, the URL parameter is an empty string. The
``{time_range_val}`` replacement is taken from the :py:obj:`time_range_map`.
.. code:: yaml
@ -177,11 +186,18 @@ def request(query, params):
params['url'] = search_url.format(**fargs)
params['soft_max_redirects'] = soft_max_redirects
params['raise_for_httperror'] = False
return params
def response(resp):
def response(resp): # pylint: disable=too-many-branches
'''Scrap *results* from the response (see :ref:`engine results`).'''
if no_result_for_http_status and resp.status_code in no_result_for_http_status:
return []
raise_for_httperror(resp)
results = []
dom = html.fromstring(resp.text)
is_onion = 'onions' in categories

View file

@ -3,7 +3,6 @@
Youtube (Videos)
"""
from datetime import datetime
from functools import reduce
from json import loads, dumps
from urllib.parse import quote_plus
@ -37,6 +36,7 @@ base_youtube_url = 'https://www.youtube.com/watch?v='
# do search-request
def request(query, params):
params['cookies']['CONSENT'] = "YES+"
if not params['engine_data'].get('next_page_token'):
params['url'] = search_url.format(query=quote_plus(query), page=params['pageno'])
if params['time_range'] in time_range_dict:
@ -52,7 +52,6 @@ def request(query, params):
)
params['headers']['Content-Type'] = 'application/json'
params['headers']['Cookie'] = "CONSENT=YES+cb.%s-17-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
return params

View file

@ -39,7 +39,7 @@ def init(engine_settings=None):
resp = http_get('https://z-lib.org', timeout=5.0)
if resp.ok:
dom = html.fromstring(resp.text)
base_url = "https:" + extract_text(
base_url = extract_text(
eval_xpath(dom, './/a[contains(@class, "domain-check-link") and @data-mode="books"]/@href')
)
logger.debug("using base_url: %s" % base_url)

View file

@ -1,29 +1,19 @@
'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2017- by Alexandre Flament, <alex@al-f.net>
'''
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Exception types raised by SearXNG modules.
"""
from typing import Optional, Union
class SearxException(Exception):
pass
"""Base SearXNG exception."""
class SearxParameterException(SearxException):
"""Raised when query miss a required paramater"""
def __init__(self, name, value):
if value == '' or value is None:
message = 'Empty ' + name + ' parameter'
@ -69,19 +59,38 @@ class SearxEngineAPIException(SearxEngineResponseException):
class SearxEngineAccessDeniedException(SearxEngineResponseException):
"""The website is blocking the access"""
def __init__(self, suspended_time=24 * 3600, message='Access denied'):
SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineAccessDenied"
"""This settings contains the default suspended time (default 86400 sec / 1
day)."""
def __init__(self, suspended_time: int = None, message: str = 'Access denied'):
"""Generic exception to raise when an engine denies access to the results.
:param suspended_time: How long the engine is going to be suspended in
second. Defaults to None.
:type suspended_time: int, None
:param message: Internal message. Defaults to ``Access denied``
:type message: str
"""
suspended_time = suspended_time or self._get_default_suspended_time()
super().__init__(message + ', suspended_time=' + str(suspended_time))
self.suspended_time = suspended_time
self.message = message
def _get_default_suspended_time(self):
from searx import get_setting # pylint: disable=C0415
return get_setting(self.SUSPEND_TIME_SETTING)
class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
"""The website has returned a CAPTCHA
"""The website has returned a CAPTCHA."""
By default, searx stops sending requests to this engine for 1 day.
"""
SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineCaptcha"
"""This settings contains the default suspended time (default 86400 sec / 1
day)."""
def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'):
def __init__(self, suspended_time=None, message='CAPTCHA'):
super().__init__(message=message, suspended_time=suspended_time)
@ -91,7 +100,11 @@ class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
By default, searx stops sending requests to this engine for 1 hour.
"""
def __init__(self, suspended_time=3600, message='Too many request'):
SUSPEND_TIME_SETTING = "search.suspended_times.SearxEngineTooManyRequests"
"""This settings contains the default suspended time (default 3660 sec / 1
hour)."""
def __init__(self, suspended_time=None, message='Too many request'):
super().__init__(message=message, suspended_time=suspended_time)

View file

@ -1,5 +1,6 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
from urllib.parse import quote_plus
from searx.data import EXTERNAL_BANGS
LEAF_KEY = chr(16)
@ -39,7 +40,7 @@ def get_bang_definition_and_ac(external_bangs_db, bang):
def resolve_bang_definition(bang_definition, query):
url, rank = bang_definition.split(chr(1))
url = url.replace(chr(2), query)
url = url.replace(chr(2), quote_plus(query))
if url.startswith('//'):
url = 'https:' + url
rank = int(rank) if len(rank) > 0 else 0

View file

@ -30,7 +30,7 @@ def get_external_url(url_id, item_id, alternative="default"):
"""Return an external URL or None if url_id is not found.
url_id can take value from data/external_urls.json
The "imdb_id" value is automaticaly converted according to the item_id value.
The "imdb_id" value is automatically converted according to the item_id value.
If item_id is None, the raw URL with the $1 is returned.
"""

View file

@ -77,13 +77,11 @@ class InfoPage:
.. _markdown-it-py: https://github.com/executablebooks/markdown-it-py
"""
return MarkdownIt(
"commonmark", {"typographer": True}
).enable(
["replacements", "smartquotes"]
).render(self.content)
return (
MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(self.content)
)
def get_ctx(self): # pylint: disable=no-self-use
def get_ctx(self):
"""Jinja context to render :py:obj:`InfoPage.content`"""
def _md_link(name, url):
@ -136,6 +134,7 @@ class InfoPageSet: # pylint: disable=too-few-public-methods
self.toc: typing.List[str] = [
'search-syntax',
'about',
'donate',
]
"""list of articles in the online documentation"""
@ -158,10 +157,9 @@ class InfoPageSet: # pylint: disable=too-few-public-methods
return None
cache_key = (pagename, locale)
page = self.CACHE.get(cache_key)
if page is not None:
return page
if cache_key in self.CACHE:
return self.CACHE[cache_key]
# not yet instantiated
@ -184,4 +182,6 @@ class InfoPageSet: # pylint: disable=too-few-public-methods
if fallback_to_default and page is None:
page_locale = self.locale_default
page = self.get_page(page_name, self.locale_default)
yield page_name, page_locale, page
if page is not None:
# page is None if the page was deleted by the administrator
yield page_name, page_locale, page

View file

@ -0,0 +1,88 @@
# Über SearXNG
SearXNG ist eine [Metasuchmaschine], welche die Ergebnisse anderer
{{link('Suchmaschinen', 'preferences')}} sammelt und aufbereitet ohne dabei
Informationen über seine Benutzer zu sammeln oder an andere Suchmaschinen weiter
zu geben.
Das SearXNG Projekt wird von einer offenen Gemeinschaft entwickelt; wenn Sie
Fragen haben oder einfach nur über SearXNG plaudern möchten, besuchen Sie uns
auf Matrix unter: [#searxng:matrix.org]
Werden Sie Teil des Projekts und unterstützen Sie SearXNG:
- Sie können die SearXNG Übersetzungen ergänzen oder korrigieren: [Weblate]
- oder folgen Sie den Entwicklungen, senden Sie Beiträge und melden Sie Fehler:
[SearXNG Quellen]
- Mehr Informationen sind in der [SearXNG Dokumentation] zu finden.
## Warum sollte ich SearXNG benutzen?
- SearXNG bietet Ihnen vielleicht nicht so personalisierte Ergebnisse wie
Google, aber es erstellt auch kein Profil über Sie.
- SearXNG kümmert sich nicht darum, wonach Sie suchen, gibt niemals etwas an
Dritte weiter und kann nicht dazu verwendet werden Sie zu kompromittieren.
- SearXNG ist freie Software, der Code ist zu 100% offen und jeder ist
willkommen ihn zu verbessern.
Wenn Ihnen die Privatsphäre wichtig ist, Sie ein bewusster Nutzer sind und Sie
an die digitale Freiheit glauben, sollten Sie SearXNG zu Ihrer
Standardsuchmaschine machen oder eine SearXNG Instanz auf Ihrem eigenen Server
betreiben.
## Wie kann ich SearXNG als Standardsuchmaschine festlegen?
SearXNG unterstützt [OpenSearch]. Weitere Informationen zum Ändern Ihrer
Standardsuchmaschine finden Sie in der Dokumentation zu Ihrem [WEB-Browser]:
- [Firefox]
- [Microsoft Edge] - Hinter dem Link finden sich auch nützliche Hinweise zu
Chrome und Safari.
- [Chromium]-basierte Browser fügen nur Websites hinzu, zu denen der Benutzer
ohne Pfadangabe navigiert.
Wenn Sie eine Suchmaschine hinzufügen, darf es keine Duplikate mit demselben
Namen geben. Wenn Sie auf ein Problem stoßen, bei dem Sie die Suchmaschine
nicht hinzufügen können, dann können Sie entweder:
- das Duplikat entfernen (Standardname: SearXNG) oder
- den Eigentümer kontaktieren, damit dieser der Instance einen anderen Namen als
den Standardnamen gibt.
## Wie funktioniert SearXNG?
SearXNG ist ein Fork der bekannten [searx] [Metasuchmaschine], die durch das
[Seeks-Projekt] inspiriert wurde (diese beide Projekte werden heute nicht mehr
aktiv weiterentwickelt). SearXNG bietet einen grundlegenden Schutz der
Privatsphäre, indem es die Suchanfragen der Benutzer mit Suchen auf anderen
Plattformen vermischt ohne dabei Suchdaten zu speichern. SearXNG kann im
[WEB-Browser] als weitere oder Standard-Suchmaschine hinzugefügt werden.
Die {{link('Suchmaschinenstatistik', 'stats')}} enthält einige nützliche
Statistiken über die verwendeten Suchmaschinen.
## Wie kann ich einen eigenen SearXNG Server betreiben?
Jeder der mit dem Betrieb von WEB-Servern vertraut ist kann sich eine eigene
Instanz einrichten; die Software dazu kann über die [SearXNG Quellen] bezogen
werden. Weitere Informationen zur Installation und zum Betrieb finden sich in
der [SearXNG Dokumentation].
Fügen Sie Ihre Instanz zu der [Liste der öffentlich zugänglichen
Instanzen]({{get_setting('brand.public_instances')}}) hinzu um auch anderen
Menschen zu helfen ihre Privatsphäre zurückzugewinnen und das Internet freier zu
machen. Je dezentraler das Internet ist, desto mehr Freiheit haben wir!
[SearXNG Quellen]: {{GIT_URL}}
[#searxng:matrix.org]: https://matrix.to/#/#searxng:matrix.org
[SearXNG Dokumentation]: {{get_setting('brand.docs_url')}}
[searx]: https://github.com/searx/searx
[Metasuchmaschine]: https://de.wikipedia.org/wiki/Metasuchmaschine
[Weblate]: https://translate.codeberg.org/projects/searxng/
[Seeks-Projekt]: https://beniz.github.io/seeks/
[OpenSearch]: https://github.com/dewitt/opensearch/blob/master/opensearch-1-1-draft-6.md
[Firefox]: https://support.mozilla.org/en-US/kb/add-or-remove-search-engine-firefox
[Microsoft Edge]: https://support.microsoft.com/en-us/help/4028574/microsoft-edge-change-the-default-search-engine
[Chromium]: https://www.chromium.org/tab-to-search
[WEB-Browser]: https://de.wikipedia.org/wiki/Webbrowser

View file

@ -0,0 +1,77 @@
# Suchbegriffe
SearXNG verfügt über eine Syntax mit der in einer Suchanfrage die Kategorien,
Suchmaschinen, Sprachen und mehr geändert werden können. In den
{{link('Eigenschaften','preferences')}} sind die Kategorien, Suchmaschinen und
Sprachen zu finden, die zur Verfügung stehen.
## `!` Suchmaschine und Kategorie auswählen
Zum Festlegen von Kategorie- und/oder Suchmaschinen dient das Präfix `!`. Um
ein paar Beispiele zu geben:
- in der Wikipedia nach dem Begriff **paris** suchen
- {{search('!wp paris')}}
- {{search('!wikipedia paris')}}
- in der Kategorie **Karte** nach dem Begriff **paris** suchen:
- {{search('!map paris')}}
- in der Kategorie **Bilder** suchen
- {{search('!images Wau Holland')}}
Abkürzungen der Suchmaschinen und Kategorien sind ebenfalls möglich und können
auch kombiniert werden. So wird z.B. mit {{search('!map !ddg !wp paris')}} in
der Kategorie **Karte** als auch mit den Suchmaschinen DuckDuckGo und Wikipedia
nach dem Begriff **paris** gesucht.
## `:` Sprache auswählen
Um einen Sprachfilter auszuwählen, verwenden Sie das Präfix`:`. Um ein
einfaches Beispiel zu geben:
- Wikipedia mit einer benutzerdefinierten Sprache durchsuchen
- {{search(':de !wp Wau Holland')}}
## `!!` external bangs
SearXNG unterstützt die _external bangs_ von [ddg]. Das Präfix `!!` kann
verwendet werden um direkt zu einer externen Suchseite zu springen. Um ein
Beispiel zu geben:
- In Wikipedia mit einer benutzerdefinierten Sprache eine Suche durchführen
- {{search('!!wde Wau Holland')}}
Bitte beachten; die Suche wird direkt in der externen Suchmaschine durchgeführt.
SearXNG kann die Privatsphäre des Benutzers in diesem Fall nur eingeschränkt
schützen, dennoch wird diese Funktion von manchen Benutzern als sehr nützlich
empfunden.
[ddg]: https://duckduckgo.com/bang
## Besondere Abfragen
In den {{link('Eigenschaften', 'preferences')}} finden sich Schlüsselwörter für
_besondere Abfragen_. Um ein paar Beispiele zu geben:
- Zufallsgenerator für eine UUID
- {{search('random uuid')}}
- Bestimmung des Mittelwerts
- {{search('avg 123 548 2.04 24.2')}}
- anzeigen des _user agent_ Ihres WEB-Browsers (muss aktiviert sein)
- {{search('user-agent')}}
- Zeichenketten in verschiedene Hash-Digests umwandeln (muss aktiviert sein)
- {{search('md5 lorem ipsum')}}
- {{search('sha512 lorem ipsum')}}

View file

@ -7,7 +7,7 @@ via the search query.
To set category and/or engine names use a `!` prefix. To give a few examples:
- search in wikipedia for **paris**
- search in Wikipedia for **paris**
- {{search('!wp paris')}}
- {{search('!wikipedia paris')}}
@ -22,29 +22,29 @@ To set category and/or engine names use a `!` prefix. To give a few examples:
Abbreviations of the engines and languages are also accepted. Engine/category
modifiers are chain able and inclusive. E.g. with {{search('!map !ddg !wp
paris')}} search in map category and duckduckgo and wikipedia for **paris**.
paris')}} search in map category and DuckDuckGo and Wikipedia for **paris**.
## `:` select language
To select language filter use a `:` prefix. To give an example:
- search wikipedia by a custom language
- search Wikipedia by a custom language
- {{search(':fr !wp Wau Holland')}}
## `!!` external bangs
SearXNG supports the external bangs from [ddg]. To directly jump to a external
search page use the `!!` prefix. To give an example:
SearXNG supports the external bangs from [DuckDuckGo]. To directly jump to a
external search page use the `!!` prefix. To give an example:
- search wikipedia by a custom language
- search Wikipedia by a custom language
- {{search('!!wfr Wau Holland')}}
Please note, your search will be performed directly in the external search
engine, SearXNG cannot protect your privacy on this.
[ddg]: https://duckduckgo.com/bang
[DuckDuckGo]: https://duckduckgo.com/bang
## Special Queries

View file

@ -0,0 +1,82 @@
# Tentang SearXNG
SearXNG adalah sebuah [mesin pencari meta], yang mendapatkan hasil dari
{{link('mesin pencari', 'preferences')}} lainnya sambil tidak melacak
penggunanya.
Proyek SearXNG diarahkan oleh sebuah komunitas terbuka, bergabung dengan kami di
Matrix jika Anda memiliki pertanyaan atau ingin mengobrol tentang SearXNG di
[#searxng:matrix.org]
Buat SearXNG lebih baik.
- Anda dapat membuat terjemahan SearXNG lebih baik di [Weblate], atau...
- Lacak pengembangan, kirim kontribusi, dan laporkan masalah di [sumber
SearXNG].
- Untuk mendapatkan informasi lanjut, kunjungi dokumentasi proyek SearXNG di
[dokumentasi SearXNG].
## Kenapa menggunakan SearXNG?
- SearXNG mungkin tidak menawarkan Anda hasil yang dipersonalisasikan seperti
Google, tetapi tidak membuat sebuah profil tentang Anda.
- SearXNG tidak peduli apa yang Anda cari, tidak akan membagikan apa pun dengan
pihak ketiga, dan tidak dapat digunakan untuk mengkompromikan Anda.
- SearXNG adalah perangkat lunak bebas, kodenya 100% terbuka, dan semuanya
dipersilakan untuk membuatnya lebih baik.
Jika Anda peduli dengan privasi, ingin menjadi pengguna yang sadar, ataupun
percaya dalam kebebasan digital, buat SearXNG sebagai mesin pencari bawaan atau
jalankan di server Anda sendiri!
## Bagaimana saya dapat membuat SearXNG sebagai mesin pencari bawaan?
SearXNG mendukung [OpenSearch]. Untuk informasi lanjut tentang mengubah mesin
pencari bawaan Anda, lihat dokumentasi peramban Anda:
- [Firefox]
- [Microsoft Edge] - Dibalik tautan, Anda juga akan menemukan beberapa instruksi
berguna untuk Chrome dan Safari.
- Peramban berbasis [Chromium] hanya menambahkan situs web yang dikunjungi oleh
pengguna tanpa sebuah jalur.
Apabila menambahkan mesin pencari, tidak boleh ada duplikat dengan nama yang
sama. Jika Anda menemukan masalah di mana Anda tidak bisa menambahkan mesin
pencari, Anda bisa:
- menghapus duplikat (nama default: SearXNG) atau
- menghubungi pemilik untuk memberikan nama yang berbeda dari nama default.
## Bagaimana caranya SearXNG bekerja?
SearXNG adalah sebuah *fork* dari [mesin pencari meta] [searx] yang banyak
dikenal yang diinspirasi oleh [proyek Seeks]. SearXNG menyediakan privasi dasar
dengan mencampur kueri Anda dengan pencarian pada *platform* lainnya tanpa
menyimpan data pencarian. SearXNG dapat ditambahkan ke bilah pencarian peramban
Anda; lain lagi, SearXNG dapat diatur sebagai mesin pencarian bawaan.
{{link('Laman statistik', 'stats')}} berisi beberapa statistik penggunaan anonim
berguna tentang mesin pencarian yang digunakan.
## Bagaimana caranya untuk membuat SearXNG milik saya?
SearXNG menghargai kekhawatiran Anda tentang pencatatan (*log*), jadi ambil
kodenya dari [sumber SearXNG] dan jalankan sendiri!
Tambahkan instansi Anda ke [daftar instansi
publik]({{get_setting('brand.public_instances')}}) ini untuk membantu orang lain
mendapatkan kembali privasi mereka dan membuat internet lebih bebas. Lebih
terdesentralisasinya internet, lebih banyak kebebasan yang kita punya!
[sumber SearXNG]: {{GIT_URL}}
[#searxng:matrix.org]: https://matrix.to/#/#searxng:matrix.org
[dokumentasi SearXNG]: {{get_setting('brand.docs_url')}}
[searx]: https://github.com/searx/searx
[mesin pencari meta]: https://id.wikipedia.org/wiki/Mesin_pencari_web#Mesin_Pencari_dan_Mesin_Pencari-meta
[Weblate]: https://translate.codeberg.org/projects/searxng/
[proyek Seeks]: https://beniz.github.io/seeks/
[OpenSearch]: https://github.com/dewitt/opensearch/blob/master/opensearch-1-1-draft-6.md
[Firefox]: https://support.mozilla.org/id/kb/add-or-remove-search-engine-firefox
[Microsoft Edge]: https://support.microsoft.com/id-id/microsoft-edge/ubah-mesin-pencarian-default-anda-f863c519-5994-a8ed-6859-00fbc123b782
[Chromium]: https://www.chromium.org/tab-to-search

View file

@ -0,0 +1,73 @@
# Sintaks pencarian
SearXNG mempunyai sintaks pencarian memungkinkan Anda untuk mengubah kategori,
mesin pencari, bahasa dan lainnya. Lihat {{link('preferensi', 'preferences')}}
untuk daftar mesin pencari, kategori dan bahasa.
## `!` pilih mesin pencari dan kategori
Untuk menetapkan nama kategori dan/atau mesin pencari gunakan awalan `!`.
Sebagai contoh:
- cari di Wikipedia tentang **Jakarta**
- {{search('!wp Jakarta')}}
- {{search('!wikipedia Jakarta')}}
- cari dalam kategori **peta** untuk **Jakarta**
- {{search('!map Jakarta')}}
- pencarian gambar
- {{search('!images kucing')}}
Singkatan mesin pencari dan bahasa juga diterima. Pengubah mesin/kategori dapat
dirantai dan inklusif. Misalnya dengan pencarian {{search('!map !ddg !wp
Jakarta')}} dalam kategori peta dan DuckDuckGo dan Wikipedia tentang
**Jakarta**.
## `:` pilih bahasa
Untuk memilih saringan bahasa gunakan awalan `:`. Sebagai contoh:
- cari Wikipedia dengan bahasa lain
- {{search(':en !wp Jakarta')}}
## `!!` mesin pencarian (*bangs*) eksternal
SearXNG mendukung mesin pencarian eksternal (*bangs*) dari [DuckDuckGo]. Untuk
langsung lompat ke sebuah laman pencarian eksternal gunakan awalan `!!`.
Sebagai contoh:
- cari Wikipedia dengan bahasa yang lain
- {{search('!!wen cat')}}
Diingat, pencarian Anda akan dilakukan secara langsung di mesin pencari
eksternal, SearXNG tidak dapat melindungi privasi Anda di sana.
[DuckDuckGo]: https://duckduckgo.com/bang
## Kueri Khusus
Dalam laman {{link('preferensi', 'preferences')}} Anda akan menemukan kata kunci
_kueri khusus_. Sebagai contoh:
- buat sebuah UUID acak
- {{search('random uuid')}}
- temukan rata-rata
- {{search('avg 123 548 2.04 24.2')}}
- tampilkan _user agent_ (agen pengguna) dari peramban Anda (harus diaktifkan)
- {{search('user-agent')}}
- ubah _string_ (teks) ke intisari *hash* yang berbeda (harus diaktifkan)
- {{search('md5 kucing sphynx')}}
- {{search('sha512 kucing sphynx')}}

View file

@ -9,20 +9,112 @@ import os
import pathlib
from babel import Locale
from babel.support import Translations
import babel.languages
import babel.core
import flask_babel
import flask
from flask.ctx import has_request_context
from searx import logger
LOCALE_NAMES = {
"oc": "Occitan",
"nl-BE": "Vlaams (Dutch, Belgium)",
}
"""Mapping of locales and their description. Locales e.g. 'fr' or 'pt-BR'
(delimiter is *underline* '-')"""
logger = logger.getChild('locales')
# safe before monkey patching flask_babel.get_translations
_flask_babel_get_translations = flask_babel.get_translations
LOCALE_NAMES = {}
"""Mapping of locales and their description. Locales e.g. 'fr' or 'pt-BR' (see
:py:obj:`locales_initialize`).
:meta hide-value:
"""
RTL_LOCALES: Set[str] = set()
"""List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (delimiter is
*underline* '-')"""
"""List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (see
:py:obj:`locales_initialize`)."""
ADDITIONAL_TRANSLATIONS = {
"dv": "ދިވެހި (Dhivehi)",
"oc": "Occitan",
"szl": "Ślōnski (Silesian)",
"pap": "Papiamento",
}
"""Additional languages SearXNG has translations for but not supported by
python-babel (see :py:obj:`locales_initialize`)."""
LOCALE_BEST_MATCH = {
"dv": "si",
"oc": 'fr-FR',
"szl": "pl",
"nl-BE": "nl",
"zh-HK": "zh-Hant-TW",
"pap": "pt-BR",
}
"""Map a locale we do not have a translations for to a locale we have a
translation for. By example: use Taiwan version of the translation for Hong
Kong."""
def _get_name(locale, language_code):
def localeselector():
locale = 'en'
if has_request_context():
value = flask.request.preferences.get_value('locale')
if value:
locale = value
# first, set the language that is not supported by babel
if locale in ADDITIONAL_TRANSLATIONS:
flask.request.form['use-translation'] = locale
# second, map locale to a value python-babel supports
locale = LOCALE_BEST_MATCH.get(locale, locale)
if locale == '':
# if there is an error loading the preferences
# the locale is going to be ''
locale = 'en'
# babel uses underscore instead of hyphen.
locale = locale.replace('-', '_')
return locale
def get_translations():
"""Monkey patch of :py:obj:`flask_babel.get_translations`"""
if has_request_context():
use_translation = flask.request.form.get('use-translation')
if use_translation in ADDITIONAL_TRANSLATIONS:
babel_ext = flask_babel.current_app.extensions['babel']
return Translations.load(babel_ext.translation_directories[0], use_translation)
return _flask_babel_get_translations()
def get_locale_descr(locale, locale_name):
"""Get locale name e.g. 'Français - fr' or 'Português (Brasil) - pt-BR'
:param locale: instance of :py:class:`Locale`
:param locale_name: name e.g. 'fr' or 'pt_BR' (delimiter is *underscore*)
"""
native_language, native_territory = _get_locale_descr(locale, locale_name)
english_language, english_territory = _get_locale_descr(locale, 'en')
if native_territory == english_territory:
english_territory = None
if not native_territory and not english_territory:
if native_language == english_language:
return native_language
return native_language + ' (' + english_language + ')'
result = native_language + ', ' + native_territory + ' (' + english_language
if english_territory:
return result + ', ' + english_territory + ')'
return result + ')'
def _get_locale_descr(locale, language_code):
language_name = locale.get_language_name(language_code).capitalize()
if language_name and ('a' <= language_name[0] <= 'z'):
language_name = language_name.capitalize()
@ -30,39 +122,184 @@ def _get_name(locale, language_code):
return language_name, terrirtory_name
def _get_locale_name(locale, locale_name):
"""Get locale name e.g. 'Français - fr' or 'Português (Brasil) - pt-BR'
def locales_initialize(directory=None):
"""Initialize locales environment of the SearXNG session.
:param locale: instance of :py:class:`Locale`
:param locale_name: name e.g. 'fr' or 'pt_BR' (delimiter is *underscore*)
- monkey patch :py:obj:`flask_babel.get_translations` by :py:obj:`get_translations`
- init global names :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`
"""
native_language, native_territory = _get_name(locale, locale_name)
english_language, english_territory = _get_name(locale, 'en')
if native_territory == english_territory:
english_territory = None
if not native_territory and not english_territory:
if native_language == english_language:
return native_language
return native_language + ' (' + english_language + ')'
result = native_language + ', ' + native_territory + ' (' + english_language
if english_territory:
return result + ', ' + english_territory + ')'
return result + ')'
directory = directory or pathlib.Path(__file__).parent / 'translations'
logger.debug("locales_initialize: %s", directory)
flask_babel.get_translations = get_translations
for tag, descr in ADDITIONAL_TRANSLATIONS.items():
locale = Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
LOCALE_NAMES[tag] = descr
if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag)
for tag in LOCALE_BEST_MATCH:
descr = LOCALE_NAMES.get(tag)
if not descr:
locale = Locale.parse(tag, sep='-')
LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_'))
if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag)
def initialize_locales(directory):
"""Initialize global names :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`."""
for dirname in sorted(os.listdir(directory)):
# Based on https://flask-babel.tkte.ch/_modules/flask_babel.html#Babel.list_translations
if not os.path.isdir(os.path.join(directory, dirname, 'LC_MESSAGES')):
continue
locale_name = dirname.replace('_', '-')
info = LOCALE_NAMES.get(locale_name)
if not info:
tag = dirname.replace('_', '-')
descr = LOCALE_NAMES.get(tag)
if not descr:
locale = Locale.parse(dirname)
LOCALE_NAMES[locale_name] = _get_locale_name(locale, dirname)
LOCALE_NAMES[tag] = get_locale_descr(locale, dirname)
if locale.text_direction == 'rtl':
RTL_LOCALES.add(locale_name)
RTL_LOCALES.add(tag)
initialize_locales(pathlib.Path(__file__).parent / 'translations')
def get_engine_locale(searxng_locale, engine_locales, default=None):
"""Return engine's language (aka locale) string that best fits to argument
``searxng_locale``.
Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
corresponding *engine locales*::
<engine>: {
# SearXNG string : engine-string
'ca-ES' : 'ca_ES',
'fr-BE' : 'fr_BE',
'fr-CA' : 'fr_CA',
'fr-CH' : 'fr_CH',
'fr' : 'fr_FR',
...
'pl-PL' : 'pl_PL',
'pt-PT' : 'pt_PT'
}
.. hint::
The *SearXNG locale* string has to be known by babel!
If there is no direct 1:1 mapping, this functions tries to narrow down
engine's language (locale). If no value can be determined by these
approximation attempts the ``default`` value is returned.
Assumptions:
A. When user select a language the results should be optimized according to
the selected language.
B. When user select a language and a territory the results should be
optimized with first priority on terrirtory and second on language.
First approximation rule (*by territory*):
When the user selects a locale with terrirtory (and a language), the
territory has priority over the language. If any of the offical languages
in the terrirtory is supported by the engine (``engine_locales``) it will
be used.
Second approximation rule (*by language*):
If "First approximation rule" brings no result or the user selects only a
language without a terrirtory. Check in which territories the language
has an offical status and if one of these territories is supported by the
engine.
"""
# pylint: disable=too-many-branches
engine_locale = engine_locales.get(searxng_locale)
if engine_locale is not None:
# There was a 1:1 mapping (e.g. "fr-BE --> fr_BE" or "fr --> fr_FR"), no
# need to narrow language nor territory.
return engine_locale
try:
locale = babel.Locale.parse(searxng_locale, sep='-')
except babel.core.UnknownLocaleError:
try:
locale = babel.Locale.parse(searxng_locale.split('-')[0])
except babel.core.UnknownLocaleError:
return default
# SearXNG's selected locale is not supported by the engine ..
if locale.territory:
# Try to narrow by *offical* languages in the territory (??-XX).
for official_language in babel.languages.get_official_languages(locale.territory, de_facto=True):
searxng_locale = official_language + '-' + locale.territory
engine_locale = engine_locales.get(searxng_locale)
if engine_locale is not None:
return engine_locale
# Engine does not support one of the offical languages in the territory or
# there is only a language selected without a territory.
# Now lets have a look if the searxng_lang (the language selected by the
# user) is a offical language in other territories. If so, check if
# engine does support the searxng_lang in this other territory.
if locale.language:
searxng_lang = locale.language
if locale.script:
searxng_lang += '_' + locale.script
terr_lang_dict = {}
for territory, langs in babel.core.get_global("territory_languages").items():
if not langs.get(searxng_lang, {}).get('official_status'):
continue
terr_lang_dict[territory] = langs.get(searxng_lang)
# first: check fr-FR, de-DE .. is supported by the engine
# exception: 'en' --> 'en-US'
territory = locale.language.upper()
if territory == 'EN':
territory = 'US'
if terr_lang_dict.get(territory):
searxng_locale = locale.language + '-' + territory
engine_locale = engine_locales.get(searxng_locale)
if engine_locale is not None:
return engine_locale
# second: sort by population_percent and take first match
# drawback of "population percent": if there is a terrirtory with a
# small number of people (e.g 100) but the majority speaks the
# language, then the percentage migth be 100% (--> 100 people) but in
# a different terrirtory with more people (e.g. 10.000) where only 10%
# speak the language the total amount of speaker is higher (--> 200
# people).
#
# By example: The population of Saint-Martin is 33.000, of which 100%
# speak French, but this is less than the 30% of the approximately 2.5
# million Belgian citizens
#
# - 'fr-MF', 'population_percent': 100.0, 'official_status': 'official'
# - 'fr-BE', 'population_percent': 38.0, 'official_status': 'official'
terr_lang_list = []
for k, v in terr_lang_dict.items():
terr_lang_list.append((k, v))
for territory, _lang in sorted(terr_lang_list, key=lambda item: item[1]['population_percent'], reverse=True):
searxng_locale = locale.language + '-' + territory
engine_locale = engine_locales.get(searxng_locale)
if engine_locale is not None:
return engine_locale
# No luck: narrow by "language from territory" and "territory from language"
# does not fit to a locale supported by the engine.
if engine_locale is None:
engine_locale = default
return default

View file

@ -160,6 +160,7 @@ def get_reliabilities(engline_name_list, checker_results):
# even if there is no exception
reliablity = 0
else:
# pylint: disable=consider-using-generator
reliablity = 100 - sum([error['percentage'] for error in errors if not error.get('secondary')])
reliabilities[engine_name] = {

View file

@ -8,12 +8,13 @@ import concurrent.futures
from queue import SimpleQueue
from types import MethodType
from timeit import default_timer
from typing import Iterable, Tuple
from typing import Iterable, NamedTuple, Tuple, List, Dict, Union
from contextlib import contextmanager
import httpx
import anyio
from .network import get_network, initialize, check_network_configuration
from .network import get_network, initialize, check_network_configuration # pylint:disable=cyclic-import
from .client import get_loop
from .raise_for_httperror import raise_for_httperror
@ -48,9 +49,23 @@ def get_context_network():
return THREADLOCAL.__dict__.get('network') or get_network()
def request(method, url, **kwargs):
"""same as requests/requests/api.py request(...)"""
@contextmanager
def _record_http_time():
# pylint: disable=too-many-branches
time_before_request = default_timer()
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
try:
yield start_time
finally:
# update total_time.
# See get_time_for_thread() and reset_time_for_thread()
if hasattr(THREADLOCAL, 'total_time'):
time_after_request = default_timer()
THREADLOCAL.total_time += time_after_request - time_before_request
def _get_timeout(start_time, kwargs):
# pylint: disable=too-many-branches
# timeout (httpx)
if 'timeout' in kwargs:
@ -65,45 +80,84 @@ def request(method, url, **kwargs):
# ajdust actual timeout
timeout += 0.2 # overhead
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
if start_time:
timeout -= default_timer() - start_time
# raise_for_error
check_for_httperror = True
if 'raise_for_httperror' in kwargs:
check_for_httperror = kwargs['raise_for_httperror']
del kwargs['raise_for_httperror']
return timeout
# requests compatibility
if isinstance(url, bytes):
url = url.decode()
# network
network = get_context_network()
def request(method, url, **kwargs):
"""same as requests/requests/api.py request(...)"""
with _record_http_time() as start_time:
network = get_context_network()
timeout = _get_timeout(start_time, kwargs)
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
try:
return future.result(timeout)
except concurrent.futures.TimeoutError as e:
raise httpx.TimeoutException('Timeout', request=None) from e
# do request
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
try:
response = future.result(timeout)
except concurrent.futures.TimeoutError as e:
raise httpx.TimeoutException('Timeout', request=None) from e
# requests compatibility
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
response.ok = not response.is_error
def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]:
"""send multiple HTTP requests in parallel. Wait for all requests to finish."""
with _record_http_time() as start_time:
# send the requests
network = get_context_network()
loop = get_loop()
future_list = []
for request_desc in request_list:
timeout = _get_timeout(start_time, request_desc.kwargs)
future = asyncio.run_coroutine_threadsafe(
network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop
)
future_list.append((future, timeout))
# update total_time.
# See get_time_for_thread() and reset_time_for_thread()
if hasattr(THREADLOCAL, 'total_time'):
time_after_request = default_timer()
THREADLOCAL.total_time += time_after_request - time_before_request
# read the responses
responses = []
for future, timeout in future_list:
try:
responses.append(future.result(timeout))
except concurrent.futures.TimeoutError:
responses.append(httpx.TimeoutException('Timeout', request=None))
except Exception as e: # pylint: disable=broad-except
responses.append(e)
return responses
# raise an exception
if check_for_httperror:
raise_for_httperror(response)
return response
class Request(NamedTuple):
"""Request description for the multi_requests function"""
method: str
url: str
kwargs: Dict[str, str] = {}
@staticmethod
def get(url, **kwargs):
return Request('GET', url, kwargs)
@staticmethod
def options(url, **kwargs):
return Request('OPTIONS', url, kwargs)
@staticmethod
def head(url, **kwargs):
return Request('HEAD', url, kwargs)
@staticmethod
def post(url, **kwargs):
return Request('POST', url, kwargs)
@staticmethod
def put(url, **kwargs):
return Request('PUT', url, kwargs)
@staticmethod
def patch(url, **kwargs):
return Request('PATCH', url, kwargs)
@staticmethod
def delete(url, **kwargs):
return Request('DELETE', url, kwargs)
def get(url, **kwargs):

View file

@ -26,9 +26,6 @@ else:
logger = logger.getChild('searx.network.client')
LOOP = None
SSLCONTEXTS: Dict[Any, SSLContext] = {}
TRANSPORT_KWARGS = {
'trust_env': False,
}
def get_sslcontexts(proxy_url=None, cert=None, verify=True, trust_env=True, http2=False):
@ -74,7 +71,7 @@ def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit
rdns = True
proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url)
verify = get_sslcontexts(proxy_url, None, True, False, http2) if verify is True else verify
verify = get_sslcontexts(proxy_url, None, verify, True, http2) if verify is True else verify
return AsyncProxyTransportFixed(
proxy_type=proxy_type,
proxy_host=proxy_host,
@ -88,12 +85,11 @@ def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit
local_address=local_address,
limits=limit,
retries=retries,
**TRANSPORT_KWARGS,
)
def get_transport(verify, http2, local_address, proxy_url, limit, retries):
verify = get_sslcontexts(None, None, True, False, http2) if verify is True else verify
verify = get_sslcontexts(None, None, verify, True, http2) if verify is True else verify
return httpx.AsyncHTTPTransport(
# pylint: disable=protected-access
verify=verify,
@ -102,7 +98,6 @@ def get_transport(verify, http2, local_address, proxy_url, limit, retries):
proxy=httpx._config.Proxy(proxy_url) if proxy_url else None,
local_address=local_address,
retries=retries,
**TRANSPORT_KWARGS,
)

View file

@ -13,6 +13,7 @@ import httpx
from searx import logger, searx_debug
from .client import new_client, get_loop, AsyncHTTPTransportNoHttp
from .raise_for_httperror import raise_for_httperror
logger = logger.getChild('network')
@ -172,7 +173,7 @@ class Network:
):
continue
return False
response = await client.get("https://check.torproject.org/api/ip", timeout=10)
response = await client.get("https://check.torproject.org/api/ip", timeout=60)
if not response.json()["IsTor"]:
result = False
Network._TOR_CHECK_RESULT[proxies] = result
@ -226,6 +227,27 @@ class Network:
kwargs['follow_redirects'] = kwargs.pop('allow_redirects')
return kwargs_clients
@staticmethod
def extract_do_raise_for_httperror(kwargs):
do_raise_for_httperror = True
if 'raise_for_httperror' in kwargs:
do_raise_for_httperror = kwargs['raise_for_httperror']
del kwargs['raise_for_httperror']
return do_raise_for_httperror
@staticmethod
def patch_response(response, do_raise_for_httperror):
if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
response.ok = not response.is_error
# raise an exception
if do_raise_for_httperror:
raise_for_httperror(response)
return response
def is_valid_response(self, response):
# pylint: disable=too-many-boolean-expressions
if (
@ -239,6 +261,7 @@ class Network:
async def call_client(self, stream, method, url, **kwargs):
retries = self.retries
was_disconnected = False
do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs)
kwargs_clients = Network.extract_kwargs_clients(kwargs)
while retries >= 0: # pragma: no cover
client = await self.get_client(**kwargs_clients)
@ -248,7 +271,7 @@ class Network:
else:
response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0:
return response
return Network.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e:
if not was_disconnected:
# the server has closed the connection:
@ -311,7 +334,7 @@ def initialize(settings_engines=None, settings_outgoing=None):
# see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # pylint: disable=line-too-long
default_params = {
'enable_http': False,
'verify': True,
'verify': settings_outgoing['verify'],
'enable_http2': settings_outgoing['enable_http2'],
'max_connections': settings_outgoing['pool_connections'],
'max_keepalive_connections': settings_outgoing['pool_maxsize'],

View file

@ -9,6 +9,7 @@ from searx.exceptions import (
SearxEngineTooManyRequestsException,
SearxEngineAccessDeniedException,
)
from searx import get_setting
def is_cloudflare_challenge(resp):
@ -33,15 +34,22 @@ def raise_for_cloudflare_captcha(resp):
if is_cloudflare_challenge(resp):
# https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha-
# suspend for 2 weeks
raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15)
raise SearxEngineCaptchaException(
message='Cloudflare CAPTCHA', suspended_time=get_setting('search.suspended_times.cf_SearxEngineCaptcha')
)
if is_cloudflare_firewall(resp):
raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24)
raise SearxEngineAccessDeniedException(
message='Cloudflare Firewall',
suspended_time=get_setting('search.suspended_times.cf_SearxEngineAccessDenied'),
)
def raise_for_recaptcha(resp):
if resp.status_code == 503 and '"https://www.google.com/recaptcha/' in resp.text:
raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7)
raise SearxEngineCaptchaException(
message='ReCAPTCHA', suspended_time=get_setting('search.suspended_times.recaptcha_SearxEngineCaptcha')
)
def raise_for_captcha(resp):
@ -64,9 +72,7 @@ def raise_for_httperror(resp):
if resp.status_code and resp.status_code >= 400:
raise_for_captcha(resp)
if resp.status_code in (402, 403):
raise SearxEngineAccessDeniedException(
message='HTTP error ' + str(resp.status_code), suspended_time=3600 * 24
)
raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code))
if resp.status_code == 429:
raise SearxEngineTooManyRequestsException()
resp.raise_for_status()

View file

@ -198,7 +198,6 @@ class PluginStore:
self.plugins.append(plugin)
def call(self, ordered_plugin_list, plugin_type, *args, **kwargs):
# pylint: disable=no-self-use
ret = True
for plugin in ordered_plugin_list:
if hasattr(plugin, plugin_type):

View file

@ -0,0 +1,97 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Plugin to detect the search language from the search query.
The language detection is done by using the fastText_ library (`python
fasttext`_). fastText_ distributes the `language identification model`_, for
reference:
- `FastText.zip: Compressing text classification models`_
- `Bag of Tricks for Efficient Text Classification`_
The `language identification model`_ support the language codes (ISO-639-3)::
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
vi vls vo wa war wuu xal xmf yi yo yue zh
The `language identification model`_ is harmonized with the SearXNG's language
(locale) model. General conditions of SearXNG's locale model are:
a. SearXNG's locale of a query is passed to the
:py:obj:`searx.locales.get_engine_locale` to get a language and/or region
code that is used by an engine.
b. SearXNG and most of the engines do not support all the languages from
language model and there might be also a discrepancy in the ISO-639-3 and
ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further
more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
``zh_Hant`` (``zh_Hans``).
Conclusion: This plugin does only auto-detect the languages a user can select in
the language menu (:py:obj:`supported_langs`).
SearXNG's locale of a query comes from (*highest wins*):
1. The ``Accept-Language`` header from user's HTTP client.
2. The user select a locale in the preferences.
3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
4. This plugin is activated in the preferences and the locale (only the language
code / none region code) comes from the fastText's language detection.
Conclusion: There is a conflict between the language selected by the user and
the language from language detection of this plugin. For example, the user
explicitly selects the German locale via the search syntax to search for a term
that is identified as an English term (try ``:de-DE thermomix``, for example).
.. hint::
To SearXNG maintainers; please take into account: under some circumstances
the auto-detection of the language of this plugin could be detrimental to
users expectations. Its not recommended to activate this plugin by
default. It should always be the user's decision whether to activate this
plugin or not.
.. _fastText: https://fasttext.cc/
.. _python fasttext: https://pypi.org/project/fasttext/
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
"""
from flask_babel import gettext
import babel
from searx.utils import detect_language
from searx.languages import language_codes
name = gettext('Autodetect search language')
description = gettext('Automatically detect the query search language and switch to it.')
preference_section = 'general'
default_on = False
supported_langs = set()
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
def pre_search(request, search): # pylint: disable=unused-argument
lang = detect_language(search.search_query.query, min_probability=0)
if lang in supported_langs:
search.search_query.lang = lang
try:
search.search_query.locale = babel.Locale.parse(lang)
except babel.core.UnknownLocaleError:
pass
return True
def init(app, settings): # pylint: disable=unused-argument
for searxng_locale in language_codes:
supported_langs.add(searxng_locale[0].split('-')[0])
return True

View file

@ -3,7 +3,7 @@
# pyright: basic
"""Some bot protection / rate limitation
To monitore rate limits and protect privacy the IP addresses are getting stored
To monitor rate limits and protect privacy the IP addresses are getting stored
with a hash so the limiter plugin knows who to block. A redis database is
needed to store the hash values.
@ -13,11 +13,11 @@ Enable the plugin in ``settings.yml``:
- ``redis.url: ...`` check the value, see :ref:`settings redis`
"""
import hmac
import re
from flask import request
from searx.shared import redisdb
from searx import redisdb
from searx.redislib import incr_sliding_window
name = "Request limiter"
description = "Limit the number of request"
@ -36,8 +36,9 @@ re_bot = re.compile(
)
def is_accepted_request(inc_get_counter) -> bool:
def is_accepted_request() -> bool:
# pylint: disable=too-many-return-statements
redis_client = redisdb.client()
user_agent = request.headers.get('User-Agent', '')
x_forwarded_for = request.headers.get('X-Forwarded-For', '')
@ -47,83 +48,54 @@ def is_accepted_request(inc_get_counter) -> bool:
return True
if request.path == '/search':
c_burst = inc_get_counter(interval=20, keys=[b'IP limit, burst', x_forwarded_for])
c_10min = inc_get_counter(interval=600, keys=[b'IP limit, 10 minutes', x_forwarded_for])
c_burst = incr_sliding_window(redis_client, 'IP limit, burst' + x_forwarded_for, 20)
c_10min = incr_sliding_window(redis_client, 'IP limit, 10 minutes' + x_forwarded_for, 600)
if c_burst > 15 or c_10min > 150:
logger.debug("to many request") # pylint: disable=undefined-variable
return False
if re_bot.match(user_agent):
logger.debug("detected bot") # pylint: disable=undefined-variable
return False
if len(request.headers.get('Accept-Language', '').strip()) == '':
logger.debug("missing Accept-Language") # pylint: disable=undefined-variable
return False
if request.headers.get('Connection') == 'close':
logger.debug("got Connection=close") # pylint: disable=undefined-variable
return False
accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if 'gzip' not in accept_encoding_list or 'deflate' not in accept_encoding_list:
if 'gzip' not in accept_encoding_list and 'deflate' not in accept_encoding_list:
logger.debug("suspicious Accept-Encoding") # pylint: disable=undefined-variable
return False
if 'text/html' not in request.accept_mimetypes:
logger.debug("Accept-Encoding misses text/html") # pylint: disable=undefined-variable
return False
if request.args.get('format', 'html') != 'html':
c = inc_get_counter(interval=3600, keys=[b'API limit', x_forwarded_for])
c = incr_sliding_window(redis_client, 'API limit' + x_forwarded_for, 3600)
if c > 4:
logger.debug("API limit exceeded") # pylint: disable=undefined-variable
return False
return True
def create_inc_get_counter(redis_client, secret_key_bytes):
lua_script = """
local slidingWindow = KEYS[1]
local key = KEYS[2]
local now = tonumber(redis.call('TIME')[1])
local id = redis.call('INCR', 'counter')
if (id > 2^46)
then
redis.call('SET', 'count', 0)
end
redis.call('ZREMRANGEBYSCORE', key, 0, now - slidingWindow)
redis.call('ZADD', key, now, id)
local result = redis.call('ZCOUNT', key, 0, now+1)
redis.call('EXPIRE', key, slidingWindow)
return result
"""
script_sha = redis_client.script_load(lua_script)
def inc_get_counter(interval, keys):
m = hmac.new(secret_key_bytes, digestmod='sha256')
for k in keys:
m.update(bytes(str(k), encoding='utf-8') or b'')
m.update(b"\0")
key = m.digest()
return redis_client.evalsha(script_sha, 2, interval, key)
return inc_get_counter
def create_pre_request(get_aggregation_count):
def pre_request():
if not is_accepted_request(get_aggregation_count):
return '', 429
return None
return pre_request
def pre_request():
if not is_accepted_request():
return 'Too Many Requests', 429
return None
def init(app, settings):
if not settings['server']['limiter']:
return False
logger.debug("init limiter DB") # pylint: disable=undefined-variable
if not redisdb.init():
logger.error("init limiter DB failed!!!") # pylint: disable=undefined-variable
if not redisdb.client():
logger.error("The limiter requires Redis") # pylint: disable=undefined-variable
return False
redis_client = redisdb.client()
secret_key_bytes = bytes(settings['server']['secret_key'], encoding='utf-8')
inc_get_counter = create_inc_get_counter(redis_client, secret_key_bytes)
app.before_request(create_pre_request(inc_get_counter))
app.before_request(pre_request)
return True

View file

@ -42,4 +42,6 @@ def on_result(request, search, result):
doi = doi[: -len(suffix)]
result['url'] = get_doi_resolver(request.preferences) + doi
result['parsed_url'] = urlparse(result['url'])
if 'doi' not in result:
result['doi'] = doi
return True

View file

@ -18,9 +18,7 @@ from flask_babel import gettext
name = gettext('Search on category select')
description = gettext(
'Perform search immediately if a category selected. ' 'Disable to select multiple categories. (JavaScript required)'
'Perform search immediately if a category selected. Disable to select multiple categories. (JavaScript required)'
)
default_on = True
preference_section = 'ui'
js_dependencies = ('plugins/js/search_on_category_select.js',)

View file

@ -17,7 +17,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
from flask_babel import gettext
import re
name = gettext('Self Informations')
name = gettext('Self Information')
description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".')
default_on = True
preference_section = 'query'

View file

@ -0,0 +1,92 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""A plugin to check if the ip address of the request is a Tor exit-node if the
user searches for ``tor-check``. It fetches the tor exit node list from
https://check.torproject.org/exit-addresses and parses all the IPs into a list,
then checks if the user's IP address is in it.
Enable in ``settings.yml``:
.. code:: yaml
enabled_plugins:
..
- 'Tor check plugin'
"""
import re
from flask_babel import gettext
from httpx import HTTPError
from searx.network import get
default_on = False
name = gettext("Tor check plugin")
'''Translated name of the plugin'''
description = gettext(
"This plugin checks if the address of the request is a Tor exit-node, and"
" informs the user if it is; like check.torproject.org, but from SearXNG."
)
'''Translated description of the plugin.'''
preference_section = 'query'
'''The preference section where the plugin is shown.'''
query_keywords = ['tor-check']
'''Query keywords shown in the preferences.'''
query_examples = ''
'''Query examples shown in the preferences.'''
# Regex for exit node addresses in the list.
reg = re.compile(r"(?<=ExitAddress )\S+")
def post_search(request, search):
if search.search_query.pageno > 1:
return True
if search.search_query.query.lower() == "tor-check":
# Request the list of tor exit nodes.
try:
resp = get("https://check.torproject.org/exit-addresses")
node_list = re.findall(reg, resp.text)
except HTTPError:
# No answer, return error
search.result_container.answers["tor"] = {
"answer": gettext(
"Could not download the list of Tor exit-nodes from: https://check.torproject.org/exit-addresses"
)
}
return True
x_forwarded_for = request.headers.getlist("X-Forwarded-For")
if x_forwarded_for:
ip_address = x_forwarded_for[0]
else:
ip_address = request.remote_addr
if ip_address in node_list:
search.result_container.answers["tor"] = {
"answer": gettext(
"You are using Tor and it looks like you have this external IP address: {ip_address}".format(
ip_address=ip_address
)
)
}
else:
search.result_container.answers["tor"] = {
"answer": gettext(
"You are not using Tor and you have this external IP address: {ip_address}".format(
ip_address=ip_address
)
)
}
return True

View file

@ -8,6 +8,3 @@ description = gettext(
)
default_on = False
preference_section = 'ui'
js_dependencies = ('plugins/js/vim_hotkeys.js',)
css_dependencies = ('plugins/css/vim_hotkeys.css',)

View file

@ -52,7 +52,7 @@ class Setting:
return self.value
def save(self, name: str, resp: flask.Response):
"""Save cookie ``name`` in the HTTP reponse obect
"""Save cookie ``name`` in the HTTP response object
If needed, its overwritten in the inheritance."""
resp.set_cookie(name, self.value, max_age=COOKIE_MAX_AGE)
@ -113,7 +113,7 @@ class MultipleChoiceSetting(Setting):
self.value.append(choice)
def save(self, name: str, resp: flask.Response):
"""Save cookie ``name`` in the HTTP reponse obect"""
"""Save cookie ``name`` in the HTTP response object"""
resp.set_cookie(name, ','.join(self.value), max_age=COOKIE_MAX_AGE)
@ -146,7 +146,7 @@ class SetSetting(Setting):
self.values = set(elements)
def save(self, name: str, resp: flask.Response):
"""Save cookie ``name`` in the HTTP reponse obect"""
"""Save cookie ``name`` in the HTTP response object"""
resp.set_cookie(name, ','.join(self.values), max_age=COOKIE_MAX_AGE)
@ -193,7 +193,7 @@ class MapSetting(Setting):
self.key = data # pylint: disable=attribute-defined-outside-init
def save(self, name: str, resp: flask.Response):
"""Save cookie ``name`` in the HTTP reponse obect"""
"""Save cookie ``name`` in the HTTP response object"""
if hasattr(self, 'key'):
resp.set_cookie(name, self.key, max_age=COOKIE_MAX_AGE)
@ -208,11 +208,9 @@ class BooleanChoices:
self.default_choices = dict(choices)
def transform_form_items(self, items):
# pylint: disable=no-self-use
return items
def transform_values(self, values):
# pylint: disable=no-self-use
return values
def parse_cookie(self, data_disabled: str, data_enabled: str):
@ -241,7 +239,7 @@ class BooleanChoices:
return (k for k, v in self.choices.items() if not v)
def save(self, resp: flask.Response):
"""Save cookie in the HTTP reponse obect"""
"""Save cookie in the HTTP response object"""
disabled_changed = (k for k in self.disabled if self.default_choices[k])
enabled_changed = (k for k in self.enabled if not self.default_choices[k])
resp.set_cookie('disabled_{0}'.format(self.name), ','.join(disabled_changed), max_age=COOKIE_MAX_AGE)
@ -367,6 +365,16 @@ class Preferences:
locked=is_locked('simple_style'),
choices=['', 'auto', 'light', 'dark']
),
'center_alignment': MapSetting(
settings['ui']['center_alignment'],
locked=is_locked('center_alignment'),
map={
'0': False,
'1': True,
'False': False,
'True': True
}
),
'advanced_search': MapSetting(
settings['ui']['advanced_search'],
locked=is_locked('advanced_search'),
@ -433,7 +441,7 @@ class Preferences:
"""parse (base64) preferences from request (``flask.request.form['preferences']``)"""
bin_data = decompress(urlsafe_b64decode(input_data))
dict_data = {}
for x, y in parse_qs(bin_data.decode('ascii')).items():
for x, y in parse_qs(bin_data.decode('ascii'), keep_blank_values=True).items():
dict_data[x] = y[0]
self.parse_dict(dict_data)
@ -488,7 +496,7 @@ class Preferences:
return ret_val
def save(self, resp: flask.Response):
"""Save cookie in the HTTP reponse obect"""
"""Save cookie in the HTTP response object"""
for user_setting_name, user_setting in self.key_value_settings.items():
# pylint: disable=unnecessary-dict-index-lookup
if self.key_value_settings[user_setting_name].locked:

View file

@ -198,10 +198,10 @@ class BangParser(QueryPartParser):
self.raw_text_query.enginerefs.append(EngineRef(value, 'none'))
return True
# check if prefix is equal with categorie name
# check if prefix is equal with category name
if value in categories:
# using all engines for that search, which
# are declared under that categorie name
# are declared under that category name
self.raw_text_query.enginerefs.extend(
EngineRef(engine.name, value)
for engine in categories[value]
@ -219,7 +219,7 @@ class BangParser(QueryPartParser):
self._add_autocomplete(first_char + suggestion)
return
# check if query starts with categorie name
# check if query starts with category name
for category in categories:
if category.startswith(value):
self._add_autocomplete(first_char + category.replace(' ', '_'))
@ -311,7 +311,7 @@ class RawTextQuery:
def getFullQuery(self):
"""
get full querry including whitespaces
get full query including whitespaces
"""
return '{0} {1}'.format(' '.join(self.query_parts), self.getQuery()).strip()

70
searx/redisdb.py Normal file
View file

@ -0,0 +1,70 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Implementation of the redis client (redis-py_).
.. _redis-py: https://github.com/redis/redis-py
This implementation uses the :ref:`settings redis` setup from ``settings.yml``.
A redis DB connect can be tested by::
>>> from searx import redisdb
>>> redisdb.initialize()
True
>>> db = redisdb.client()
>>> db.set("foo", "bar")
True
>>> db.get("foo")
b'bar'
>>>
"""
import os
import pwd
import logging
import redis
from searx import get_setting
OLD_REDIS_URL_DEFAULT_URL = 'unix:///usr/local/searxng-redis/run/redis.sock?db=0'
"""This was the default Redis URL in settings.yml."""
_CLIENT = None
logger = logging.getLogger(__name__)
def client() -> redis.Redis:
return _CLIENT
def initialize():
global _CLIENT # pylint: disable=global-statement
redis_url = get_setting('redis.url')
if not redis_url:
return False
try:
# create a client, but no connection is done
_CLIENT = redis.Redis.from_url(redis_url)
# log the parameters as seen by the redis lib, without the password
kwargs = _CLIENT.get_connection_kwargs().copy()
kwargs.pop('password', None)
kwargs = ' '.join([f'{k}={v!r}' for k, v in kwargs.items()])
logger.info("connecting to Redis %s", kwargs)
# check the connection
_CLIENT.ping()
# no error: the redis connection is working
logger.info("connected to Redis")
return True
except redis.exceptions.RedisError as e:
_CLIENT = None
_pw = pwd.getpwuid(os.getuid())
logger.exception("[%s (%s)] can't connect redis DB ...", _pw.pw_name, _pw.pw_uid)
if redis_url == OLD_REDIS_URL_DEFAULT_URL and isinstance(e, redis.exceptions.ConnectionError):
logger.info(
"You can safely ignore the above Redis error if you don't use Redis. "
"You can remove this error by setting redis.url to false in your settings.yml."
)
return False

241
searx/redislib.py Normal file
View file

@ -0,0 +1,241 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""A collection of convenient functions and redis/lua scripts.
This code was partial inspired by the `Bullet-Proofing Lua Scripts in RedisPy`_
article.
.. _Bullet-Proofing Lua Scripts in RedisPy:
https://redis.com/blog/bullet-proofing-lua-scripts-in-redispy/
"""
import hmac
from searx import get_setting
LUA_SCRIPT_STORAGE = {}
"""A global dictionary to cache client's ``Script`` objects, used by
:py:obj:`lua_script_storage`"""
def lua_script_storage(client, script):
"""Returns a redis :py:obj:`Script
<redis.commands.core.CoreCommands.register_script>` instance.
Due to performance reason the ``Script`` object is instantiated only once
for a client (``client.register_script(..)``) and is cached in
:py:obj:`LUA_SCRIPT_STORAGE`.
"""
# redis connection can be closed, lets use the id() of the redis connector
# as key in the script-storage:
client_id = id(client)
if LUA_SCRIPT_STORAGE.get(client_id) is None:
LUA_SCRIPT_STORAGE[client_id] = {}
if LUA_SCRIPT_STORAGE[client_id].get(script) is None:
LUA_SCRIPT_STORAGE[client_id][script] = client.register_script(script)
return LUA_SCRIPT_STORAGE[client_id][script]
PURGE_BY_PREFIX = """
local prefix = tostring(ARGV[1])
for i, name in ipairs(redis.call('KEYS', prefix .. '*')) do
redis.call('EXPIRE', name, 0)
end
"""
def purge_by_prefix(client, prefix: str = "SearXNG_"):
"""Purge all keys with ``prefix`` from database.
Queries all keys in the database by the given prefix and set expire time to
zero. The default prefix will drop all keys which has been set by SearXNG
(drops SearXNG schema entirely from database).
The implementation is the lua script from string :py:obj:`PURGE_BY_PREFIX`.
The lua script uses EXPIRE_ instead of DEL_: if there are a lot keys to
delete and/or their values are big, `DEL` could take more time and blocks
the command loop while `EXPIRE` turns back immediate.
:param prefix: prefix of the key to delete (default: ``SearXNG_``)
:type name: str
.. _EXPIRE: https://redis.io/commands/expire/
.. _DEL: https://redis.io/commands/del/
"""
script = lua_script_storage(client, PURGE_BY_PREFIX)
script(args=[prefix])
def secret_hash(name: str):
"""Creates a hash of the ``name``.
Combines argument ``name`` with the ``secret_key`` from :ref:`settings
server`. This function can be used to get a more anonymised name of a Redis
KEY.
:param name: the name to create a secret hash for
:type name: str
"""
m = hmac.new(bytes(name, encoding='utf-8'), digestmod='sha256')
m.update(bytes(get_setting('server.secret_key'), encoding='utf-8'))
return m.hexdigest()
INCR_COUNTER = """
local limit = tonumber(ARGV[1])
local expire = tonumber(ARGV[2])
local c_name = KEYS[1]
local c = redis.call('GET', c_name)
if not c then
c = redis.call('INCR', c_name)
if expire > 0 then
redis.call('EXPIRE', c_name, expire)
end
else
c = tonumber(c)
if limit == 0 or c < limit then
c = redis.call('INCR', c_name)
end
end
return c
"""
def incr_counter(client, name: str, limit: int = 0, expire: int = 0):
"""Increment a counter and return the new value.
If counter with redis key ``SearXNG_counter_<name>`` does not exists it is
created with initial value 1 returned. The replacement ``<name>`` is a
*secret hash* of the value from argument ``name`` (see
:py:func:`secret_hash`).
The implementation of the redis counter is the lua script from string
:py:obj:`INCR_COUNTER`.
:param name: name of the counter
:type name: str
:param expire: live-time of the counter in seconds (default ``None`` means
infinite).
:type expire: int / see EXPIRE_
:param limit: limit where the counter stops to increment (default ``None``)
:type limit: int / limit is 2^64 see INCR_
:return: value of the incremented counter
:type return: int
.. _EXPIRE: https://redis.io/commands/expire/
.. _INCR: https://redis.io/commands/incr/
A simple demo of a counter with expire time and limit::
>>> for i in range(6):
... i, incr_counter(client, "foo", 3, 5) # max 3, duration 5 sec
... time.sleep(1) # from the third call on max has been reached
...
(0, 1)
(1, 2)
(2, 3)
(3, 3)
(4, 3)
(5, 1)
"""
script = lua_script_storage(client, INCR_COUNTER)
name = "SearXNG_counter_" + secret_hash(name)
c = script(args=[limit, expire], keys=[name])
return c
def drop_counter(client, name):
"""Drop counter with redis key ``SearXNG_counter_<name>``
The replacement ``<name>`` is a *secret hash* of the value from argument
``name`` (see :py:func:`incr_counter` and :py:func:`incr_sliding_window`).
"""
name = "SearXNG_counter_" + secret_hash(name)
client.delete(name)
INCR_SLIDING_WINDOW = """
local expire = tonumber(ARGV[1])
local name = KEYS[1]
local current_time = redis.call('TIME')
redis.call('ZREMRANGEBYSCORE', name, 0, current_time[1] - expire)
redis.call('ZADD', name, current_time[1], current_time[1] .. current_time[2])
local result = redis.call('ZCOUNT', name, 0, current_time[1] + 1)
redis.call('EXPIRE', name, expire)
return result
"""
def incr_sliding_window(client, name: str, duration: int):
"""Increment a sliding-window counter and return the new value.
If counter with redis key ``SearXNG_counter_<name>`` does not exists it is
created with initial value 1 returned. The replacement ``<name>`` is a
*secret hash* of the value from argument ``name`` (see
:py:func:`secret_hash`).
:param name: name of the counter
:type name: str
:param duration: live-time of the sliding window in seconds
:typeduration: int
:return: value of the incremented counter
:type return: int
The implementation of the redis counter is the lua script from string
:py:obj:`INCR_SLIDING_WINDOW`. The lua script uses `sorted sets in Redis`_
to implement a sliding window for the redis key ``SearXNG_counter_<name>``
(ZADD_). The current TIME_ is used to score the items in the sorted set and
the time window is moved by removing items with a score lower current time
minus *duration* time (ZREMRANGEBYSCORE_).
The EXPIRE_ time (the duration of the sliding window) is refreshed on each
call (incrementation) and if there is no call in this duration, the sorted
set expires from the redis DB.
The return value is the amount of items in the sorted set (ZCOUNT_), what
means the number of calls in the sliding window.
.. _Sorted sets in Redis:
https://redis.com/ebook/part-1-getting-started/chapter-1-getting-to-know-redis/1-2-what-redis-data-structures-look-like/1-2-5-sorted-sets-in-redis/
.. _TIME: https://redis.io/commands/time/
.. _ZADD: https://redis.io/commands/zadd/
.. _EXPIRE: https://redis.io/commands/expire/
.. _ZREMRANGEBYSCORE: https://redis.io/commands/zremrangebyscore/
.. _ZCOUNT: https://redis.io/commands/zcount/
A simple demo of the sliding window::
>>> for i in range(5):
... incr_sliding_window(client, "foo", 3) # duration 3 sec
... time.sleep(1) # from the third call (second) on the window is moved
...
1
2
3
3
3
>>> time.sleep(3) # wait until expire
>>> incr_sliding_window(client, "foo", 3)
1
"""
script = lua_script_storage(client, INCR_SLIDING_WINDOW)
name = "SearXNG_counter_" + secret_hash(name)
c = script(args=[duration], keys=[name])
return c

View file

@ -134,9 +134,9 @@ def result_score(result):
if hasattr(engines[result_engine], 'weight'):
weight *= float(engines[result_engine].weight)
occurences = len(result['positions'])
occurrences = len(result['positions'])
return sum((occurences * weight) / position for position in result['positions'])
return sum((occurrences * weight) / position for position in result['positions'])
class Timing(NamedTuple):
@ -286,7 +286,7 @@ class ResultContainer:
if 'template' not in result:
result['template'] = 'default.html'
# strip multiple spaces and cariage returns from content
# strip multiple spaces and carriage returns from content
if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
@ -315,7 +315,7 @@ class ResultContainer:
return merged_result
else:
# it's an image
# it's a duplicate if the parsed_url, template and img_src are differents
# it's a duplicate if the parsed_url, template and img_src are different
if result.get('img_src', '') == merged_result.get('img_src', ''):
return merged_result
return None

View file

@ -2,11 +2,12 @@
# lint: pylint
# pylint: disable=missing-module-docstring, too-few-public-methods
import typing
import threading
from timeit import default_timer
from uuid import uuid4
import flask
from searx import settings
from searx.answerers import ask
from searx.external_bang import get_bang_url
@ -133,7 +134,7 @@ class Search:
def search_multiple_requests(self, requests):
# pylint: disable=protected-access
search_id = uuid4().__str__()
search_id = str(uuid4())
for engine_name, query, request_params in requests:
th = threading.Thread( # pylint: disable=invalid-name
@ -181,7 +182,7 @@ class SearchWithPlugins(Search):
__slots__ = 'ordered_plugin_list', 'request'
def __init__(self, search_query: SearchQuery, ordered_plugin_list, request: "flask.Request"):
def __init__(self, search_query: SearchQuery, ordered_plugin_list, request: flask.Request):
super().__init__(search_query)
self.ordered_plugin_list = ordered_plugin_list
self.result_container.on_result = self._on_result

View file

@ -2,3 +2,5 @@
from .impl import Checker
from .background import initialize, get_result
__all__ = ('Checker', 'initialize', 'get_result')

View file

@ -70,7 +70,7 @@ def run(engine_name_list, verbose):
stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n')
checker = searx.search.checker.Checker(processor)
checker.run()
if checker.test_results.succesfull:
if checker.test_results.successful:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n')
if verbose:
stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n')

View file

@ -1,26 +1,28 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-module-docstring
# pyright: strict
# pyright: basic
import json
import random
import time
import threading
import os
import signal
from typing import Dict, Union, List, Any, Tuple
from typing import Dict, Union, List, Any, Tuple, Optional
from typing_extensions import TypedDict, Literal
import redis.exceptions
from searx import logger, settings, searx_debug
from searx.redisdb import client as get_redis_client
from searx.exceptions import SearxSettingsException
from searx.search.processors import PROCESSORS
from searx.search.checker import Checker
from searx.shared import schedule, storage
from searx.search.checker.scheduler import scheduler_function
CHECKER_RESULT = 'CHECKER_RESULT'
running = threading.Lock()
REDIS_RESULT_KEY = 'SearXNG_checker_result'
REDIS_LOCK_KEY = 'SearXNG_checker_lock'
CheckerResult = Union['CheckerOk', 'CheckerErr', 'CheckerOther']
@ -77,20 +79,24 @@ def _get_interval(every: Any, error_msg: str) -> Tuple[int, int]:
return (every[0], every[1])
def _get_every():
every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800))
return _get_interval(every, 'checker.scheduling.every is not a int or list')
def get_result() -> CheckerResult:
serialized_result = storage.get_str(CHECKER_RESULT)
if serialized_result is not None:
return json.loads(serialized_result)
return {'status': 'unknown'}
client = get_redis_client()
if client is None:
# without Redis, the checker is disabled
return {'status': 'disabled'}
serialized_result: Optional[bytes] = client.get(REDIS_RESULT_KEY)
if serialized_result is None:
# the Redis key does not exist
return {'status': 'unknown'}
return json.loads(serialized_result)
def _set_result(result: CheckerResult):
storage.set_str(CHECKER_RESULT, json.dumps(result))
client = get_redis_client()
if client is None:
# without Redis, the function does nothing
return
client.set(REDIS_RESULT_KEY, json.dumps(result))
def _timestamp():
@ -98,41 +104,29 @@ def _timestamp():
def run():
if not running.acquire(blocking=False): # pylint: disable=consider-using-with
return
try:
logger.info('Starting checker')
result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()}
for name, processor in PROCESSORS.items():
logger.debug('Checking %s engine', name)
checker = Checker(processor)
checker.run()
if checker.test_results.succesfull:
result['engines'][name] = {'success': True}
else:
result['engines'][name] = {'success': False, 'errors': checker.test_results.errors}
# use a Redis lock to make sure there is no checker running at the same time
# (this should not happen, this is a safety measure)
with get_redis_client().lock(REDIS_LOCK_KEY, blocking_timeout=60, timeout=3600):
logger.info('Starting checker')
result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()}
for name, processor in PROCESSORS.items():
logger.debug('Checking %s engine', name)
checker = Checker(processor)
checker.run()
if checker.test_results.successful:
result['engines'][name] = {'success': True}
else:
result['engines'][name] = {'success': False, 'errors': checker.test_results.errors}
_set_result(result)
logger.info('Check done')
_set_result(result)
logger.info('Check done')
except redis.exceptions.LockError:
_set_result({'status': 'error', 'timestamp': _timestamp()})
logger.exception('Error while running the checker')
except Exception: # pylint: disable=broad-except
_set_result({'status': 'error', 'timestamp': _timestamp()})
logger.exception('Error while running the checker')
finally:
running.release()
def _run_with_delay():
every = _get_every()
delay = random.randint(0, every[1] - every[0])
logger.debug('Start checker in %i seconds', delay)
time.sleep(delay)
run()
def _start_scheduling():
every = _get_every()
if schedule(every[0], _run_with_delay):
run()
def _signal_handler(_signum: int, _frame: Any):
@ -147,27 +141,31 @@ def initialize():
logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
signal.signal(signal.SIGUSR1, _signal_handler)
# disabled by default
_set_result({'status': 'disabled'})
# special case when debug is activate
if searx_debug and settings.get('checker', {}).get('off_when_debug', True):
if searx_debug and settings['checker']['off_when_debug']:
logger.info('debug mode: checker is disabled')
return
# check value of checker.scheduling.every now
scheduling = settings.get('checker', {}).get('scheduling', None)
scheduling = settings['checker']['scheduling']
if scheduling is None or not scheduling:
logger.info('Checker scheduler is disabled')
return
#
_set_result({'status': 'unknown'})
# make sure there is a Redis connection
if get_redis_client() is None:
logger.error('The checker requires Redis')
return
start_after = scheduling.get('start_after', (300, 1800))
start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list')
delay = random.randint(start_after[0], start_after[1])
logger.info('Start checker in %i seconds', delay)
t = threading.Timer(delay, _start_scheduling)
# start the background scheduler
every_range = _get_interval(scheduling.get('every', (300, 1800)), 'checker.scheduling.every is not a int or list')
start_after_range = _get_interval(
scheduling.get('start_after', (300, 1800)), 'checker.scheduling.start_after is not a int or list'
)
t = threading.Thread(
target=scheduler_function,
args=(start_after_range[0], start_after_range[1], every_range[0], every_range[1], run),
name='checker_scheduler',
)
t.daemon = True
t.start()

View file

@ -10,12 +10,10 @@ from timeit import default_timer
from urllib.parse import urlparse
import re
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
import httpx
from searx import network, logger
from searx.utils import gen_useragent
from searx.utils import gen_useragent, detect_language
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
@ -174,7 +172,7 @@ class TestResults:
self.languages.add(language)
@property
def succesfull(self):
def successful(self):
return len(self.errors) == 0
def __iter__(self):
@ -208,14 +206,10 @@ class ResultContainerTests:
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
def _add_language(self, text: str) -> typing.Optional[str]:
try:
r = detect_langs(str(text)) # pylint: disable=E1101
except LangDetectException:
return None
if len(r) > 0 and r[0].prob > 0.95:
self.languages.add(r[0].lang)
self.test_results.add_language(r[0].lang)
langStr = detect_language(text)
if langStr:
self.languages.add(langStr)
self.test_results.add_language(langStr)
return None
def _check_result(self, result):
@ -317,7 +311,7 @@ class ResultContainerTests:
self._record_error('No result')
def one_title_contains(self, title: str):
"""Check one of the title contains `title` (case insensitive comparaison)"""
"""Check one of the title contains `title` (case insensitive comparison)"""
title = title.lower()
for result in self.result_container.get_ordered_results():
if title in result['title'].lower():

View file

@ -0,0 +1,36 @@
-- SPDX-License-Identifier: AGPL-3.0-or-later
--
-- This script is not a string in scheduler.py, so editors can provide syntax highlighting.
-- The Redis KEY is defined here and not in Python on purpose:
-- only this LUA script can read and update this key to avoid lock and concurrency issues.
local redis_key = 'SearXNG_checker_next_call_ts'
local now = redis.call('TIME')[1]
local start_after_from = ARGV[1]
local start_after_to = ARGV[2]
local every_from = ARGV[3]
local every_to = ARGV[4]
local next_call_ts = redis.call('GET', redis_key)
if (next_call_ts == false or next_call_ts == nil) then
-- the scheduler has never run on this Redis instance, so:
-- 1/ the scheduler does not run now
-- 2/ the next call is a random time between start_after_from and start_after_to
local initial_delay = math.random(start_after_from, start_after_to)
redis.call('SET', redis_key, now + initial_delay)
return { false, delay }
end
-- next_call_ts is defined
-- --> if now is lower than next_call_ts then we don't run the embedded checker
-- --> if now is higher then we update next_call_ts and ask to run the embedded checker now.
local call_now = next_call_ts <= now
if call_now then
-- the checker runs now, define the timestamp of the next call:
-- this is a random delay between every_from and every_to
local periodic_delay = math.random(every_from, every_to)
next_call_ts = redis.call('INCRBY', redis_key, periodic_delay)
end
return { call_now, next_call_ts - now }

Some files were not shown because too many files have changed in this diff Show more