Merge remote-tracking branch 'upstream/master' into pulling

This commit is contained in:
kvan7 2024-01-22 03:23:01 +00:00
commit 72e8b5c354
223 changed files with 24674 additions and 20896 deletions

View file

@ -45,6 +45,7 @@ ENGINE_DEFAULT_ARGS = {
"using_tor_proxy": False,
"send_accept_language_header": False,
"tokens": [],
"max_page": 0,
}
# set automatically when an engine does not have any tab category
DEFAULT_CATEGORY = 'other'

View file

@ -59,6 +59,9 @@ about = {
# engine dependent config
categories = ['general', 'web']
paging = True
max_page = 200
"""200 pages maximum (``&first=1991``)"""
time_range_support = True
safesearch = True
"""Bing results are always SFW. To get NSFW links from bing some age

68
searx/engines/bpb.py Normal file
View file

@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""BPB refers to ``Bundeszentrale für poltische Bildung``, which is a German
governmental institution aiming to reduce misinformation by providing resources
about politics and history.
"""
from datetime import datetime
from urllib.parse import urlencode
about = {
'website': "https://www.bpb.de",
'official_api_documentation': None,
'use_official_api': False,
'require_api_key': False,
'results': 'JSON',
'language': 'de',
}
paging = True
categories = ['general']
base_url = "https://www.bpb.de"
def request(query, params):
args = {
'query[term]': query,
'page': params['pageno'] - 1,
'sort[direction]': 'descending',
'payload[nid]': 65350,
}
params['url'] = f"{base_url}/bpbapi/filter/search?{urlencode(args)}"
return params
def response(resp):
results = []
json_resp = resp.json()
for result in json_resp['teaser']:
img_src = None
if result['teaser']['image']:
img_src = base_url + result['teaser']['image']['sources'][-1]['url']
metadata = result['extension']['overline']
authors = ', '.join(author['name'] for author in result['extension'].get('authors', []))
if authors:
metadata += f" | {authors}"
publishedDate = None
if result['extension'].get('publishingDate'):
publishedDate = datetime.utcfromtimestamp(result['extension']['publishingDate'])
results.append(
{
'url': base_url + result['teaser']['link']['url'],
'title': result['teaser']['title'],
'content': result['teaser']['text'],
'img_src': img_src,
'publishedDate': publishedDate,
'metadata': metadata,
}
)
return results

View file

@ -152,6 +152,10 @@ send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All)."""
max_page = 10
"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
to do more won't return any result and you will most likely be flagged as a bot.
"""
safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off

68
searx/engines/destatis.py Normal file
View file

@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""DeStatis
"""
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://www.destatis.de',
'official_api_documentation': 'https://destatis.api.bund.dev/',
'use_official_api': False,
'require_api_key': False,
'results': 'HTML',
'language': 'de',
}
categories = []
paging = True
base_url = "https://www.destatis.de"
search_url = f"{base_url}/SiteGlobals/Forms/Suche/Expertensuche_Formular.html"
# pylint: disable-next=line-too-long
results_xpath = '//div[contains(@class, "l-content-wrapper")]/div[contains(@class, "row")]/div[contains(@class, "column")]/div[contains(@class, "c-result"){extra}]'
results_xpath_filter_recommended = " and not(contains(@class, 'c-result--recommended'))"
url_xpath = './/a/@href'
title_xpath = './/a/text()'
date_xpath = './/a/span[contains(@class, "c-result__date")]'
content_xpath = './/div[contains(@class, "column")]/p/text()'
doctype_xpath = './/div[contains(@class, "c-result__doctype")]/p'
def request(query, params):
args = {
'templateQueryString': query,
'gtp': f"474_list%3D{params['pageno']}",
}
params['url'] = f"{search_url}?{urlencode(args)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
# filter out suggested results on further page because they're the same on each page
extra_xpath = results_xpath_filter_recommended if resp.search_params['pageno'] > 1 else ''
res_xpath = results_xpath.format(extra=extra_xpath)
for result in eval_xpath_list(dom, res_xpath):
doctype = extract_text(eval_xpath(result, doctype_xpath))
date = extract_text(eval_xpath(result, date_xpath))
metadata = [meta for meta in (doctype, date) if meta != ""]
results.append(
{
'url': base_url + "/" + extract_text(eval_xpath(result, url_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
'content': extract_text(eval_xpath(result, content_xpath)),
'metadata': ', '.join(metadata),
}
)
return results

50
searx/engines/fyyd.py Normal file
View file

@ -0,0 +1,50 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Fyyd (podcasts)
"""
from datetime import datetime
from urllib.parse import urlencode
about = {
'website': 'https://fyyd.de',
'official_api_documentation': 'https://github.com/eazyliving/fyyd-api',
'use_official_api': True,
'require_api_key': False,
'results': 'JSON',
}
categories = []
paging = True
base_url = "https://api.fyyd.de"
page_size = 10
def request(query, params):
args = {
'term': query,
'count': page_size,
'page': params['pageno'] - 1,
}
params['url'] = f"{base_url}/0.2/search/podcast?{urlencode(args)}"
return params
def response(resp):
results = []
json_results = resp.json()['data']
for result in json_results:
results.append(
{
'url': result['htmlURL'],
'title': result['title'],
'content': result['description'],
'thumbnail': result['smallImageURL'],
'publishedDate': datetime.strptime(result['status_since'], '%Y-%m-%d %H:%M:%S'),
'metadata': f"Rank: {result['rank']} || {result['episode_count']} episodes",
}
)
return results

View file

@ -48,6 +48,7 @@ about = {
# engine dependent config
categories = ['general', 'web']
paging = True
max_page = 50
time_range_support = True
safesearch = True
@ -429,14 +430,13 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
if not resp.ok: # type: ignore
raise RuntimeError("Response from Google's preferences is not OK.")
dom = html.fromstring(resp.text) # type: ignore
dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
# supported language codes
lang_map = {'no': 'nb'}
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
eng_lang = x.get("value").split('_')[-1]
for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
eng_lang = x.get("value")
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
@ -456,7 +456,7 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
# supported region codes
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
eng_country = x.get("value")
if eng_country in skip_countries:

View file

@ -47,6 +47,7 @@ about = {
# engine dependent config
categories = ['images', 'web']
paging = True
max_page = 50
time_range_support = True
safesearch = True
send_accept_language_header = True

View file

@ -51,6 +51,7 @@ about = {
# engine dependent config
categories = ['science', 'scientific publications']
paging = True
max_page = 50
language_support = True
time_range_support = True
safesearch = False

View file

@ -57,6 +57,7 @@ about = {
categories = ['videos', 'web']
paging = True
max_page = 50
language_support = True
time_range_support = True
safesearch = True
@ -86,7 +87,7 @@ def request(query, params):
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
if 'safesearch' in params:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url

View file

@ -8,6 +8,7 @@ from searx.utils import to_string, html_to_text
search_url = None
url_query = None
url_prefix = ""
content_query = None
title_query = None
content_html_to_text = False
@ -129,7 +130,7 @@ def response(resp):
content = ""
results.append(
{
'url': to_string(url),
'url': url_prefix + to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}
@ -138,7 +139,7 @@ def response(resp):
for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)):
results.append(
{
'url': to_string(url),
'url': url_prefix + to_string(url),
'title': title_filter(to_string(title)),
'content': content_filter(to_string(content)),
}

View file

@ -16,7 +16,7 @@ about = {
engine_type = 'online_dictionary'
categories = ['general']
url = "https://lingva.ml"
url = "https://lingva.thedaviddelta.com/"
search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"

View file

@ -0,0 +1,43 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Podcast Index
"""
from urllib.parse import quote_plus
from datetime import datetime
about = {
'website': 'https://podcastindex.org',
'official_api_documentation': None, # requires an account
'use_official_api': False,
'require_api_key': False,
'results': 'JSON',
}
categories = []
base_url = "https://podcastindex.org"
def request(query, params):
params['url'] = f"{base_url}/api/search/byterm?q={quote_plus(query)}"
return params
def response(resp):
results = []
json = resp.json()
for result in json['feeds']:
results.append(
{
'url': result['link'],
'title': result['title'],
'content': result['description'],
'thumbnail': result['image'],
'publishedDate': datetime.utcfromtimestamp(result['newestItemPubdate']),
'metadata': f"{result['author']}, {result['episodeCount']} episodes",
}
)
return results

266
searx/engines/presearch.py Normal file
View file

@ -0,0 +1,266 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Presearch supports the search types listed in :py:obj:`search_type` (general,
images, videos, news).
Configured ``presarch`` engines:
.. code:: yaml
- name: presearch
engine: presearch
search_type: search
categories: [general, web]
- name: presearch images
...
search_type: images
categories: [images, web]
- name: presearch videos
...
search_type: videos
categories: [general, web]
- name: presearch news
...
search_type: news
categories: [news, web]
.. hint::
By default Presearch's video category is intentionally placed into::
categories: [general, web]
Search type ``video``
=====================
The results in the video category are most often links to pages that contain a
video, for instance many links from Preasearch's video category link content
from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
video streams SearXNG can't use the video template for this and if SearXNG can't
use this template, then the user doesn't want to see these hits in the videos
category.
Languages & Regions
===================
In Presearch there are languages for the UI and regions for narrowing down the
search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
``use_local_search_results=false``, then the defaults are set for both (the
language and the region) from the ``Accept-Language`` header.
Since the region is already "auto" by default, we only need to set the
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
have to set these values in both requests we send to Presearch; in the first
request to get the request-ID from Presearch and in the final request to get the
result list (see ``send_accept_language_header``).
Implementations
===============
"""
from urllib.parse import urlencode
from searx import locales
from searx.network import get
from searx.utils import gen_useragent, html_to_text
about = {
"website": "https://presearch.io",
"wikidiata_id": "Q7240905",
"official_api_documentation": "https://docs.presearch.io/nodes/api",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
paging = True
safesearch = True
time_range_support = True
send_accept_language_header = True
categories = ["general", "web"] # general, images, videos, news
search_type = "search"
"""must be any of ``search``, ``images``, ``videos``, ``news``"""
base_url = "https://presearch.com"
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
def init(_):
if search_type not in ['search', 'images', 'videos', 'news']:
raise ValueError(f'presearch search_type: {search_type}')
def _get_request_id(query, params):
args = {
"q": query,
"page": params["pageno"],
}
if params["time_range"]:
args["time"] = params["time_range"]
url = f"{base_url}/{search_type}?{urlencode(args)}"
headers = {
'User-Agent': gen_useragent(),
'Cookie': (
f"b=1;"
f" presearch_session=;"
f" use_local_search_results=false;"
f" use_safe_search={safesearch_map[params['safesearch']]}"
),
}
if params['searxng_locale'] != 'all':
l = locales.get_locale(params['searxng_locale'])
# Presearch narrows down the search by region. In SearXNG when the user
# does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
# region.
# We could possibly use searx.locales.get_official_locales to determine
# in which regions this language is an official one, but then we still
# wouldn't know which region should be given more weight / Presearch
# performs an IP-based geolocation of the user, we don't want that in
# SearXNG ;-)
if l.territory:
headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
resp_text = get(url, headers=headers).text # type: ignore
for line in resp_text.split("\n"):
if "window.searchId = " in line:
return line.split("= ")[1][:-1].replace('"', "")
return None
def request(query, params):
request_id = _get_request_id(query, params)
params["headers"]["Accept"] = "application/json"
params["url"] = f"{base_url}/results?id={request_id}"
return params
def _strip_leading_strings(text):
for x in ['wikipedia', 'google']:
if text.lower().endswith(x):
text = text[: -len(x)]
return text.strip()
def parse_search_query(json_results):
results = []
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
result = {
'url': item['link'],
'title': item['title'],
'img_src': item['image'],
'content': '',
'metadata': item.get('source'),
}
results.append(result)
for item in json_results.get('standardResults', []):
result = {
'url': item['link'],
'title': item['title'],
'content': html_to_text(item['description']),
}
results.append(result)
info = json_results.get('infoSection', {}).get('data')
if info:
attributes = []
for item in info.get('about', []):
text = html_to_text(item)
if ':' in text:
# split text into key / value
label, value = text.split(':', 1)
else:
# In other languages (tested with zh-TW) a colon is represented
# by a different symbol --> then we split at the first space.
label, value = text.split(' ', 1)
label = label[:-1]
value = _strip_leading_strings(value)
attributes.append({'label': label, 'value': value})
content = []
for item in [info.get('subtitle'), info.get('description')]:
if not item:
continue
item = _strip_leading_strings(html_to_text(item))
if item:
content.append(item)
results.append(
{
'infobox': info['title'],
'id': info['title'],
'img_src': info.get('image'),
'content': ' | '.join(content),
'attributes': attributes,
}
)
return results
def response(resp):
results = []
json_resp = resp.json()
if search_type == 'search':
results = parse_search_query(json_resp.get('results'))
elif search_type == 'images':
for item in json_resp.get('images', []):
results.append(
{
'template': 'images.html',
'title': item['title'],
'url': item.get('link'),
'img_src': item.get('image'),
'thumbnail_src': item.get('thumbnail'),
}
)
elif search_type == 'videos':
# The results in the video category are most often links to pages that contain
# a video and not to a video stream --> SearXNG can't use the video template.
for item in json_resp.get('videos', []):
metadata = [x for x in [item.get('description'), item.get('duration')] if x]
results.append(
{
'title': item['title'],
'url': item.get('link'),
'content': '',
'metadata': ' / '.join(metadata),
'img_src': item.get('image'),
}
)
elif search_type == 'news':
for item in json_resp.get('news', []):
metadata = [x for x in [item.get('source'), item.get('time')] if x]
results.append(
{
'title': item['title'],
'url': item.get('link'),
'content': item.get('description', ''),
'metadata': ' / '.join(metadata),
'img_src': item.get('image'),
}
)
return results

View file

@ -75,6 +75,10 @@ about = {
# engine dependent config
categories = []
paging = True
max_page = 5
"""5 pages maximum (``&p=5``): Trying to do more just results in an improper
redirect"""
qwant_categ = None
"""One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""
@ -112,10 +116,6 @@ def request(query, params):
args = {'q': query}
params['raise_for_httperror'] = False
# all qwant engines (incl qwant-lite) delivers only 5 pages maximum
if params['pageno'] > 5:
return None
if qwant_categ == 'web-lite':
url = web_lite_url + '?'

View file

@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""RottenTomatoes (movies)
"""
from urllib.parse import quote_plus
from lxml import html
from searx.utils import eval_xpath, eval_xpath_list, extract_text
# about
about = {
"website": 'https://www.rottentomatoes.com/',
"wikidata_id": 'Q105584',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['movies']
base_url = "https://www.rottentomatoes.com"
results_xpath = "//search-page-media-row"
url_xpath = "./a[1]/@href"
title_xpath = "./a/img/@alt"
img_src_xpath = "./a/img/@src"
release_year_xpath = "concat('From ', string(./@releaseyear))"
score_xpath = "concat('Score: ', string(./@tomatometerscore))"
cast_xpath = "concat('Starring ', string(./@cast))"
def request(query, params):
params["url"] = f"{base_url}/search?search={quote_plus(query)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, results_xpath):
content = []
for xpath in (release_year_xpath, score_xpath, cast_xpath):
info = extract_text(eval_xpath(result, xpath))
# a gap in the end means that no data was found
if info and info[-1] != " ":
content.append(info)
results.append(
{
'url': extract_text(eval_xpath(result, url_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
'content': ', '.join(content),
'img_src': extract_text(eval_xpath(result, img_src_xpath)),
}
)
return results

View file

@ -127,6 +127,9 @@ different to the UI language) and a region filter.
# engine dependent config
categories = ['general', 'web']
paging = True
max_page = 18
"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
time_range_support = True
safesearch = True

43
searx/engines/stract.py Normal file
View file

@ -0,0 +1,43 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Stract is an independent open source search engine.
At this state, it's still in beta and hence this implementation will need to be updated once beta ends.
"""
from json import dumps
about = {
"website": "https://stract.com/",
"use_official_api": True,
"official_api_documentation": "https://stract.com/beta/api/docs/#/search/api",
"require_api_key": False,
"results": "JSON",
}
categories = ['general']
paging = True
search_url = "https://stract.com/beta/api/search"
def request(query, params):
params['url'] = search_url
params['method'] = "POST"
params['headers'] = {'Accept': 'application/json', 'Content-Type': 'application/json'}
params['data'] = dumps({'query': query, 'page': params['pageno'] - 1})
return params
def response(resp):
results = []
for result in resp.json()["webpages"]:
results.append(
{
'url': result['url'],
'title': result['title'],
'content': ''.join(fragment['text'] for fragment in result['snippet']['text']['fragments']),
}
)
return results

View file

@ -0,0 +1,60 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Tootfinder (social media)
"""
from datetime import datetime
from json import loads
from searx.utils import html_to_text
about = {
'website': "https://www.tootfinder.ch",
'official_api_documentation': "https://wiki.tootfinder.ch/index.php?name=the-tootfinder-rest-api",
'use_official_api': True,
'require_api_key': False,
'results': "JSON",
}
categories = ['social media']
base_url = "https://www.tootfinder.ch"
def request(query, params):
params['url'] = f"{base_url}/rest/api/search/{query}"
return params
def response(resp):
results = []
# the API of tootfinder has an issue that errors on server side are appended to the API response as HTML
# thus we're only looking for the line that contains the actual json data and ignore everything else
json_str = ""
for line in resp.text.split("\n"):
if line.startswith("[{"):
json_str = line
break
for result in loads(json_str):
thumbnail = None
attachments = result.get('media_attachments', [])
images = [attachment['preview_url'] for attachment in attachments if attachment['type'] == 'image']
if len(images) > 0:
thumbnail = images[0]
title = result.get('card', {}).get('title')
if not title:
title = html_to_text(result['content'])[:75]
results.append(
{
'url': result['url'],
'title': title,
'content': html_to_text(result['content']),
'thumbnail': thumbnail,
'publishedDate': datetime.strptime(result['created_at'], '%Y-%m-%d %H:%M:%S'),
}
)
return results

79
searx/engines/yep.py Normal file
View file

@ -0,0 +1,79 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Yep (general, images, news)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
'website': 'https://yep.com/',
'official_api_documentation': 'https://docs.developer.yelp.com',
'use_official_api': False,
'require_api_key': False,
'results': 'JSON',
}
base_url = "https://api.yep.com"
search_type = "web" # 'web', 'images', 'news'
safesearch = True
safesearch_map = {0: 'off', 1: 'moderate', 2: 'strict'}
def request(query, params):
args = {
'client': 'web',
'no_correct': 'false',
'q': query,
'safeSearch': safesearch_map[params['safesearch']],
'type': search_type,
}
params['url'] = f"{base_url}/fs/2/search?{urlencode(args)}"
params['headers']['Referer'] = 'https://yep.com/'
return params
def _web_result(result):
return {
'url': result['url'],
'title': result['title'],
'content': html_to_text(result['snippet']),
}
def _images_result(result):
return {
'template': 'images.html',
'url': result['host_page'],
'title': result.get('title', ''),
'content': '',
'img_src': result['image_id'],
'thumbnail_src': result['src'],
}
def _news_result(result):
return {
'url': result['url'],
'title': result['title'],
'content': html_to_text(result['snippet']),
'publishedDate': datetime.strptime(result['first_seen'][:19], '%Y-%m-%dT%H:%M:%S'),
}
def response(resp):
results = []
for result in resp.json()[1]['results']:
if search_type == "web":
results.append(_web_result(result))
elif search_type == "images":
results.append(_images_result(result))
elif search_type == "news":
results.append(_news_result(result))
else:
raise ValueError(f"Unsupported yep search type: {search_type}")
return results

View file

@ -200,6 +200,8 @@ def fetch_traits(engine_traits: EngineTraits) -> None:
for locale in babel.core.localedata.locale_identifiers(): # type: ignore
# Create a Locale object for the current locale
loc = babel.Locale.parse(locale)
if loc.english_name is None:
continue
language_name_locale_map[loc.english_name.lower()] = loc # type: ignore
for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"):