mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
Merge remote-tracking branch 'upstream/master' into pulling
This commit is contained in:
commit
72e8b5c354
223 changed files with 24674 additions and 20896 deletions
|
|
@ -45,6 +45,7 @@ ENGINE_DEFAULT_ARGS = {
|
|||
"using_tor_proxy": False,
|
||||
"send_accept_language_header": False,
|
||||
"tokens": [],
|
||||
"max_page": 0,
|
||||
}
|
||||
# set automatically when an engine does not have any tab category
|
||||
DEFAULT_CATEGORY = 'other'
|
||||
|
|
|
|||
|
|
@ -59,6 +59,9 @@ about = {
|
|||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
max_page = 200
|
||||
"""200 pages maximum (``&first=1991``)"""
|
||||
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
"""Bing results are always SFW. To get NSFW links from bing some age
|
||||
|
|
|
|||
68
searx/engines/bpb.py
Normal file
68
searx/engines/bpb.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""BPB refers to ``Bundeszentrale für poltische Bildung``, which is a German
|
||||
governmental institution aiming to reduce misinformation by providing resources
|
||||
about politics and history.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
about = {
|
||||
'website': "https://www.bpb.de",
|
||||
'official_api_documentation': None,
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
'language': 'de',
|
||||
}
|
||||
|
||||
paging = True
|
||||
categories = ['general']
|
||||
|
||||
|
||||
base_url = "https://www.bpb.de"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'query[term]': query,
|
||||
'page': params['pageno'] - 1,
|
||||
'sort[direction]': 'descending',
|
||||
'payload[nid]': 65350,
|
||||
}
|
||||
params['url'] = f"{base_url}/bpbapi/filter/search?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json_resp = resp.json()
|
||||
|
||||
for result in json_resp['teaser']:
|
||||
img_src = None
|
||||
if result['teaser']['image']:
|
||||
img_src = base_url + result['teaser']['image']['sources'][-1]['url']
|
||||
|
||||
metadata = result['extension']['overline']
|
||||
authors = ', '.join(author['name'] for author in result['extension'].get('authors', []))
|
||||
if authors:
|
||||
metadata += f" | {authors}"
|
||||
|
||||
publishedDate = None
|
||||
if result['extension'].get('publishingDate'):
|
||||
publishedDate = datetime.utcfromtimestamp(result['extension']['publishingDate'])
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': base_url + result['teaser']['link']['url'],
|
||||
'title': result['teaser']['title'],
|
||||
'content': result['teaser']['text'],
|
||||
'img_src': img_src,
|
||||
'publishedDate': publishedDate,
|
||||
'metadata': metadata,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
|
@ -152,6 +152,10 @@ send_accept_language_header = True
|
|||
paging = False
|
||||
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
|
||||
category All)."""
|
||||
max_page = 10
|
||||
"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
|
||||
to do more won't return any result and you will most likely be flagged as a bot.
|
||||
"""
|
||||
|
||||
safesearch = True
|
||||
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
|
||||
|
|
|
|||
68
searx/engines/destatis.py
Normal file
68
searx/engines/destatis.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""DeStatis
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text
|
||||
|
||||
about = {
|
||||
'website': 'https://www.destatis.de',
|
||||
'official_api_documentation': 'https://destatis.api.bund.dev/',
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'HTML',
|
||||
'language': 'de',
|
||||
}
|
||||
|
||||
categories = []
|
||||
paging = True
|
||||
|
||||
base_url = "https://www.destatis.de"
|
||||
search_url = f"{base_url}/SiteGlobals/Forms/Suche/Expertensuche_Formular.html"
|
||||
|
||||
# pylint: disable-next=line-too-long
|
||||
results_xpath = '//div[contains(@class, "l-content-wrapper")]/div[contains(@class, "row")]/div[contains(@class, "column")]/div[contains(@class, "c-result"){extra}]'
|
||||
results_xpath_filter_recommended = " and not(contains(@class, 'c-result--recommended'))"
|
||||
url_xpath = './/a/@href'
|
||||
title_xpath = './/a/text()'
|
||||
date_xpath = './/a/span[contains(@class, "c-result__date")]'
|
||||
content_xpath = './/div[contains(@class, "column")]/p/text()'
|
||||
doctype_xpath = './/div[contains(@class, "c-result__doctype")]/p'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'templateQueryString': query,
|
||||
'gtp': f"474_list%3D{params['pageno']}",
|
||||
}
|
||||
params['url'] = f"{search_url}?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# filter out suggested results on further page because they're the same on each page
|
||||
extra_xpath = results_xpath_filter_recommended if resp.search_params['pageno'] > 1 else ''
|
||||
res_xpath = results_xpath.format(extra=extra_xpath)
|
||||
|
||||
for result in eval_xpath_list(dom, res_xpath):
|
||||
doctype = extract_text(eval_xpath(result, doctype_xpath))
|
||||
date = extract_text(eval_xpath(result, date_xpath))
|
||||
|
||||
metadata = [meta for meta in (doctype, date) if meta != ""]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': base_url + "/" + extract_text(eval_xpath(result, url_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
'content': extract_text(eval_xpath(result, content_xpath)),
|
||||
'metadata': ', '.join(metadata),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
50
searx/engines/fyyd.py
Normal file
50
searx/engines/fyyd.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Fyyd (podcasts)
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
|
||||
about = {
|
||||
'website': 'https://fyyd.de',
|
||||
'official_api_documentation': 'https://github.com/eazyliving/fyyd-api',
|
||||
'use_official_api': True,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
}
|
||||
categories = []
|
||||
paging = True
|
||||
|
||||
base_url = "https://api.fyyd.de"
|
||||
page_size = 10
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'term': query,
|
||||
'count': page_size,
|
||||
'page': params['pageno'] - 1,
|
||||
}
|
||||
params['url'] = f"{base_url}/0.2/search/podcast?{urlencode(args)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json_results = resp.json()['data']
|
||||
|
||||
for result in json_results:
|
||||
results.append(
|
||||
{
|
||||
'url': result['htmlURL'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'thumbnail': result['smallImageURL'],
|
||||
'publishedDate': datetime.strptime(result['status_since'], '%Y-%m-%d %H:%M:%S'),
|
||||
'metadata': f"Rank: {result['rank']} || {result['episode_count']} episodes",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
|
@ -48,6 +48,7 @@ about = {
|
|||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
max_page = 50
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
|
||||
|
|
@ -429,14 +430,13 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
|
|||
if not resp.ok: # type: ignore
|
||||
raise RuntimeError("Response from Google's preferences is not OK.")
|
||||
|
||||
dom = html.fromstring(resp.text) # type: ignore
|
||||
dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
|
||||
|
||||
# supported language codes
|
||||
|
||||
lang_map = {'no': 'nb'}
|
||||
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
|
||||
|
||||
eng_lang = x.get("value").split('_')[-1]
|
||||
for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
|
||||
eng_lang = x.get("value")
|
||||
try:
|
||||
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
|
||||
except babel.UnknownLocaleError:
|
||||
|
|
@ -456,7 +456,7 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
|
|||
|
||||
# supported region codes
|
||||
|
||||
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
|
||||
for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
|
||||
eng_country = x.get("value")
|
||||
|
||||
if eng_country in skip_countries:
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ about = {
|
|||
# engine dependent config
|
||||
categories = ['images', 'web']
|
||||
paging = True
|
||||
max_page = 50
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ about = {
|
|||
# engine dependent config
|
||||
categories = ['science', 'scientific publications']
|
||||
paging = True
|
||||
max_page = 50
|
||||
language_support = True
|
||||
time_range_support = True
|
||||
safesearch = False
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ about = {
|
|||
|
||||
categories = ['videos', 'web']
|
||||
paging = True
|
||||
max_page = 50
|
||||
language_support = True
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
|
|
@ -86,7 +87,7 @@ def request(query, params):
|
|||
|
||||
if params['time_range'] in time_range_dict:
|
||||
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
|
||||
if params['safesearch']:
|
||||
if 'safesearch' in params:
|
||||
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
|
||||
params['url'] = query_url
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ from searx.utils import to_string, html_to_text
|
|||
|
||||
search_url = None
|
||||
url_query = None
|
||||
url_prefix = ""
|
||||
content_query = None
|
||||
title_query = None
|
||||
content_html_to_text = False
|
||||
|
|
@ -129,7 +130,7 @@ def response(resp):
|
|||
content = ""
|
||||
results.append(
|
||||
{
|
||||
'url': to_string(url),
|
||||
'url': url_prefix + to_string(url),
|
||||
'title': title_filter(to_string(title)),
|
||||
'content': content_filter(to_string(content)),
|
||||
}
|
||||
|
|
@ -138,7 +139,7 @@ def response(resp):
|
|||
for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)):
|
||||
results.append(
|
||||
{
|
||||
'url': to_string(url),
|
||||
'url': url_prefix + to_string(url),
|
||||
'title': title_filter(to_string(title)),
|
||||
'content': content_filter(to_string(content)),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ about = {
|
|||
engine_type = 'online_dictionary'
|
||||
categories = ['general']
|
||||
|
||||
url = "https://lingva.ml"
|
||||
url = "https://lingva.thedaviddelta.com/"
|
||||
search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"
|
||||
|
||||
|
||||
|
|
|
|||
43
searx/engines/podcastindex.py
Normal file
43
searx/engines/podcastindex.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Podcast Index
|
||||
"""
|
||||
|
||||
from urllib.parse import quote_plus
|
||||
from datetime import datetime
|
||||
|
||||
about = {
|
||||
'website': 'https://podcastindex.org',
|
||||
'official_api_documentation': None, # requires an account
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
}
|
||||
categories = []
|
||||
|
||||
base_url = "https://podcastindex.org"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = f"{base_url}/api/search/byterm?q={quote_plus(query)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
json = resp.json()
|
||||
|
||||
for result in json['feeds']:
|
||||
results.append(
|
||||
{
|
||||
'url': result['link'],
|
||||
'title': result['title'],
|
||||
'content': result['description'],
|
||||
'thumbnail': result['image'],
|
||||
'publishedDate': datetime.utcfromtimestamp(result['newestItemPubdate']),
|
||||
'metadata': f"{result['author']}, {result['episodeCount']} episodes",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
266
searx/engines/presearch.py
Normal file
266
searx/engines/presearch.py
Normal file
|
|
@ -0,0 +1,266 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Presearch supports the search types listed in :py:obj:`search_type` (general,
|
||||
images, videos, news).
|
||||
|
||||
Configured ``presarch`` engines:
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
- name: presearch
|
||||
engine: presearch
|
||||
search_type: search
|
||||
categories: [general, web]
|
||||
|
||||
- name: presearch images
|
||||
...
|
||||
search_type: images
|
||||
categories: [images, web]
|
||||
|
||||
- name: presearch videos
|
||||
...
|
||||
search_type: videos
|
||||
categories: [general, web]
|
||||
|
||||
- name: presearch news
|
||||
...
|
||||
search_type: news
|
||||
categories: [news, web]
|
||||
|
||||
.. hint::
|
||||
|
||||
By default Presearch's video category is intentionally placed into::
|
||||
|
||||
categories: [general, web]
|
||||
|
||||
|
||||
Search type ``video``
|
||||
=====================
|
||||
|
||||
The results in the video category are most often links to pages that contain a
|
||||
video, for instance many links from Preasearch's video category link content
|
||||
from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
|
||||
video streams SearXNG can't use the video template for this and if SearXNG can't
|
||||
use this template, then the user doesn't want to see these hits in the videos
|
||||
category.
|
||||
|
||||
|
||||
Languages & Regions
|
||||
===================
|
||||
|
||||
In Presearch there are languages for the UI and regions for narrowing down the
|
||||
search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
|
||||
``use_local_search_results=false``, then the defaults are set for both (the
|
||||
language and the region) from the ``Accept-Language`` header.
|
||||
|
||||
Since the region is already "auto" by default, we only need to set the
|
||||
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
|
||||
have to set these values in both requests we send to Presearch; in the first
|
||||
request to get the request-ID from Presearch and in the final request to get the
|
||||
result list (see ``send_accept_language_header``).
|
||||
|
||||
|
||||
Implementations
|
||||
===============
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from searx import locales
|
||||
from searx.network import get
|
||||
from searx.utils import gen_useragent, html_to_text
|
||||
|
||||
about = {
|
||||
"website": "https://presearch.io",
|
||||
"wikidiata_id": "Q7240905",
|
||||
"official_api_documentation": "https://docs.presearch.io/nodes/api",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
paging = True
|
||||
safesearch = True
|
||||
time_range_support = True
|
||||
send_accept_language_header = True
|
||||
categories = ["general", "web"] # general, images, videos, news
|
||||
|
||||
search_type = "search"
|
||||
"""must be any of ``search``, ``images``, ``videos``, ``news``"""
|
||||
|
||||
base_url = "https://presearch.com"
|
||||
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
|
||||
|
||||
|
||||
def init(_):
|
||||
if search_type not in ['search', 'images', 'videos', 'news']:
|
||||
raise ValueError(f'presearch search_type: {search_type}')
|
||||
|
||||
|
||||
def _get_request_id(query, params):
|
||||
|
||||
args = {
|
||||
"q": query,
|
||||
"page": params["pageno"],
|
||||
}
|
||||
|
||||
if params["time_range"]:
|
||||
args["time"] = params["time_range"]
|
||||
|
||||
url = f"{base_url}/{search_type}?{urlencode(args)}"
|
||||
|
||||
headers = {
|
||||
'User-Agent': gen_useragent(),
|
||||
'Cookie': (
|
||||
f"b=1;"
|
||||
f" presearch_session=;"
|
||||
f" use_local_search_results=false;"
|
||||
f" use_safe_search={safesearch_map[params['safesearch']]}"
|
||||
),
|
||||
}
|
||||
if params['searxng_locale'] != 'all':
|
||||
l = locales.get_locale(params['searxng_locale'])
|
||||
|
||||
# Presearch narrows down the search by region. In SearXNG when the user
|
||||
# does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
|
||||
# region.
|
||||
|
||||
# We could possibly use searx.locales.get_official_locales to determine
|
||||
# in which regions this language is an official one, but then we still
|
||||
# wouldn't know which region should be given more weight / Presearch
|
||||
# performs an IP-based geolocation of the user, we don't want that in
|
||||
# SearXNG ;-)
|
||||
|
||||
if l.territory:
|
||||
headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
|
||||
|
||||
resp_text = get(url, headers=headers).text # type: ignore
|
||||
|
||||
for line in resp_text.split("\n"):
|
||||
if "window.searchId = " in line:
|
||||
return line.split("= ")[1][:-1].replace('"', "")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def request(query, params):
|
||||
request_id = _get_request_id(query, params)
|
||||
params["headers"]["Accept"] = "application/json"
|
||||
params["url"] = f"{base_url}/results?id={request_id}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def _strip_leading_strings(text):
|
||||
for x in ['wikipedia', 'google']:
|
||||
if text.lower().endswith(x):
|
||||
text = text[: -len(x)]
|
||||
return text.strip()
|
||||
|
||||
|
||||
def parse_search_query(json_results):
|
||||
results = []
|
||||
|
||||
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
||||
result = {
|
||||
'url': item['link'],
|
||||
'title': item['title'],
|
||||
'img_src': item['image'],
|
||||
'content': '',
|
||||
'metadata': item.get('source'),
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
for item in json_results.get('standardResults', []):
|
||||
result = {
|
||||
'url': item['link'],
|
||||
'title': item['title'],
|
||||
'content': html_to_text(item['description']),
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
info = json_results.get('infoSection', {}).get('data')
|
||||
if info:
|
||||
attributes = []
|
||||
for item in info.get('about', []):
|
||||
|
||||
text = html_to_text(item)
|
||||
if ':' in text:
|
||||
# split text into key / value
|
||||
label, value = text.split(':', 1)
|
||||
else:
|
||||
# In other languages (tested with zh-TW) a colon is represented
|
||||
# by a different symbol --> then we split at the first space.
|
||||
label, value = text.split(' ', 1)
|
||||
label = label[:-1]
|
||||
|
||||
value = _strip_leading_strings(value)
|
||||
attributes.append({'label': label, 'value': value})
|
||||
content = []
|
||||
for item in [info.get('subtitle'), info.get('description')]:
|
||||
if not item:
|
||||
continue
|
||||
item = _strip_leading_strings(html_to_text(item))
|
||||
if item:
|
||||
content.append(item)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'infobox': info['title'],
|
||||
'id': info['title'],
|
||||
'img_src': info.get('image'),
|
||||
'content': ' | '.join(content),
|
||||
'attributes': attributes,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
json_resp = resp.json()
|
||||
|
||||
if search_type == 'search':
|
||||
results = parse_search_query(json_resp.get('results'))
|
||||
|
||||
elif search_type == 'images':
|
||||
for item in json_resp.get('images', []):
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'title': item['title'],
|
||||
'url': item.get('link'),
|
||||
'img_src': item.get('image'),
|
||||
'thumbnail_src': item.get('thumbnail'),
|
||||
}
|
||||
)
|
||||
|
||||
elif search_type == 'videos':
|
||||
# The results in the video category are most often links to pages that contain
|
||||
# a video and not to a video stream --> SearXNG can't use the video template.
|
||||
|
||||
for item in json_resp.get('videos', []):
|
||||
metadata = [x for x in [item.get('description'), item.get('duration')] if x]
|
||||
results.append(
|
||||
{
|
||||
'title': item['title'],
|
||||
'url': item.get('link'),
|
||||
'content': '',
|
||||
'metadata': ' / '.join(metadata),
|
||||
'img_src': item.get('image'),
|
||||
}
|
||||
)
|
||||
|
||||
elif search_type == 'news':
|
||||
for item in json_resp.get('news', []):
|
||||
metadata = [x for x in [item.get('source'), item.get('time')] if x]
|
||||
results.append(
|
||||
{
|
||||
'title': item['title'],
|
||||
'url': item.get('link'),
|
||||
'content': item.get('description', ''),
|
||||
'metadata': ' / '.join(metadata),
|
||||
'img_src': item.get('image'),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
|
@ -75,6 +75,10 @@ about = {
|
|||
# engine dependent config
|
||||
categories = []
|
||||
paging = True
|
||||
max_page = 5
|
||||
"""5 pages maximum (``&p=5``): Trying to do more just results in an improper
|
||||
redirect"""
|
||||
|
||||
qwant_categ = None
|
||||
"""One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""
|
||||
|
||||
|
|
@ -112,10 +116,6 @@ def request(query, params):
|
|||
args = {'q': query}
|
||||
params['raise_for_httperror'] = False
|
||||
|
||||
# all qwant engines (incl qwant-lite) delivers only 5 pages maximum
|
||||
if params['pageno'] > 5:
|
||||
return None
|
||||
|
||||
if qwant_categ == 'web-lite':
|
||||
|
||||
url = web_lite_url + '?'
|
||||
|
|
|
|||
60
searx/engines/rottentomatoes.py
Normal file
60
searx/engines/rottentomatoes.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""RottenTomatoes (movies)
|
||||
"""
|
||||
|
||||
from urllib.parse import quote_plus
|
||||
from lxml import html
|
||||
from searx.utils import eval_xpath, eval_xpath_list, extract_text
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.rottentomatoes.com/',
|
||||
"wikidata_id": 'Q105584',
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
categories = ['movies']
|
||||
|
||||
base_url = "https://www.rottentomatoes.com"
|
||||
|
||||
results_xpath = "//search-page-media-row"
|
||||
url_xpath = "./a[1]/@href"
|
||||
title_xpath = "./a/img/@alt"
|
||||
img_src_xpath = "./a/img/@src"
|
||||
release_year_xpath = "concat('From ', string(./@releaseyear))"
|
||||
score_xpath = "concat('Score: ', string(./@tomatometerscore))"
|
||||
cast_xpath = "concat('Starring ', string(./@cast))"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params["url"] = f"{base_url}/search?search={quote_plus(query)}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
content = []
|
||||
for xpath in (release_year_xpath, score_xpath, cast_xpath):
|
||||
info = extract_text(eval_xpath(result, xpath))
|
||||
|
||||
# a gap in the end means that no data was found
|
||||
if info and info[-1] != " ":
|
||||
content.append(info)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': extract_text(eval_xpath(result, url_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
'content': ', '.join(content),
|
||||
'img_src': extract_text(eval_xpath(result, img_src_xpath)),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
|
@ -127,6 +127,9 @@ different to the UI language) and a region filter.
|
|||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
max_page = 18
|
||||
"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
|
||||
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
|
||||
|
|
|
|||
43
searx/engines/stract.py
Normal file
43
searx/engines/stract.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Stract is an independent open source search engine.
|
||||
At this state, it's still in beta and hence this implementation will need to be updated once beta ends.
|
||||
"""
|
||||
|
||||
from json import dumps
|
||||
|
||||
about = {
|
||||
"website": "https://stract.com/",
|
||||
"use_official_api": True,
|
||||
"official_api_documentation": "https://stract.com/beta/api/docs/#/search/api",
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
categories = ['general']
|
||||
paging = True
|
||||
|
||||
search_url = "https://stract.com/beta/api/search"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url
|
||||
params['method'] = "POST"
|
||||
params['headers'] = {'Accept': 'application/json', 'Content-Type': 'application/json'}
|
||||
params['data'] = dumps({'query': query, 'page': params['pageno'] - 1})
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
for result in resp.json()["webpages"]:
|
||||
results.append(
|
||||
{
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': ''.join(fragment['text'] for fragment in result['snippet']['text']['fragments']),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
60
searx/engines/tootfinder.py
Normal file
60
searx/engines/tootfinder.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Tootfinder (social media)
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from json import loads
|
||||
from searx.utils import html_to_text
|
||||
|
||||
about = {
|
||||
'website': "https://www.tootfinder.ch",
|
||||
'official_api_documentation': "https://wiki.tootfinder.ch/index.php?name=the-tootfinder-rest-api",
|
||||
'use_official_api': True,
|
||||
'require_api_key': False,
|
||||
'results': "JSON",
|
||||
}
|
||||
categories = ['social media']
|
||||
|
||||
base_url = "https://www.tootfinder.ch"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = f"{base_url}/rest/api/search/{query}"
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
# the API of tootfinder has an issue that errors on server side are appended to the API response as HTML
|
||||
# thus we're only looking for the line that contains the actual json data and ignore everything else
|
||||
json_str = ""
|
||||
for line in resp.text.split("\n"):
|
||||
if line.startswith("[{"):
|
||||
json_str = line
|
||||
break
|
||||
|
||||
for result in loads(json_str):
|
||||
thumbnail = None
|
||||
|
||||
attachments = result.get('media_attachments', [])
|
||||
images = [attachment['preview_url'] for attachment in attachments if attachment['type'] == 'image']
|
||||
if len(images) > 0:
|
||||
thumbnail = images[0]
|
||||
|
||||
title = result.get('card', {}).get('title')
|
||||
if not title:
|
||||
title = html_to_text(result['content'])[:75]
|
||||
|
||||
results.append(
|
||||
{
|
||||
'url': result['url'],
|
||||
'title': title,
|
||||
'content': html_to_text(result['content']),
|
||||
'thumbnail': thumbnail,
|
||||
'publishedDate': datetime.strptime(result['created_at'], '%Y-%m-%d %H:%M:%S'),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
79
searx/engines/yep.py
Normal file
79
searx/engines/yep.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Yep (general, images, news)
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import html_to_text
|
||||
|
||||
about = {
|
||||
'website': 'https://yep.com/',
|
||||
'official_api_documentation': 'https://docs.developer.yelp.com',
|
||||
'use_official_api': False,
|
||||
'require_api_key': False,
|
||||
'results': 'JSON',
|
||||
}
|
||||
|
||||
base_url = "https://api.yep.com"
|
||||
search_type = "web" # 'web', 'images', 'news'
|
||||
|
||||
safesearch = True
|
||||
safesearch_map = {0: 'off', 1: 'moderate', 2: 'strict'}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
args = {
|
||||
'client': 'web',
|
||||
'no_correct': 'false',
|
||||
'q': query,
|
||||
'safeSearch': safesearch_map[params['safesearch']],
|
||||
'type': search_type,
|
||||
}
|
||||
params['url'] = f"{base_url}/fs/2/search?{urlencode(args)}"
|
||||
params['headers']['Referer'] = 'https://yep.com/'
|
||||
return params
|
||||
|
||||
|
||||
def _web_result(result):
|
||||
return {
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': html_to_text(result['snippet']),
|
||||
}
|
||||
|
||||
|
||||
def _images_result(result):
|
||||
return {
|
||||
'template': 'images.html',
|
||||
'url': result['host_page'],
|
||||
'title': result.get('title', ''),
|
||||
'content': '',
|
||||
'img_src': result['image_id'],
|
||||
'thumbnail_src': result['src'],
|
||||
}
|
||||
|
||||
|
||||
def _news_result(result):
|
||||
return {
|
||||
'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': html_to_text(result['snippet']),
|
||||
'publishedDate': datetime.strptime(result['first_seen'][:19], '%Y-%m-%dT%H:%M:%S'),
|
||||
}
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
for result in resp.json()[1]['results']:
|
||||
if search_type == "web":
|
||||
results.append(_web_result(result))
|
||||
elif search_type == "images":
|
||||
results.append(_images_result(result))
|
||||
elif search_type == "news":
|
||||
results.append(_news_result(result))
|
||||
else:
|
||||
raise ValueError(f"Unsupported yep search type: {search_type}")
|
||||
|
||||
return results
|
||||
|
|
@ -200,6 +200,8 @@ def fetch_traits(engine_traits: EngineTraits) -> None:
|
|||
for locale in babel.core.localedata.locale_identifiers(): # type: ignore
|
||||
# Create a Locale object for the current locale
|
||||
loc = babel.Locale.parse(locale)
|
||||
if loc.english_name is None:
|
||||
continue
|
||||
language_name_locale_map[loc.english_name.lower()] = loc # type: ignore
|
||||
|
||||
for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue