Merge remote-tracking branch 'upstream/master' into pulling

2024-01-01 19:24:07 +01:00 · 2024-01-22 03:23:01 +00:00 · 2024-01-22 03:23:01 +00:00 · 72e8b5c354
commit 72e8b5c354
parent 0392224e19 047c7a7cf6
223 changed files with 24674 additions and 20896 deletions
--- a/searx/engines/init.py
+++ b/searx/engines/init.py
@ -45,6 +45,7 @@ ENGINE_DEFAULT_ARGS = {
    "using_tor_proxy": False,
    "send_accept_language_header": False,
    "tokens": [],
+    "max_page": 0,
 }
 # set automatically when an engine does not have any tab category
 DEFAULT_CATEGORY = 'other'
--- a/searx/engines/bing.py
+++ b/searx/engines/bing.py
@ -59,6 +59,9 @@ about = {
 # engine dependent config
 categories = ['general', 'web']
 paging = True
+max_page = 200
+"""200 pages maximum (``&first=1991``)"""
+
 time_range_support = True
 safesearch = True
 """Bing results are always SFW.  To get NSFW links from bing some age
--- a/searx/engines/bpb.py
+++ b/searx/engines/bpb.py
@ -0,0 +1,68 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""BPB refers to ``Bundeszentrale für poltische Bildung``, which is a German
+governmental institution aiming to reduce misinformation by providing resources
+about politics and history.
+"""
+
+from datetime import datetime
+from urllib.parse import urlencode
+
+about = {
+    'website': "https://www.bpb.de",
+    'official_api_documentation': None,
+    'use_official_api': False,
+    'require_api_key': False,
+    'results': 'JSON',
+    'language': 'de',
+}
+
+paging = True
+categories = ['general']
+
+
+base_url = "https://www.bpb.de"
+
+
+def request(query, params):
+    args = {
+        'query[term]': query,
+        'page': params['pageno'] - 1,
+        'sort[direction]': 'descending',
+        'payload[nid]': 65350,
+    }
+    params['url'] = f"{base_url}/bpbapi/filter/search?{urlencode(args)}"
+    return params
+
+
+def response(resp):
+    results = []
+
+    json_resp = resp.json()
+
+    for result in json_resp['teaser']:
+        img_src = None
+        if result['teaser']['image']:
+            img_src = base_url + result['teaser']['image']['sources'][-1]['url']
+
+        metadata = result['extension']['overline']
+        authors = ', '.join(author['name'] for author in result['extension'].get('authors', []))
+        if authors:
+            metadata += f" | {authors}"
+
+        publishedDate = None
+        if result['extension'].get('publishingDate'):
+            publishedDate = datetime.utcfromtimestamp(result['extension']['publishingDate'])
+
+        results.append(
+            {
+                'url': base_url + result['teaser']['link']['url'],
+                'title': result['teaser']['title'],
+                'content': result['teaser']['text'],
+                'img_src': img_src,
+                'publishedDate': publishedDate,
+                'metadata': metadata,
+            }
+        )
+
+    return results
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@ -152,6 +152,10 @@ send_accept_language_header = True
 paging = False
 """Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
 category All)."""
+max_page = 10
+"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10.  Trying
+to do more won't return any result and you will most likely be flagged as a bot.
+"""

 safesearch = True
 safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'}  # cookie: safesearch=off
--- a/searx/engines/destatis.py
+++ b/searx/engines/destatis.py
@ -0,0 +1,68 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""DeStatis
+"""
+
+from urllib.parse import urlencode
+from lxml import html
+from searx.utils import eval_xpath, eval_xpath_list, extract_text
+
+about = {
+    'website': 'https://www.destatis.de',
+    'official_api_documentation': 'https://destatis.api.bund.dev/',
+    'use_official_api': False,
+    'require_api_key': False,
+    'results': 'HTML',
+    'language': 'de',
+}
+
+categories = []
+paging = True
+
+base_url = "https://www.destatis.de"
+search_url = f"{base_url}/SiteGlobals/Forms/Suche/Expertensuche_Formular.html"
+
+# pylint: disable-next=line-too-long
+results_xpath = '//div[contains(@class, "l-content-wrapper")]/div[contains(@class, "row")]/div[contains(@class, "column")]/div[contains(@class, "c-result"){extra}]'
+results_xpath_filter_recommended = " and not(contains(@class, 'c-result--recommended'))"
+url_xpath = './/a/@href'
+title_xpath = './/a/text()'
+date_xpath = './/a/span[contains(@class, "c-result__date")]'
+content_xpath = './/div[contains(@class, "column")]/p/text()'
+doctype_xpath = './/div[contains(@class, "c-result__doctype")]/p'
+
+
+def request(query, params):
+    args = {
+        'templateQueryString': query,
+        'gtp': f"474_list%3D{params['pageno']}",
+    }
+    params['url'] = f"{search_url}?{urlencode(args)}"
+    return params
+
+
+def response(resp):
+    results = []
+
+    dom = html.fromstring(resp.text)
+
+    # filter out suggested results on further page because they're the same on each page
+    extra_xpath = results_xpath_filter_recommended if resp.search_params['pageno'] > 1 else ''
+    res_xpath = results_xpath.format(extra=extra_xpath)
+
+    for result in eval_xpath_list(dom, res_xpath):
+        doctype = extract_text(eval_xpath(result, doctype_xpath))
+        date = extract_text(eval_xpath(result, date_xpath))
+
+        metadata = [meta for meta in (doctype, date) if meta != ""]
+
+        results.append(
+            {
+                'url': base_url + "/" + extract_text(eval_xpath(result, url_xpath)),
+                'title': extract_text(eval_xpath(result, title_xpath)),
+                'content': extract_text(eval_xpath(result, content_xpath)),
+                'metadata': ', '.join(metadata),
+            }
+        )
+
+    return results
--- a/searx/engines/fyyd.py
+++ b/searx/engines/fyyd.py
@ -0,0 +1,50 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Fyyd (podcasts)
+"""
+
+from datetime import datetime
+from urllib.parse import urlencode
+
+about = {
+    'website': 'https://fyyd.de',
+    'official_api_documentation': 'https://github.com/eazyliving/fyyd-api',
+    'use_official_api': True,
+    'require_api_key': False,
+    'results': 'JSON',
+}
+categories = []
+paging = True
+
+base_url = "https://api.fyyd.de"
+page_size = 10
+
+
+def request(query, params):
+    args = {
+        'term': query,
+        'count': page_size,
+        'page': params['pageno'] - 1,
+    }
+    params['url'] = f"{base_url}/0.2/search/podcast?{urlencode(args)}"
+    return params
+
+
+def response(resp):
+    results = []
+
+    json_results = resp.json()['data']
+
+    for result in json_results:
+        results.append(
+            {
+                'url': result['htmlURL'],
+                'title': result['title'],
+                'content': result['description'],
+                'thumbnail': result['smallImageURL'],
+                'publishedDate': datetime.strptime(result['status_since'], '%Y-%m-%d %H:%M:%S'),
+                'metadata': f"Rank: {result['rank']} || {result['episode_count']} episodes",
+            }
+        )
+
+    return results
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@ -48,6 +48,7 @@ about = {
 # engine dependent config
 categories = ['general', 'web']
 paging = True
+max_page = 50
 time_range_support = True
 safesearch = True

@ -429,14 +430,13 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
    if not resp.ok:  # type: ignore
        raise RuntimeError("Response from Google's preferences is not OK.")

-    dom = html.fromstring(resp.text)  # type: ignore
+    dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))

    # supported language codes

    lang_map = {'no': 'nb'}
-    for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
-
-        eng_lang = x.get("value").split('_')[-1]
+    for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
+        eng_lang = x.get("value")
        try:
            locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
        except babel.UnknownLocaleError:
@ -456,7 +456,7 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):

    # supported region codes

-    for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
+    for x in eval_xpath_list(dom, "//select[@name='gl']/option"):
        eng_country = x.get("value")

        if eng_country in skip_countries:
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@ -47,6 +47,7 @@ about = {
 # engine dependent config
 categories = ['images', 'web']
 paging = True
+max_page = 50
 time_range_support = True
 safesearch = True
 send_accept_language_header = True
--- a/searx/engines/google_scholar.py
+++ b/searx/engines/google_scholar.py
@ -51,6 +51,7 @@ about = {
 # engine dependent config
 categories = ['science', 'scientific publications']
 paging = True
+max_page = 50
 language_support = True
 time_range_support = True
 safesearch = False
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@ -57,6 +57,7 @@ about = {

 categories = ['videos', 'web']
 paging = True
+max_page = 50
 language_support = True
 time_range_support = True
 safesearch = True
@ -86,7 +87,7 @@ def request(query, params):

    if params['time_range'] in time_range_dict:
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
-    if params['safesearch']:
+    if 'safesearch' in params:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
    params['url'] = query_url

--- a/searx/engines/json_engine.py
+++ b/searx/engines/json_engine.py
@ -8,6 +8,7 @@ from searx.utils import to_string, html_to_text

 search_url = None
 url_query = None
+url_prefix = ""
 content_query = None
 title_query = None
 content_html_to_text = False
@ -129,7 +130,7 @@ def response(resp):
                content = ""
            results.append(
                {
-                    'url': to_string(url),
+                    'url': url_prefix + to_string(url),
                    'title': title_filter(to_string(title)),
                    'content': content_filter(to_string(content)),
                }
@ -138,7 +139,7 @@ def response(resp):
        for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)):
            results.append(
                {
-                    'url': to_string(url),
+                    'url': url_prefix + to_string(url),
                    'title': title_filter(to_string(title)),
                    'content': content_filter(to_string(content)),
                }
--- a/searx/engines/lingva.py
+++ b/searx/engines/lingva.py
@ -16,7 +16,7 @@ about = {
 engine_type = 'online_dictionary'
 categories = ['general']

-url = "https://lingva.ml"
+url = "https://lingva.thedaviddelta.com/"
 search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"


--- a/searx/engines/podcastindex.py
+++ b/searx/engines/podcastindex.py
@ -0,0 +1,43 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Podcast Index
+"""
+
+from urllib.parse import quote_plus
+from datetime import datetime
+
+about = {
+    'website': 'https://podcastindex.org',
+    'official_api_documentation': None,  # requires an account
+    'use_official_api': False,
+    'require_api_key': False,
+    'results': 'JSON',
+}
+categories = []
+
+base_url = "https://podcastindex.org"
+
+
+def request(query, params):
+    params['url'] = f"{base_url}/api/search/byterm?q={quote_plus(query)}"
+    return params
+
+
+def response(resp):
+    results = []
+
+    json = resp.json()
+
+    for result in json['feeds']:
+        results.append(
+            {
+                'url': result['link'],
+                'title': result['title'],
+                'content': result['description'],
+                'thumbnail': result['image'],
+                'publishedDate': datetime.utcfromtimestamp(result['newestItemPubdate']),
+                'metadata': f"{result['author']}, {result['episodeCount']} episodes",
+            }
+        )
+
+    return results
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@ -0,0 +1,266 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Presearch supports the search types listed in :py:obj:`search_type` (general,
+images, videos, news).
+
+Configured ``presarch`` engines:
+
+.. code:: yaml
+
+  - name: presearch
+    engine: presearch
+    search_type: search
+    categories: [general, web]
+
+  - name: presearch images
+    ...
+    search_type: images
+    categories: [images, web]
+
+  - name: presearch videos
+    ...
+    search_type: videos
+    categories: [general, web]
+
+  - name: presearch news
+    ...
+    search_type: news
+    categories: [news, web]
+
+.. hint::
+
+   By default Presearch's video category is intentionally placed into::
+
+       categories: [general, web]
+
+
+Search type ``video``
+=====================
+
+The results in the video category are most often links to pages that contain a
+video, for instance many links from Preasearch's video category link content
+from facebook (aka Meta) or Twitter (aka X).  Since these are not real links to
+video streams SearXNG can't use the video template for this and if SearXNG can't
+use this template, then the user doesn't want to see these hits in the videos
+category.
+
+
+Languages & Regions
+===================
+
+In Presearch there are languages for the UI and regions for narrowing down the
+search.  If we set "auto" for the region in the WEB-UI of Presearch and cookie
+``use_local_search_results=false``, then the defaults are set for both (the
+language and the region) from the ``Accept-Language`` header.
+
+Since the region is already "auto" by default, we only need to set the
+``use_local_search_results`` cookie and send the ``Accept-Language`` header.  We
+have to set these values in both requests we send to Presearch; in the first
+request to get the request-ID from Presearch and in the final request to get the
+result list (see ``send_accept_language_header``).
+
+
+Implementations
+===============
+
+"""
+
+from urllib.parse import urlencode
+from searx import locales
+from searx.network import get
+from searx.utils import gen_useragent, html_to_text
+
+about = {
+    "website": "https://presearch.io",
+    "wikidiata_id": "Q7240905",
+    "official_api_documentation": "https://docs.presearch.io/nodes/api",
+    "use_official_api": False,
+    "require_api_key": False,
+    "results": "JSON",
+}
+paging = True
+safesearch = True
+time_range_support = True
+send_accept_language_header = True
+categories = ["general", "web"]  # general, images, videos, news
+
+search_type = "search"
+"""must be any of ``search``, ``images``, ``videos``, ``news``"""
+
+base_url = "https://presearch.com"
+safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
+
+
+def init(_):
+    if search_type not in ['search', 'images', 'videos', 'news']:
+        raise ValueError(f'presearch search_type: {search_type}')
+
+
+def _get_request_id(query, params):
+
+    args = {
+        "q": query,
+        "page": params["pageno"],
+    }
+
+    if params["time_range"]:
+        args["time"] = params["time_range"]
+
+    url = f"{base_url}/{search_type}?{urlencode(args)}"
+
+    headers = {
+        'User-Agent': gen_useragent(),
+        'Cookie': (
+            f"b=1;"
+            f" presearch_session=;"
+            f" use_local_search_results=false;"
+            f" use_safe_search={safesearch_map[params['safesearch']]}"
+        ),
+    }
+    if params['searxng_locale'] != 'all':
+        l = locales.get_locale(params['searxng_locale'])
+
+        # Presearch narrows down the search by region.  In SearXNG when the user
+        # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
+        # region.
+
+        # We could possibly use searx.locales.get_official_locales to determine
+        # in which regions this language is an official one, but then we still
+        # wouldn't know which region should be given more weight / Presearch
+        # performs an IP-based geolocation of the user, we don't want that in
+        # SearXNG ;-)
+
+        if l.territory:
+            headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
+
+    resp_text = get(url, headers=headers).text  # type: ignore
+
+    for line in resp_text.split("\n"):
+        if "window.searchId = " in line:
+            return line.split("= ")[1][:-1].replace('"', "")
+
+    return None
+
+
+def request(query, params):
+    request_id = _get_request_id(query, params)
+    params["headers"]["Accept"] = "application/json"
+    params["url"] = f"{base_url}/results?id={request_id}"
+
+    return params
+
+
+def _strip_leading_strings(text):
+    for x in ['wikipedia', 'google']:
+        if text.lower().endswith(x):
+            text = text[: -len(x)]
+    return text.strip()
+
+
+def parse_search_query(json_results):
+    results = []
+
+    for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
+        result = {
+            'url': item['link'],
+            'title': item['title'],
+            'img_src': item['image'],
+            'content': '',
+            'metadata': item.get('source'),
+        }
+        results.append(result)
+
+    for item in json_results.get('standardResults', []):
+        result = {
+            'url': item['link'],
+            'title': item['title'],
+            'content': html_to_text(item['description']),
+        }
+        results.append(result)
+
+    info = json_results.get('infoSection', {}).get('data')
+    if info:
+        attributes = []
+        for item in info.get('about', []):
+
+            text = html_to_text(item)
+            if ':' in text:
+                # split text into key / value
+                label, value = text.split(':', 1)
+            else:
+                # In other languages (tested with zh-TW) a colon is represented
+                # by a different symbol --> then we split at the first space.
+                label, value = text.split(' ', 1)
+                label = label[:-1]
+
+            value = _strip_leading_strings(value)
+            attributes.append({'label': label, 'value': value})
+        content = []
+        for item in [info.get('subtitle'), info.get('description')]:
+            if not item:
+                continue
+            item = _strip_leading_strings(html_to_text(item))
+            if item:
+                content.append(item)
+
+        results.append(
+            {
+                'infobox': info['title'],
+                'id': info['title'],
+                'img_src': info.get('image'),
+                'content': ' | '.join(content),
+                'attributes': attributes,
+            }
+        )
+    return results
+
+
+def response(resp):
+    results = []
+    json_resp = resp.json()
+
+    if search_type == 'search':
+        results = parse_search_query(json_resp.get('results'))
+
+    elif search_type == 'images':
+        for item in json_resp.get('images', []):
+            results.append(
+                {
+                    'template': 'images.html',
+                    'title': item['title'],
+                    'url': item.get('link'),
+                    'img_src': item.get('image'),
+                    'thumbnail_src': item.get('thumbnail'),
+                }
+            )
+
+    elif search_type == 'videos':
+        # The results in the video category are most often links to pages that contain
+        # a video and not to a video stream --> SearXNG can't use the video template.
+
+        for item in json_resp.get('videos', []):
+            metadata = [x for x in [item.get('description'), item.get('duration')] if x]
+            results.append(
+                {
+                    'title': item['title'],
+                    'url': item.get('link'),
+                    'content': '',
+                    'metadata': ' / '.join(metadata),
+                    'img_src': item.get('image'),
+                }
+            )
+
+    elif search_type == 'news':
+        for item in json_resp.get('news', []):
+            metadata = [x for x in [item.get('source'), item.get('time')] if x]
+            results.append(
+                {
+                    'title': item['title'],
+                    'url': item.get('link'),
+                    'content': item.get('description', ''),
+                    'metadata': ' / '.join(metadata),
+                    'img_src': item.get('image'),
+                }
+            )
+
+    return results
--- a/searx/engines/qwant.py
+++ b/searx/engines/qwant.py
@ -75,6 +75,10 @@ about = {
 # engine dependent config
 categories = []
 paging = True
+max_page = 5
+"""5 pages maximum (``&p=5``): Trying to do more just results in an improper
+redirect"""
+
 qwant_categ = None
 """One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""

@ -112,10 +116,6 @@ def request(query, params):
    args = {'q': query}
    params['raise_for_httperror'] = False

-    # all qwant engines (incl qwant-lite) delivers only 5 pages maximum
-    if params['pageno'] > 5:
-        return None
-
    if qwant_categ == 'web-lite':

        url = web_lite_url + '?'
--- a/searx/engines/rottentomatoes.py
+++ b/searx/engines/rottentomatoes.py
@ -0,0 +1,60 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""RottenTomatoes (movies)
+"""
+
+from urllib.parse import quote_plus
+from lxml import html
+from searx.utils import eval_xpath, eval_xpath_list, extract_text
+
+# about
+about = {
+    "website": 'https://www.rottentomatoes.com/',
+    "wikidata_id": 'Q105584',
+    "official_api_documentation": None,
+    "use_official_api": False,
+    "require_api_key": False,
+    "results": 'HTML',
+}
+categories = ['movies']
+
+base_url = "https://www.rottentomatoes.com"
+
+results_xpath = "//search-page-media-row"
+url_xpath = "./a[1]/@href"
+title_xpath = "./a/img/@alt"
+img_src_xpath = "./a/img/@src"
+release_year_xpath = "concat('From ', string(./@releaseyear))"
+score_xpath = "concat('Score: ', string(./@tomatometerscore))"
+cast_xpath = "concat('Starring ', string(./@cast))"
+
+
+def request(query, params):
+    params["url"] = f"{base_url}/search?search={quote_plus(query)}"
+    return params
+
+
+def response(resp):
+    results = []
+
+    dom = html.fromstring(resp.text)
+
+    for result in eval_xpath_list(dom, results_xpath):
+        content = []
+        for xpath in (release_year_xpath, score_xpath, cast_xpath):
+            info = extract_text(eval_xpath(result, xpath))
+
+            # a gap in the end means that no data was found
+            if info and info[-1] != " ":
+                content.append(info)
+
+        results.append(
+            {
+                'url': extract_text(eval_xpath(result, url_xpath)),
+                'title': extract_text(eval_xpath(result, title_xpath)),
+                'content': ', '.join(content),
+                'img_src': extract_text(eval_xpath(result, img_src_xpath)),
+            }
+        )
+
+    return results
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@ -127,6 +127,9 @@ different to the UI language) and a region filter.
 # engine dependent config
 categories = ['general', 'web']
 paging = True
+max_page = 18
+"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
+
 time_range_support = True
 safesearch = True

--- a/searx/engines/stract.py
+++ b/searx/engines/stract.py
@ -0,0 +1,43 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Stract is an independent open source search engine. 
+At this state, it's still in beta and hence this implementation will need to be updated once beta ends.
+"""
+
+from json import dumps
+
+about = {
+    "website": "https://stract.com/",
+    "use_official_api": True,
+    "official_api_documentation": "https://stract.com/beta/api/docs/#/search/api",
+    "require_api_key": False,
+    "results": "JSON",
+}
+categories = ['general']
+paging = True
+
+search_url = "https://stract.com/beta/api/search"
+
+
+def request(query, params):
+    params['url'] = search_url
+    params['method'] = "POST"
+    params['headers'] = {'Accept': 'application/json', 'Content-Type': 'application/json'}
+    params['data'] = dumps({'query': query, 'page': params['pageno'] - 1})
+
+    return params
+
+
+def response(resp):
+    results = []
+
+    for result in resp.json()["webpages"]:
+        results.append(
+            {
+                'url': result['url'],
+                'title': result['title'],
+                'content': ''.join(fragment['text'] for fragment in result['snippet']['text']['fragments']),
+            }
+        )
+
+    return results
--- a/searx/engines/tootfinder.py
+++ b/searx/engines/tootfinder.py
@ -0,0 +1,60 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Tootfinder (social media)
+"""
+
+from datetime import datetime
+from json import loads
+from searx.utils import html_to_text
+
+about = {
+    'website': "https://www.tootfinder.ch",
+    'official_api_documentation': "https://wiki.tootfinder.ch/index.php?name=the-tootfinder-rest-api",
+    'use_official_api': True,
+    'require_api_key': False,
+    'results': "JSON",
+}
+categories = ['social media']
+
+base_url = "https://www.tootfinder.ch"
+
+
+def request(query, params):
+    params['url'] = f"{base_url}/rest/api/search/{query}"
+    return params
+
+
+def response(resp):
+    results = []
+
+    # the API of tootfinder has an issue that errors on server side are appended to the API response as HTML
+    # thus we're only looking for the line that contains the actual json data and ignore everything else
+    json_str = ""
+    for line in resp.text.split("\n"):
+        if line.startswith("[{"):
+            json_str = line
+            break
+
+    for result in loads(json_str):
+        thumbnail = None
+
+        attachments = result.get('media_attachments', [])
+        images = [attachment['preview_url'] for attachment in attachments if attachment['type'] == 'image']
+        if len(images) > 0:
+            thumbnail = images[0]
+
+        title = result.get('card', {}).get('title')
+        if not title:
+            title = html_to_text(result['content'])[:75]
+
+        results.append(
+            {
+                'url': result['url'],
+                'title': title,
+                'content': html_to_text(result['content']),
+                'thumbnail': thumbnail,
+                'publishedDate': datetime.strptime(result['created_at'], '%Y-%m-%d %H:%M:%S'),
+            }
+        )
+
+    return results
--- a/searx/engines/yep.py
+++ b/searx/engines/yep.py
@ -0,0 +1,79 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Yep (general, images, news)
+"""
+
+from datetime import datetime
+from urllib.parse import urlencode
+from searx.utils import html_to_text
+
+about = {
+    'website': 'https://yep.com/',
+    'official_api_documentation': 'https://docs.developer.yelp.com',
+    'use_official_api': False,
+    'require_api_key': False,
+    'results': 'JSON',
+}
+
+base_url = "https://api.yep.com"
+search_type = "web"  # 'web', 'images', 'news'
+
+safesearch = True
+safesearch_map = {0: 'off', 1: 'moderate', 2: 'strict'}
+
+
+def request(query, params):
+    args = {
+        'client': 'web',
+        'no_correct': 'false',
+        'q': query,
+        'safeSearch': safesearch_map[params['safesearch']],
+        'type': search_type,
+    }
+    params['url'] = f"{base_url}/fs/2/search?{urlencode(args)}"
+    params['headers']['Referer'] = 'https://yep.com/'
+    return params
+
+
+def _web_result(result):
+    return {
+        'url': result['url'],
+        'title': result['title'],
+        'content': html_to_text(result['snippet']),
+    }
+
+
+def _images_result(result):
+    return {
+        'template': 'images.html',
+        'url': result['host_page'],
+        'title': result.get('title', ''),
+        'content': '',
+        'img_src': result['image_id'],
+        'thumbnail_src': result['src'],
+    }
+
+
+def _news_result(result):
+    return {
+        'url': result['url'],
+        'title': result['title'],
+        'content': html_to_text(result['snippet']),
+        'publishedDate': datetime.strptime(result['first_seen'][:19], '%Y-%m-%dT%H:%M:%S'),
+    }
+
+
+def response(resp):
+    results = []
+
+    for result in resp.json()[1]['results']:
+        if search_type == "web":
+            results.append(_web_result(result))
+        elif search_type == "images":
+            results.append(_images_result(result))
+        elif search_type == "news":
+            results.append(_news_result(result))
+        else:
+            raise ValueError(f"Unsupported yep search type: {search_type}")
+
+    return results
--- a/searx/engines/zlibrary.py
+++ b/searx/engines/zlibrary.py
@ -200,6 +200,8 @@ def fetch_traits(engine_traits: EngineTraits) -> None:
    for locale in babel.core.localedata.locale_identifiers():  # type: ignore
        # Create a Locale object for the current locale
        loc = babel.Locale.parse(locale)
+        if loc.english_name is None:
+            continue
        language_name_locale_map[loc.english_name.lower()] = loc  # type: ignore

    for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"):