From 5fca26f0b3ae18cb00b5182f3ad2150edb01acee Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 8 Apr 2022 13:24:17 +0200 Subject: [PATCH] [mod] engines_languages.json: add new type engine_properties This patch adds the boilerplate code, needed to fetch properties from engines. In the past we only fetched *languages* but some engines need *regions* to parameterize the engine request. To fit into our *fetch language* procedures the boilerplate is implemented in the `searxng_extra/update/update_languages.py` and the *engine_properties* are stored along in the `searx/data/engines_languages.json`. This implementation is downward compatible to the `_fetch_fetch_languages()` infrastructure we have. If there comes the day we have all `_fetch_fetch_languages()` implementations moved to `_fetch_engine_properties()` implementations, we can rename the files and scripts. The new type `engine_properties` is a dictionary with keys `languages` and `regions`. The values are dictionaries to map from SearXNG's language & region to option values the engine does use:: engine_properties = { 'type' : 'engine_properties', # <-- !!! 'regions': { # 'ca-ES' : }, 'languages': { # 'ca' : }, } Similar to the `supported_languages`, in the engine the properties are available under the name `supported_properties`. Initial we start with languages & regions, but in a wider sense the type is named *engine properties*. Engines can store in whatever options they need and may be in the future there is a need to fetch additional or complete different properties. Signed-off-by: Markus Heiser --- docs/admin/engines/configured_engines.rst | 2 +- manage | 2 +- searx/autocomplete.py | 2 +- searx/engines/__init__.py | 105 +++++++++++++--------- searx/engines/google_scholar.py | 1 - searx/engines/google_videos.py | 1 - searx/engines/yahoo_news.py | 1 - searx/engines/youtube_noapi.py | 1 - searxng_extra/update/update_languages.py | 105 ++++++++++++++++++---- 9 files changed, 152 insertions(+), 68 deletions(-) diff --git a/docs/admin/engines/configured_engines.rst b/docs/admin/engines/configured_engines.rst index c7b6a1f52..fa1e5a4b0 100644 --- a/docs/admin/engines/configured_engines.rst +++ b/docs/admin/engines/configured_engines.rst @@ -42,7 +42,7 @@ Explanation of the :ref:`general engine configuration` shown in the table - Timeout - Weight - Paging - - Language + - Language, Region - Safe search - Time range diff --git a/manage b/manage index 9e5b59fc7..20f8fc1b3 100755 --- a/manage +++ b/manage @@ -57,7 +57,7 @@ PYLINT_SEARXNG_DISABLE_OPTION="\ I,C,R,\ W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\ E1136" -PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_languages,language_aliases,logger,categories" +PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_properties,supported_languages,language_aliases,logger,categories" PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc" help() { diff --git a/searx/autocomplete.py b/searx/autocomplete.py index ff299d184..014f81685 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -15,7 +15,7 @@ from searx.data import ENGINES_LANGUAGES from searx.network import get as http_get from searx.exceptions import SearxEngineResponseException -# a fetch_supported_languages() for XPath engines isn't available right now +# a _fetch_supported_properites() for XPath engines isn't available right now # _brave = ENGINES_LANGUAGES['brave'].keys() diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index ae132f48d..ece3d3e5d 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -19,8 +19,7 @@ from os.path import realpath, dirname from babel.localedata import locale_identifiers from searx import logger, settings from searx.data import ENGINES_LANGUAGES -from searx.network import get -from searx.utils import load_module, match_language, gen_useragent +from searx.utils import load_module, match_language logger = logger.getChild('engines') @@ -36,8 +35,7 @@ ENGINE_DEFAULT_ARGS = { "timeout": settings["outgoing"]["request_timeout"], "shortcut": "-", "categories": ["general"], - "supported_languages": [], - "language_aliases": {}, + "language_support" : False, "paging": False, "safesearch": False, "time_range_support": False, @@ -58,10 +56,10 @@ class Engine: # pylint: disable=too-few-public-methods engine: str shortcut: str categories: List[str] - supported_languages: List[str] about: dict inactive: bool disabled: bool + # language support, either by selecting a region or by selecting a language language_support: bool paging: bool safesearch: bool @@ -143,6 +141,25 @@ def load_engine(engine_data: dict) -> Optional[Engine]: return engine +def engine_properties_template(): + """A dictionary with languages and regions to map from SearXNG' languages & + region tags to engine's language & region tags:: + + engine_properties = { + 'type' : 'engine_properties', + 'regions': { + # 'ca-ES' : + }, + 'languages': { + # 'ca' : + }, + } + """ + return { + 'type' : 'engine_properties', + 'regions': {}, + 'languages': {}, + } def set_loggers(engine, engine_name): # set the logger for engine @@ -179,8 +196,11 @@ def update_engine_attributes(engine: Engine, engine_data): def set_language_attributes(engine: Engine): # assign supported languages from json file + + supported_properties = None + if engine.name in ENGINES_LANGUAGES: - engine.supported_languages = ENGINES_LANGUAGES[engine.name] + supported_properties = ENGINES_LANGUAGES[engine.name] elif engine.engine in ENGINES_LANGUAGES: # The key of the dictionary ENGINES_LANGUAGES is the *engine name* @@ -188,47 +208,48 @@ def set_language_attributes(engine: Engine): # settings.yml to use the same origin engine (python module) these # additional engines can use the languages from the origin engine. # For this use the configured ``engine: ...`` from settings.yml - engine.supported_languages = ENGINES_LANGUAGES[engine.engine] + supported_properties = ENGINES_LANGUAGES[engine.engine] - if hasattr(engine, 'language'): - # For an engine, when there is `language: ...` in the YAML settings, the - # engine supports only one language, in this case - # engine.supported_languages should contains this value defined in - # settings.yml - if engine.language not in engine.supported_languages: - raise ValueError( - "settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language) - ) + if not supported_properties: + return - if isinstance(engine.supported_languages, dict): - engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} - else: - engine.supported_languages = [engine.language] + if isinstance(supported_properties, dict) and supported_properties.get('type') == 'engine_properties': + engine.supported_properties = supported_properties + engine.language_support = len(supported_properties['languages']) or len(supported_properties['regions']) - # find custom aliases for non standard language codes - for engine_lang in engine.supported_languages: - iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) - if ( - iso_lang - and iso_lang != engine_lang - and not engine_lang.startswith(iso_lang) - and iso_lang not in engine.supported_languages - ): - engine.language_aliases[iso_lang] = engine_lang + else: + # depricated: does not work for engines that do support languages + # based on a region. + engine.supported_languages = supported_properties + engine.language_support = len(engine.supported_languages) > 0 - # language_support - engine.language_support = len(engine.supported_languages) > 0 + if hasattr(engine, 'language'): + # For an engine, when there is `language: ...` in the YAML settings, the + # engine supports only one language, in this case + # engine.supported_languages should contains this value defined in + # settings.yml + if engine.language not in engine.supported_languages: + raise ValueError( + "settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language) + ) - # assign language fetching method if auxiliary method exists - if hasattr(engine, '_fetch_supported_languages'): - headers = { - 'User-Agent': gen_useragent(), - 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language - } - engine.fetch_supported_languages = ( - # pylint: disable=protected-access - lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers)) - ) + if isinstance(engine.supported_languages, dict): + engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} + else: + engine.supported_languages = [engine.language] + + if not hasattr(engine, 'language_aliases'): + engine.language_aliases = {} + # find custom aliases for non standard language codes + for engine_lang in engine.supported_languages: + iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) + if ( + iso_lang + and iso_lang != engine_lang + and not engine_lang.startswith(iso_lang) + and iso_lang not in engine.supported_languages + ): + engine.language_aliases[iso_lang] = engine_lang def update_attributes_for_tor(engine: Engine) -> bool: diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index e0700957c..d48cbb4db 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -48,7 +48,6 @@ about = { # engine dependent config categories = ['science'] paging = True -language_support = True use_locale_domain = True time_range_support = True safesearch = False diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 06aac8ae1..e311e59c8 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -56,7 +56,6 @@ about = { categories = ['videos', 'web'] paging = False -language_support = True use_locale_domain = True time_range_support = True safesearch = True diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 00f208b17..ccb2f464e 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -32,7 +32,6 @@ about = { "results": 'HTML', } -language_support = False time_range_support = False safesearch = False paging = True diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 406314684..c909f0526 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -21,7 +21,6 @@ about = { # engine dependent config categories = ['videos', 'music'] paging = True -language_support = False time_range_support = True # search-url diff --git a/searxng_extra/update/update_languages.py b/searxng_extra/update/update_languages.py index 95ee5bd3e..53abbe929 100755 --- a/searxng_extra/update/update_languages.py +++ b/searxng_extra/update/update_languages.py @@ -1,14 +1,29 @@ #!/usr/bin/env python # lint: pylint - # SPDX-License-Identifier: AGPL-3.0-or-later -"""This script generates languages.py from intersecting each engine's supported -languages. +"""This script generates :origin:`searx/languages.py` from intersecting each +engine's supported properites. The script checks all engines about a function:: + + def _fetch_engine_properties(resp, engine_properties): + ... + +and a variable named ``supported_properties_url``. The HTTP get response of +``supported_properties_url`` is passed to the ``_fetch_engine_properties`` +function including a template of ``engine_properties`` (see +:py:obj:`searx.engines.engine_properties_template`). Output files: :origin:`searx/data/engines_languages.json` and :origin:`searx/languages.py` (:origin:`CI Update data ... <.github/workflows/data-update.yml>`). +.. hint:: + + This implementation is backward compatible and supports the (depricated) + ``_fetch_supported_languages`` interface. + + On the long term the depricated implementations in the engines will be + replaced by ``_fetch_engine_properties``. + """ # pylint: disable=invalid-name @@ -21,32 +36,67 @@ from babel.languages import get_global from babel.core import parse_locale from searx import settings, searx_dir -from searx.engines import load_engines, engines -from searx.network import set_timeout_for_thread +from searx import network +from searx.engines import load_engines, engines, engine_properties_template +from searx.utils import gen_useragent # Output files. engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' languages_file = Path(searx_dir) / 'languages.py' -# Fetchs supported languages for each engine and writes json file with those. def fetch_supported_languages(): - set_timeout_for_thread(10.0) + """Fetchs supported languages for each engine and writes json file with those. + """ + network.set_timeout_for_thread(10.0) engines_languages = {} names = list(engines) names.sort() + # The headers has been moved here from commit 9b6ffed06: Some engines (at + # least bing and startpage) return a different result list of supported + # languages depending on the IP location where the HTTP request comes from. + # The IP based results (from bing) can be avoided by setting a + # 'Accept-Language' in the HTTP request. + + headers = { + 'User-Agent': gen_useragent(), + 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language + } + for engine_name in names: - if hasattr(engines[engine_name], 'fetch_supported_languages'): - engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() - print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name)) - if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck - engines_languages[engine_name] = sorted(engines_languages[engine_name]) + engine = engines[engine_name] + fetch_languages = getattr(engine, '_fetch_supported_languages', None) + fetch_properties = getattr(engine, '_fetch_engine_properties', None) - print("fetched languages from %s engines" % len(engines_languages)) + if fetch_properties is not None: + resp = network.get(engine.supported_properties_url, headers=headers) + engine_properties = engine_properties_template() + fetch_properties(resp, engine_properties) + print("%s: %s languages" % (engine_name, len(engine_properties['languages']))) + print("%s: %s regions" % (engine_name, len(engine_properties['regions']))) + + elif fetch_languages is not None: + # print("%s: using deepricated _fetch_fetch_languages()" % engine_name) + resp = network.get(engine.supported_languages_url, headers=headers) + engine_properties = fetch_languages(resp) + if isinstance(engine_properties, list): + engine_properties.sort() + + print("%s: fetched language %s containing %s items" % ( + engine_name, + engine_properties.__class__.__name__, + len(engine_properties) + )) + else: + continue + + engines_languages[engine_name] = engine_properties + + print("fetched properties from %s engines" % len(engines_languages)) + print("write json file: %s" % (engines_languages_file)) - # write json file with open(engines_languages_file, 'w', encoding='utf-8') as f: json.dump(engines_languages, f, indent=2, sort_keys=True) @@ -122,17 +172,33 @@ def get_territory_name(lang_code): print("ERROR: %s --> %s" % (locale, exc)) return country_name - -# Join all language lists. def join_language_lists(engines_languages): + """Join all languages of the engines into one list. The returned language list + contains language codes (``zh``) and region codes (``zh-TW``). The codes can + be parsed by babel:: + + babel.Locale.parse(language_list[n]) + + """ + # pylint: disable=too-many-branches language_list = {} + for engine_name in engines_languages: - for lang_code in engines_languages[engine_name]: + engine = engines[engine_name] + engine_codes = languages = engines_languages[engine_name] + + if isinstance(languages, dict): + engine_codes = languages.get('regions', engine_codes) + + if isinstance(engine_codes, dict): + engine_codes = engine_codes.keys() + + for lang_code in engine_codes: # apply custom fixes if necessary - if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values(): + if lang_code in getattr(engine, 'language_aliases', {}).values(): lang_code = next( - lc for lc, alias in engines[engine_name].language_aliases.items() if lang_code == alias + lc for lc, alias in engine.language_aliases.items() if lang_code == alias ) locale = get_locale(lang_code) @@ -197,6 +263,7 @@ def filter_language_list(all_languages): engine_name for engine_name in engines.keys() if 'general' in engines[engine_name].categories + and hasattr(engines[engine_name], 'supported_languages') and engines[engine_name].supported_languages and not engines[engine_name].disabled ]