[mod] engines_languages.json: add new type EngineProperties

This patch adds the boilerplate code, needed to fetch properties from engines.
In the past we only fetched *languages* but some engines need *regions* to
parameterize the engine request.

To fit into our *fetch language* procedures the boilerplate is implemented in
the `searxng_extra/update/update_languages.py` and the *engine_properties* are
stored along in the `searx/data/engines_languages.json`.

This implementation is downward compatible to the `_fetch_fetch_languages()`
infrastructure we have.  If there comes the day we have all
`_fetch_fetch_languages()` implementations moved to `_fetch_engine_properties()`
implementations, we can rename the files and scripts.

The new type `EngineProperties` is a dictionary with keys `languages` and
`regions`.  The values are dictionaries to map from SearXNG's language & region
to option values the engine does use::

    engine_properties = {
        'type' : 'engine_properties',  # <-- !!!
        'regions': {
            # 'ca-ES' : <engine's region name>
        },
        'languages': {
            # 'ca' : <engine's language name>
        },
    }

Similar to the `supported_languages`, in the engine the properties are available
under the name `supported_properties`.

Initial we start with languages & regions, but in a wider sense the type is
named *engine properties*.  Engines can store in whatever options they need and
may be in the future there is a need to fetch additional or complete different
properties.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-04-08 13:24:17 +02:00
parent 13ef9cc125
commit 3b10d63e2f
9 changed files with 171 additions and 70 deletions

View file

@ -42,7 +42,7 @@ Explanation of the :ref:`general engine configuration` shown in the table
- Timeout - Timeout
- Weight - Weight
- Paging - Paging
- Language - Language, Region
- Safe search - Safe search
- Time range - Time range

3
manage
View file

@ -57,7 +57,7 @@ PYLINT_SEARXNG_DISABLE_OPTION="\
I,C,R,\ I,C,R,\
W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\ W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\
E1136" E1136"
PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_languages,language_aliases,logger,categories" PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_properties,supported_languages,language_aliases,logger,categories"
PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc" PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc"
help() { help() {
@ -698,6 +698,7 @@ test.pyright() {
| grep -v '/engines/.*.py.* - warning: "logger" is not defined'\ | grep -v '/engines/.*.py.* - warning: "logger" is not defined'\
| grep -v '/plugins/.*.py.* - error: "logger" is not defined'\ | grep -v '/plugins/.*.py.* - error: "logger" is not defined'\
| grep -v '/engines/.*.py.* - warning: "supported_languages" is not defined' \ | grep -v '/engines/.*.py.* - warning: "supported_languages" is not defined' \
| grep -v '/engines/.*.py.* - warning: "supported_properties" is not defined' \
| grep -v '/engines/.*.py.* - warning: "language_aliases" is not defined' \ | grep -v '/engines/.*.py.* - warning: "language_aliases" is not defined' \
| grep -v '/engines/.*.py.* - warning: "categories" is not defined' | grep -v '/engines/.*.py.* - warning: "categories" is not defined'
dump_return $? dump_return $?

View file

@ -15,7 +15,7 @@ from searx.data import ENGINES_LANGUAGES
from searx.network import get as http_get from searx.network import get as http_get
from searx.exceptions import SearxEngineResponseException from searx.exceptions import SearxEngineResponseException
# a fetch_supported_languages() for XPath engines isn't available right now # a _fetch_supported_properites() for XPath engines isn't available right now
# _brave = ENGINES_LANGUAGES['brave'].keys() # _brave = ENGINES_LANGUAGES['brave'].keys()

View file

@ -13,14 +13,14 @@ usage::
import sys import sys
import copy import copy
import dataclasses
from typing import Dict, List, Optional from typing import Dict, List, Optional
from os.path import realpath, dirname from os.path import realpath, dirname
from babel.localedata import locale_identifiers from babel.localedata import locale_identifiers
from searx import logger, settings from searx import logger, settings
from searx.data import ENGINES_LANGUAGES from searx.data import ENGINES_LANGUAGES
from searx.network import get from searx.utils import load_module, match_language
from searx.utils import load_module, match_language, gen_useragent
logger = logger.getChild('engines') logger = logger.getChild('engines')
@ -36,8 +36,7 @@ ENGINE_DEFAULT_ARGS = {
"timeout": settings["outgoing"]["request_timeout"], "timeout": settings["outgoing"]["request_timeout"],
"shortcut": "-", "shortcut": "-",
"categories": ["general"], "categories": ["general"],
"supported_languages": [], "language_support": False,
"language_aliases": {},
"paging": False, "paging": False,
"safesearch": False, "safesearch": False,
"time_range_support": False, "time_range_support": False,
@ -52,6 +51,35 @@ ENGINE_DEFAULT_ARGS = {
OTHER_CATEGORY = 'other' OTHER_CATEGORY = 'other'
@dataclasses.dataclass
class EngineProperties(dict):
"""
The class is intended to be instanciated for each engine.
"""
regions: Dict[str, str] = dataclasses.field(default_factory=dict)
"""
{
'fr-BE' : <engine's region name>
},
"""
languages: Dict[str, str] = dataclasses.field(default_factory=dict)
"""
{
'ca' : <engine's language name>
},
"""
def asdict(self):
return {
'type': 'engine_properties',
'regions': self.regions,
'languages': self.languages,
}
class Engine: # pylint: disable=too-few-public-methods class Engine: # pylint: disable=too-few-public-methods
"""This class is currently never initialized and only used for type hinting.""" """This class is currently never initialized and only used for type hinting."""
@ -59,15 +87,16 @@ class Engine: # pylint: disable=too-few-public-methods
engine: str engine: str
shortcut: str shortcut: str
categories: List[str] categories: List[str]
supported_languages: List[str]
about: dict about: dict
inactive: bool inactive: bool
disabled: bool disabled: bool
# language support, either by selecting a region or by selecting a language
language_support: bool language_support: bool
paging: bool paging: bool
safesearch: bool safesearch: bool
time_range_support: bool time_range_support: bool
timeout: float timeout: float
properties: EngineProperties
# Defaults for the namespace of an engine module, see :py:func:`load_engine` # Defaults for the namespace of an engine module, see :py:func:`load_engine`
@ -184,8 +213,11 @@ def update_engine_attributes(engine: Engine, engine_data):
def set_language_attributes(engine: Engine): def set_language_attributes(engine: Engine):
# assign supported languages from json file # assign supported languages from json file
supported_properties = None
if engine.name in ENGINES_LANGUAGES: if engine.name in ENGINES_LANGUAGES:
engine.supported_languages = ENGINES_LANGUAGES[engine.name] supported_properties = ENGINES_LANGUAGES[engine.name]
elif engine.engine in ENGINES_LANGUAGES: elif engine.engine in ENGINES_LANGUAGES:
# The key of the dictionary ENGINES_LANGUAGES is the *engine name* # The key of the dictionary ENGINES_LANGUAGES is the *engine name*
@ -193,47 +225,48 @@ def set_language_attributes(engine: Engine):
# settings.yml to use the same origin engine (python module) these # settings.yml to use the same origin engine (python module) these
# additional engines can use the languages from the origin engine. # additional engines can use the languages from the origin engine.
# For this use the configured ``engine: ...`` from settings.yml # For this use the configured ``engine: ...`` from settings.yml
engine.supported_languages = ENGINES_LANGUAGES[engine.engine] supported_properties = ENGINES_LANGUAGES[engine.engine]
if hasattr(engine, 'language'): if not supported_properties:
# For an engine, when there is `language: ...` in the YAML settings, the return
# engine supports only one language, in this case
# engine.supported_languages should contains this value defined in
# settings.yml
if engine.language not in engine.supported_languages:
raise ValueError(
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
)
if isinstance(engine.supported_languages, dict): if isinstance(supported_properties, dict) and supported_properties.get('type') == 'engine_properties':
engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} engine.supported_properties = supported_properties
else: engine.language_support = len(supported_properties['languages']) or len(supported_properties['regions'])
engine.supported_languages = [engine.language]
# find custom aliases for non standard language codes else:
for engine_lang in engine.supported_languages: # depricated: does not work for engines that do support languages
iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) # based on a region.
if ( engine.supported_languages = supported_properties
iso_lang engine.language_support = len(engine.supported_languages) > 0
and iso_lang != engine_lang
and not engine_lang.startswith(iso_lang)
and iso_lang not in engine.supported_languages
):
engine.language_aliases[iso_lang] = engine_lang
# language_support if hasattr(engine, 'language'):
engine.language_support = len(engine.supported_languages) > 0 # For an engine, when there is `language: ...` in the YAML settings, the
# engine supports only one language, in this case
# engine.supported_languages should contains this value defined in
# settings.yml
if engine.language not in engine.supported_languages:
raise ValueError(
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
)
# assign language fetching method if auxiliary method exists if isinstance(engine.supported_languages, dict):
if hasattr(engine, '_fetch_supported_languages'): engine.supported_languages = {engine.language: engine.supported_languages[engine.language]}
headers = { else:
'User-Agent': gen_useragent(), engine.supported_languages = [engine.language]
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
} if not hasattr(engine, 'language_aliases'):
engine.fetch_supported_languages = ( engine.language_aliases = {}
# pylint: disable=protected-access # find custom aliases for non standard language codes
lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers)) for engine_lang in engine.supported_languages:
) iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None)
if (
iso_lang
and iso_lang != engine_lang
and not engine_lang.startswith(iso_lang)
and iso_lang not in engine.supported_languages
):
engine.language_aliases[iso_lang] = engine_lang
def update_attributes_for_tor(engine: Engine) -> bool: def update_attributes_for_tor(engine: Engine) -> bool:

View file

@ -48,7 +48,6 @@ about = {
# engine dependent config # engine dependent config
categories = ['science'] categories = ['science']
paging = True paging = True
language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = False safesearch = False

View file

@ -56,7 +56,6 @@ about = {
categories = ['videos', 'web'] categories = ['videos', 'web']
paging = False paging = False
language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True

View file

@ -32,7 +32,6 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
language_support = False
time_range_support = False time_range_support = False
safesearch = False safesearch = False
paging = True paging = True

View file

@ -20,7 +20,6 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos', 'music'] categories = ['videos', 'music']
paging = True paging = True
language_support = False
time_range_support = True time_range_support = True
# search-url # search-url

View file

@ -1,14 +1,28 @@
#!/usr/bin/env python #!/usr/bin/env python
# lint: pylint # lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""This script generates languages.py from intersecting each engine's supported """This script generates :origin:`searx/languages.py` from intersecting each
languages. engine's supported properites. The script checks all engines about a function::
def _fetch_engine_properties(resp, engine_properties):
...
and a variable named ``supported_properties_url``. The HTTP get response of
``supported_properties_url`` is passed to the ``_fetch_engine_properties``
function including a instance of :py:obj:`searx.engines.EngineProperties`.
Output files: :origin:`searx/data/engines_languages.json` and Output files: :origin:`searx/data/engines_languages.json` and
:origin:`searx/languages.py` (:origin:`CI Update data ... :origin:`searx/languages.py` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`). <.github/workflows/data-update.yml>`).
.. hint::
This implementation is backward compatible and supports the (depricated)
``_fetch_supported_languages`` interface.
On the long term the depricated implementations in the engines will be
replaced by ``_fetch_engine_properties``.
""" """
# pylint: disable=invalid-name # pylint: disable=invalid-name
@ -21,32 +35,67 @@ from babel.languages import get_global
from babel.core import parse_locale from babel.core import parse_locale
from searx import settings, searx_dir from searx import settings, searx_dir
from searx.engines import load_engines, engines from searx import network
from searx.network import set_timeout_for_thread from searx.engines import load_engines, engines, EngineProperties
from searx.utils import gen_useragent
# Output files. # Output files.
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
languages_file = Path(searx_dir) / 'languages.py' languages_file = Path(searx_dir) / 'languages.py'
# Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages(): def fetch_supported_languages():
set_timeout_for_thread(10.0) """Fetchs supported languages for each engine and writes json file with those."""
network.set_timeout_for_thread(10.0)
engines_languages = {} engines_languages = {}
names = list(engines) names = list(engines)
names.sort() names.sort()
# The headers has been moved here from commit 9b6ffed06: Some engines (at
# least bing and startpage) return a different result list of supported
# languages depending on the IP location where the HTTP request comes from.
# The IP based results (from bing) can be avoided by setting a
# 'Accept-Language' in the HTTP request.
headers = {
'User-Agent': gen_useragent(),
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
}
for engine_name in names: for engine_name in names:
if hasattr(engines[engine_name], 'fetch_supported_languages'): engine = engines[engine_name]
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() fetch_languages = getattr(engine, '_fetch_supported_languages', None)
print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name)) fetch_properties = getattr(engine, '_fetch_engine_properties', None)
if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck
engines_languages[engine_name] = sorted(engines_languages[engine_name])
print("fetched languages from %s engines" % len(engines_languages)) if fetch_properties is not None:
resp = network.get(engine.supported_properties_url, headers=headers)
engine_properties = EngineProperties()
fetch_properties(resp, engine_properties)
print("%s: %s languages" % (engine_name, len(engine_properties.languages)))
print("%s: %s regions" % (engine_name, len(engine_properties.regions)))
engine_properties = engine_properties.asdict()
elif fetch_languages is not None:
# print("%s: using deepricated _fetch_fetch_languages()" % engine_name)
resp = network.get(engine.supported_languages_url, headers=headers)
engine_properties = fetch_languages(resp)
if isinstance(engine_properties, list):
engine_properties.sort()
print(
"%s: fetched language %s containing %s items"
% (engine_name, engine_properties.__class__.__name__, len(engine_properties))
)
else:
continue
engines_languages[engine_name] = engine_properties
print("fetched properties from %s engines" % len(engines_languages))
print("write json file: %s" % (engines_languages_file))
# write json file
with open(engines_languages_file, 'w', encoding='utf-8') as f: with open(engines_languages_file, 'w', encoding='utf-8') as f:
json.dump(engines_languages, f, indent=2, sort_keys=True) json.dump(engines_languages, f, indent=2, sort_keys=True)
@ -124,17 +173,38 @@ def get_territory_name(lang_code):
return country_name return country_name
# Join all language lists.
def join_language_lists(engines_languages): def join_language_lists(engines_languages):
"""Join all languages of the engines into one list. The returned language list
contains language codes (``zh``) and region codes (``zh-TW``). The codes can
be parsed by babel::
babel.Locale.parse(language_list[n])
"""
# pylint: disable=too-many-branches
language_list = {} language_list = {}
for engine_name in engines_languages: for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]: engine = engines[engine_name]
engine_properties = engines_languages[engine_name]
if isinstance(engine_properties, dict) and engine_properties.get('type') == 'engine_properties':
# items of type 'engine_properties' do have regions & languages, the
# list of engine_codes should contain both.
engine_codes = engine_properties.get('regions', {})
engine_codes.update(engine_properties.get('languages', {}))
engine_codes = engine_codes.keys()
else:
engine_codes = engine_properties
engine_properties = {}
if isinstance(engine_codes, dict):
engine_codes = engine_codes.keys()
for lang_code in engine_codes:
# apply custom fixes if necessary # apply custom fixes if necessary
if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values(): if lang_code in getattr(engine, 'language_aliases', {}).values():
lang_code = next( lang_code = next(lc for lc, alias in engine.language_aliases.items() if lang_code == alias)
lc for lc, alias in engines[engine_name].language_aliases.items() if lang_code == alias
)
locale = get_locale(lang_code) locale = get_locale(lang_code)
@ -198,6 +268,7 @@ def filter_language_list(all_languages):
engine_name engine_name
for engine_name in engines.keys() for engine_name in engines.keys()
if 'general' in engines[engine_name].categories if 'general' in engines[engine_name].categories
and hasattr(engines[engine_name], 'supported_languages')
and engines[engine_name].supported_languages and engines[engine_name].supported_languages
and not engines[engine_name].disabled and not engines[engine_name].disabled
] ]