Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-08-24 18:57:36 +02:00
parent 744d96a16c
commit a5580c785d
13 changed files with 4555 additions and 4428 deletions

View file

@ -17,7 +17,7 @@ jobs:
- update_currencies.py - update_currencies.py
- update_external_bangs.py - update_external_bangs.py
- update_firefox_version.py - update_firefox_version.py
- update_languages.py - update_engine_data.py
- update_wikidata_units.py - update_wikidata_units.py
- update_engine_descriptions.py - update_engine_descriptions.py
steps: steps:

4
manage
View file

@ -57,7 +57,7 @@ PYLINT_SEARXNG_DISABLE_OPTION="\
I,C,R,\ I,C,R,\
W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\ W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\
E1136" E1136"
PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_properties,supported_languages,language_aliases,logger,categories" PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="engine_data,supported_languages,language_aliases,logger,categories"
PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc" PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc"
help() { help() {
@ -698,7 +698,7 @@ test.pyright() {
| grep -v '/engines/.*.py.* - warning: "logger" is not defined'\ | grep -v '/engines/.*.py.* - warning: "logger" is not defined'\
| grep -v '/plugins/.*.py.* - error: "logger" is not defined'\ | grep -v '/plugins/.*.py.* - error: "logger" is not defined'\
| grep -v '/engines/.*.py.* - warning: "supported_languages" is not defined' \ | grep -v '/engines/.*.py.* - warning: "supported_languages" is not defined' \
| grep -v '/engines/.*.py.* - warning: "supported_properties" is not defined' \ | grep -v '/engines/.*.py.* - warning: "engine_data" is not defined' \
| grep -v '/engines/.*.py.* - warning: "language_aliases" is not defined' \ | grep -v '/engines/.*.py.* - warning: "language_aliases" is not defined' \
| grep -v '/engines/.*.py.* - warning: "categories" is not defined' | grep -v '/engines/.*.py.* - warning: "categories" is not defined'
dump_return $? dump_return $?

View file

@ -15,9 +15,6 @@ from searx.network import get as http_get
from searx.exceptions import SearxEngineResponseException from searx.exceptions import SearxEngineResponseException
from searx.engines import engines from searx.engines import engines
# a _fetch_supported_properites() for XPath engines isn't available right now
# _brave = ENGINES_LANGUAGES['brave'].keys()
def get(*args, **kwargs): def get(*args, **kwargs):
if 'timeout' not in kwargs: if 'timeout' not in kwargs:

View file

@ -7,7 +7,7 @@
""" """
__all__ = [ __all__ = [
'ENGINES_LANGUAGES', 'ENGINES_DATAS',
'CURRENCIES', 'CURRENCIES',
'USER_AGENTS', 'USER_AGENTS',
'EXTERNAL_URLS', 'EXTERNAL_URLS',
@ -42,7 +42,7 @@ def ahmia_blacklist_loader():
return f.read().split() return f.read().split()
ENGINES_LANGUAGES = _load('engines_languages.json') ENGINES_DATAS = _load('engines_datas.json')
CURRENCIES = _load('currencies.json') CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json') USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json') EXTERNAL_URLS = _load('external_urls.json')

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -12,19 +12,22 @@ usage::
""" """
import sys import sys
import json
import copy import copy
import pathlib
import dataclasses import dataclasses
from typing import Dict, List, Optional from typing import Dict, List, Optional
from os.path import realpath, dirname from os.path import realpath, dirname
from babel.localedata import locale_identifiers from babel.localedata import locale_identifiers
from searx import logger, settings from searx import logger, settings
from searx.data import ENGINES_LANGUAGES
from searx.utils import load_module, match_language from searx.utils import load_module, match_language
from searx.data import data_dir, ENGINES_DATAS
logger = logger.getChild('engines') logger = logger.getChild('engines')
ENGINE_DIR = dirname(realpath(__file__))
ENGINE_DIR = pathlib.Path(__file__).parent
BABEL_LANGS = [ BABEL_LANGS = [
lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0] lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers()) for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())
@ -52,32 +55,58 @@ OTHER_CATEGORY = 'other'
@dataclasses.dataclass @dataclasses.dataclass
class EngineProperties(dict): class EngineData:
""" """The class is intended to be instanciated for each engine."""
The class is intended to be instanciated for each engine.
"""
regions: Dict[str, str] = dataclasses.field(default_factory=dict) regions: Dict[str, str] = dataclasses.field(default_factory=dict)
""" """
{ .. code:: python
'fr-BE' : <engine's region name>
}, {
'fr-BE' : <engine's region name>,
}
""" """
languages: Dict[str, str] = dataclasses.field(default_factory=dict) languages: Dict[str, str] = dataclasses.field(default_factory=dict)
""" """
{ .. code:: python
'ca' : <engine's language name>
}, {
'ca' : <engine's language name>,
}
""" """
def asdict(self): data_type: str = 'engine_data'
return { """Data type, default is 'engine_data' for vintage use 'supported_languages'"""
'type': 'engine_properties',
'regions': self.regions,
'languages': self.languages, class EngineDataEncoder(json.JSONEncoder):
} """Encodes :class:`EngineData` to a serializable object, see
:class:`json.JSONEncoder`."""
def default(self, o):
"""Return dictionary of a :class:`EngineData` object."""
if isinstance(o, EngineData):
return o.__dict__
return super().default(o)
class EngineDataDict(dict):
"""A python dictionary to map :class:`EngineData` by engine name."""
ENGINE_DATA_FILE = (data_dir / 'engines_datas.json').resolve()
def save_data(self):
with open(self.ENGINE_DATA_FILE, 'w', encoding='utf-8') as f:
json.dump(self, f, indent=2, sort_keys=True, cls=EngineDataEncoder)
@classmethod
def from_data(cls):
"""Instantiate :class:`EngineDataDict` object from :py:obj:`ENGINES_DATAS`"""
obj = EngineDataDict()
for k, v in ENGINES_DATAS.items():
obj[k] = EngineData(**v)
return obj
class Engine: # pylint: disable=too-few-public-methods class Engine: # pylint: disable=too-few-public-methods
@ -96,7 +125,7 @@ class Engine: # pylint: disable=too-few-public-methods
safesearch: bool safesearch: bool
time_range_support: bool time_range_support: bool
timeout: float timeout: float
properties: EngineProperties engine_data: EngineData
# Defaults for the namespace of an engine module, see :py:func:`load_engine` # Defaults for the namespace of an engine module, see :py:func:`load_engine`
@ -110,18 +139,19 @@ engine_shortcuts = {}
engine_shortcuts[engine.shortcut] = engine.name engine_shortcuts[engine.shortcut] = engine.name
:meta hide-value:
""" """
def load_engine(engine_data: dict) -> Optional[Engine]: def load_engine(engine_setting: dict) -> Optional[Engine]:
"""Load engine from ``engine_data``. """Load engine from ``engine_setting``.
:param dict engine_data: Attributes from YAML ``settings:engines/<engine>`` :param dict engine_setting: Attributes from YAML ``settings:engines/<engine>``
:return: initialized namespace of the ``<engine>``. :return: initialized namespace of the ``<engine>``.
1. create a namespace and load module of the ``<engine>`` 1. create a namespace and load module of the ``<engine>``
2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS` 2. update namespace with the defaults from :py:obj:`ENGINE_DEFAULT_ARGS`
3. update namespace with values from ``engine_data`` 3. update namespace with values from ``engine_setting``
If engine *is active*, return namespace of the engine, otherwise return If engine *is active*, return namespace of the engine, otherwise return
``None``. ``None``.
@ -135,7 +165,7 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
""" """
engine_name = engine_data['name'] engine_name = engine_setting['name']
if '_' in engine_name: if '_' in engine_name:
logger.error('Engine name contains underscore: "{}"'.format(engine_name)) logger.error('Engine name contains underscore: "{}"'.format(engine_name))
return None return None
@ -143,10 +173,10 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
if engine_name.lower() != engine_name: if engine_name.lower() != engine_name:
logger.warn('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name)) logger.warn('Engine name is not lowercase: "{}", converting to lowercase'.format(engine_name))
engine_name = engine_name.lower() engine_name = engine_name.lower()
engine_data['name'] = engine_name engine_setting['name'] = engine_name
# load_module # load_module
engine_module = engine_data['engine'] engine_module = engine_setting['engine']
try: try:
engine = load_module(engine_module + '.py', ENGINE_DIR) engine = load_module(engine_module + '.py', ENGINE_DIR)
except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError):
@ -156,7 +186,7 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
logger.exception('Cannot load engine "{}"'.format(engine_module)) logger.exception('Cannot load engine "{}"'.format(engine_module))
return None return None
update_engine_attributes(engine, engine_data) update_engine_attributes(engine, engine_setting)
set_language_attributes(engine) set_language_attributes(engine)
update_attributes_for_tor(engine) update_attributes_for_tor(engine)
@ -193,15 +223,15 @@ def set_loggers(engine, engine_name):
module.logger = logger.getChild(module_engine_name) module.logger = logger.getChild(module_engine_name)
def update_engine_attributes(engine: Engine, engine_data): def update_engine_attributes(engine: Engine, engine_setting):
# set engine attributes from engine_data # set engine attributes from engine_setting
for param_name, param_value in engine_data.items(): for param_name, param_value in engine_setting.items():
if param_name == 'categories': if param_name == 'categories':
if isinstance(param_value, str): if isinstance(param_value, str):
param_value = list(map(str.strip, param_value.split(','))) param_value = list(map(str.strip, param_value.split(',')))
engine.categories = param_value engine.categories = param_value
elif hasattr(engine, 'about') and param_name == 'about': elif hasattr(engine, 'about') and param_name == 'about':
engine.about = {**engine.about, **engine_data['about']} engine.about = {**engine.about, **engine_setting['about']}
else: else:
setattr(engine, param_name, param_value) setattr(engine, param_name, param_value)
@ -211,33 +241,36 @@ def update_engine_attributes(engine: Engine, engine_data):
setattr(engine, arg_name, copy.deepcopy(arg_value)) setattr(engine, arg_name, copy.deepcopy(arg_value))
def set_language_attributes(engine: Engine): def set_language_attributes(engine: Engine): # pylint: disable=too-many-branches
# assign supported languages from json file # assign supported languages from json file
supported_properties = None engine_data_dict = EngineDataDict.from_data()
engine_data = None
if engine.name in ENGINES_LANGUAGES: if engine.name in engine_data_dict:
supported_properties = ENGINES_LANGUAGES[engine.name] engine_data = engine_data_dict[engine.name]
elif engine.engine in ENGINES_LANGUAGES: elif engine.engine in engine_data_dict:
# The key of the dictionary ENGINES_LANGUAGES is the *engine name* # The key of the dictionary engine_data_dict is the *engine name*
# configured in settings.xml. When multiple engines are configured in # configured in settings.xml. When multiple engines are configured in
# settings.yml to use the same origin engine (python module) these # settings.yml to use the same origin engine (python module) these
# additional engines can use the languages from the origin engine. # additional engines can use the languages from the origin engine.
# For this use the configured ``engine: ...`` from settings.yml # For this use the configured ``engine: ...`` from settings.yml
supported_properties = ENGINES_LANGUAGES[engine.engine] engine_data = engine_data_dict[engine.engine]
if not supported_properties: if not engine_data:
return return
if isinstance(supported_properties, dict) and supported_properties.get('type') == 'engine_properties': if engine_data.data_type == 'engine_data':
engine.supported_properties = supported_properties engine.engine_data = engine_data
engine.language_support = len(supported_properties['languages']) or len(supported_properties['regions']) engine.language_support = len(engine_data.languages) or len(engine_data.regions)
elif engine_data.data_type == 'supported_languages':
# vintage
else:
# depricated: does not work for engines that do support languages # depricated: does not work for engines that do support languages
# based on a region. # based on a region.
engine.supported_languages = supported_properties engine.supported_languages = engine_data.languages
engine.language_support = len(engine.supported_languages) > 0 engine.language_support = len(engine.supported_languages) > 0
if hasattr(engine, 'language'): if hasattr(engine, 'language'):
@ -268,6 +301,9 @@ def set_language_attributes(engine: Engine):
): ):
engine.language_aliases[iso_lang] = engine_lang engine.language_aliases[iso_lang] = engine_lang
else:
raise TypeError('unknown type of engine data: %s' % engine_data.data_type)
def update_attributes_for_tor(engine: Engine) -> bool: def update_attributes_for_tor(engine: Engine) -> bool:
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):

View file

@ -136,7 +136,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
:param dict param: request parameters of the engine :param dict param: request parameters of the engine
:param list lang_list: list of supported languages of the engine :param list lang_list: list of supported languages of the engine
:py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` :py:obj:`ENGINES_DATAS[engine-name].languages <searx.data.ENGINES_DATAS>`
:param dict lang_list: custom aliases for non standard language codes :param dict lang_list: custom aliases for non standard language codes
(used when calling :py:func:`searx.utils.match_language`) (used when calling :py:func:`searx.utils.match_language`)

View file

@ -49,7 +49,7 @@ about = {
# engine dependent config # engine dependent config
categories = [] categories = []
paging = True paging = True
supported_properties_url = about['website'] engine_data_url = about['website']
qwant_categ = None # web|news|inages|videos qwant_categ = None # web|news|inages|videos
safesearch = True safesearch = True
@ -95,7 +95,7 @@ def request(query, params):
) )
# add quant's locale # add quant's locale
q_locale = get_engine_locale(params['language'], supported_properties['regions'], default='en_US') q_locale = get_engine_locale(params['language'], engine_data.regions, default='en_US')
params['url'] += '&locale=' + q_locale params['url'] += '&locale=' + q_locale
# add safesearch option # add safesearch option
@ -243,7 +243,7 @@ def response(resp):
return results return results
def _fetch_engine_properties(resp, engine_properties): def _fetch_engine_data(resp, engine_data):
text = resp.text text = resp.text
text = text[text.find('INITIAL_PROPS') :] text = text[text.find('INITIAL_PROPS') :]
@ -270,7 +270,7 @@ def _fetch_engine_properties(resp, engine_properties):
print("ERROR: can't determine babel locale of quant's locale %s" % q_locale) print("ERROR: can't determine babel locale of quant's locale %s" % q_locale)
continue continue
# note: engine_properties.regions (dict) # note: engine_data.regions (dict)
# #
# dict's key is a string build up from a babel.Locale object / the # dict's key is a string build up from a babel.Locale object / the
# notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and # notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and
@ -278,6 +278,6 @@ def _fetch_engine_properties(resp, engine_properties):
# the engine. # the engine.
searxng_locale = locale.language + '-' + locale.territory # --> params['language'] searxng_locale = locale.language + '-' + locale.territory # --> params['language']
engine_properties.regions[searxng_locale] = q_locale engine_data.regions[searxng_locale] = q_locale
return engine_properties return engine_data

View file

@ -48,7 +48,7 @@ filter_mapping = {0: '0', 1: '1', 2: '1'}
time_range_support = True time_range_support = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
supported_properties_url = 'https://www.startpage.com/do/settings' engine_data_url = 'https://www.startpage.com/do/settings'
# search-url # search-url
base_url = 'https://www.startpage.com/' base_url = 'https://www.startpage.com/'
@ -249,7 +249,7 @@ def response(resp):
return results return results
def _fetch_engine_properties(resp, engine_properties): def _fetch_engine_data(resp, engine_data):
# startpage's language & region selectors are a mess. # startpage's language & region selectors are a mess.
# #
@ -341,7 +341,7 @@ def _fetch_engine_properties(resp, engine_properties):
region_tag = locale.language + '-' + locale.territory region_tag = locale.language + '-' + locale.territory
# print("SearXNG locale tag: %s --> Engine tag: %s" % (region_tag, engine_region_tag)) # print("SearXNG locale tag: %s --> Engine tag: %s" % (region_tag, engine_region_tag))
engine_properties.regions[region_tag] = engine_region_tag engine_data.regions[region_tag] = engine_region_tag
# languages # languages
@ -385,6 +385,6 @@ def _fetch_engine_properties(resp, engine_properties):
lang_code = catalog_engine2code[name] lang_code = catalog_engine2code[name]
# print("SearXNG language tag: %s --> Engine tag: %s" % (lang_code, engine_lang)) # print("SearXNG language tag: %s --> Engine tag: %s" % (lang_code, engine_lang))
engine_properties.languages[lang_code] = engine_lang engine_data.languages[lang_code] = engine_lang
return engine_properties return engine_data

View file

@ -1,6 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# list of language codes # list of language codes
# this file is generated automatically by utils/fetch_languages.py # this file is generated automatically by:
#
# ./manage pyenv.cmd searxng_extra/update/update_languages.py
language_codes = ( language_codes = (
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'), ('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'), ('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),

View file

@ -19,13 +19,14 @@ from searx import logger
logger = logger.getChild('locales') logger = logger.getChild('locales')
LANGUAGES_FILE = pathlib.Path(__file__).parent / 'languages.py'
# safe before monkey patching flask_babel.get_translations # safe before monkey patching flask_babel.get_translations
_flask_babel_get_translations = flask_babel.get_translations _flask_babel_get_translations = flask_babel.get_translations
LOCALE_NAMES = {} LOCALE_NAMES = {}
"""Mapping of locales and their description. Locales e.g. 'fr' or 'pt-BR' (see """Mapping of locales and their description. Locales e.g. ``fr`` or ``pt-BR``
:py:obj:`locales_initialize`).""" (see :py:obj:`locales_initialize`)."""
RTL_LOCALES: Set[str] = set() RTL_LOCALES: Set[str] = set()
"""List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (see """List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (see
@ -161,6 +162,8 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
corresponding *engine locales*: corresponding *engine locales*:
.. code:: python
<engine>: { <engine>: {
# SearXNG string : engine-string # SearXNG string : engine-string
'ca-ES' : 'ca_ES', 'ca-ES' : 'ca_ES',

View file

@ -1,53 +1,56 @@
#!/usr/bin/env python #!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""This script generates :origin:`searx/languages.py` from intersecting each # lint: pylint
engine's supported properites. The script checks all engines about a function:: """This script fetches engine data from engines `engine_data_url`` and updates:
def _fetch_engine_properties(resp, engine_properties): - :py:obj:`write_languages_file` updates :origin:`searx/languages.py`
... - :py:obj:`fetch_engine_data` updates :origin:`searx/data/engines_datas.json`
and a variable named ``supported_properties_url``. The HTTP get response of
``supported_properties_url`` is passed to the ``_fetch_engine_properties``
function including a instance of :py:obj:`searx.engines.EngineProperties`.
Output files: :origin:`searx/data/engines_languages.json` and
:origin:`searx/languages.py` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
.. hint::
This implementation is backward compatible and supports the (depricated)
``_fetch_supported_languages`` interface.
On the long term the depricated implementations in the engines will be
replaced by ``_fetch_engine_properties``.
This script is triggered by CI in job :origin:`updateData
<.github/workflows/data-update.yml>`.
""" """
# pylint: disable=invalid-name # pylint: disable=invalid-name
from unicodedata import lookup from unicodedata import lookup
import json
from pathlib import Path
from pprint import pformat from pprint import pformat
from babel import Locale, UnknownLocaleError from babel import Locale, UnknownLocaleError
from babel.languages import get_global from babel.languages import get_global
from babel.core import parse_locale from babel.core import parse_locale
from searx import settings, searx_dir from searx import settings
from searx import network from searx import network
from searx.engines import load_engines, engines, EngineProperties from searx.engines import (
load_engines,
engines,
EngineData,
EngineDataDict,
)
from searx.locales import LANGUAGES_FILE
from searx.utils import gen_useragent from searx.utils import gen_useragent
# Output files.
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
languages_file = Path(searx_dir) / 'languages.py'
def fetch_engine_data():
"""Fetch :class:`EngineData` for each engine and persist JSON in file.
The script checks all engines about a function::
def _fetch_engine_data(resp, engine_data):
...
and a variable named ``engine_data_url``. The HTTP GET response of
``engine_data_url`` is passed to the ``_fetch_engine_data`` function including a
instance of :py:obj:`searx.engines.EngineData`.
.. hint::
This implementation is backward compatible and supports the (depricated)
``_fetch_supported_languages`` interface.
On the long term the depricated implementations in the engines will be
replaced by ``_fetch_engine_data``."""
def fetch_supported_languages():
"""Fetchs supported languages for each engine and writes json file with those."""
network.set_timeout_for_thread(10.0) network.set_timeout_for_thread(10.0)
engines_languages = {} engine_data_dict = EngineDataDict()
names = list(engines) names = list(engines)
names.sort() names.sort()
@ -64,42 +67,46 @@ def fetch_supported_languages():
for engine_name in names: for engine_name in names:
engine = engines[engine_name] engine = engines[engine_name]
fetch_data = getattr(engine, '_fetch_engine_data', None)
# depricated: _fetch_supported_languages
fetch_languages = getattr(engine, '_fetch_supported_languages', None) fetch_languages = getattr(engine, '_fetch_supported_languages', None)
fetch_properties = getattr(engine, '_fetch_engine_properties', None)
if fetch_properties is not None: if fetch_data is not None:
resp = network.get(engine.supported_properties_url, headers=headers) # data_type = 'engine_data'
engine_properties = EngineProperties()
fetch_properties(resp, engine_properties) engine_data = EngineData()
print("%s: %s languages" % (engine_name, len(engine_properties.languages))) resp = network.get(engine.engine_data_url, headers=headers)
print("%s: %s regions" % (engine_name, len(engine_properties.regions))) fetch_data(resp, engine_data)
engine_properties = engine_properties.asdict() print("%s: %s languages" % (engine_name, len(engine_data.languages)))
print("%s: %s regions" % (engine_name, len(engine_data.regions)))
elif fetch_languages is not None: elif fetch_languages is not None:
# print("%s: using deepricated _fetch_fetch_languages()" % engine_name) # depricated: data_type = 'supported_languages'
resp = network.get(engine.supported_languages_url, headers=headers) print("%s: using deepricated _fetch_supported_languages" % engine_name)
engine_properties = fetch_languages(resp)
if isinstance(engine_properties, list):
engine_properties.sort()
resp = network.get(engine.supported_languages_url, headers=headers)
engine_languages = fetch_languages(resp)
if isinstance(engine_languages, list):
engine_languages.sort()
print( print(
"%s: fetched language %s containing %s items" "%s: fetched language %s containing %s items"
% (engine_name, engine_properties.__class__.__name__, len(engine_properties)) % (engine_name, engine_languages.__class__.__name__, len(engine_languages))
) )
engine_data = EngineData(data_type='supported_languages')
engine_data.languages = engine_languages
else: else:
continue continue
engines_languages[engine_name] = engine_properties engine_data_dict[engine_name] = engine_data
print("fetched properties from %s engines" % len(engines_languages)) print("fetched properties from %s engines" % len(engine_data_dict))
print("write json file: %s" % (engines_languages_file)) print("write json file: %s" % (engine_data_dict.ENGINE_DATA_FILE))
with open(engines_languages_file, 'w', encoding='utf-8') as f: engine_data_dict.save_data()
json.dump(engines_languages, f, indent=2, sort_keys=True) return engine_data_dict
return engines_languages
# Get babel Locale object from lang_code if possible. # Get babel Locale object from lang_code if possible.
@ -173,7 +180,7 @@ def get_territory_name(lang_code):
return country_name return country_name
def join_language_lists(engines_languages): def join_language_lists(engine_data_dict):
"""Join all languages of the engines into one list. The returned language list """Join all languages of the engines into one list. The returned language list
contains language codes (``zh``) and region codes (``zh-TW``). The codes can contains language codes (``zh``) and region codes (``zh-TW``). The codes can
be parsed by babel:: be parsed by babel::
@ -184,28 +191,36 @@ def join_language_lists(engines_languages):
# pylint: disable=too-many-branches # pylint: disable=too-many-branches
language_list = {} language_list = {}
for engine_name in engines_languages: for engine_name in engine_data_dict:
engine = engines[engine_name] engine = engines[engine_name]
engine_properties = engines_languages[engine_name] engine_data = engine_data_dict[engine_name]
if isinstance(engine_properties, dict) and engine_properties.get('type') == 'engine_properties': if engine_data.data_type == 'engine_data':
# items of type 'engine_properties' do have regions & languages, the # items of type 'engine_data' do have regions & languages, the list
# list of engine_codes should contain both. # of engine_codes should contain both.
engine_codes = engine_properties.get('regions', {})
engine_codes.update(engine_properties.get('languages', {})) engine_codes = engine_data.regions
engine_codes.update(engine_data.languages)
engine_codes = engine_codes.keys() engine_codes = engine_codes.keys()
elif engine_data.data_type == 'supported_languages': # depricated
engine_languages = engine_data.languages
if isinstance(engine_languages, dict):
engine_languages = engine_languages.keys()
language_aliases_values = getattr(engine, 'language_aliases', {}).values()
engine_codes = []
for lang_code in engine_languages:
if lang_code in language_aliases_values:
lang_code = next(lc for lc, alias in engine.language_aliases.items() if lang_code == alias)
engine_codes.append(lang_code)
else: else:
engine_codes = engine_properties raise TypeError('unknown type of engine data: %s' % engine_data.data_type)
engine_properties = {}
if isinstance(engine_codes, dict):
engine_codes = engine_codes.keys()
for lang_code in engine_codes: for lang_code in engine_codes:
# apply custom fixes if necessary
if lang_code in getattr(engine, 'language_aliases', {}).values():
lang_code = next(lc for lc, alias in engine.language_aliases.items() if lang_code == alias)
locale = get_locale(lang_code) locale = get_locale(lang_code)
# ensure that lang_code uses standard language and country codes # ensure that lang_code uses standard language and country codes
@ -219,10 +234,10 @@ def join_language_lists(engines_languages):
# get language's data from babel's Locale object # get language's data from babel's Locale object
language_name = locale.get_language_name().title() language_name = locale.get_language_name().title()
english_name = locale.english_name.split(' (')[0] english_name = locale.english_name.split(' (')[0]
elif short_code in engines_languages['wikipedia']: elif short_code in engine_data_dict['wikipedia'].languages:
# get language's data from wikipedia if not known by babel # get language's data from wikipedia if not known by babel
language_name = engines_languages['wikipedia'][short_code]['name'] language_name = engine_data_dict['wikipedia'].languages[short_code]['name']
english_name = engines_languages['wikipedia'][short_code]['english_name'] english_name = engine_data_dict['wikipedia'].languages[short_code]['english_name']
else: else:
language_name = None language_name = None
english_name = None english_name = None
@ -259,8 +274,10 @@ def join_language_lists(engines_languages):
return language_list return language_list
# Filter language list so it only includes the most supported languages and countries
def filter_language_list(all_languages): def filter_language_list(all_languages):
"""Filter language list so it only includes the most supported languages and
countries.
"""
min_engines_per_lang = 12 min_engines_per_lang = 12
min_engines_per_country = 7 min_engines_per_country = 7
# pylint: disable=consider-using-dict-items, consider-iterating-dictionary # pylint: disable=consider-using-dict-items, consider-iterating-dictionary
@ -336,10 +353,14 @@ class UnicodeEscape(str):
# Write languages.py. # Write languages.py.
def write_languages_file(languages): def write_languages_file(languages):
"""Generates :origin:`searx/languages.py`."""
file_headers = ( file_headers = (
"# -*- coding: utf-8 -*-", "# -*- coding: utf-8 -*-",
"# list of language codes", "# list of language codes",
"# this file is generated automatically by utils/fetch_languages.py", "# this file is generated automatically by:",
"#",
"# ./manage pyenv.cmd searxng_extra/update/update_languages.py",
"language_codes = (\n", "language_codes = (\n",
) )
@ -365,7 +386,7 @@ def write_languages_file(languages):
language_codes = tuple(language_codes) language_codes = tuple(language_codes)
with open(languages_file, 'w', encoding='utf-8') as new_file: with open(LANGUAGES_FILE, 'w', encoding='utf-8') as new_file:
file_content = "{file_headers} {language_codes},\n)\n".format( file_content = "{file_headers} {language_codes},\n)\n".format(
# fmt: off # fmt: off
file_headers = '\n'.join(file_headers), file_headers = '\n'.join(file_headers),
@ -378,7 +399,7 @@ def write_languages_file(languages):
if __name__ == "__main__": if __name__ == "__main__":
load_engines(settings['engines']) load_engines(settings['engines'])
_engines_languages = fetch_supported_languages() _engine_data_dict = fetch_engine_data()
_all_languages = join_language_lists(_engines_languages) _all_languages = join_language_lists(_engine_data_dict)
_filtered_languages = filter_language_list(_all_languages) _filtered_languages = filter_language_list(_all_languages)
write_languages_file(_filtered_languages) write_languages_file(_filtered_languages)