diff --git a/searx/locales.py b/searx/locales.py index a4560aab7..ffa5e731c 100644 --- a/searx/locales.py +++ b/searx/locales.py @@ -4,7 +4,7 @@ """Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. """ -from typing import Set +from typing import Set, Optional, List import os import pathlib @@ -177,6 +177,17 @@ def language_tag(locale: babel.Locale) -> str: return sxng_lang +def get_locale(locale_tag: str) -> Optional[babel.Locale]: + """Returns a :py:obj:`babel.Locale` object parsed from argument + ``locale_tag``""" + try: + locale = babel.Locale.parse(locale_tag, sep='-') + return locale + + except babel.core.UnknownLocaleError: + return None + + def get_offical_locales( territory: str, languages=None, regional: bool = False, de_facto: bool = True ) -> Set[babel.Locale]: @@ -363,3 +374,98 @@ def get_engine_locale(searxng_locale, engine_locales, default=None): engine_locale = default return default + + +def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]: + """Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``. + + :param str searxng_locale: SearXNG's internal representation of locale (de, + de-DE, fr-BE, zh, zh-CN, zh-TW ..). + + :param list locale_tag_list: The list of locale tags to select from + + :param str fallback: fallback locale tag (if unset --> ``None``) + + The rules to find a match are implemented in :py:obj:`get_engine_locale`, + the ``engine_locales`` is build up by :py:obj:`build_engine_locales`. + + .. hint:: + + The *SearXNG locale* string and the members of ``locale_tag_list`` has to + be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the + UI and are not known by babel --> will be ignored. + """ + + # searxng_locale = 'es' + # locale_tag_list = ['es-AR', 'es-ES', 'es-MX'] + + if not searxng_locale: + return fallback + + locale = get_locale(searxng_locale) + if locale is None: + return fallback + + # normalize to a SearXNG locale that can be passed to get_engine_locale + + searxng_locale = language_tag(locale) + if locale.territory: + searxng_locale = region_tag(locale) + + # clean up locale_tag_list + + tag_list = [] + for tag in locale_tag_list: + if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS: + continue + tag_list.append(tag) + + # emulate fetch_traits + engine_locales = build_engine_locales(tag_list) + return get_engine_locale(searxng_locale, engine_locales, default=fallback) + + +def build_engine_locales(tag_list: List[str]): + """From a list of locale tags a dictionary is build that can be passed by + argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function + is mainly used by :py:obj:`match_locale` and is similar to what the + ``fetch_traits(..)`` function of engines do. + + If there are territory codes in the ``tag_list`` that have a *script code* + additional keys are added to the returned dictionary. + + .. code:: python + + >>> import locales + >>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW']) + >>> engine_locales + { + 'en': 'en', 'en-US': 'en-US', + 'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN', + 'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW' + } + >>> get_engine_locale('zh-Hans', engine_locales) + 'zh-CN' + + This function is a good example to understand the language/region model + of SearXNG: + + SearXNG only distinguishes between **search languages** and **search + regions**, by adding the *script-tags*, languages with *script-tags* can + be assigned to the **regions** that SearXNG supports. + + """ + engine_locales = {} + + for tag in tag_list: + locale = get_locale(tag) + if locale is None: + logger.warn("build_engine_locales: skip locale tag %s / unknown by babel", tag) + continue + if locale.territory: + engine_locales[region_tag(locale)] = tag + if locale.script: + engine_locales[language_tag(locale)] = tag + else: + engine_locales[language_tag(locale)] = tag + return engine_locales diff --git a/searx/utils.py b/searx/utils.py index f7a71b649..161983011 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -18,8 +18,6 @@ from urllib.parse import urljoin, urlparse from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult -from babel.core import get_global - from searx import settings from searx.data import USER_AGENTS, data_dir @@ -365,92 +363,6 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]: return None -def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]: - key = str(lang_list) - value = _LANG_TO_LC_CACHE.get(key, None) - if value is None: - value = {} - for lang in lang_list: - value.setdefault(lang.split('-')[0], lang) - _LANG_TO_LC_CACHE[key] = value - return value - - -# babel's get_global contains all sorts of miscellaneous locale and territory related data -# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py -def _get_from_babel(lang_code: str, key): - match = get_global(key).get(lang_code.replace('-', '_')) - # for some keys, such as territory_aliases, match may be a list - if isinstance(match, str): - return match.replace('_', '-') - return match - - -def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]: # pylint: disable=W0102 - """auxiliary function to match lang_code in lang_list""" - # replace language code with a custom alias if necessary - if lang_code in custom_aliases: - lang_code = custom_aliases[lang_code] - - if lang_code in lang_list: - return lang_code - - # try to get the most likely country for this language - subtags = _get_from_babel(lang_code, 'likely_subtags') - if subtags: - if subtags in lang_list: - return subtags - subtag_parts = subtags.split('-') - new_code = subtag_parts[0] + '-' + subtag_parts[-1] - if new_code in custom_aliases: - new_code = custom_aliases[new_code] - if new_code in lang_list: - return new_code - - # try to get the any supported country for this language - return _get_lang_to_lc_dict(lang_list).get(lang_code) - - -def match_language( # pylint: disable=W0102 - locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US' -) -> Optional[str]: - """get the language code from lang_list that best matches locale_code""" - # try to get language from given locale_code - language = _match_language(locale_code, lang_list, custom_aliases) - if language: - return language - - locale_parts = locale_code.split('-') - lang_code = locale_parts[0] - - # if locale_code has script, try matching without it - if len(locale_parts) > 2: - language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases) - if language: - return language - - # try to get language using an equivalent country code - if len(locale_parts) > 1: - country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases') - if country_alias: - language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases) - if language: - return language - - # try to get language using an equivalent language code - alias = _get_from_babel(lang_code, 'language_aliases') - if alias: - language = _match_language(alias, lang_list, custom_aliases) - if language: - return language - - if lang_code != locale_code: - # try to get language from given language without giving the country - language = _match_language(lang_code, lang_list, custom_aliases) - - return language or fallback - - def load_module(filename: str, module_dir: str) -> types.ModuleType: modname = splitext(filename)[0] modpath = join(module_dir, filename) diff --git a/searx/webapp.py b/searx/webapp.py index bc2a50784..4ed6c2eb7 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -89,7 +89,6 @@ from searx.utils import ( html_to_text, gen_useragent, dict_subset, - match_language, ) from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH from searx.query import RawTextQuery @@ -117,6 +116,7 @@ from searx.locales import ( RTL_LOCALES, localeselector, locales_initialize, + match_locale, ) # renaming names from searx imports ... @@ -227,7 +227,7 @@ def _get_browser_language(req, lang_list): if '-' in lang: lang_parts = lang.split('-') lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper()) - locale = match_language(lang, lang_list, fallback=None) + locale = match_locale(lang, lang_list, fallback=None) if locale is not None: return locale return 'en' @@ -407,7 +407,7 @@ def get_client_settings(): def render(template_name: str, **kwargs): - + # pylint: disable=too-many-statements kwargs['client_settings'] = str( base64.b64encode( bytes( @@ -445,10 +445,13 @@ def render(template_name: str, **kwargs): if locale in RTL_LOCALES and 'rtl' not in kwargs: kwargs['rtl'] = True + if 'current_language' not in kwargs: - kwargs['current_language'] = match_language( - request.preferences.get_value('language'), settings['search']['languages'] - ) + _locale = request.preferences.get_value('language') + if _locale in ('auto', 'all'): + kwargs['current_language'] = _locale + else: + kwargs['current_language'] = match_locale(_locale, settings['search']['languages']) # values from settings kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html'] @@ -810,6 +813,13 @@ def search(): ) ) + if search_query.lang in ('auto', 'all'): + current_language = search_query.lang + else: + current_language = match_locale( + search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language") + ) + # search_query.lang contains the user choice (all, auto, en, ...) # when the user choice is "auto", search.search_query.lang contains the detected language # otherwise it is equals to search_query.lang @@ -832,12 +842,8 @@ def search(): result_container.unresponsive_engines ), current_locale = request.preferences.get_value("locale"), - current_language = match_language( - search_query.lang, - settings['search']['languages'], - fallback=request.preferences.get_value("language") - ), - search_language = match_language( + current_language = current_language, + search_language = match_locale( search.search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language") diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index 6052bf084..66bc303db 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -18,8 +18,8 @@ from os.path import join from lxml.html import fromstring from searx.engines import wikidata, set_loggers -from searx.utils import extract_text, match_language -from searx.locales import LOCALE_NAMES, locales_initialize +from searx.utils import extract_text +from searx.locales import LOCALE_NAMES, locales_initialize, match_locale from searx import searx_dir from searx.utils import gen_useragent, detect_language import searx.search @@ -225,9 +225,9 @@ def fetch_website_description(engine_name, website): fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) if fetched_lang is None or desc is None: continue - matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None) + matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None) if matched_lang is None: - fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) + fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None) matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang) if matched_lang is not None: update_description(engine_name, matched_lang, desc, website, replace=False) diff --git a/tests/unit/test_locales.py b/tests/unit/test_locales.py new file mode 100644 index 000000000..61561c17b --- /dev/null +++ b/tests/unit/test_locales.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Test some code from module :py:obj:`searx.locales`""" + +from searx import locales +from searx.sxng_locales import sxng_locales +from tests import SearxTestCase + + +class TestLocales(SearxTestCase): + """Implemented tests: + + - :py:obj:`searx.locales.match_locale` + """ + + def test_match_locale(self): + + locale_tag_list = [x[0] for x in sxng_locales] + + # Test SearXNG search languages + + self.assertEqual(locales.match_locale('de', locale_tag_list), 'de') + self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr') + self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh') + + # Test SearXNG search regions + + self.assertEqual(locales.match_locale('ca-es', locale_tag_list), 'ca-ES') + self.assertEqual(locales.match_locale('de-at', locale_tag_list), 'de-AT') + self.assertEqual(locales.match_locale('de-de', locale_tag_list), 'de-DE') + self.assertEqual(locales.match_locale('en-UK', locale_tag_list), 'en-GB') + self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE') + self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE') + self.assertEqual(locales.match_locale('fr-ca', locale_tag_list), 'fr-CA') + self.assertEqual(locales.match_locale('fr-ch', locale_tag_list), 'fr-CH') + self.assertEqual(locales.match_locale('zh-cn', locale_tag_list), 'zh-CN') + self.assertEqual(locales.match_locale('zh-tw', locale_tag_list), 'zh-TW') + self.assertEqual(locales.match_locale('zh-hk', locale_tag_list), 'zh-HK') + + # Test language script code + + self.assertEqual(locales.match_locale('zh-hans', locale_tag_list), 'zh-CN') + self.assertEqual(locales.match_locale('zh-hans-cn', locale_tag_list), 'zh-CN') + self.assertEqual(locales.match_locale('zh-hant', locale_tag_list), 'zh-TW') + self.assertEqual(locales.match_locale('zh-hant-tw', locale_tag_list), 'zh-TW') + + # Test individual locale lists + + self.assertEqual(locales.match_locale('es', [], fallback='fallback'), 'fallback') + + self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE') + self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE') + self.assertEqual(locales.match_locale('es', ['ES']), 'ES') + self.assertEqual(locales.match_locale('es', ['es-AR', 'es-ES', 'es-MX']), 'es-ES') + self.assertEqual(locales.match_locale('es-AR', ['es-AR', 'es-ES', 'es-MX']), 'es-AR') + self.assertEqual(locales.match_locale('es-CO', ['es-AR', 'es-ES']), 'es-ES') + self.assertEqual(locales.match_locale('es-CO', ['es-AR']), 'es-AR') + + # Tests from the commit message of 9ae409a05a + + # Assumption: + # A. When a user selects a language the results should be optimized according to + # the selected language. + # + # B. When user selects a language and a territory the results should be + # optimized with first priority on territory and second on language. + + # Assume we have an engine that supports the follwoing locales: + locale_tag_list = ['zh-CN', 'zh-HK', 'nl-BE', 'fr-CA'] + + # Examples (Assumption A.) + # ------------------------ + + # A user selects region 'zh-TW' which should end in zh_HK. + # hint: CN is 'Hans' and HK ('Hant') fits better to TW ('Hant') + self.assertEqual(locales.match_locale('zh-TW', locale_tag_list), 'zh-HK') + + # A user selects only the language 'zh' which should end in CN + self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh-CN') + + # A user selects only the language 'fr' which should end in fr_CA + self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-CA') + + # The difference in priority on the territory is best shown with a + # engine that supports the following locales: + locale_tag_list = ['fr-FR', 'fr-CA', 'en-GB', 'nl-BE'] + + # A user selects only a language + self.assertEqual(locales.match_locale('en', locale_tag_list), 'en-GB') + + # hint: the engine supports fr_FR and fr_CA since no territory is given, + # fr_FR takes priority .. + self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-FR') + + # Examples (Assumption B.) + # ------------------------ + + # A user selects region 'fr-BE' which should end in nl-BE + self.assertEqual(locales.match_locale('fr-BE', locale_tag_list), 'nl-BE') + + # If the user selects a language and there are two locales like the + # following: + + locale_tag_list = ['fr-BE', 'fr-CH'] + + # The get_engine_locale selects the locale by looking at the "population + # percent" and this percentage has an higher amount in BE (68.%) + # compared to CH (21%) + + self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-BE') diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 6f51f1ee3..2ad4593a1 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -87,39 +87,6 @@ class TestUtils(SearxTestCase): html = '
Lorem ipsumdolor sit amet
' self.assertEqual(utils.html_to_text(html), "Lorem ipsum") - def test_match_language(self): - self.assertEqual(utils.match_language('es', ['es']), 'es') - self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback') - self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp') - - # handle script tags - self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN') - self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW') - self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN') - self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW') - self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN') - self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW') - - aliases = {'en-GB': 'en-UK', 'he': 'iw'} - - # guess country - self.assertEqual(utils.match_language('de-DE', ['de']), 'de') - self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE') - self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES') - self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX') - self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB') - self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK') - - # language aliases - self.assertEqual(utils.match_language('iw', ['he']), 'he') - self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw') - self.assertEqual(utils.match_language('iw-IL', ['he']), 'he') - self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw') - self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL') - self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL') - self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL') - self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL') - def test_ecma_unscape(self): self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space') self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: รณ')