diff --git a/searx/autocomplete.py b/searx/autocomplete.py index f8a482deb..57f67239a 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -89,22 +89,28 @@ def seznam(query, _lang): # seznam search autocompleter url = 'https://suggest.seznam.cz/fulltext/cs?{query}' - resp = get(url.format(query=urlencode( - {'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'} - ))) + resp = get( + url.format( + query=urlencode( + {'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'} + ) + ) + ) if not resp.ok: return [] data = resp.json() - return [''.join( - [part.get('text', '') for part in item.get('text', [])] - ) for item in data.get('result', []) if item.get('itemType', None) == 'ItemType.TEXT'] + return [ + ''.join([part.get('text', '') for part in item.get('text', [])]) + for item in data.get('result', []) + if item.get('itemType', None) == 'ItemType.TEXT' + ] + def startpage(query, lang): # startpage autocompleter - engine = engines['startpage'] - _, engine_language, _ = engine.get_engine_locale(lang) + _, engine_language, _ = engines['startpage'].supported_locales.get(lang) url = 'https://startpage.com/suggestions?{query}' resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': engine_language}))) diff --git a/searx/data/engines_languages.json b/searx/data/engines_languages.json index 5ea3044c3..45be267d4 100644 --- a/searx/data/engines_languages.json +++ b/searx/data/engines_languages.json @@ -1561,6 +1561,7 @@ "zh-HK" ], "startpage": { + "all_language": "en-US", "languages": { "af": "afrikaans", "am": "amharic", @@ -1693,8 +1694,7 @@ "zh-CN": "zh-CN_CN", "zh-HK": "zh-TW_HK", "zh-TW": "zh-TW_TW" - }, - "type": "engine_properties" + } }, "wikidata": { "ab": { diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index ece3d3e5d..d1a370bda 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -13,13 +13,14 @@ usage:: import sys import copy -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Any from os.path import realpath, dirname from babel.localedata import locale_identifiers from searx import logger, settings from searx.data import ENGINES_LANGUAGES from searx.utils import load_module, match_language +from searx.locales import SupportedLocales logger = logger.getChild('engines') @@ -35,7 +36,7 @@ ENGINE_DEFAULT_ARGS = { "timeout": settings["outgoing"]["request_timeout"], "shortcut": "-", "categories": ["general"], - "language_support" : False, + "language_support": False, "paging": False, "safesearch": False, "time_range_support": False, @@ -56,11 +57,13 @@ class Engine: # pylint: disable=too-few-public-methods engine: str shortcut: str categories: List[str] + supported_languages: List[str] + supported_locales: SupportedLocales + # language support, either by selecting a region or by selecting a language + language_support: bool about: dict inactive: bool disabled: bool - # language support, either by selecting a region or by selecting a language - language_support: bool paging: bool safesearch: bool time_range_support: bool @@ -141,25 +144,6 @@ def load_engine(engine_data: dict) -> Optional[Engine]: return engine -def engine_properties_template(): - """A dictionary with languages and regions to map from SearXNG' languages & - region tags to engine's language & region tags:: - - engine_properties = { - 'type' : 'engine_properties', - 'regions': { - # 'ca-ES' : - }, - 'languages': { - # 'ca' : - }, - } - """ - return { - 'type' : 'engine_properties', - 'regions': {}, - 'languages': {}, - } def set_loggers(engine, engine_name): # set the logger for engine @@ -197,10 +181,10 @@ def update_engine_attributes(engine: Engine, engine_data): def set_language_attributes(engine: Engine): # assign supported languages from json file - supported_properties = None + data: Any = None if engine.name in ENGINES_LANGUAGES: - supported_properties = ENGINES_LANGUAGES[engine.name] + data = ENGINES_LANGUAGES[engine.name] elif engine.engine in ENGINES_LANGUAGES: # The key of the dictionary ENGINES_LANGUAGES is the *engine name* @@ -208,48 +192,48 @@ def set_language_attributes(engine: Engine): # settings.yml to use the same origin engine (python module) these # additional engines can use the languages from the origin engine. # For this use the configured ``engine: ...`` from settings.yml - supported_properties = ENGINES_LANGUAGES[engine.engine] + data = ENGINES_LANGUAGES[engine.engine] - if not supported_properties: + # supported_locales is always defined + engine.supported_locales = SupportedLocales.loads(data) + engine.language_support = not engine.supported_locales.empty() + + if data is None: return - if isinstance(supported_properties, dict) and supported_properties.get('type') == 'engine_properties': - engine.supported_properties = supported_properties - engine.language_support = len(supported_properties['languages']) or len(supported_properties['regions']) + if engine.language_support: + # to do: implement engine.language equivalent by calling a method of SupportedLocales + return - else: - # depricated: does not work for engines that do support languages - # based on a region. - engine.supported_languages = supported_properties - engine.language_support = len(engine.supported_languages) > 0 + # deprecated: does not work for engines that do support languages based on a region. + engine.supported_languages = data + engine.language_support = len(engine.supported_languages) > 0 - if hasattr(engine, 'language'): - # For an engine, when there is `language: ...` in the YAML settings, the - # engine supports only one language, in this case - # engine.supported_languages should contains this value defined in - # settings.yml - if engine.language not in engine.supported_languages: - raise ValueError( - "settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language) - ) + if hasattr(engine, 'language'): + # For an engine, when there is `language: ...` in the YAML settings, the engine supports only one language, + # in this case engine.supported_languages should contains this value defined in settings.yml + if engine.language not in engine.supported_languages: + raise ValueError( + "settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language) + ) - if isinstance(engine.supported_languages, dict): - engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} - else: - engine.supported_languages = [engine.language] + if isinstance(engine.supported_languages, dict): + engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} + else: + engine.supported_languages = [engine.language] - if not hasattr(engine, 'language_aliases'): - engine.language_aliases = {} - # find custom aliases for non standard language codes - for engine_lang in engine.supported_languages: - iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) - if ( - iso_lang - and iso_lang != engine_lang - and not engine_lang.startswith(iso_lang) - and iso_lang not in engine.supported_languages - ): - engine.language_aliases[iso_lang] = engine_lang + if not hasattr(engine, 'language_aliases'): + engine.language_aliases = {} + # find custom aliases for non standard language codes + for engine_lang in engine.supported_languages: + iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) + if ( + iso_lang + and iso_lang != engine_lang + and not engine_lang.startswith(iso_lang) + and iso_lang not in engine.supported_languages + ): + engine.language_aliases[iso_lang] = engine_lang def update_attributes_for_tor(engine: Engine) -> bool: diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index fe4d54682..acc5e5741 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -18,6 +18,7 @@ import babel from searx.network import get from searx.utils import extract_text, eval_xpath +from searx.locales import SupportedLocales from searx.exceptions import ( SearxEngineResponseException, SearxEngineCaptchaException, @@ -46,7 +47,7 @@ filter_mapping = {0: '0', 1: '1', 2: '1'} time_range_support = True time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} -supported_properties_url = 'https://www.startpage.com/do/settings' +supported_locales_url = 'https://www.startpage.com/do/settings' # search-url base_url = 'https://www.startpage.com/' @@ -109,33 +110,12 @@ def get_sc_code(headers): return sc_code -def get_engine_locale(language): - - if language == 'all': - language = 'en-US' - locale = babel.Locale.parse(language, sep='-') - - engine_language = supported_properties['languages'].get(locale.language) - if not engine_language: - logger.debug("startpage does NOT support language: %s", locale.language) - - engine_region = None - if locale.territory: - engine_region = supported_properties['regions'].get(locale.language + '-' + locale.territory) - if not engine_region: - logger.debug("no region in selected (only lang: '%s'), using region 'all'", language) - engine_region = 'all' - - logger.debug( - "UI language: %s --> engine language: %s // engine region: %s", - language, engine_language, engine_region - ) - return locale, engine_language, engine_region - - def request(query, params): - locale, engine_language, engine_region = get_engine_locale(params['language']) + locale, engine_language, engine_region = params['locale'], params['engine_language'], params['engine_region'] + + if engine_region is None: + engine_region = 'all' # prepare HTTP headers ac_lang = locale.language @@ -151,7 +131,7 @@ def request(query, params): 'cat': 'web', 't': 'device', 'sc': get_sc_code(params['headers']), # hint: this func needs HTTP headers - 'with_date' : time_range_dict.get(params['time_range'], '') + 'with_date': time_range_dict.get(params['time_range'], ''), } if engine_language: @@ -187,7 +167,7 @@ def request(query, params): if engine_region: cookie['search_results_region'] = engine_region - params['cookies']['preferences'] = 'N1N'.join([ "%sEEE%s" % x for x in cookie.items() ]) + params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()]) logger.debug('cookie preferences: %s', params['cookies']['preferences']) params['method'] = 'POST' @@ -263,7 +243,7 @@ def response(resp): return results -def _fetch_engine_properties(resp, engine_properties): +def _fetch_supported_locales(resp): # startpage's language & region selectors are a mess. # @@ -291,6 +271,8 @@ def _fetch_engine_properties(resp, engine_properties): # name of the writing script used by the language, or occasionally # something else entirely. + supported_locales = SupportedLocales(all_language='en-US') + dom = html.fromstring(resp.text) # regions @@ -305,27 +287,21 @@ def _fetch_engine_properties(resp, engine_properties): if '-' in sp_region_tag: l, r = sp_region_tag.split('-') r = r.split('_')[-1] - locale = babel.Locale.parse(l +'_'+ r, sep='_') + locale = babel.Locale.parse(l + '_' + r, sep='_') else: locale = babel.Locale.parse(sp_region_tag, sep='_') region_tag = locale.language + '-' + locale.territory # print("internal: %s --> engine: %s" % (region_tag, sp_region_tag)) - engine_properties['regions'][region_tag] = sp_region_tag + supported_locales.regions[region_tag] = sp_region_tag # languages - catalog_engine2code = { - name.lower(): lang_code - for lang_code, name in babel.Locale('en').languages.items() - } + catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()} # get the native name of every language known by babel - for lang_code in filter( - lambda lang_code: lang_code.find('_') == -1, - babel.localedata.locale_identifiers() - ): + for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()): native_name = babel.Locale(lang_code).get_language_name().lower() # add native name exactly as it is catalog_engine2code[native_name] = lang_code @@ -338,17 +314,19 @@ def _fetch_engine_properties(resp, engine_properties): # values that can't be determined by babel's languages names - catalog_engine2code.update({ - 'english_uk': 'en', - # traditional chinese used in .. - 'fantizhengwen': 'zh_Hant', - # Korean alphabet - 'hangul': 'ko', - # Malayalam is one of 22 scheduled languages of India. - 'malayam': 'ml', - 'norsk': 'nb', - 'sinhalese': 'si', - }) + catalog_engine2code.update( + { + 'english_uk': 'en', + # traditional chinese used in .. + 'fantizhengwen': 'zh_Hant', + # Korean alphabet + 'hangul': 'ko', + # Malayalam is one of 22 scheduled languages of India. + 'malayam': 'ml', + 'norsk': 'nb', + 'sinhalese': 'si', + } + ) for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): engine_lang = option.get('value') @@ -359,6 +337,6 @@ def _fetch_engine_properties(resp, engine_properties): lang_code = catalog_engine2code[name] # print("internal: %s --> engine: %s" % (lang_code, engine_lang)) - engine_properties['languages'][lang_code] = engine_lang + supported_locales.languages[lang_code] = engine_lang - return engine_properties + return supported_locales diff --git a/searx/locales.py b/searx/locales.py index 62f64204f..f6e1970dc 100644 --- a/searx/locales.py +++ b/searx/locales.py @@ -4,7 +4,7 @@ """Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. """ -from typing import Set +from typing import Set, Dict, Optional, Tuple import os import pathlib @@ -22,6 +22,70 @@ RTL_LOCALES: Set[str] = set() *underline* '-')""" +class SupportedLocales: + """Map the Preferences.get("languages) value to a Locale, a language and a region. + + The class is intended to be instanciated for each engine. + """ + + all_language: Optional[str] + """ + To which locale value the "all" language is mapped (shown a "Default language") + """ + + regions: Dict[str, str] + """ + { + 'fr-BE' : + }, + """ + + languages: Dict[str, str] + """ + { + 'ca' : + }, + """ + + @classmethod + def loads(cls, data): + if isinstance(data, dict) and 'all_language' in data and 'languages' in data and 'regions' in data: + return cls(data['all_language'], data['regions'], data['languages']) + return cls() + + def __init__(self, all_language=None, regions=None, languages=None): + self.all_language = all_language + self.regions = regions or {} + self.languages = languages or {} + + def empty(self): + return len(self.regions) == 0 and len(self.languages) == 0 + + def get(self, language: str) -> Tuple[Optional[Locale], Optional[str], Optional[str]]: + if language == 'all' and self.all_language is None: + return None, None, None + + if language == 'all' and self.all_language is not None: + language = self.all_language + + locale = Locale.parse(language, sep='-') + + engine_language = self.languages.get(locale.language) + + engine_region = None + if locale.territory: + engine_region = self.regions.get(locale.language + '-' + locale.territory) + + return locale, engine_language, engine_region + + def dumps(self): + return { + 'all_language': self.all_language, + 'regions': self.regions, + 'languages': self.languages, + } + + def _get_name(locale, language_code): language_name = locale.get_language_name(language_code).capitalize() if language_name and ('a' <= language_name[0] <= 'z'): diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index b7703496b..d6f3d3fb9 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -157,6 +157,18 @@ class EngineProcessor(ABC): params['language'] = self.engine.language else: params['language'] = search_query.lang + + params['locale'], params['engine_language'], params['engine_region'] = self.engine.supported_locales.get( + params['language'] + ) + if params['engine_language']: + self.logger.debug( + 'language:"%s" --> %s, engine_language:"%s", engine_region:"%s"', + params['language'], + repr(params['locale']), + params['engine_language'], + params['engine_region'], + ) return params @abstractmethod diff --git a/searxng_extra/update/update_languages.py b/searxng_extra/update/update_languages.py index 53abbe929..00a5b5dfb 100755 --- a/searxng_extra/update/update_languages.py +++ b/searxng_extra/update/update_languages.py @@ -37,18 +37,17 @@ from babel.core import parse_locale from searx import settings, searx_dir from searx import network -from searx.engines import load_engines, engines, engine_properties_template +from searx.engines import load_engines, engines from searx.utils import gen_useragent + # Output files. engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' languages_file = Path(searx_dir) / 'languages.py' def fetch_supported_languages(): - """Fetchs supported languages for each engine and writes json file with those. - - """ + """Fetchs supported languages for each engine and writes json file with those.""" network.set_timeout_for_thread(10.0) engines_languages = {} names = list(engines) @@ -68,31 +67,29 @@ def fetch_supported_languages(): for engine_name in names: engine = engines[engine_name] fetch_languages = getattr(engine, '_fetch_supported_languages', None) - fetch_properties = getattr(engine, '_fetch_engine_properties', None) + fetch_locales = getattr(engine, '_fetch_supported_locales', None) - if fetch_properties is not None: - resp = network.get(engine.supported_properties_url, headers=headers) - engine_properties = engine_properties_template() - fetch_properties(resp, engine_properties) - print("%s: %s languages" % (engine_name, len(engine_properties['languages']))) - print("%s: %s regions" % (engine_name, len(engine_properties['regions']))) + if fetch_locales is not None and fetch_languages is not None: + print('%s: Both _fetch_supported_languages and _fetch_supported_locales are defined.' % (engine_name,)) + if fetch_locales is not None: + resp = network.get(engine.supported_locales_url, headers=headers) + supported_locales = fetch_locales(resp) + print("%s: %s languages" % (engine_name, len(supported_locales.languages))) + print("%s: %s regions" % (engine_name, len(supported_locales.regions))) + data = supported_locales.dumps() elif fetch_languages is not None: - # print("%s: using deepricated _fetch_fetch_languages()" % engine_name) + # print("%s: using deprecated _fetch_fetch_languages()" % engine_name) resp = network.get(engine.supported_languages_url, headers=headers) - engine_properties = fetch_languages(resp) - if isinstance(engine_properties, list): - engine_properties.sort() + data = fetch_languages(resp) + if isinstance(data, list): + data.sort() - print("%s: fetched language %s containing %s items" % ( - engine_name, - engine_properties.__class__.__name__, - len(engine_properties) - )) + print("%s: fetched language %s containing %s items" % (engine_name, data.__class__.__name__, len(data))) else: continue - engines_languages[engine_name] = engine_properties + engines_languages[engine_name] = data print("fetched properties from %s engines" % len(engines_languages)) print("write json file: %s" % (engines_languages_file)) @@ -172,6 +169,7 @@ def get_territory_name(lang_code): print("ERROR: %s --> %s" % (locale, exc)) return country_name + def join_language_lists(engines_languages): """Join all languages of the engines into one list. The returned language list contains language codes (``zh``) and region codes (``zh-TW``). The codes can @@ -197,9 +195,7 @@ def join_language_lists(engines_languages): # apply custom fixes if necessary if lang_code in getattr(engine, 'language_aliases', {}).values(): - lang_code = next( - lc for lc, alias in engine.language_aliases.items() if lang_code == alias - ) + lang_code = next(lc for lc, alias in engine.language_aliases.items() if lang_code == alias) locale = get_locale(lang_code)