Add searx.locales.SupportedLocales

This commit is contained in:
Alexandre Flament 2022-04-23 13:23:16 +02:00
parent 9c68980ea1
commit 7f78b69e86
7 changed files with 187 additions and 147 deletions

View file

@ -89,22 +89,28 @@ def seznam(query, _lang):
# seznam search autocompleter # seznam search autocompleter
url = 'https://suggest.seznam.cz/fulltext/cs?{query}' url = 'https://suggest.seznam.cz/fulltext/cs?{query}'
resp = get(url.format(query=urlencode( resp = get(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'} url.format(
))) query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)
)
)
if not resp.ok: if not resp.ok:
return [] return []
data = resp.json() data = resp.json()
return [''.join( return [
[part.get('text', '') for part in item.get('text', [])] ''.join([part.get('text', '') for part in item.get('text', [])])
) for item in data.get('result', []) if item.get('itemType', None) == 'ItemType.TEXT'] for item in data.get('result', [])
if item.get('itemType', None) == 'ItemType.TEXT'
]
def startpage(query, lang): def startpage(query, lang):
# startpage autocompleter # startpage autocompleter
engine = engines['startpage'] _, engine_language, _ = engines['startpage'].supported_locales.get(lang)
_, engine_language, _ = engine.get_engine_locale(lang)
url = 'https://startpage.com/suggestions?{query}' url = 'https://startpage.com/suggestions?{query}'
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': engine_language}))) resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': engine_language})))

View file

@ -1561,6 +1561,7 @@
"zh-HK" "zh-HK"
], ],
"startpage": { "startpage": {
"all_language": "en-US",
"languages": { "languages": {
"af": "afrikaans", "af": "afrikaans",
"am": "amharic", "am": "amharic",
@ -1693,8 +1694,7 @@
"zh-CN": "zh-CN_CN", "zh-CN": "zh-CN_CN",
"zh-HK": "zh-TW_HK", "zh-HK": "zh-TW_HK",
"zh-TW": "zh-TW_TW" "zh-TW": "zh-TW_TW"
}, }
"type": "engine_properties"
}, },
"wikidata": { "wikidata": {
"ab": { "ab": {

View file

@ -13,13 +13,14 @@ usage::
import sys import sys
import copy import copy
from typing import Dict, List, Optional from typing import Dict, List, Optional, Any
from os.path import realpath, dirname from os.path import realpath, dirname
from babel.localedata import locale_identifiers from babel.localedata import locale_identifiers
from searx import logger, settings from searx import logger, settings
from searx.data import ENGINES_LANGUAGES from searx.data import ENGINES_LANGUAGES
from searx.utils import load_module, match_language from searx.utils import load_module, match_language
from searx.locales import SupportedLocales
logger = logger.getChild('engines') logger = logger.getChild('engines')
@ -35,7 +36,7 @@ ENGINE_DEFAULT_ARGS = {
"timeout": settings["outgoing"]["request_timeout"], "timeout": settings["outgoing"]["request_timeout"],
"shortcut": "-", "shortcut": "-",
"categories": ["general"], "categories": ["general"],
"language_support" : False, "language_support": False,
"paging": False, "paging": False,
"safesearch": False, "safesearch": False,
"time_range_support": False, "time_range_support": False,
@ -56,11 +57,13 @@ class Engine: # pylint: disable=too-few-public-methods
engine: str engine: str
shortcut: str shortcut: str
categories: List[str] categories: List[str]
supported_languages: List[str]
supported_locales: SupportedLocales
# language support, either by selecting a region or by selecting a language
language_support: bool
about: dict about: dict
inactive: bool inactive: bool
disabled: bool disabled: bool
# language support, either by selecting a region or by selecting a language
language_support: bool
paging: bool paging: bool
safesearch: bool safesearch: bool
time_range_support: bool time_range_support: bool
@ -141,25 +144,6 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
return engine return engine
def engine_properties_template():
"""A dictionary with languages and regions to map from SearXNG' languages &
region tags to engine's language & region tags::
engine_properties = {
'type' : 'engine_properties',
'regions': {
# 'ca-ES' : <engine's region name>
},
'languages': {
# 'ca' : <engine's language name>
},
}
"""
return {
'type' : 'engine_properties',
'regions': {},
'languages': {},
}
def set_loggers(engine, engine_name): def set_loggers(engine, engine_name):
# set the logger for engine # set the logger for engine
@ -197,10 +181,10 @@ def update_engine_attributes(engine: Engine, engine_data):
def set_language_attributes(engine: Engine): def set_language_attributes(engine: Engine):
# assign supported languages from json file # assign supported languages from json file
supported_properties = None data: Any = None
if engine.name in ENGINES_LANGUAGES: if engine.name in ENGINES_LANGUAGES:
supported_properties = ENGINES_LANGUAGES[engine.name] data = ENGINES_LANGUAGES[engine.name]
elif engine.engine in ENGINES_LANGUAGES: elif engine.engine in ENGINES_LANGUAGES:
# The key of the dictionary ENGINES_LANGUAGES is the *engine name* # The key of the dictionary ENGINES_LANGUAGES is the *engine name*
@ -208,48 +192,48 @@ def set_language_attributes(engine: Engine):
# settings.yml to use the same origin engine (python module) these # settings.yml to use the same origin engine (python module) these
# additional engines can use the languages from the origin engine. # additional engines can use the languages from the origin engine.
# For this use the configured ``engine: ...`` from settings.yml # For this use the configured ``engine: ...`` from settings.yml
supported_properties = ENGINES_LANGUAGES[engine.engine] data = ENGINES_LANGUAGES[engine.engine]
if not supported_properties: # supported_locales is always defined
engine.supported_locales = SupportedLocales.loads(data)
engine.language_support = not engine.supported_locales.empty()
if data is None:
return return
if isinstance(supported_properties, dict) and supported_properties.get('type') == 'engine_properties': if engine.language_support:
engine.supported_properties = supported_properties # to do: implement engine.language equivalent by calling a method of SupportedLocales
engine.language_support = len(supported_properties['languages']) or len(supported_properties['regions']) return
else: # deprecated: does not work for engines that do support languages based on a region.
# depricated: does not work for engines that do support languages engine.supported_languages = data
# based on a region. engine.language_support = len(engine.supported_languages) > 0
engine.supported_languages = supported_properties
engine.language_support = len(engine.supported_languages) > 0
if hasattr(engine, 'language'): if hasattr(engine, 'language'):
# For an engine, when there is `language: ...` in the YAML settings, the # For an engine, when there is `language: ...` in the YAML settings, the engine supports only one language,
# engine supports only one language, in this case # in this case engine.supported_languages should contains this value defined in settings.yml
# engine.supported_languages should contains this value defined in if engine.language not in engine.supported_languages:
# settings.yml raise ValueError(
if engine.language not in engine.supported_languages: "settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
raise ValueError( )
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
)
if isinstance(engine.supported_languages, dict): if isinstance(engine.supported_languages, dict):
engine.supported_languages = {engine.language: engine.supported_languages[engine.language]} engine.supported_languages = {engine.language: engine.supported_languages[engine.language]}
else: else:
engine.supported_languages = [engine.language] engine.supported_languages = [engine.language]
if not hasattr(engine, 'language_aliases'): if not hasattr(engine, 'language_aliases'):
engine.language_aliases = {} engine.language_aliases = {}
# find custom aliases for non standard language codes # find custom aliases for non standard language codes
for engine_lang in engine.supported_languages: for engine_lang in engine.supported_languages:
iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None) iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None)
if ( if (
iso_lang iso_lang
and iso_lang != engine_lang and iso_lang != engine_lang
and not engine_lang.startswith(iso_lang) and not engine_lang.startswith(iso_lang)
and iso_lang not in engine.supported_languages and iso_lang not in engine.supported_languages
): ):
engine.language_aliases[iso_lang] = engine_lang engine.language_aliases[iso_lang] = engine_lang
def update_attributes_for_tor(engine: Engine) -> bool: def update_attributes_for_tor(engine: Engine) -> bool:

View file

@ -18,6 +18,7 @@ import babel
from searx.network import get from searx.network import get
from searx.utils import extract_text, eval_xpath from searx.utils import extract_text, eval_xpath
from searx.locales import SupportedLocales
from searx.exceptions import ( from searx.exceptions import (
SearxEngineResponseException, SearxEngineResponseException,
SearxEngineCaptchaException, SearxEngineCaptchaException,
@ -46,7 +47,7 @@ filter_mapping = {0: '0', 1: '1', 2: '1'}
time_range_support = True time_range_support = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
supported_properties_url = 'https://www.startpage.com/do/settings' supported_locales_url = 'https://www.startpage.com/do/settings'
# search-url # search-url
base_url = 'https://www.startpage.com/' base_url = 'https://www.startpage.com/'
@ -109,33 +110,12 @@ def get_sc_code(headers):
return sc_code return sc_code
def get_engine_locale(language):
if language == 'all':
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
engine_language = supported_properties['languages'].get(locale.language)
if not engine_language:
logger.debug("startpage does NOT support language: %s", locale.language)
engine_region = None
if locale.territory:
engine_region = supported_properties['regions'].get(locale.language + '-' + locale.territory)
if not engine_region:
logger.debug("no region in selected (only lang: '%s'), using region 'all'", language)
engine_region = 'all'
logger.debug(
"UI language: %s --> engine language: %s // engine region: %s",
language, engine_language, engine_region
)
return locale, engine_language, engine_region
def request(query, params): def request(query, params):
locale, engine_language, engine_region = get_engine_locale(params['language']) locale, engine_language, engine_region = params['locale'], params['engine_language'], params['engine_region']
if engine_region is None:
engine_region = 'all'
# prepare HTTP headers # prepare HTTP headers
ac_lang = locale.language ac_lang = locale.language
@ -151,7 +131,7 @@ def request(query, params):
'cat': 'web', 'cat': 'web',
't': 'device', 't': 'device',
'sc': get_sc_code(params['headers']), # hint: this func needs HTTP headers 'sc': get_sc_code(params['headers']), # hint: this func needs HTTP headers
'with_date' : time_range_dict.get(params['time_range'], '') 'with_date': time_range_dict.get(params['time_range'], ''),
} }
if engine_language: if engine_language:
@ -187,7 +167,7 @@ def request(query, params):
if engine_region: if engine_region:
cookie['search_results_region'] = engine_region cookie['search_results_region'] = engine_region
params['cookies']['preferences'] = 'N1N'.join([ "%sEEE%s" % x for x in cookie.items() ]) params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
logger.debug('cookie preferences: %s', params['cookies']['preferences']) logger.debug('cookie preferences: %s', params['cookies']['preferences'])
params['method'] = 'POST' params['method'] = 'POST'
@ -263,7 +243,7 @@ def response(resp):
return results return results
def _fetch_engine_properties(resp, engine_properties): def _fetch_supported_locales(resp):
# startpage's language & region selectors are a mess. # startpage's language & region selectors are a mess.
# #
@ -291,6 +271,8 @@ def _fetch_engine_properties(resp, engine_properties):
# name of the writing script used by the language, or occasionally # name of the writing script used by the language, or occasionally
# something else entirely. # something else entirely.
supported_locales = SupportedLocales(all_language='en-US')
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# regions # regions
@ -305,27 +287,21 @@ def _fetch_engine_properties(resp, engine_properties):
if '-' in sp_region_tag: if '-' in sp_region_tag:
l, r = sp_region_tag.split('-') l, r = sp_region_tag.split('-')
r = r.split('_')[-1] r = r.split('_')[-1]
locale = babel.Locale.parse(l +'_'+ r, sep='_') locale = babel.Locale.parse(l + '_' + r, sep='_')
else: else:
locale = babel.Locale.parse(sp_region_tag, sep='_') locale = babel.Locale.parse(sp_region_tag, sep='_')
region_tag = locale.language + '-' + locale.territory region_tag = locale.language + '-' + locale.territory
# print("internal: %s --> engine: %s" % (region_tag, sp_region_tag)) # print("internal: %s --> engine: %s" % (region_tag, sp_region_tag))
engine_properties['regions'][region_tag] = sp_region_tag supported_locales.regions[region_tag] = sp_region_tag
# languages # languages
catalog_engine2code = { catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
name.lower(): lang_code
for lang_code, name in babel.Locale('en').languages.items()
}
# get the native name of every language known by babel # get the native name of every language known by babel
for lang_code in filter( for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
lambda lang_code: lang_code.find('_') == -1,
babel.localedata.locale_identifiers()
):
native_name = babel.Locale(lang_code).get_language_name().lower() native_name = babel.Locale(lang_code).get_language_name().lower()
# add native name exactly as it is # add native name exactly as it is
catalog_engine2code[native_name] = lang_code catalog_engine2code[native_name] = lang_code
@ -338,17 +314,19 @@ def _fetch_engine_properties(resp, engine_properties):
# values that can't be determined by babel's languages names # values that can't be determined by babel's languages names
catalog_engine2code.update({ catalog_engine2code.update(
'english_uk': 'en', {
# traditional chinese used in .. 'english_uk': 'en',
'fantizhengwen': 'zh_Hant', # traditional chinese used in ..
# Korean alphabet 'fantizhengwen': 'zh_Hant',
'hangul': 'ko', # Korean alphabet
# Malayalam is one of 22 scheduled languages of India. 'hangul': 'ko',
'malayam': 'ml', # Malayalam is one of 22 scheduled languages of India.
'norsk': 'nb', 'malayam': 'ml',
'sinhalese': 'si', 'norsk': 'nb',
}) 'sinhalese': 'si',
}
)
for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'): for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
engine_lang = option.get('value') engine_lang = option.get('value')
@ -359,6 +337,6 @@ def _fetch_engine_properties(resp, engine_properties):
lang_code = catalog_engine2code[name] lang_code = catalog_engine2code[name]
# print("internal: %s --> engine: %s" % (lang_code, engine_lang)) # print("internal: %s --> engine: %s" % (lang_code, engine_lang))
engine_properties['languages'][lang_code] = engine_lang supported_locales.languages[lang_code] = engine_lang
return engine_properties return supported_locales

View file

@ -4,7 +4,7 @@
"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`. """Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`.
""" """
from typing import Set from typing import Set, Dict, Optional, Tuple
import os import os
import pathlib import pathlib
@ -22,6 +22,70 @@ RTL_LOCALES: Set[str] = set()
*underline* '-')""" *underline* '-')"""
class SupportedLocales:
"""Map the Preferences.get("languages) value to a Locale, a language and a region.
The class is intended to be instanciated for each engine.
"""
all_language: Optional[str]
"""
To which locale value the "all" language is mapped (shown a "Default language")
"""
regions: Dict[str, str]
"""
{
'fr-BE' : <engine's region name>
},
"""
languages: Dict[str, str]
"""
{
'ca' : <engine's language name>
},
"""
@classmethod
def loads(cls, data):
if isinstance(data, dict) and 'all_language' in data and 'languages' in data and 'regions' in data:
return cls(data['all_language'], data['regions'], data['languages'])
return cls()
def __init__(self, all_language=None, regions=None, languages=None):
self.all_language = all_language
self.regions = regions or {}
self.languages = languages or {}
def empty(self):
return len(self.regions) == 0 and len(self.languages) == 0
def get(self, language: str) -> Tuple[Optional[Locale], Optional[str], Optional[str]]:
if language == 'all' and self.all_language is None:
return None, None, None
if language == 'all' and self.all_language is not None:
language = self.all_language
locale = Locale.parse(language, sep='-')
engine_language = self.languages.get(locale.language)
engine_region = None
if locale.territory:
engine_region = self.regions.get(locale.language + '-' + locale.territory)
return locale, engine_language, engine_region
def dumps(self):
return {
'all_language': self.all_language,
'regions': self.regions,
'languages': self.languages,
}
def _get_name(locale, language_code): def _get_name(locale, language_code):
language_name = locale.get_language_name(language_code).capitalize() language_name = locale.get_language_name(language_code).capitalize()
if language_name and ('a' <= language_name[0] <= 'z'): if language_name and ('a' <= language_name[0] <= 'z'):

View file

@ -157,6 +157,18 @@ class EngineProcessor(ABC):
params['language'] = self.engine.language params['language'] = self.engine.language
else: else:
params['language'] = search_query.lang params['language'] = search_query.lang
params['locale'], params['engine_language'], params['engine_region'] = self.engine.supported_locales.get(
params['language']
)
if params['engine_language']:
self.logger.debug(
'language:"%s" --> %s, engine_language:"%s", engine_region:"%s"',
params['language'],
repr(params['locale']),
params['engine_language'],
params['engine_region'],
)
return params return params
@abstractmethod @abstractmethod

View file

@ -37,18 +37,17 @@ from babel.core import parse_locale
from searx import settings, searx_dir from searx import settings, searx_dir
from searx import network from searx import network
from searx.engines import load_engines, engines, engine_properties_template from searx.engines import load_engines, engines
from searx.utils import gen_useragent from searx.utils import gen_useragent
# Output files. # Output files.
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json' engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
languages_file = Path(searx_dir) / 'languages.py' languages_file = Path(searx_dir) / 'languages.py'
def fetch_supported_languages(): def fetch_supported_languages():
"""Fetchs supported languages for each engine and writes json file with those. """Fetchs supported languages for each engine and writes json file with those."""
"""
network.set_timeout_for_thread(10.0) network.set_timeout_for_thread(10.0)
engines_languages = {} engines_languages = {}
names = list(engines) names = list(engines)
@ -68,31 +67,29 @@ def fetch_supported_languages():
for engine_name in names: for engine_name in names:
engine = engines[engine_name] engine = engines[engine_name]
fetch_languages = getattr(engine, '_fetch_supported_languages', None) fetch_languages = getattr(engine, '_fetch_supported_languages', None)
fetch_properties = getattr(engine, '_fetch_engine_properties', None) fetch_locales = getattr(engine, '_fetch_supported_locales', None)
if fetch_properties is not None: if fetch_locales is not None and fetch_languages is not None:
resp = network.get(engine.supported_properties_url, headers=headers) print('%s: Both _fetch_supported_languages and _fetch_supported_locales are defined.' % (engine_name,))
engine_properties = engine_properties_template() if fetch_locales is not None:
fetch_properties(resp, engine_properties) resp = network.get(engine.supported_locales_url, headers=headers)
print("%s: %s languages" % (engine_name, len(engine_properties['languages']))) supported_locales = fetch_locales(resp)
print("%s: %s regions" % (engine_name, len(engine_properties['regions']))) print("%s: %s languages" % (engine_name, len(supported_locales.languages)))
print("%s: %s regions" % (engine_name, len(supported_locales.regions)))
data = supported_locales.dumps()
elif fetch_languages is not None: elif fetch_languages is not None:
# print("%s: using deepricated _fetch_fetch_languages()" % engine_name) # print("%s: using deprecated _fetch_fetch_languages()" % engine_name)
resp = network.get(engine.supported_languages_url, headers=headers) resp = network.get(engine.supported_languages_url, headers=headers)
engine_properties = fetch_languages(resp) data = fetch_languages(resp)
if isinstance(engine_properties, list): if isinstance(data, list):
engine_properties.sort() data.sort()
print("%s: fetched language %s containing %s items" % ( print("%s: fetched language %s containing %s items" % (engine_name, data.__class__.__name__, len(data)))
engine_name,
engine_properties.__class__.__name__,
len(engine_properties)
))
else: else:
continue continue
engines_languages[engine_name] = engine_properties engines_languages[engine_name] = data
print("fetched properties from %s engines" % len(engines_languages)) print("fetched properties from %s engines" % len(engines_languages))
print("write json file: %s" % (engines_languages_file)) print("write json file: %s" % (engines_languages_file))
@ -172,6 +169,7 @@ def get_territory_name(lang_code):
print("ERROR: %s --> %s" % (locale, exc)) print("ERROR: %s --> %s" % (locale, exc))
return country_name return country_name
def join_language_lists(engines_languages): def join_language_lists(engines_languages):
"""Join all languages of the engines into one list. The returned language list """Join all languages of the engines into one list. The returned language list
contains language codes (``zh``) and region codes (``zh-TW``). The codes can contains language codes (``zh``) and region codes (``zh-TW``). The codes can
@ -197,9 +195,7 @@ def join_language_lists(engines_languages):
# apply custom fixes if necessary # apply custom fixes if necessary
if lang_code in getattr(engine, 'language_aliases', {}).values(): if lang_code in getattr(engine, 'language_aliases', {}).values():
lang_code = next( lang_code = next(lc for lc, alias in engine.language_aliases.items() if lang_code == alias)
lc for lc, alias in engine.language_aliases.items() if lang_code == alias
)
locale = get_locale(lang_code) locale = get_locale(lang_code)