Add searx.locales.SupportedLocales

This commit is contained in:
Alexandre Flament 2022-04-23 13:23:16 +02:00
parent 9c68980ea1
commit 7f78b69e86
7 changed files with 187 additions and 147 deletions

View file

@ -89,22 +89,28 @@ def seznam(query, _lang):
# seznam search autocompleter
url = 'https://suggest.seznam.cz/fulltext/cs?{query}'
resp = get(url.format(query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)))
resp = get(
url.format(
query=urlencode(
{'phrase': query, 'cursorPosition': len(query), 'format': 'json-2', 'highlight': '1', 'count': '6'}
)
)
)
if not resp.ok:
return []
data = resp.json()
return [''.join(
[part.get('text', '') for part in item.get('text', [])]
) for item in data.get('result', []) if item.get('itemType', None) == 'ItemType.TEXT']
return [
''.join([part.get('text', '') for part in item.get('text', [])])
for item in data.get('result', [])
if item.get('itemType', None) == 'ItemType.TEXT'
]
def startpage(query, lang):
# startpage autocompleter
engine = engines['startpage']
_, engine_language, _ = engine.get_engine_locale(lang)
_, engine_language, _ = engines['startpage'].supported_locales.get(lang)
url = 'https://startpage.com/suggestions?{query}'
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': engine_language})))

View file

@ -1561,6 +1561,7 @@
"zh-HK"
],
"startpage": {
"all_language": "en-US",
"languages": {
"af": "afrikaans",
"am": "amharic",
@ -1693,8 +1694,7 @@
"zh-CN": "zh-CN_CN",
"zh-HK": "zh-TW_HK",
"zh-TW": "zh-TW_TW"
},
"type": "engine_properties"
}
},
"wikidata": {
"ab": {

View file

@ -13,13 +13,14 @@ usage::
import sys
import copy
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Any
from os.path import realpath, dirname
from babel.localedata import locale_identifiers
from searx import logger, settings
from searx.data import ENGINES_LANGUAGES
from searx.utils import load_module, match_language
from searx.locales import SupportedLocales
logger = logger.getChild('engines')
@ -35,7 +36,7 @@ ENGINE_DEFAULT_ARGS = {
"timeout": settings["outgoing"]["request_timeout"],
"shortcut": "-",
"categories": ["general"],
"language_support" : False,
"language_support": False,
"paging": False,
"safesearch": False,
"time_range_support": False,
@ -56,11 +57,13 @@ class Engine: # pylint: disable=too-few-public-methods
engine: str
shortcut: str
categories: List[str]
supported_languages: List[str]
supported_locales: SupportedLocales
# language support, either by selecting a region or by selecting a language
language_support: bool
about: dict
inactive: bool
disabled: bool
# language support, either by selecting a region or by selecting a language
language_support: bool
paging: bool
safesearch: bool
time_range_support: bool
@ -141,25 +144,6 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
return engine
def engine_properties_template():
"""A dictionary with languages and regions to map from SearXNG' languages &
region tags to engine's language & region tags::
engine_properties = {
'type' : 'engine_properties',
'regions': {
# 'ca-ES' : <engine's region name>
},
'languages': {
# 'ca' : <engine's language name>
},
}
"""
return {
'type' : 'engine_properties',
'regions': {},
'languages': {},
}
def set_loggers(engine, engine_name):
# set the logger for engine
@ -197,10 +181,10 @@ def update_engine_attributes(engine: Engine, engine_data):
def set_language_attributes(engine: Engine):
# assign supported languages from json file
supported_properties = None
data: Any = None
if engine.name in ENGINES_LANGUAGES:
supported_properties = ENGINES_LANGUAGES[engine.name]
data = ENGINES_LANGUAGES[engine.name]
elif engine.engine in ENGINES_LANGUAGES:
# The key of the dictionary ENGINES_LANGUAGES is the *engine name*
@ -208,48 +192,48 @@ def set_language_attributes(engine: Engine):
# settings.yml to use the same origin engine (python module) these
# additional engines can use the languages from the origin engine.
# For this use the configured ``engine: ...`` from settings.yml
supported_properties = ENGINES_LANGUAGES[engine.engine]
data = ENGINES_LANGUAGES[engine.engine]
if not supported_properties:
# supported_locales is always defined
engine.supported_locales = SupportedLocales.loads(data)
engine.language_support = not engine.supported_locales.empty()
if data is None:
return
if isinstance(supported_properties, dict) and supported_properties.get('type') == 'engine_properties':
engine.supported_properties = supported_properties
engine.language_support = len(supported_properties['languages']) or len(supported_properties['regions'])
if engine.language_support:
# to do: implement engine.language equivalent by calling a method of SupportedLocales
return
else:
# depricated: does not work for engines that do support languages
# based on a region.
engine.supported_languages = supported_properties
engine.language_support = len(engine.supported_languages) > 0
# deprecated: does not work for engines that do support languages based on a region.
engine.supported_languages = data
engine.language_support = len(engine.supported_languages) > 0
if hasattr(engine, 'language'):
# For an engine, when there is `language: ...` in the YAML settings, the
# engine supports only one language, in this case
# engine.supported_languages should contains this value defined in
# settings.yml
if engine.language not in engine.supported_languages:
raise ValueError(
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
)
if hasattr(engine, 'language'):
# For an engine, when there is `language: ...` in the YAML settings, the engine supports only one language,
# in this case engine.supported_languages should contains this value defined in settings.yml
if engine.language not in engine.supported_languages:
raise ValueError(
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
)
if isinstance(engine.supported_languages, dict):
engine.supported_languages = {engine.language: engine.supported_languages[engine.language]}
else:
engine.supported_languages = [engine.language]
if isinstance(engine.supported_languages, dict):
engine.supported_languages = {engine.language: engine.supported_languages[engine.language]}
else:
engine.supported_languages = [engine.language]
if not hasattr(engine, 'language_aliases'):
engine.language_aliases = {}
# find custom aliases for non standard language codes
for engine_lang in engine.supported_languages:
iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None)
if (
iso_lang
and iso_lang != engine_lang
and not engine_lang.startswith(iso_lang)
and iso_lang not in engine.supported_languages
):
engine.language_aliases[iso_lang] = engine_lang
if not hasattr(engine, 'language_aliases'):
engine.language_aliases = {}
# find custom aliases for non standard language codes
for engine_lang in engine.supported_languages:
iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None)
if (
iso_lang
and iso_lang != engine_lang
and not engine_lang.startswith(iso_lang)
and iso_lang not in engine.supported_languages
):
engine.language_aliases[iso_lang] = engine_lang
def update_attributes_for_tor(engine: Engine) -> bool:

View file

@ -18,6 +18,7 @@ import babel
from searx.network import get
from searx.utils import extract_text, eval_xpath
from searx.locales import SupportedLocales
from searx.exceptions import (
SearxEngineResponseException,
SearxEngineCaptchaException,
@ -46,7 +47,7 @@ filter_mapping = {0: '0', 1: '1', 2: '1'}
time_range_support = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
supported_properties_url = 'https://www.startpage.com/do/settings'
supported_locales_url = 'https://www.startpage.com/do/settings'
# search-url
base_url = 'https://www.startpage.com/'
@ -109,33 +110,12 @@ def get_sc_code(headers):
return sc_code
def get_engine_locale(language):
if language == 'all':
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
engine_language = supported_properties['languages'].get(locale.language)
if not engine_language:
logger.debug("startpage does NOT support language: %s", locale.language)
engine_region = None
if locale.territory:
engine_region = supported_properties['regions'].get(locale.language + '-' + locale.territory)
if not engine_region:
logger.debug("no region in selected (only lang: '%s'), using region 'all'", language)
engine_region = 'all'
logger.debug(
"UI language: %s --> engine language: %s // engine region: %s",
language, engine_language, engine_region
)
return locale, engine_language, engine_region
def request(query, params):
locale, engine_language, engine_region = get_engine_locale(params['language'])
locale, engine_language, engine_region = params['locale'], params['engine_language'], params['engine_region']
if engine_region is None:
engine_region = 'all'
# prepare HTTP headers
ac_lang = locale.language
@ -151,7 +131,7 @@ def request(query, params):
'cat': 'web',
't': 'device',
'sc': get_sc_code(params['headers']), # hint: this func needs HTTP headers
'with_date' : time_range_dict.get(params['time_range'], '')
'with_date': time_range_dict.get(params['time_range'], ''),
}
if engine_language:
@ -187,7 +167,7 @@ def request(query, params):
if engine_region:
cookie['search_results_region'] = engine_region
params['cookies']['preferences'] = 'N1N'.join([ "%sEEE%s" % x for x in cookie.items() ])
params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
logger.debug('cookie preferences: %s', params['cookies']['preferences'])
params['method'] = 'POST'
@ -263,7 +243,7 @@ def response(resp):
return results
def _fetch_engine_properties(resp, engine_properties):
def _fetch_supported_locales(resp):
# startpage's language & region selectors are a mess.
#
@ -291,6 +271,8 @@ def _fetch_engine_properties(resp, engine_properties):
# name of the writing script used by the language, or occasionally
# something else entirely.
supported_locales = SupportedLocales(all_language='en-US')
dom = html.fromstring(resp.text)
# regions
@ -305,27 +287,21 @@ def _fetch_engine_properties(resp, engine_properties):
if '-' in sp_region_tag:
l, r = sp_region_tag.split('-')
r = r.split('_')[-1]
locale = babel.Locale.parse(l +'_'+ r, sep='_')
locale = babel.Locale.parse(l + '_' + r, sep='_')
else:
locale = babel.Locale.parse(sp_region_tag, sep='_')
region_tag = locale.language + '-' + locale.territory
# print("internal: %s --> engine: %s" % (region_tag, sp_region_tag))
engine_properties['regions'][region_tag] = sp_region_tag
supported_locales.regions[region_tag] = sp_region_tag
# languages
catalog_engine2code = {
name.lower(): lang_code
for lang_code, name in babel.Locale('en').languages.items()
}
catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
# get the native name of every language known by babel
for lang_code in filter(
lambda lang_code: lang_code.find('_') == -1,
babel.localedata.locale_identifiers()
):
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
native_name = babel.Locale(lang_code).get_language_name().lower()
# add native name exactly as it is
catalog_engine2code[native_name] = lang_code
@ -338,17 +314,19 @@ def _fetch_engine_properties(resp, engine_properties):
# values that can't be determined by babel's languages names
catalog_engine2code.update({
'english_uk': 'en',
# traditional chinese used in ..
'fantizhengwen': 'zh_Hant',
# Korean alphabet
'hangul': 'ko',
# Malayalam is one of 22 scheduled languages of India.
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
})
catalog_engine2code.update(
{
'english_uk': 'en',
# traditional chinese used in ..
'fantizhengwen': 'zh_Hant',
# Korean alphabet
'hangul': 'ko',
# Malayalam is one of 22 scheduled languages of India.
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
}
)
for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
engine_lang = option.get('value')
@ -359,6 +337,6 @@ def _fetch_engine_properties(resp, engine_properties):
lang_code = catalog_engine2code[name]
# print("internal: %s --> engine: %s" % (lang_code, engine_lang))
engine_properties['languages'][lang_code] = engine_lang
supported_locales.languages[lang_code] = engine_lang
return engine_properties
return supported_locales

View file

@ -4,7 +4,7 @@
"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`.
"""
from typing import Set
from typing import Set, Dict, Optional, Tuple
import os
import pathlib
@ -22,6 +22,70 @@ RTL_LOCALES: Set[str] = set()
*underline* '-')"""
class SupportedLocales:
"""Map the Preferences.get("languages) value to a Locale, a language and a region.
The class is intended to be instanciated for each engine.
"""
all_language: Optional[str]
"""
To which locale value the "all" language is mapped (shown a "Default language")
"""
regions: Dict[str, str]
"""
{
'fr-BE' : <engine's region name>
},
"""
languages: Dict[str, str]
"""
{
'ca' : <engine's language name>
},
"""
@classmethod
def loads(cls, data):
if isinstance(data, dict) and 'all_language' in data and 'languages' in data and 'regions' in data:
return cls(data['all_language'], data['regions'], data['languages'])
return cls()
def __init__(self, all_language=None, regions=None, languages=None):
self.all_language = all_language
self.regions = regions or {}
self.languages = languages or {}
def empty(self):
return len(self.regions) == 0 and len(self.languages) == 0
def get(self, language: str) -> Tuple[Optional[Locale], Optional[str], Optional[str]]:
if language == 'all' and self.all_language is None:
return None, None, None
if language == 'all' and self.all_language is not None:
language = self.all_language
locale = Locale.parse(language, sep='-')
engine_language = self.languages.get(locale.language)
engine_region = None
if locale.territory:
engine_region = self.regions.get(locale.language + '-' + locale.territory)
return locale, engine_language, engine_region
def dumps(self):
return {
'all_language': self.all_language,
'regions': self.regions,
'languages': self.languages,
}
def _get_name(locale, language_code):
language_name = locale.get_language_name(language_code).capitalize()
if language_name and ('a' <= language_name[0] <= 'z'):

View file

@ -157,6 +157,18 @@ class EngineProcessor(ABC):
params['language'] = self.engine.language
else:
params['language'] = search_query.lang
params['locale'], params['engine_language'], params['engine_region'] = self.engine.supported_locales.get(
params['language']
)
if params['engine_language']:
self.logger.debug(
'language:"%s" --> %s, engine_language:"%s", engine_region:"%s"',
params['language'],
repr(params['locale']),
params['engine_language'],
params['engine_region'],
)
return params
@abstractmethod

View file

@ -37,18 +37,17 @@ from babel.core import parse_locale
from searx import settings, searx_dir
from searx import network
from searx.engines import load_engines, engines, engine_properties_template
from searx.engines import load_engines, engines
from searx.utils import gen_useragent
# Output files.
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
languages_file = Path(searx_dir) / 'languages.py'
def fetch_supported_languages():
"""Fetchs supported languages for each engine and writes json file with those.
"""
"""Fetchs supported languages for each engine and writes json file with those."""
network.set_timeout_for_thread(10.0)
engines_languages = {}
names = list(engines)
@ -68,31 +67,29 @@ def fetch_supported_languages():
for engine_name in names:
engine = engines[engine_name]
fetch_languages = getattr(engine, '_fetch_supported_languages', None)
fetch_properties = getattr(engine, '_fetch_engine_properties', None)
fetch_locales = getattr(engine, '_fetch_supported_locales', None)
if fetch_properties is not None:
resp = network.get(engine.supported_properties_url, headers=headers)
engine_properties = engine_properties_template()
fetch_properties(resp, engine_properties)
print("%s: %s languages" % (engine_name, len(engine_properties['languages'])))
print("%s: %s regions" % (engine_name, len(engine_properties['regions'])))
if fetch_locales is not None and fetch_languages is not None:
print('%s: Both _fetch_supported_languages and _fetch_supported_locales are defined.' % (engine_name,))
if fetch_locales is not None:
resp = network.get(engine.supported_locales_url, headers=headers)
supported_locales = fetch_locales(resp)
print("%s: %s languages" % (engine_name, len(supported_locales.languages)))
print("%s: %s regions" % (engine_name, len(supported_locales.regions)))
data = supported_locales.dumps()
elif fetch_languages is not None:
# print("%s: using deepricated _fetch_fetch_languages()" % engine_name)
# print("%s: using deprecated _fetch_fetch_languages()" % engine_name)
resp = network.get(engine.supported_languages_url, headers=headers)
engine_properties = fetch_languages(resp)
if isinstance(engine_properties, list):
engine_properties.sort()
data = fetch_languages(resp)
if isinstance(data, list):
data.sort()
print("%s: fetched language %s containing %s items" % (
engine_name,
engine_properties.__class__.__name__,
len(engine_properties)
))
print("%s: fetched language %s containing %s items" % (engine_name, data.__class__.__name__, len(data)))
else:
continue
engines_languages[engine_name] = engine_properties
engines_languages[engine_name] = data
print("fetched properties from %s engines" % len(engines_languages))
print("write json file: %s" % (engines_languages_file))
@ -172,6 +169,7 @@ def get_territory_name(lang_code):
print("ERROR: %s --> %s" % (locale, exc))
return country_name
def join_language_lists(engines_languages):
"""Join all languages of the engines into one list. The returned language list
contains language codes (``zh``) and region codes (``zh-TW``). The codes can
@ -197,9 +195,7 @@ def join_language_lists(engines_languages):
# apply custom fixes if necessary
if lang_code in getattr(engine, 'language_aliases', {}).values():
lang_code = next(
lc for lc, alias in engine.language_aliases.items() if lang_code == alias
)
lang_code = next(lc for lc, alias in engine.language_aliases.items() if lang_code == alias)
locale = get_locale(lang_code)