fetch supported languages for startpage engine

This commit is contained in:
Marc Abonce Seguin 2020-09-14 00:06:58 -07:00 committed by Alexandre Flament
parent ea9d979cc3
commit 41800835f9

View File

@ -14,9 +14,12 @@ from lxml import html
from dateutil import parser from dateutil import parser
from datetime import datetime, timedelta from datetime import datetime, timedelta
import re import re
from unicodedata import normalize, combining
from babel import Locale
from babel.localedata import locale_identifiers
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.languages import language_codes from searx.languages import language_codes
from searx.utils import eval_xpath from searx.utils import eval_xpath, match_language
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
@ -26,6 +29,7 @@ categories = ['general']
paging = True paging = True
language_support = True language_support = True
supported_languages_url = 'https://www.startpage.com/do/settings'
# search-url # search-url
base_url = 'https://startpage.com/' base_url = 'https://startpage.com/'
@ -54,12 +58,11 @@ def request(query, params):
# set language if specified # set language if specified
if params['language'] != 'all': if params['language'] != 'all':
language = 'english' lang_code = match_language(params['language'], supported_languages, fallback=None)
for lc, _, _, lang in language_codes: if lang_code:
if lc == params['language']: language_name = supported_languages[lang_code]['alias']
language = lang params['data']['language'] = language_name
params['data']['language'] = language params['data']['lui'] = language_name
params['data']['lui'] = language
return params return params
@ -132,3 +135,55 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def _fetch_supported_languages(resp):
# startpage's language selector is a mess
# each option has a displayed name and a value, either of which may represent the language name
# in the native script, the language name in English, an English transliteration of the native name,
# the English name of the writing script used by the language, or occasionally something else entirely.
# this cases are so special they need to be hardcoded, a couple of them are mispellings
language_names = {
'english_uk': 'en-GB',
'fantizhengwen': ['zh-TW', 'zh-HK'],
'hangul': 'ko',
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
'sudanese': 'su'
}
# get the English name of every language known by babel
language_names.update({name.lower(): lang_code for lang_code, name in Locale('en')._data['languages'].items()})
# get the native name of every language known by babel
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
native_name = Locale(lang_code).get_language_name().lower()
# add native name exactly as it is
language_names[native_name] = lang_code
# add "normalized" language name (i.e. français becomes francais and español becomes espanol)
unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
if len(unaccented_name) == len(unaccented_name.encode()):
# add only if result is ascii (otherwise "normalization" didn't work)
language_names[unaccented_name] = lang_code
dom = html.fromstring(resp.text)
sp_lang_names = []
for option in dom.xpath('//form[@id="settings-form"]//select[@name="language"]/option'):
sp_lang_names.append((option.get('value'), extract_text(option).lower()))
supported_languages = {}
for sp_option_value, sp_option_text in sp_lang_names:
lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
if isinstance(lang_code, str):
supported_languages[lang_code] = {'alias': sp_option_value}
elif isinstance(lang_code, list):
for lc in lang_code:
supported_languages[lc] = {'alias': sp_option_value}
else:
print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))
return supported_languages