From 61ce0c2244135e88c6c015ff29d5e896a49f46b6 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 1 Jan 2022 16:47:47 +0100 Subject: [PATCH 1/2] [fix] bing engines: fetch_supported_languages The Request to and the Response from https://www.bing.com/account/general has been changed. [1] https://github.com/searxng/searxng/pull/672#discussion_r777104919 Signed-off-by: Markus Heiser --- searx/engines/__init__.py | 2 +- searx/engines/bing.py | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index fa9749e9d..a3dd7a95a 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -193,7 +193,7 @@ def set_language_attributes(engine): if hasattr(engine, '_fetch_supported_languages'): headers = { 'User-Agent': gen_useragent(), - 'Accept-Language': 'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3', # bing needs a non-English language + 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language } engine.fetch_supported_languages = ( # pylint: disable=protected-access diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 59fc22be4..1170227ad 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -6,7 +6,7 @@ """ import re -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse, parse_qs from lxml import html from searx.utils import eval_xpath, extract_text, match_language @@ -25,7 +25,7 @@ paging = True time_range_support = False safesearch = False supported_languages_url = 'https://www.bing.com/account/general' -language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} +language_aliases = {} # search-url base_url = 'https://www.bing.com/' @@ -127,18 +127,27 @@ def response(resp): # get supported languages from their site def _fetch_supported_languages(resp): + lang_tags = set() - setmkt = re.compile('setmkt=([^&]*)') dom = html.fromstring(resp.text) - lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]") + lang_links = eval_xpath(dom, '//div[@id="language-section"]//li') - for a in lang_links: - href = eval_xpath(a, './@href')[0] - match = setmkt.search(href) - l_tag = match.groups()[0] - _lang, _nation = l_tag.split('-', 1) - l_tag = _lang.lower() + '-' + _nation.upper() - lang_tags.add(l_tag) + for _li in lang_links: + + href = eval_xpath(_li, './/@href')[0] + (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href) + query = parse_qs(query, keep_blank_values=True) + + # fmt: off + setlang = query.get('setlang', [None, ])[0] + # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN'] + lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2] # fmt: skip + # fmt: on + + if not nation: + nation = lang.upper() + tag = lang + '-' + nation + lang_tags.add(tag) return list(lang_tags) From 8a07559ab54da64b916552ef59e86f5cceab623a Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 1 Jan 2022 17:22:22 +0100 Subject: [PATCH 2/2] [fix] update_languages.py: no excption on unknown locale & language Fix exception handling of unknown locales and languages:: ERROR: ca_ES_valencia --> [Errno 2] No such file or directory: 'local/py3/lib/python3.8/site-packages/babel/locale-data/ca_ES_valencia.dat' ERROR: languages['fil-PH'] --> {'name': None, 'english_name': None} ERROR: languages['nb-NO'] --> {'name': None, 'english_name': None} Signed-off-by: Markus Heiser --- searxng_extra/update/update_languages.py | 37 ++++++++++++++++-------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/searxng_extra/update/update_languages.py b/searxng_extra/update/update_languages.py index 526469342..3a172d5dc 100755 --- a/searxng_extra/update/update_languages.py +++ b/searxng_extra/update/update_languages.py @@ -35,6 +35,8 @@ def fetch_supported_languages(): if type(engines_languages[engine_name]) == list: engines_languages[engine_name] = sorted(engines_languages[engine_name]) + print("fetched languages from %s engines" % len(engines_languages)) + # write json file with open(engines_languages_file, 'w', encoding='utf-8') as f: json.dump(engines_languages, f, indent=2, sort_keys=True) @@ -97,7 +99,11 @@ def join_language_lists(engines_languages): country_name = '' if locale: # get country name from babel's Locale object - country_name = locale.get_territory_name() + try: + country_name = locale.get_territory_name() + except FileNotFoundError as exc: + print("ERROR: %s --> %s" % (locale, exc)) + locale = None language_list[short_code]['countries'][lang_code] = {'country_name': country_name, 'counter': set()} @@ -186,17 +192,24 @@ def write_languages_file(languages): "language_codes =", ) - language_codes = tuple( - [ - ( - code, - languages[code]['name'].split(' (')[0], - languages[code].get('country_name') or '', - languages[code].get('english_name') or '', - ) - for code in sorted(languages) - ] - ) + language_codes = [] + + for code in sorted(languages): + + name = languages[code]['name'] + if name is None: + print("ERROR: languages['%s'] --> %s" % (code, languages[code])) + continue + item = ( + code, + languages[code]['name'].split(' (')[0], + languages[code].get('country_name') or '', + languages[code].get('english_name') or '', + ) + + language_codes.append(item) + + language_codes = tuple(language_codes) with open(languages_file, 'w') as new_file: file_content = "{file_headers} \\\n{language_codes}".format(