[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-01-01 16:47:47 +01:00
parent 021b4a0a02
commit 61ce0c2244
2 changed files with 21 additions and 12 deletions

View File

@ -193,7 +193,7 @@ def set_language_attributes(engine):
if hasattr(engine, '_fetch_supported_languages'): if hasattr(engine, '_fetch_supported_languages'):
headers = { headers = {
'User-Agent': gen_useragent(), 'User-Agent': gen_useragent(),
'Accept-Language': 'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3', # bing needs a non-English language 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
} }
engine.fetch_supported_languages = ( engine.fetch_supported_languages = (
# pylint: disable=protected-access # pylint: disable=protected-access

View File

@ -6,7 +6,7 @@
""" """
import re import re
from urllib.parse import urlencode from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from searx.utils import eval_xpath, extract_text, match_language from searx.utils import eval_xpath, extract_text, match_language
@ -25,7 +25,7 @@ paging = True
time_range_support = False time_range_support = False
safesearch = False safesearch = False
supported_languages_url = 'https://www.bing.com/account/general' supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'} language_aliases = {}
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
@ -127,18 +127,27 @@ def response(resp):
# get supported languages from their site # get supported languages from their site
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
lang_tags = set() lang_tags = set()
setmkt = re.compile('setmkt=([^&]*)')
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
lang_links = eval_xpath(dom, "//li/a[contains(@href, 'setmkt')]") lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')
for a in lang_links: for _li in lang_links:
href = eval_xpath(a, './@href')[0]
match = setmkt.search(href) href = eval_xpath(_li, './/@href')[0]
l_tag = match.groups()[0] (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href)
_lang, _nation = l_tag.split('-', 1) query = parse_qs(query, keep_blank_values=True)
l_tag = _lang.lower() + '-' + _nation.upper()
lang_tags.add(l_tag) # fmt: off
setlang = query.get('setlang', [None, ])[0]
# example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2] # fmt: skip
# fmt: on
if not nation:
nation = lang.upper()
tag = lang + '-' + nation
lang_tags.add(tag)
return list(lang_tags) return list(lang_tags)