[fix] bing-news engine: language support (by bing-news market code)

- bing-news uses market codes / see query-parameter 'mkt' at [1]:

  The market must be in the form <language>-<country/region>.
  ...
  If known, you are encouraged to always specify the market. Specifying the
  market helps Bing route the request and return an appropriate and optimal
  response. If you specify a market that is not listed in Market codes, Bing
  uses a best fit market code based on an internal mapping that is subject to
  change.

- fech market codes from [2]

[1] https://docs.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/query-parameters
[2] https://docs.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-04-11 17:33:07 +02:00
parent 81ffabee4f
commit aed92b96f9
3 changed files with 147 additions and 165 deletions

View file

@ -204,106 +204,44 @@
"zu" "zu"
], ],
"bing news": [ "bing news": [
"af", "da-DK",
"am", "de-AT",
"ar", "de-CH",
"as", "de-DE",
"az", "en-AU",
"be", "en-CA",
"bg", "en-GB",
"bn", "en-ID",
"bs", "en-IN",
"ca", "en-MY",
"chr", "en-NZ",
"cs", "en-PH",
"cy", "en-US",
"da", "en-ZA",
"de", "es-AR",
"el", "es-CL",
"en", "es-ES",
"es", "es-MX",
"et", "es-US",
"eu", "fi-FI",
"fa", "fr-BE",
"fi", "fr-CA",
"fil", "fr-CH",
"fr", "fr-FR",
"ga", "it-IT",
"gd", "ja-JP",
"gl", "ko-KR",
"gu", "nl-BE",
"ha", "nl-NL",
"he", "no-NO",
"hi", "pl-PL",
"hr", "pt-BR",
"hu", "ru-RU",
"hy", "sv-SE",
"id", "tr-TR",
"ig", "zh-CN",
"is", "zh-HK",
"it", "zh-TW"
"ja",
"ka",
"kk",
"km",
"kn",
"ko",
"kok",
"ku",
"ky",
"lb",
"lo",
"lt",
"lv",
"mi",
"mk",
"ml",
"mn",
"mr",
"ms",
"mt",
"nb",
"ne",
"nl",
"nn",
"nso",
"or",
"pa",
"pl",
"prs",
"pt",
"quc",
"quz",
"ro",
"ru",
"rw",
"sd",
"si",
"sk",
"sl",
"sq",
"sr",
"sv",
"sw",
"ta",
"te",
"tg",
"th",
"ti",
"tk",
"tn",
"tr",
"tt",
"ug",
"uk",
"ur",
"uz",
"vi",
"wo",
"xh",
"yo",
"zh",
"zu"
], ],
"bing videos": [ "bing videos": [
"af", "af",

View file

@ -11,20 +11,21 @@ from urllib.parse import (
) )
from datetime import datetime from datetime import datetime
from dateutil import parser from dateutil import parser
from lxml import etree
import babel
from lxml import etree, html
from lxml.etree import XPath from lxml.etree import XPath
from searx.utils import match_language, eval_xpath_getindex
from searx.engines.bing import ( # pylint: disable=unused-import from searx.utils import (
language_aliases, eval_xpath_getindex,
_fetch_supported_languages, eval_xpath,
supported_languages_url,
) )
# about # about
about = { about = {
"website": 'https://www.bing.com/news', "website": 'https://www.bing.com/news',
"wikidata_id": 'Q2878637', "wikidata_id": 'Q2878637',
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api', "official_api_documentation": 'https://docs.microsoft.com/en-us/bing/search-apis/bing-news-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'RSS', "results": 'RSS',
@ -34,79 +35,72 @@ about = {
categories = ['news'] categories = ['news']
paging = True paging = True
time_range_support = True time_range_support = True
supported_languages_url = 'https://docs.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/news'
search_string = 'news/search?{query}&first={offset}&format=RSS'
search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS' #search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
#https://www.bing.com/news/search?q=foo&format=RSS
#https://www.bing.com/news/search?q=foo&setmkt=de&first=1&qft=interval%3D%227%22&format=RSS
# https://www.bing.com/news/search?q=foo&cc=en-UK&first=1&qft=interval%3D%227%22&format=RSS
time_range_dict = {'day': '7', 'week': '8', 'month': '9'} time_range_dict = {'day': '7', 'week': '8', 'month': '9'}
def url_cleanup(url_string):
"""remove click"""
parsed_url = urlparse(url_string)
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
query = dict(parse_qsl(parsed_url.query))
url_string = query.get('url', None)
return url_string
def image_url_cleanup(url_string):
"""replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=..."""
parsed_url = urlparse(url_string)
if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th':
query = dict(parse_qsl(parsed_url.query))
url_string = "https://www.bing.com/th?id=" + quote(query.get('id'))
return url_string
def _get_url(query, language, offset, time_range):
if time_range in time_range_dict:
search_path = search_string_with_time.format(
# fmt: off
query = urlencode({
'q': query,
'setmkt': language
}),
offset = offset,
interval = time_range_dict[time_range]
# fmt: on
)
else:
# e.g. setmkt=de-de&setlang=de
search_path = search_string.format(
# fmt: off
query = urlencode({
'q': query,
'setmkt': language
}),
offset = offset
# fmt: on
)
return base_url + search_path
def request(query, params): def request(query, params):
if params['time_range'] and params['time_range'] not in time_range_dict: language = params['language']
return params if language == 'all':
offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all':
language = 'en-US' language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
req_args = {
'q' : query,
'format': 'RSS'
}
if locale.territory:
market_code = locale.language + '-' + locale.territory
if market_code in supported_languages:
req_args['setmkt'] = market_code
else: else:
language = match_language(params['language'], supported_languages, language_aliases) # Seems that language code can be used as market_code alternative,
params['url'] = _get_url(query, language, offset, params['time_range']) # when bing-news does not support the market_code (including
# territory), but news results are better if there is a territory
# given.
req_args['setmkt'] = locale.language
if params['pageno'] > 1:
req_args['first'] = (params['pageno'] - 1) * 10 + 1
params['url'] = base_url + '/search?' + urlencode(req_args)
interval = time_range_dict.get(params['time_range'])
if interval:
params['url'] += f'&qft=interval%3d"{interval}"'
ac_lang = locale.language
if locale.territory:
ac_lang = "%s-%s,%s;q=0.5" % (locale.language, locale.territory, locale.language)
logger.debug("headers.Accept-Language --> %s", ac_lang)
params['headers']['Accept-Language'] = ac_lang
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params return params
def response(resp): def response(resp):
results = [] try:
rss = etree.fromstring(resp.content) rss = etree.fromstring(resp.content)
except etree.XMLSyntaxError:
return []
results = []
namespaces = rss.nsmap namespaces = rss.nsmap
for item in rss.xpath('./channel/item'): for item in rss.xpath('./channel/item'):
@ -138,3 +132,54 @@ def response(resp):
results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}) results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content})
return results return results
def url_cleanup(url_string):
"""remove click"""
parsed_url = urlparse(url_string)
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
query = dict(parse_qsl(parsed_url.query))
url_string = query.get('url', None)
return url_string
def image_url_cleanup(url_string):
"""replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=..."""
parsed_url = urlparse(url_string)
if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th':
query = dict(parse_qsl(parsed_url.query))
url_string = "https://www.bing.com/th?id=" + quote(query.get('id'))
return url_string
def _fetch_supported_languages(resp):
"""Market and language codes used by Bing Web Search API"""
dom = html.fromstring(resp.text)
market_codes = eval_xpath(
dom,
"//th[normalize-space(text()) = 'Market code']/../../../tbody/tr/td[3]/text()",
)
m_codes = set()
for value in market_codes:
m_codes.add(value)
# country_codes = eval_xpath(
# dom,
# "//th[normalize-space(text()) = 'Country Code']/../../../tbody/tr/td[2]/text()",
# )
# c_codes = set()
# for value in country_codes:
# c_codes.add(value)
# language_codes = eval_xpath(
# dom,
# "//th[normalize-space(text()) = 'Language Code']/../../../tbody/tr/td[2]/text()",
# )
# l_codes = set()
# for value in language_codes:
# l_codes.add(value)
return list(m_codes)

View file

@ -32,7 +32,6 @@ language_codes = (
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'), ('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),