mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
[fix] bing-news engine: language support (by bing-news market code)
- bing-news uses market codes / see query-parameter 'mkt' at [1]: The market must be in the form <language>-<country/region>. ... If known, you are encouraged to always specify the market. Specifying the market helps Bing route the request and return an appropriate and optimal response. If you specify a market that is not listed in Market codes, Bing uses a best fit market code based on an internal mapping that is subject to change. - fech market codes from [2] [1] https://docs.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/query-parameters [2] https://docs.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
81ffabee4f
commit
aed92b96f9
3 changed files with 147 additions and 165 deletions
|
@ -204,106 +204,44 @@
|
||||||
"zu"
|
"zu"
|
||||||
],
|
],
|
||||||
"bing news": [
|
"bing news": [
|
||||||
"af",
|
"da-DK",
|
||||||
"am",
|
"de-AT",
|
||||||
"ar",
|
"de-CH",
|
||||||
"as",
|
"de-DE",
|
||||||
"az",
|
"en-AU",
|
||||||
"be",
|
"en-CA",
|
||||||
"bg",
|
"en-GB",
|
||||||
"bn",
|
"en-ID",
|
||||||
"bs",
|
"en-IN",
|
||||||
"ca",
|
"en-MY",
|
||||||
"chr",
|
"en-NZ",
|
||||||
"cs",
|
"en-PH",
|
||||||
"cy",
|
"en-US",
|
||||||
"da",
|
"en-ZA",
|
||||||
"de",
|
"es-AR",
|
||||||
"el",
|
"es-CL",
|
||||||
"en",
|
"es-ES",
|
||||||
"es",
|
"es-MX",
|
||||||
"et",
|
"es-US",
|
||||||
"eu",
|
"fi-FI",
|
||||||
"fa",
|
"fr-BE",
|
||||||
"fi",
|
"fr-CA",
|
||||||
"fil",
|
"fr-CH",
|
||||||
"fr",
|
"fr-FR",
|
||||||
"ga",
|
"it-IT",
|
||||||
"gd",
|
"ja-JP",
|
||||||
"gl",
|
"ko-KR",
|
||||||
"gu",
|
"nl-BE",
|
||||||
"ha",
|
"nl-NL",
|
||||||
"he",
|
"no-NO",
|
||||||
"hi",
|
"pl-PL",
|
||||||
"hr",
|
"pt-BR",
|
||||||
"hu",
|
"ru-RU",
|
||||||
"hy",
|
"sv-SE",
|
||||||
"id",
|
"tr-TR",
|
||||||
"ig",
|
"zh-CN",
|
||||||
"is",
|
"zh-HK",
|
||||||
"it",
|
"zh-TW"
|
||||||
"ja",
|
|
||||||
"ka",
|
|
||||||
"kk",
|
|
||||||
"km",
|
|
||||||
"kn",
|
|
||||||
"ko",
|
|
||||||
"kok",
|
|
||||||
"ku",
|
|
||||||
"ky",
|
|
||||||
"lb",
|
|
||||||
"lo",
|
|
||||||
"lt",
|
|
||||||
"lv",
|
|
||||||
"mi",
|
|
||||||
"mk",
|
|
||||||
"ml",
|
|
||||||
"mn",
|
|
||||||
"mr",
|
|
||||||
"ms",
|
|
||||||
"mt",
|
|
||||||
"nb",
|
|
||||||
"ne",
|
|
||||||
"nl",
|
|
||||||
"nn",
|
|
||||||
"nso",
|
|
||||||
"or",
|
|
||||||
"pa",
|
|
||||||
"pl",
|
|
||||||
"prs",
|
|
||||||
"pt",
|
|
||||||
"quc",
|
|
||||||
"quz",
|
|
||||||
"ro",
|
|
||||||
"ru",
|
|
||||||
"rw",
|
|
||||||
"sd",
|
|
||||||
"si",
|
|
||||||
"sk",
|
|
||||||
"sl",
|
|
||||||
"sq",
|
|
||||||
"sr",
|
|
||||||
"sv",
|
|
||||||
"sw",
|
|
||||||
"ta",
|
|
||||||
"te",
|
|
||||||
"tg",
|
|
||||||
"th",
|
|
||||||
"ti",
|
|
||||||
"tk",
|
|
||||||
"tn",
|
|
||||||
"tr",
|
|
||||||
"tt",
|
|
||||||
"ug",
|
|
||||||
"uk",
|
|
||||||
"ur",
|
|
||||||
"uz",
|
|
||||||
"vi",
|
|
||||||
"wo",
|
|
||||||
"xh",
|
|
||||||
"yo",
|
|
||||||
"zh",
|
|
||||||
"zu"
|
|
||||||
],
|
],
|
||||||
"bing videos": [
|
"bing videos": [
|
||||||
"af",
|
"af",
|
||||||
|
|
|
@ -11,20 +11,21 @@ from urllib.parse import (
|
||||||
)
|
)
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
from lxml import etree
|
|
||||||
|
import babel
|
||||||
|
from lxml import etree, html
|
||||||
from lxml.etree import XPath
|
from lxml.etree import XPath
|
||||||
from searx.utils import match_language, eval_xpath_getindex
|
|
||||||
from searx.engines.bing import ( # pylint: disable=unused-import
|
from searx.utils import (
|
||||||
language_aliases,
|
eval_xpath_getindex,
|
||||||
_fetch_supported_languages,
|
eval_xpath,
|
||||||
supported_languages_url,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# about
|
# about
|
||||||
about = {
|
about = {
|
||||||
"website": 'https://www.bing.com/news',
|
"website": 'https://www.bing.com/news',
|
||||||
"wikidata_id": 'Q2878637',
|
"wikidata_id": 'Q2878637',
|
||||||
"official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
|
"official_api_documentation": 'https://docs.microsoft.com/en-us/bing/search-apis/bing-news-search',
|
||||||
"use_official_api": False,
|
"use_official_api": False,
|
||||||
"require_api_key": False,
|
"require_api_key": False,
|
||||||
"results": 'RSS',
|
"results": 'RSS',
|
||||||
|
@ -34,79 +35,72 @@ about = {
|
||||||
categories = ['news']
|
categories = ['news']
|
||||||
paging = True
|
paging = True
|
||||||
time_range_support = True
|
time_range_support = True
|
||||||
|
supported_languages_url = 'https://docs.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
base_url = 'https://www.bing.com/'
|
base_url = 'https://www.bing.com/news'
|
||||||
search_string = 'news/search?{query}&first={offset}&format=RSS'
|
|
||||||
search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
|
#search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
|
||||||
|
|
||||||
|
#https://www.bing.com/news/search?q=foo&format=RSS
|
||||||
|
#https://www.bing.com/news/search?q=foo&setmkt=de&first=1&qft=interval%3D%227%22&format=RSS
|
||||||
|
|
||||||
|
# https://www.bing.com/news/search?q=foo&cc=en-UK&first=1&qft=interval%3D%227%22&format=RSS
|
||||||
|
|
||||||
time_range_dict = {'day': '7', 'week': '8', 'month': '9'}
|
time_range_dict = {'day': '7', 'week': '8', 'month': '9'}
|
||||||
|
|
||||||
|
|
||||||
def url_cleanup(url_string):
|
|
||||||
"""remove click"""
|
|
||||||
|
|
||||||
parsed_url = urlparse(url_string)
|
|
||||||
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
|
|
||||||
query = dict(parse_qsl(parsed_url.query))
|
|
||||||
url_string = query.get('url', None)
|
|
||||||
return url_string
|
|
||||||
|
|
||||||
|
|
||||||
def image_url_cleanup(url_string):
|
|
||||||
"""replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=..."""
|
|
||||||
|
|
||||||
parsed_url = urlparse(url_string)
|
|
||||||
if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th':
|
|
||||||
query = dict(parse_qsl(parsed_url.query))
|
|
||||||
url_string = "https://www.bing.com/th?id=" + quote(query.get('id'))
|
|
||||||
return url_string
|
|
||||||
|
|
||||||
|
|
||||||
def _get_url(query, language, offset, time_range):
|
|
||||||
if time_range in time_range_dict:
|
|
||||||
search_path = search_string_with_time.format(
|
|
||||||
# fmt: off
|
|
||||||
query = urlencode({
|
|
||||||
'q': query,
|
|
||||||
'setmkt': language
|
|
||||||
}),
|
|
||||||
offset = offset,
|
|
||||||
interval = time_range_dict[time_range]
|
|
||||||
# fmt: on
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# e.g. setmkt=de-de&setlang=de
|
|
||||||
search_path = search_string.format(
|
|
||||||
# fmt: off
|
|
||||||
query = urlencode({
|
|
||||||
'q': query,
|
|
||||||
'setmkt': language
|
|
||||||
}),
|
|
||||||
offset = offset
|
|
||||||
# fmt: on
|
|
||||||
)
|
|
||||||
return base_url + search_path
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
|
||||||
if params['time_range'] and params['time_range'] not in time_range_dict:
|
language = params['language']
|
||||||
return params
|
if language == 'all':
|
||||||
|
|
||||||
offset = (params['pageno'] - 1) * 10 + 1
|
|
||||||
if params['language'] == 'all':
|
|
||||||
language = 'en-US'
|
language = 'en-US'
|
||||||
else:
|
locale = babel.Locale.parse(language, sep='-')
|
||||||
language = match_language(params['language'], supported_languages, language_aliases)
|
|
||||||
params['url'] = _get_url(query, language, offset, params['time_range'])
|
req_args = {
|
||||||
|
'q' : query,
|
||||||
|
'format': 'RSS'
|
||||||
|
}
|
||||||
|
|
||||||
|
if locale.territory:
|
||||||
|
market_code = locale.language + '-' + locale.territory
|
||||||
|
if market_code in supported_languages:
|
||||||
|
req_args['setmkt'] = market_code
|
||||||
|
else:
|
||||||
|
# Seems that language code can be used as market_code alternative,
|
||||||
|
# when bing-news does not support the market_code (including
|
||||||
|
# territory), but news results are better if there is a territory
|
||||||
|
# given.
|
||||||
|
req_args['setmkt'] = locale.language
|
||||||
|
|
||||||
|
if params['pageno'] > 1:
|
||||||
|
req_args['first'] = (params['pageno'] - 1) * 10 + 1
|
||||||
|
|
||||||
|
params['url'] = base_url + '/search?' + urlencode(req_args)
|
||||||
|
|
||||||
|
interval = time_range_dict.get(params['time_range'])
|
||||||
|
if interval:
|
||||||
|
params['url'] += f'&qft=interval%3d"{interval}"'
|
||||||
|
|
||||||
|
ac_lang = locale.language
|
||||||
|
if locale.territory:
|
||||||
|
ac_lang = "%s-%s,%s;q=0.5" % (locale.language, locale.territory, locale.language)
|
||||||
|
logger.debug("headers.Accept-Language --> %s", ac_lang)
|
||||||
|
params['headers']['Accept-Language'] = ac_lang
|
||||||
|
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
|
|
||||||
|
try:
|
||||||
|
rss = etree.fromstring(resp.content)
|
||||||
|
except etree.XMLSyntaxError:
|
||||||
|
return []
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
rss = etree.fromstring(resp.content)
|
|
||||||
namespaces = rss.nsmap
|
namespaces = rss.nsmap
|
||||||
|
|
||||||
for item in rss.xpath('./channel/item'):
|
for item in rss.xpath('./channel/item'):
|
||||||
|
@ -138,3 +132,54 @@ def response(resp):
|
||||||
results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content})
|
results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def url_cleanup(url_string):
|
||||||
|
"""remove click"""
|
||||||
|
|
||||||
|
parsed_url = urlparse(url_string)
|
||||||
|
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
|
||||||
|
query = dict(parse_qsl(parsed_url.query))
|
||||||
|
url_string = query.get('url', None)
|
||||||
|
return url_string
|
||||||
|
|
||||||
|
|
||||||
|
def image_url_cleanup(url_string):
|
||||||
|
"""replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=..."""
|
||||||
|
|
||||||
|
parsed_url = urlparse(url_string)
|
||||||
|
if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th':
|
||||||
|
query = dict(parse_qsl(parsed_url.query))
|
||||||
|
url_string = "https://www.bing.com/th?id=" + quote(query.get('id'))
|
||||||
|
return url_string
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_supported_languages(resp):
|
||||||
|
"""Market and language codes used by Bing Web Search API"""
|
||||||
|
|
||||||
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
market_codes = eval_xpath(
|
||||||
|
dom,
|
||||||
|
"//th[normalize-space(text()) = 'Market code']/../../../tbody/tr/td[3]/text()",
|
||||||
|
)
|
||||||
|
m_codes = set()
|
||||||
|
for value in market_codes:
|
||||||
|
m_codes.add(value)
|
||||||
|
|
||||||
|
# country_codes = eval_xpath(
|
||||||
|
# dom,
|
||||||
|
# "//th[normalize-space(text()) = 'Country Code']/../../../tbody/tr/td[2]/text()",
|
||||||
|
# )
|
||||||
|
# c_codes = set()
|
||||||
|
# for value in country_codes:
|
||||||
|
# c_codes.add(value)
|
||||||
|
|
||||||
|
# language_codes = eval_xpath(
|
||||||
|
# dom,
|
||||||
|
# "//th[normalize-space(text()) = 'Language Code']/../../../tbody/tr/td[2]/text()",
|
||||||
|
# )
|
||||||
|
# l_codes = set()
|
||||||
|
# for value in language_codes:
|
||||||
|
# l_codes.add(value)
|
||||||
|
|
||||||
|
return list(m_codes)
|
||||||
|
|
|
@ -32,7 +32,6 @@ language_codes = (
|
||||||
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
||||||
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
|
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
|
||||||
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
|
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
|
||||||
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
|
|
||||||
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
|
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
|
||||||
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
|
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
|
||||||
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
|
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
|
||||||
|
|
Loading…
Add table
Reference in a new issue