[mod] add 'Accept-Language' HTTP header to online processores

Most engines that support languages (and regions) use the Accept-Language from
the WEB browser to build a response that fits to the language (and region).

- add new engine option: send_accept_language_header

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-08-01 17:01:59 +02:00
parent a2badb4fe4
commit 8df1f0c47e
21 changed files with 52 additions and 26 deletions

View File

@ -440,6 +440,7 @@ engine is shown. Most of the options have a default value or even are optional.
engine: example engine: example
shortcut: demo shortcut: demo
base_url: 'https://{language}.example.com/' base_url: 'https://{language}.example.com/'
send_accept_language_header: false
categories: general categories: general
timeout: 3.0 timeout: 3.0
api_key: 'apikey' api_key: 'apikey'
@ -488,6 +489,13 @@ engine is shown. Most of the options have a default value or even are optional.
use multiple sites using only one engine, or updating the site URL without use multiple sites using only one engine, or updating the site URL without
touching at the code. touching at the code.
``send_accept_language_header`` :
Several engines that support languages (or regions) deal with the HTTP header
``Accept-Language`` to build a response that fits to the locale. When this
option is activated, the language (locale) that is selected by the user is used
to build and send a ``Accept-Language`` header in the request to the origin
search engine.
``categories`` : optional ``categories`` : optional
Define in which categories this engine will be active. Most of the time, it is Define in which categories this engine will be active. Most of the time, it is
defined in the code of the engine, but in a few cases it is useful, like when defined in the code of the engine, but in a few cases it is useful, like when

View File

@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = {
"enable_http": False, "enable_http": False,
"using_tor_proxy": False, "using_tor_proxy": False,
"display_error_messages": True, "display_error_messages": True,
"send_accept_language_header": False,
"tokens": [], "tokens": [],
"about": {}, "about": {},
} }

View File

@ -25,6 +25,7 @@ categories = ['general', 'web']
paging = True paging = True
time_range_support = False time_range_support = False
safesearch = False safesearch = False
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general' supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {} language_aliases = {}
@ -68,7 +69,6 @@ def request(query, params):
logger.debug("headers.Referer --> %s", referer) logger.debug("headers.Referer --> %s", referer)
params['url'] = base_url + search_path params['url'] = base_url + search_path
params['headers']['Accept-Language'] = "en-US,en;q=0.5"
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params return params

View File

@ -31,6 +31,7 @@ categories = ['images', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general' supported_languages_url = 'https://www.bing.com/account/general'
number_of_results = 28 number_of_results = 28

View File

@ -34,6 +34,7 @@ about = {
categories = ['news'] categories = ['news']
paging = True paging = True
time_range_support = True time_range_support = True
send_accept_language_header = True
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'

View File

@ -30,6 +30,7 @@ categories = ['videos', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
number_of_results = 28 number_of_results = 28
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
@ -70,10 +71,6 @@ def request(query, params):
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
# bing videos did not like "older" versions < 70.0.1 when selectin other
# languages then 'en' .. very strange ?!?!
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
return params return params

View File

@ -20,6 +20,7 @@ from json import loads
from urllib.parse import urlencode from urllib.parse import urlencode
engine_type = 'online' engine_type = 'online'
send_accept_language_header = True
categories = ['general'] categories = ['general']
disabled = True disabled = True
timeout = 2.0 timeout = 2.0

View File

@ -31,6 +31,7 @@ categories = ['general', 'web']
paging = True paging = True
supported_languages_url = 'https://duckduckgo.com/util/u588.js' supported_languages_url = 'https://duckduckgo.com/util/u588.js'
time_range_support = True time_range_support = True
send_accept_language_header = True
language_aliases = { language_aliases = {
'ar-SA': 'ar-XA', 'ar-SA': 'ar-XA',

View File

@ -27,6 +27,8 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/'] WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
@ -62,7 +64,6 @@ def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query})) params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases) language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0] language = language.split('-')[0]
params['headers']['Accept-Language'] = language
return params return params

View File

@ -30,6 +30,7 @@ about = {
categories = ['images', 'web'] categories = ['images', 'web']
paging = True paging = True
safesearch = True safesearch = True
send_accept_language_header = True
# search-url # search-url
images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'

View File

@ -45,6 +45,7 @@ categories = ['general', 'web']
paging = True paging = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
use_mobile_ui = False use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages' supported_languages_url = 'https://www.google.com/preferences?#languages'
@ -241,16 +242,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
# language. # language.
ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
# Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
ret_val['headers']['Accept-Language'] = ','.join(
[
lang_country,
language + ';q=0.8,',
'en;q=0.6',
'*;q=0.5',
]
)
return ret_val return ret_val

View File

@ -51,6 +51,7 @@ paging = False
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'} filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
@ -125,7 +126,6 @@ def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = ( query_url = (
'https://' 'https://'

View File

@ -70,13 +70,13 @@ time_range_support = True
# #
# safesearch : results are identitical for safesearch=0 and safesearch=2 # safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False safesearch = False
send_accept_language_header = True
def request(query, params): def request(query, params):
"""Google-News search request""" """Google-News search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# google news has only one domain # google news has only one domain
lang_info['subdomain'] = 'news.google.com' lang_info['subdomain'] = 'news.google.com'

View File

@ -22,6 +22,8 @@ about = {
} }
categories = ["files", "apps"] categories = ["files", "apps"]
send_accept_language_header = True
search_url = "https://play.google.com/store/search?{query}&c=apps" search_url = "https://play.google.com/store/search?{query}&c=apps"

View File

@ -52,6 +52,7 @@ language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = False safesearch = False
send_accept_language_header = True
def time_range_url(params): def time_range_url(params):
@ -75,7 +76,6 @@ def request(query, params):
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 10
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# subdomain is: scholar.google.xy # subdomain is: scholar.google.xy
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")

View File

@ -60,6 +60,7 @@ language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
RE_CACHE = {} RE_CACHE = {}
@ -111,7 +112,6 @@ def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = ( query_url = (
'https://' 'https://'

View File

@ -30,6 +30,7 @@ about = {
categories = ['map'] categories = ['map']
paging = False paging = False
language_support = True language_support = True
send_accept_language_header = True
# search-url # search-url
base_url = 'https://nominatim.openstreetmap.org/' base_url = 'https://nominatim.openstreetmap.org/'
@ -142,9 +143,8 @@ def request(query, params):
params['url'] = base_url + search_string.format(query=urlencode({'q': query})) params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
params['route'] = route_re.match(query) params['route'] = route_re.match(query)
params['headers']['User-Agent'] = searx_useragent() params['headers']['User-Agent'] = searx_useragent()
if 'Accept-Language' not in params['headers']:
accept_language = 'en' if params['language'] == 'all' else params['language'] params['headers']['Accept-Language'] = 'en'
params['headers']['Accept-Language'] = accept_language
return params return params

View File

@ -19,6 +19,9 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True
# search-url # search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@ -41,9 +44,6 @@ def request(query, params):
language = url_lang(params['language']) language = url_lang(params['language'])
params['url'] = search_url.format(title=quote(query), language=language) params['url'] = search_url.format(title=quote(query), language=language)
if params['language'].lower() in language_variants.get(language, []):
params['headers']['Accept-Language'] = params['language'].lower()
params['headers']['User-Agent'] = searx_useragent() params['headers']['User-Agent'] = searx_useragent()
params['raise_for_httperror'] = False params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2 params['soft_max_redirects'] = 2

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
import typing import typing
import babel
class EngineRef: class EngineRef:
@ -29,6 +30,7 @@ class SearchQuery:
'query', 'query',
'engineref_list', 'engineref_list',
'lang', 'lang',
'locale',
'safesearch', 'safesearch',
'pageno', 'pageno',
'time_range', 'time_range',
@ -59,6 +61,13 @@ class SearchQuery:
self.external_bang = external_bang self.external_bang = external_bang
self.engine_data = engine_data or {} self.engine_data = engine_data or {}
self.locale = None
if self.lang:
try:
self.locale = babel.Locale.parse(self.lang, sep='-')
except babel.core.UnknownLocaleError:
pass
@property @property
def categories(self): def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list))) return list(set(map(lambda engineref: engineref.category, self.engineref_list)))

View File

@ -60,6 +60,17 @@ class OnlineProcessor(EngineProcessor):
# add an user agent # add an user agent
params['headers']['User-Agent'] = gen_useragent() params['headers']['User-Agent'] = gen_useragent()
# add Accept-Language header
if self.engine.send_accept_language_header and search_query.locale:
ac_lang = search_query.locale.language
if search_query.locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
search_query.locale.language,
search_query.locale.territory,
search_query.locale.language,
)
params['headers']['Accept-Language'] = ac_lang
return params return params
def _send_http_request(self, params): def _send_http_request(self, params):

View File

@ -748,6 +748,7 @@ engines:
- name: google play movies - name: google play movies
engine: xpath engine: xpath
send_accept_language_header: true
search_url: https://play.google.com/store/search?q={query}&c=movies search_url: https://play.google.com/store/search?q={query}&c=movies
results_xpath: '//div[@class="ImZGtf mpg5gc"]' results_xpath: '//div[@class="ImZGtf mpg5gc"]'
title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a' title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'