From 8df1f0c47e03fe7525c40a2856dba950bab8998b Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 1 Aug 2022 17:01:59 +0200 Subject: [PATCH] [mod] add 'Accept-Language' HTTP header to online processores Most engines that support languages (and regions) use the Accept-Language from the WEB browser to build a response that fits to the language (and region). - add new engine option: send_accept_language_header Signed-off-by: Markus Heiser --- docs/admin/engines/settings.rst | 8 ++++++++ searx/engines/__init__.py | 1 + searx/engines/bing.py | 2 +- searx/engines/bing_images.py | 1 + searx/engines/bing_news.py | 1 + searx/engines/bing_videos.py | 5 +---- searx/engines/demo_online.py | 1 + searx/engines/duckduckgo.py | 1 + searx/engines/duckduckgo_definitions.py | 3 ++- searx/engines/duckduckgo_images.py | 1 + searx/engines/google.py | 11 +---------- searx/engines/google_images.py | 2 +- searx/engines/google_news.py | 2 +- searx/engines/google_play_apps.py | 2 ++ searx/engines/google_scholar.py | 2 +- searx/engines/google_videos.py | 2 +- searx/engines/openstreetmap.py | 6 +++--- searx/engines/wikipedia.py | 6 +++--- searx/search/models.py | 9 +++++++++ searx/search/processors/online.py | 11 +++++++++++ searx/settings.yml | 1 + 21 files changed, 52 insertions(+), 26 deletions(-) diff --git a/docs/admin/engines/settings.rst b/docs/admin/engines/settings.rst index 0b4b984d7..f85c6a75f 100644 --- a/docs/admin/engines/settings.rst +++ b/docs/admin/engines/settings.rst @@ -440,6 +440,7 @@ engine is shown. Most of the options have a default value or even are optional. engine: example shortcut: demo base_url: 'https://{language}.example.com/' + send_accept_language_header: false categories: general timeout: 3.0 api_key: 'apikey' @@ -488,6 +489,13 @@ engine is shown. Most of the options have a default value or even are optional. use multiple sites using only one engine, or updating the site URL without touching at the code. +``send_accept_language_header`` : + Several engines that support languages (or regions) deal with the HTTP header + ``Accept-Language`` to build a response that fits to the locale. When this + option is activated, the language (locale) that is selected by the user is used + to build and send a ``Accept-Language`` header in the request to the origin + search engine. + ``categories`` : optional Define in which categories this engine will be active. Most of the time, it is defined in the code of the engine, but in a few cases it is useful, like when diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 3fb0bcfb1..07d5b226c 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = { "enable_http": False, "using_tor_proxy": False, "display_error_messages": True, + "send_accept_language_header": False, "tokens": [], "about": {}, } diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 3d4ac08bd..8d024fed0 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -25,6 +25,7 @@ categories = ['general', 'web'] paging = True time_range_support = False safesearch = False +send_accept_language_header = True supported_languages_url = 'https://www.bing.com/account/general' language_aliases = {} @@ -68,7 +69,6 @@ def request(query, params): logger.debug("headers.Referer --> %s", referer) params['url'] = base_url + search_path - params['headers']['Accept-Language'] = "en-US,en;q=0.5" params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' return params diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index cb69dc172..107ce3cff 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -31,6 +31,7 @@ categories = ['images', 'web'] paging = True safesearch = True time_range_support = True +send_accept_language_header = True supported_languages_url = 'https://www.bing.com/account/general' number_of_results = 28 diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 22856541b..7eea17bb4 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -34,6 +34,7 @@ about = { categories = ['news'] paging = True time_range_support = True +send_accept_language_header = True # search-url base_url = 'https://www.bing.com/' diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index ae8e8d49a..9be8eeaef 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -30,6 +30,7 @@ categories = ['videos', 'web'] paging = True safesearch = True time_range_support = True +send_accept_language_header = True number_of_results = 28 base_url = 'https://www.bing.com/' @@ -70,10 +71,6 @@ def request(query, params): if params['time_range'] in time_range_dict: params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) - # bing videos did not like "older" versions < 70.0.1 when selectin other - # languages then 'en' .. very strange ?!?! - params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1' - return params diff --git a/searx/engines/demo_online.py b/searx/engines/demo_online.py index ee39a2f5a..08add5371 100644 --- a/searx/engines/demo_online.py +++ b/searx/engines/demo_online.py @@ -20,6 +20,7 @@ from json import loads from urllib.parse import urlencode engine_type = 'online' +send_accept_language_header = True categories = ['general'] disabled = True timeout = 2.0 diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 71da72677..17f0fae1c 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -31,6 +31,7 @@ categories = ['general', 'web'] paging = True supported_languages_url = 'https://duckduckgo.com/util/u588.js' time_range_support = True +send_accept_language_header = True language_aliases = { 'ar-SA': 'ar-XA', diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index ad3c92169..a73ee55ff 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -27,6 +27,8 @@ about = { "results": 'JSON', } +send_accept_language_header = True + URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/'] @@ -62,7 +64,6 @@ def request(query, params): params['url'] = URL.format(query=urlencode({'q': query})) language = match_language(params['language'], supported_languages, language_aliases) language = language.split('-')[0] - params['headers']['Accept-Language'] = language return params diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 7d844b543..19f649ef4 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -30,6 +30,7 @@ about = { categories = ['images', 'web'] paging = True safesearch = True +send_accept_language_header = True # search-url images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' diff --git a/searx/engines/google.py b/searx/engines/google.py index 0d116db9f..5e80f6dcc 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -45,6 +45,7 @@ categories = ['general', 'web'] paging = True time_range_support = True safesearch = True +send_accept_language_header = True use_mobile_ui = False supported_languages_url = 'https://www.google.com/preferences?#languages' @@ -241,16 +242,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): # language. ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) - # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 - ret_val['headers']['Accept-Language'] = ','.join( - [ - lang_country, - language + ';q=0.8,', - 'en;q=0.6', - '*;q=0.5', - ] - ) - return ret_val diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index a65c0ce37..e1f676dd6 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -51,6 +51,7 @@ paging = False use_locale_domain = True time_range_support = True safesearch = True +send_accept_language_header = True filter_mapping = {0: 'images', 1: 'active', 2: 'active'} @@ -125,7 +126,6 @@ def request(query, params): """Google-Video search request""" lang_info = get_lang_info(params, supported_languages, language_aliases, False) - logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) query_url = ( 'https://' diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 0f97f9289..8f5a4b104 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -70,13 +70,13 @@ time_range_support = True # # safesearch : results are identitical for safesearch=0 and safesearch=2 safesearch = False +send_accept_language_header = True def request(query, params): """Google-News search request""" lang_info = get_lang_info(params, supported_languages, language_aliases, False) - logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) # google news has only one domain lang_info['subdomain'] = 'news.google.com' diff --git a/searx/engines/google_play_apps.py b/searx/engines/google_play_apps.py index 226e48dab..6506a446a 100644 --- a/searx/engines/google_play_apps.py +++ b/searx/engines/google_play_apps.py @@ -22,6 +22,8 @@ about = { } categories = ["files", "apps"] +send_accept_language_header = True + search_url = "https://play.google.com/store/search?{query}&c=apps" diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index f9c73097d..41c62886b 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -52,6 +52,7 @@ language_support = True use_locale_domain = True time_range_support = True safesearch = False +send_accept_language_header = True def time_range_url(params): @@ -75,7 +76,6 @@ def request(query, params): offset = (params['pageno'] - 1) * 10 lang_info = get_lang_info(params, supported_languages, language_aliases, False) - logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) # subdomain is: scholar.google.xy lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 6eb051e0a..26dbcdd3c 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -60,6 +60,7 @@ language_support = True use_locale_domain = True time_range_support = True safesearch = True +send_accept_language_header = True RE_CACHE = {} @@ -111,7 +112,6 @@ def request(query, params): """Google-Video search request""" lang_info = get_lang_info(params, supported_languages, language_aliases, False) - logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) query_url = ( 'https://' diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index d44792077..4f799fce7 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -30,6 +30,7 @@ about = { categories = ['map'] paging = False language_support = True +send_accept_language_header = True # search-url base_url = 'https://nominatim.openstreetmap.org/' @@ -142,9 +143,8 @@ def request(query, params): params['url'] = base_url + search_string.format(query=urlencode({'q': query})) params['route'] = route_re.match(query) params['headers']['User-Agent'] = searx_useragent() - - accept_language = 'en' if params['language'] == 'all' else params['language'] - params['headers']['Accept-Language'] = accept_language + if 'Accept-Language' not in params['headers']: + params['headers']['Accept-Language'] = 'en' return params diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index cc806a8de..52b1053ed 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -19,6 +19,9 @@ about = { "results": 'JSON', } + +send_accept_language_header = True + # search-url search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' @@ -41,9 +44,6 @@ def request(query, params): language = url_lang(params['language']) params['url'] = search_url.format(title=quote(query), language=language) - if params['language'].lower() in language_variants.get(language, []): - params['headers']['Accept-Language'] = params['language'].lower() - params['headers']['User-Agent'] = searx_useragent() params['raise_for_httperror'] = False params['soft_max_redirects'] = 2 diff --git a/searx/search/models.py b/searx/search/models.py index ff5897966..bbca1cd1d 100644 --- a/searx/search/models.py +++ b/searx/search/models.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later import typing +import babel class EngineRef: @@ -29,6 +30,7 @@ class SearchQuery: 'query', 'engineref_list', 'lang', + 'locale', 'safesearch', 'pageno', 'time_range', @@ -59,6 +61,13 @@ class SearchQuery: self.external_bang = external_bang self.engine_data = engine_data or {} + self.locale = None + if self.lang: + try: + self.locale = babel.Locale.parse(self.lang, sep='-') + except babel.core.UnknownLocaleError: + pass + @property def categories(self): return list(set(map(lambda engineref: engineref.category, self.engineref_list))) diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 0cfe6e123..17e9b6a96 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -60,6 +60,17 @@ class OnlineProcessor(EngineProcessor): # add an user agent params['headers']['User-Agent'] = gen_useragent() + # add Accept-Language header + if self.engine.send_accept_language_header and search_query.locale: + ac_lang = search_query.locale.language + if search_query.locale.territory: + ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( + search_query.locale.language, + search_query.locale.territory, + search_query.locale.language, + ) + params['headers']['Accept-Language'] = ac_lang + return params def _send_http_request(self, params): diff --git a/searx/settings.yml b/searx/settings.yml index 949550831..d98828ae1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -748,6 +748,7 @@ engines: - name: google play movies engine: xpath + send_accept_language_header: true search_url: https://play.google.com/store/search?q={query}&c=movies results_xpath: '//div[@class="ImZGtf mpg5gc"]' title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'