From b8352eca0c23b00993a2f1bc659d98022dc07894 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 8 Aug 2023 11:20:10 +0200 Subject: [PATCH] [mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser --- searx/data/engine_traits.json | 280 ++++++++++++++++++++++++++++++++++ searx/engines/brave.py | 166 ++++++++++++++++++-- 2 files changed, 431 insertions(+), 15 deletions(-) diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json index e6774d696..9f14320cd 100644 --- a/searx/data/engine_traits.json +++ b/searx/data/engine_traits.json @@ -541,6 +541,286 @@ "zh-TW": "zh-TW" } }, + "brave": { + "all_locale": "all", + "custom": { + "ui_lang": { + "ca": "ca", + "de-DE": "de-de", + "en-CA": "en-ca", + "en-GB": "en-gb", + "en-US": "en-us", + "es": "es", + "fr-CA": "fr-ca", + "fr-FR": "fr-fr", + "ja-JP": "ja-jp", + "pt-BR": "pt-br", + "sq-AL": "sq-al" + } + }, + "data_type": "traits_v1", + "languages": {}, + "regions": { + "ar-SA": "sa", + "da-DK": "dk", + "de-AT": "at", + "de-BE": "be", + "de-CH": "ch", + "de-DE": "de", + "en-AU": "au", + "en-CA": "ca", + "en-GB": "gb", + "en-HK": "hk", + "en-IN": "in", + "en-NZ": "nz", + "en-PH": "ph", + "en-US": "us", + "en-ZA": "za", + "es-AR": "ar", + "es-CL": "cl", + "es-ES": "es", + "es-MX": "mx", + "fi-FI": "fi", + "fil-PH": "ph", + "fr-BE": "be", + "fr-CA": "ca", + "fr-CH": "ch", + "fr-FR": "fr", + "gsw-CH": "ch", + "hi-IN": "in", + "id-ID": "id", + "it-CH": "ch", + "it-IT": "it", + "ja-JP": "jp", + "ko-KR": "kr", + "mi-NZ": "nz", + "ms-MY": "my", + "nb-NO": "no", + "nl-BE": "be", + "nl-NL": "nl", + "nn-NO": "no", + "pl-PL": "pl", + "pt-BR": "br", + "pt-PT": "pt", + "ru-RU": "ru", + "sv-FI": "fi", + "sv-SE": "se", + "tr-TR": "tr", + "zh-CN": "cn", + "zh-HK": "hk", + "zh-TW": "tw" + } + }, + "brave.images": { + "all_locale": "all", + "custom": { + "ui_lang": { + "ca": "ca", + "de-DE": "de-de", + "en-CA": "en-ca", + "en-GB": "en-gb", + "en-US": "en-us", + "es": "es", + "fr-CA": "fr-ca", + "fr-FR": "fr-fr", + "ja-JP": "ja-jp", + "pt-BR": "pt-br", + "sq-AL": "sq-al" + } + }, + "data_type": "traits_v1", + "languages": {}, + "regions": { + "ar-SA": "sa", + "da-DK": "dk", + "de-AT": "at", + "de-BE": "be", + "de-CH": "ch", + "de-DE": "de", + "en-AU": "au", + "en-CA": "ca", + "en-GB": "gb", + "en-HK": "hk", + "en-IN": "in", + "en-NZ": "nz", + "en-PH": "ph", + "en-US": "us", + "en-ZA": "za", + "es-AR": "ar", + "es-CL": "cl", + "es-ES": "es", + "es-MX": "mx", + "fi-FI": "fi", + "fil-PH": "ph", + "fr-BE": "be", + "fr-CA": "ca", + "fr-CH": "ch", + "fr-FR": "fr", + "gsw-CH": "ch", + "hi-IN": "in", + "id-ID": "id", + "it-CH": "ch", + "it-IT": "it", + "ja-JP": "jp", + "ko-KR": "kr", + "mi-NZ": "nz", + "ms-MY": "my", + "nb-NO": "no", + "nl-BE": "be", + "nl-NL": "nl", + "nn-NO": "no", + "pl-PL": "pl", + "pt-BR": "br", + "pt-PT": "pt", + "ru-RU": "ru", + "sv-FI": "fi", + "sv-SE": "se", + "tr-TR": "tr", + "zh-CN": "cn", + "zh-HK": "hk", + "zh-TW": "tw" + } + }, + "brave.news": { + "all_locale": "all", + "custom": { + "ui_lang": { + "ca": "ca", + "de-DE": "de-de", + "en-CA": "en-ca", + "en-GB": "en-gb", + "en-US": "en-us", + "es": "es", + "fr-CA": "fr-ca", + "fr-FR": "fr-fr", + "ja-JP": "ja-jp", + "pt-BR": "pt-br", + "sq-AL": "sq-al" + } + }, + "data_type": "traits_v1", + "languages": {}, + "regions": { + "ar-SA": "sa", + "da-DK": "dk", + "de-AT": "at", + "de-BE": "be", + "de-CH": "ch", + "de-DE": "de", + "en-AU": "au", + "en-CA": "ca", + "en-GB": "gb", + "en-HK": "hk", + "en-IN": "in", + "en-NZ": "nz", + "en-PH": "ph", + "en-US": "us", + "en-ZA": "za", + "es-AR": "ar", + "es-CL": "cl", + "es-ES": "es", + "es-MX": "mx", + "fi-FI": "fi", + "fil-PH": "ph", + "fr-BE": "be", + "fr-CA": "ca", + "fr-CH": "ch", + "fr-FR": "fr", + "gsw-CH": "ch", + "hi-IN": "in", + "id-ID": "id", + "it-CH": "ch", + "it-IT": "it", + "ja-JP": "jp", + "ko-KR": "kr", + "mi-NZ": "nz", + "ms-MY": "my", + "nb-NO": "no", + "nl-BE": "be", + "nl-NL": "nl", + "nn-NO": "no", + "pl-PL": "pl", + "pt-BR": "br", + "pt-PT": "pt", + "ru-RU": "ru", + "sv-FI": "fi", + "sv-SE": "se", + "tr-TR": "tr", + "zh-CN": "cn", + "zh-HK": "hk", + "zh-TW": "tw" + } + }, + "brave.videos": { + "all_locale": "all", + "custom": { + "ui_lang": { + "ca": "ca", + "de-DE": "de-de", + "en-CA": "en-ca", + "en-GB": "en-gb", + "en-US": "en-us", + "es": "es", + "fr-CA": "fr-ca", + "fr-FR": "fr-fr", + "ja-JP": "ja-jp", + "pt-BR": "pt-br", + "sq-AL": "sq-al" + } + }, + "data_type": "traits_v1", + "languages": {}, + "regions": { + "ar-SA": "sa", + "da-DK": "dk", + "de-AT": "at", + "de-BE": "be", + "de-CH": "ch", + "de-DE": "de", + "en-AU": "au", + "en-CA": "ca", + "en-GB": "gb", + "en-HK": "hk", + "en-IN": "in", + "en-NZ": "nz", + "en-PH": "ph", + "en-US": "us", + "en-ZA": "za", + "es-AR": "ar", + "es-CL": "cl", + "es-ES": "es", + "es-MX": "mx", + "fi-FI": "fi", + "fil-PH": "ph", + "fr-BE": "be", + "fr-CA": "ca", + "fr-CH": "ch", + "fr-FR": "fr", + "gsw-CH": "ch", + "hi-IN": "in", + "id-ID": "id", + "it-CH": "ch", + "it-IT": "it", + "ja-JP": "jp", + "ko-KR": "kr", + "mi-NZ": "nz", + "ms-MY": "my", + "nb-NO": "no", + "nl-BE": "be", + "nl-NL": "nl", + "nn-NO": "no", + "pl-PL": "pl", + "pt-BR": "br", + "pt-PT": "pt", + "ru-RU": "ru", + "sv-FI": "fi", + "sv-SE": "se", + "tr-TR": "tr", + "zh-CN": "cn", + "zh-HK": "hk", + "zh-TW": "tw" + } + }, "dailymotion": { "all_locale": null, "custom": {}, diff --git a/searx/engines/brave.py b/searx/engines/brave.py index c32d0f39e..c27d113f6 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -31,12 +31,73 @@ Configured ``brave`` engines: brave_category: news +.. _brave regions: + +Brave regions +============= + +Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with +locales. To get a mapping, all *officatl de-facto* languages of the Brave +region are mapped to regions in SearXNG (see :py:obj:`babel +`): + +.. code:: python + + "regions": { + .. + "en-CA": "ca", + "fr-CA": "ca", + .. + } + + +.. note:: + + The language (aka region) support of Brave's index is limited to very basic + languages. The search results for languages like Chinese or Arabic are of + low quality. + + +.. _brave languages: + +Brave languages +=============== + +Brave's language support is limited to the UI (menues, area local notations, +etc). Brave's index only seems to support a locale, but it does not seem to +support any languages in its index. The choice of available languages is very +small (and its not clear to me where the differencee in UI is when switching +from en-us to en-ca or en-gb). + +In the :py:obj:`EngineTraits object ` the +UI languages are stored in a custom field named ``ui_lang``: + +.. code:: python + + "custom": { + "ui_lang": { + "ca": "ca", + "de-DE": "de-de", + "en-CA": "en-ca", + "en-GB": "en-gb", + "en-US": "en-us", + "es": "es", + "fr-CA": "fr-ca", + "fr-FR": "fr-fr", + "ja-JP": "ja-jp", + "pt-BR": "pt-br", + "sq-AL": "sq-al" + } + }, + Implementations =============== """ -# pylint: disable=fixme +from typing import TYPE_CHECKING + +import re from urllib.parse import ( urlencode, urlparse, @@ -46,11 +107,20 @@ from urllib.parse import ( import chompjs from lxml import html +from searx import locales from searx.utils import ( extract_text, eval_xpath_list, eval_xpath_getindex, ) +from searx.enginelib.traits import EngineTraits + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits about = { "website": 'https://search.brave.com/', @@ -118,23 +188,20 @@ def request(query, params): params["url"] = f"{base_url}{brave_category}?{urlencode(args)}" - # set preferences in cookie + # set properties in the cookies + params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off') - - # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are - # limited and the country handling has it quirks - - eng_locale = params.get('searxng_locale') - params['cookies']['useLocation'] = '0' # the useLocation is IP based, we use 'country' + # the useLocation is IP based, we use cookie 'country' for the region + params['cookies']['useLocation'] = '0' params['cookies']['summarizer'] = '0' - if not eng_locale or eng_locale == 'all': - params['cookies']['country'] = 'all' # country=all - else: - params['cookies']['country'] = eng_locale.split('-')[-1].lower() - params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower() + engine_region = traits.get_region(params['searxng_locale'], 'all') + params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore - # logger.debug("cookies %s", params['cookies']) + ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us') + params['cookies']['ui_lang'] = ui_lang + + logger.debug("cookies %s", params['cookies']) def response(resp): @@ -195,7 +262,7 @@ def _parse_search(resp): video_tag = eval_xpath_getindex( result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None ) - if video_tag: + if video_tag is not None: # In my tests a video tag in the WEB search was mostoften not a # video, except the ones from youtube .. @@ -281,3 +348,72 @@ def _parse_videos(json_resp): result_list.append(item) return result_list + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch :ref:`languages ` and :ref:`regions ` from Brave.""" + + # pylint: disable=import-outside-toplevel + + import babel.languages + from searx.locales import region_tag, language_tag + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + + engine_traits.custom["ui_lang"] = {} + + headers = { + 'Accept-Encoding': 'gzip, deflate', + } + lang_map = {'no': 'nb'} # norway + + # languages (UI) + + resp = get('https://search.brave.com/settings', headers=headers) + + if not resp.ok: # type: ignore + print("ERROR: response from Brave is not OK.") + dom = html.fromstring(resp.text) # type: ignore + + for option in dom.xpath('//div[@id="language-select"]//option'): + + ui_lang = option.get('value') + try: + if '-' in ui_lang: + sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-')) + else: + sxng_tag = language_tag(babel.Locale.parse(ui_lang)) + + except babel.UnknownLocaleError: + print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang) + continue + + conflict = engine_traits.custom["ui_lang"].get(sxng_tag) + if conflict: + if conflict != ui_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang)) + continue + engine_traits.custom["ui_lang"][sxng_tag] = ui_lang + + # search regions of brave + + engine_traits.all_locale = 'all' + + for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'): + + flag = country.xpath('./span[contains(@class, "flag")]')[0] + # country_name = extract_text(flag.xpath('./following-sibling::*')[0]) + country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1) # type: ignore + + # add offical languages of the country .. + for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True): + lang_tag = lang_map.get(lang_tag, lang_tag) + sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper()))) + # print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag)) + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != country_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag)) + continue + engine_traits.regions[sxng_tag] = country_tag