From d97e1a479fcd1afc1f059529fa060f8c7597be17 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 10 Apr 2022 15:43:41 +0200 Subject: [PATCH] [fix] bing engine: query string only support 2 letter language code The 'language:xx' query string in the request() function does only support the 2 letter language codes from the "Display languages" list. Examples of items from the "Display languages" not sopported in the query string: pt-BR, pt-BR, mn-Cyrl-MN, chr-cher, zh-Hans, ha-latn, ca-es-valencia Signed-off-by: Markus Heiser --- searx/data/engines_languages.json | 124 +++++++++++++----------------- searx/engines/bing.py | 21 ++--- searx/languages.py | 2 - 3 files changed, 64 insertions(+), 83 deletions(-) diff --git a/searx/data/engines_languages.json b/searx/data/engines_languages.json index fe648499a..b6ac0c535 100644 --- a/searx/data/engines_languages.json +++ b/searx/data/engines_languages.json @@ -4,14 +4,13 @@ "am", "ar", "as", - "az-latn", + "az", "be", "bg", "bn", - "bs-latn", + "bs", "ca", - "ca-es-valencia", - "chr-cher", + "chr", "cs", "cy", "da", @@ -29,7 +28,7 @@ "gd", "gl", "gu", - "ha-latn", + "ha", "he", "hi", "hr", @@ -46,7 +45,7 @@ "kn", "ko", "kok", - "ku-arab", + "ku", "ky", "lb", "lo", @@ -55,7 +54,7 @@ "mi", "mk", "ml", - "mn-Cyrl-MN", + "mn", "mr", "ms", "mt", @@ -65,29 +64,26 @@ "nn", "nso", "or", - "pa-arab", - "pa-guru", + "pa", "pl", "prs", - "pt-BR", - "pt-PT", + "pt", "quc", "quz", "ro", "ru", "rw", - "sd-arab", + "sd", "si", "sk", "sl", "sq", - "sr-cyrl", - "sr-latn", + "sr", "sv", "sw", "ta", "te", - "tg-cyrl", + "tg", "th", "ti", "tk", @@ -97,13 +93,12 @@ "ug", "uk", "ur", - "uz-latn", + "uz", "vi", "wo", "xh", "yo", - "zh-Hans", - "zh-Hant", + "zh", "zu" ], "bing images": [ @@ -111,14 +106,13 @@ "am", "ar", "as", - "az-latn", + "az", "be", "bg", "bn", - "bs-latn", + "bs", "ca", - "ca-es-valencia", - "chr-cher", + "chr", "cs", "cy", "da", @@ -136,7 +130,7 @@ "gd", "gl", "gu", - "ha-latn", + "ha", "he", "hi", "hr", @@ -153,7 +147,7 @@ "kn", "ko", "kok", - "ku-arab", + "ku", "ky", "lb", "lo", @@ -162,7 +156,7 @@ "mi", "mk", "ml", - "mn-Cyrl-MN", + "mn", "mr", "ms", "mt", @@ -172,29 +166,26 @@ "nn", "nso", "or", - "pa-arab", - "pa-guru", + "pa", "pl", "prs", - "pt-BR", - "pt-PT", + "pt", "quc", "quz", "ro", "ru", "rw", - "sd-arab", + "sd", "si", "sk", "sl", "sq", - "sr-cyrl", - "sr-latn", + "sr", "sv", "sw", "ta", "te", - "tg-cyrl", + "tg", "th", "ti", "tk", @@ -204,13 +195,12 @@ "ug", "uk", "ur", - "uz-latn", + "uz", "vi", "wo", "xh", "yo", - "zh-Hans", - "zh-Hant", + "zh", "zu" ], "bing news": [ @@ -218,14 +208,13 @@ "am", "ar", "as", - "az-latn", + "az", "be", "bg", "bn", - "bs-latn", + "bs", "ca", - "ca-es-valencia", - "chr-cher", + "chr", "cs", "cy", "da", @@ -243,7 +232,7 @@ "gd", "gl", "gu", - "ha-latn", + "ha", "he", "hi", "hr", @@ -260,7 +249,7 @@ "kn", "ko", "kok", - "ku-arab", + "ku", "ky", "lb", "lo", @@ -269,7 +258,7 @@ "mi", "mk", "ml", - "mn-Cyrl-MN", + "mn", "mr", "ms", "mt", @@ -279,29 +268,26 @@ "nn", "nso", "or", - "pa-arab", - "pa-guru", + "pa", "pl", "prs", - "pt-BR", - "pt-PT", + "pt", "quc", "quz", "ro", "ru", "rw", - "sd-arab", + "sd", "si", "sk", "sl", "sq", - "sr-cyrl", - "sr-latn", + "sr", "sv", "sw", "ta", "te", - "tg-cyrl", + "tg", "th", "ti", "tk", @@ -311,13 +297,12 @@ "ug", "uk", "ur", - "uz-latn", + "uz", "vi", "wo", "xh", "yo", - "zh-Hans", - "zh-Hant", + "zh", "zu" ], "bing videos": [ @@ -325,14 +310,13 @@ "am", "ar", "as", - "az-latn", + "az", "be", "bg", "bn", - "bs-latn", + "bs", "ca", - "ca-es-valencia", - "chr-cher", + "chr", "cs", "cy", "da", @@ -350,7 +334,7 @@ "gd", "gl", "gu", - "ha-latn", + "ha", "he", "hi", "hr", @@ -367,7 +351,7 @@ "kn", "ko", "kok", - "ku-arab", + "ku", "ky", "lb", "lo", @@ -376,7 +360,7 @@ "mi", "mk", "ml", - "mn-Cyrl-MN", + "mn", "mr", "ms", "mt", @@ -386,29 +370,26 @@ "nn", "nso", "or", - "pa-arab", - "pa-guru", + "pa", "pl", "prs", - "pt-BR", - "pt-PT", + "pt", "quc", "quz", "ro", "ru", "rw", - "sd-arab", + "sd", "si", "sk", "sl", "sq", - "sr-cyrl", - "sr-latn", + "sr", "sv", "sw", "ta", "te", - "tg-cyrl", + "tg", "th", "ti", "tk", @@ -418,13 +399,12 @@ "ug", "uk", "ur", - "uz-latn", + "uz", "vi", "wo", "xh", "yo", - "zh-Hans", - "zh-Hant", + "zh", "zu" ], "dailymotion": [ diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 4c037de85..ff89de199 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -131,21 +131,24 @@ def _fetch_supported_languages(resp): lang_tags = set() dom = html.fromstring(resp.text) - lang_links = eval_xpath(dom, '//div[@id="language-section"]//li') - for _li in lang_links: + # Selector to get items from "Display language" + ui_lang_links = eval_xpath(dom, '//div[@id="language-section"]//li') + + for _li in ui_lang_links: href = eval_xpath(_li, './/@href')[0] (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href) query = parse_qs(query, keep_blank_values=True) - # fmt: off - setlang = query.get('setlang', [None, ])[0] - # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN'] - lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2] # fmt: skip - # fmt: on + # The 'language:xx' query string in the request function (above) does + # only support the 2 letter language codes from the "Display languages" + # list. Examples of items from the "Display languages" not sopported in + # the query string: + # 'mn-Cyrl-MN', 'chr-cher', 'zh-Hans', ha-latn, 'ca-es-valencia' - tag = lang + '-' + nation if nation else lang - lang_tags.add(tag) + setlang = query.get('setlang', [None, ])[0] + lang = setlang.split('-')[0] + lang_tags.add(lang) return list(lang_tags) diff --git a/searx/languages.py b/searx/languages.py index 8ffff9c1d..9987a6812 100644 --- a/searx/languages.py +++ b/searx/languages.py @@ -49,8 +49,6 @@ language_codes = ( ('nl-BE', 'Nederlands', 'België', 'Dutch', '\U0001f1e7\U0001f1ea'), ('nl-NL', 'Nederlands', 'Nederland', 'Dutch', '\U0001f1f3\U0001f1f1'), ('pl-PL', 'Polski', 'Polska', 'Polish', '\U0001f1f5\U0001f1f1'), - ('pt', 'Português', '', 'Portuguese', '\U0001f310'), - ('pt-BR', 'Português', 'Brasil', 'Portuguese', '\U0001f1e7\U0001f1f7'), ('pt-PT', 'Português', 'Portugal', 'Portuguese', '\U0001f1f5\U0001f1f9'), ('ro-RO', 'Română', 'România', 'Romanian', '\U0001f1f7\U0001f1f4'), ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),