From 66b7be09656798b745f95ff0ce7669db014b3b38 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Mon, 4 Oct 2021 22:44:58 -0700 Subject: [PATCH] [fix] fix match_language issue to make zh-TW match to zh-Hant-TW pybabel separates locales with underscores but we use hyphens everywhere babel doesn't directly touch --- searx/utils.py | 26 ++++++++++++++++++++++---- tests/unit/test_utils.py | 8 ++++++++ tests/unit/test_webapp.py | 4 ++-- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/searx/utils.py b/searx/utils.py index 4c2c9e429..163892e93 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -369,6 +369,16 @@ def _get_lang_to_lc_dict(lang_list): return value +# babel's get_global contains all sorts of miscellaneous locale and territory related data +# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py +def _get_from_babel(lang_code, key): + match = get_global(key).get(lang_code.replace('-', '_')) + # for some keys, such as territory_aliases, match may be a list + if isinstance(match, str): + return match.replace('_', '-') + return match + + def _match_language(lang_code, lang_list=[], custom_aliases={}): # pylint: disable=W0102 """auxiliary function to match lang_code in lang_list""" # replace language code with a custom alias if necessary @@ -379,9 +389,11 @@ def _match_language(lang_code, lang_list=[], custom_aliases={}): # pylint: disa return lang_code # try to get the most likely country for this language - subtags = get_global('likely_subtags').get(lang_code) + subtags = _get_from_babel(lang_code, 'likely_subtags') if subtags: - subtag_parts = subtags.split('_') + if subtags in lang_list: + return subtags + subtag_parts = subtags.split('-') new_code = subtag_parts[0] + '-' + subtag_parts[-1] if new_code in custom_aliases: new_code = custom_aliases[new_code] @@ -402,16 +414,22 @@ def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US locale_parts = locale_code.split('-') lang_code = locale_parts[0] + # if locale_code has script, try matching without it + if len(locale_parts) > 2: + language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases) + if language: + return language + # try to get language using an equivalent country code if len(locale_parts) > 1: - country_alias = get_global('territory_aliases').get(locale_parts[-1]) + country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases') if country_alias: language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases) if language: return language # try to get language using an equivalent language code - alias = get_global('language_aliases').get(lang_code) + alias = _get_from_babel(lang_code, 'language_aliases') if alias: language = _match_language(alias, lang_list, custom_aliases) if language: diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index e9c247382..bea28c0cc 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -92,6 +92,14 @@ class TestUtils(SearxTestCase): self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback') self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp') + # handle script tags + self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN') + self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW') + self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN') + self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW') + self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN') + self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW') + aliases = {'en-GB': 'en-UK', 'he': 'iw'} # guess country diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index c2d57f80c..30b2839a8 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -211,12 +211,12 @@ class ViewsTestCase(SearxTestCase): result = self.app.get('/preferences', headers={'Accept-Language': 'zh-tw;q=0.8'}) self.assertEqual(result.status_code, 200) self.assertIn( - b'