searx.utils.detect_language : add model from fastlangid

The project fastlangid has a FastText model to achieve higher accuracy for Japanese, Korean, Chinese languages. This commits includes the model from this project, and change searx.utils.detect_language to rely on that model as a second stage. The model is lazy loaded when the base model detect Japanese, Korean or Chinese. https://github.com/currentslab/fastlangid
2024-01-01 19:24:07 +01:00 · 2023-09-17 09:31:47 +00:00 · 2023-09-17 09:31:47 +00:00 · 931e1f6503
commit 931e1f6503
parent 7dfcc3386e
2 changed files with 51 additions and 13 deletions
--- a/searx/data/model_s.ftz
+++ b/searx/data/model_s.ftz
--- a/searx/utils.py
+++ b/searx/utils.py
@ -4,6 +4,7 @@
 """Utility functions for the engines

 """
+import collections
 import re
 import importlib
 import importlib.util
@ -54,9 +55,16 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
 _XPATH_CACHE: Dict[str, XPath] = {}
 _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}

-_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
+_FASTTEXT_MODEL: Dict[str, "fasttext.FastText._FastText"] = {}
 """fasttext model to predict laguage of a search term"""

+_FASTTEXT_UNCERTAIN_SETS = frozenset(('zh', 'ko', 'ja'))
+_FASTTEXT_CHINESE_FAMILY_CODES = frozenset(('zh-hant', 'zh-hans', 'zh-yue'))
+_FASTTEXT_MODEL_S_THRESHOLD = 0.93
+
+FastTextResult = collections.namedtuple('FastTextResult', 'language, prob')
+
+
 SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
 """Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""

@ -570,15 +578,48 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
    return default


-def _get_fasttext_model() -> "fasttext.FastText._FastText":
-    global _FASTTEXT_MODEL  # pylint: disable=global-statement
-    if _FASTTEXT_MODEL is None:
+def _get_fasttext_model(model_name: str) -> "fasttext.FastText._FastText":
+    if model_name not in _FASTTEXT_MODEL:
        import fasttext  # pylint: disable=import-outside-toplevel

        # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
        fasttext.FastText.eprint = lambda x: None
-        _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
-    return _FASTTEXT_MODEL
+        _FASTTEXT_MODEL[model_name] = fasttext.load_model(str(data_dir / model_name))
+    return _FASTTEXT_MODEL[model_name]
+
+
+def _fast_text_predict(model: str, text: str, k: int, threshold: float):
+    r = _get_fasttext_model(model).predict(text, k=k, threshold=threshold)
+    if isinstance(r, tuple) and len(r) == 2:
+        return [FastTextResult(language=label.split('__label__')[1], prob=prob) for label, prob in zip(r[0], r[1])]
+    return []
+
+
+def _inner_detect_language(text: str, threshold: float = 0.3) -> Optional[str]:
+    text = text.replace("\n", "")
+    r = _fast_text_predict("lid.176.ftz", text, k=1, threshold=threshold)
+    if len(r) == 0:
+        return None
+
+    first_result = r[0]
+
+    # fastlang
+    # https://github.com/currentslab/fastlangid
+    # Licence Apache-2.0 license
+    if first_result.language in _FASTTEXT_UNCERTAIN_SETS and first_result.prob < _FASTTEXT_MODEL_S_THRESHOLD:
+        # lid.176.ftz models usually confuse chinese, korean, japanese words
+        # if the model is not so sure we pass to our model to reduce down the uncertainty
+        r = _fast_text_predict("model_s.ftz", text, k=1, threshold=threshold)
+        return r[0].language if len(r) > 0 else None
+
+    if first_result.language == 'zh' and first_result.prob >= _FASTTEXT_MODEL_S_THRESHOLD:
+        # predict chinese: now we want to know which chinese family it belongs
+        for lang, prob in _fast_text_predict("model_s.ftz", text, k=10, threshold=0):
+            if lang in _FASTTEXT_CHINESE_FAMILY_CODES and prob > threshold:
+                return lang
+        return 'zh' if r[0].prob > threshold else None
+
+    return first_result.language


 def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
@ -643,13 +684,10 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
    """
    if not isinstance(text, str):
        raise ValueError('text must a str')
-    r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
-    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
-        language = r[0][0].split('__label__')[1]
-        if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
-            return None
-        return language
-    return None
+    language = _inner_detect_language(text, threshold=threshold)
+    if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
+        return None
+    return language


 def js_variable_to_python(js_variable):