searx.utils.detect_language : add model from fastlangid

The project fastlangid has a FastText model to achieve higher accuracy
for Japanese, Korean, Chinese languages.

This commits includes the model from this project,
and change searx.utils.detect_language to rely on that model
as a second stage.

The model is lazy loaded when the base model detect Japanese, Korean or Chinese.

https://github.com/currentslab/fastlangid
This commit is contained in:
Alexandre Flament 2023-09-17 09:31:47 +00:00
parent 7dfcc3386e
commit 931e1f6503
2 changed files with 51 additions and 13 deletions

BIN
searx/data/model_s.ftz Normal file

Binary file not shown.

View File

@ -4,6 +4,7 @@
"""Utility functions for the engines
"""
import collections
import re
import importlib
import importlib.util
@ -54,9 +55,16 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
_XPATH_CACHE: Dict[str, XPath] = {}
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
_FASTTEXT_MODEL: Dict[str, "fasttext.FastText._FastText"] = {}
"""fasttext model to predict laguage of a search term"""
_FASTTEXT_UNCERTAIN_SETS = frozenset(('zh', 'ko', 'ja'))
_FASTTEXT_CHINESE_FAMILY_CODES = frozenset(('zh-hant', 'zh-hans', 'zh-yue'))
_FASTTEXT_MODEL_S_THRESHOLD = 0.93
FastTextResult = collections.namedtuple('FastTextResult', 'language, prob')
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
@ -570,15 +578,48 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
return default
def _get_fasttext_model() -> "fasttext.FastText._FastText":
global _FASTTEXT_MODEL # pylint: disable=global-statement
if _FASTTEXT_MODEL is None:
def _get_fasttext_model(model_name: str) -> "fasttext.FastText._FastText":
if model_name not in _FASTTEXT_MODEL:
import fasttext # pylint: disable=import-outside-toplevel
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
fasttext.FastText.eprint = lambda x: None
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
return _FASTTEXT_MODEL
_FASTTEXT_MODEL[model_name] = fasttext.load_model(str(data_dir / model_name))
return _FASTTEXT_MODEL[model_name]
def _fast_text_predict(model: str, text: str, k: int, threshold: float):
r = _get_fasttext_model(model).predict(text, k=k, threshold=threshold)
if isinstance(r, tuple) and len(r) == 2:
return [FastTextResult(language=label.split('__label__')[1], prob=prob) for label, prob in zip(r[0], r[1])]
return []
def _inner_detect_language(text: str, threshold: float = 0.3) -> Optional[str]:
text = text.replace("\n", "")
r = _fast_text_predict("lid.176.ftz", text, k=1, threshold=threshold)
if len(r) == 0:
return None
first_result = r[0]
# fastlang
# https://github.com/currentslab/fastlangid
# Licence Apache-2.0 license
if first_result.language in _FASTTEXT_UNCERTAIN_SETS and first_result.prob < _FASTTEXT_MODEL_S_THRESHOLD:
# lid.176.ftz models usually confuse chinese, korean, japanese words
# if the model is not so sure we pass to our model to reduce down the uncertainty
r = _fast_text_predict("model_s.ftz", text, k=1, threshold=threshold)
return r[0].language if len(r) > 0 else None
if first_result.language == 'zh' and first_result.prob >= _FASTTEXT_MODEL_S_THRESHOLD:
# predict chinese: now we want to know which chinese family it belongs
for lang, prob in _fast_text_predict("model_s.ftz", text, k=10, threshold=0):
if lang in _FASTTEXT_CHINESE_FAMILY_CODES and prob > threshold:
return lang
return 'zh' if r[0].prob > threshold else None
return first_result.language
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
@ -643,13 +684,10 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
"""
if not isinstance(text, str):
raise ValueError('text must a str')
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
language = r[0][0].split('__label__')[1]
if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
return None
return language
return None
language = _inner_detect_language(text, threshold=threshold)
if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
return None
return language
def js_variable_to_python(js_variable):