mirror of https://github.com/searxng/searxng.git
Replace langdetect with fasttext
This commit is contained in:
parent
a6d870d5cf
commit
1f8f8c1e91
|
@ -11,7 +11,6 @@ httpx[http2]==0.21.2
|
||||||
Brotli==1.0.9
|
Brotli==1.0.9
|
||||||
uvloop==0.17.0
|
uvloop==0.17.0
|
||||||
httpx-socks[asyncio]==0.7.2
|
httpx-socks[asyncio]==0.7.2
|
||||||
langdetect==1.0.9
|
|
||||||
setproctitle==1.3.2
|
setproctitle==1.3.2
|
||||||
redis==4.3.5
|
redis==4.3.5
|
||||||
markdown-it-py==2.1.0
|
markdown-it-py==2.1.0
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
# lint: pylint
|
||||||
|
"""Plugin to detect the search language from the search query.
|
||||||
|
|
||||||
|
The language detection is done by using the fastText_ library (`python
|
||||||
|
fasttext`_). fastText_ distributes the `language identification model`_, for
|
||||||
|
reference:
|
||||||
|
|
||||||
|
- `FastText.zip: Compressing text classification models`_
|
||||||
|
- `Bag of Tricks for Efficient Text Classification`_
|
||||||
|
|
||||||
|
The `language identification model`_ support the language codes (ISO-639-3)::
|
||||||
|
|
||||||
|
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
|
||||||
|
ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
|
||||||
|
fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
|
||||||
|
is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
|
||||||
|
mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
|
||||||
|
no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
|
||||||
|
sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
|
||||||
|
vi vls vo wa war wuu xal xmf yi yo yue zh
|
||||||
|
|
||||||
|
The `language identification model`_ is harmonized with the SearXNG's language
|
||||||
|
(locale) model. General conditions of SearXNG's locale model are:
|
||||||
|
|
||||||
|
a. SearXNG's locale of a query is passed to the
|
||||||
|
:py:obj:`searx.locales.get_engine_locale` to get a language and/or region
|
||||||
|
code that is used by an engine.
|
||||||
|
|
||||||
|
b. SearXNG and most of the engines do not support all the languages from
|
||||||
|
language model and there might be also a discrepancy in the ISO-639-3 and
|
||||||
|
ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further
|
||||||
|
more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
|
||||||
|
``zh_Hant`` (``zh_Hans``).
|
||||||
|
|
||||||
|
Conclusion: This plugin does only auto-detect the languages a user can select in
|
||||||
|
the language menu (:py:obj:`supported_langs`).
|
||||||
|
|
||||||
|
SearXNG's locale of a query comes from (*highest wins*):
|
||||||
|
|
||||||
|
1. The ``Accept-Language`` header from user's HTTP client.
|
||||||
|
2. The user select a locale in the preferences.
|
||||||
|
3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
|
||||||
|
4. This plugin is activated in the preferences and the locale (only the language
|
||||||
|
code / none region code) comes from the fastText's language detection.
|
||||||
|
|
||||||
|
Conclusion: There is a conflict between the language selected by the user and
|
||||||
|
the language from language detection of this plugin. For example, the user
|
||||||
|
explicitly selects the German locale via the search syntax to search for a term
|
||||||
|
that is identified as an English term (try ``:de-DE thermomix``, for example).
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
To SearXNG maintainers; please take into account: under some circumstances
|
||||||
|
the auto-detection of the language of this plugin could be detrimental to
|
||||||
|
users expectations. Its not recommended to activate this plugin by
|
||||||
|
default. It should always be the user's decision whether to activate this
|
||||||
|
plugin or not.
|
||||||
|
|
||||||
|
.. _fastText: https://fasttext.cc/
|
||||||
|
.. _python fasttext: https://pypi.org/project/fasttext/
|
||||||
|
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
|
||||||
|
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
|
||||||
|
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from flask_babel import gettext
|
||||||
|
import babel
|
||||||
|
|
||||||
|
from searx.utils import detect_language
|
||||||
|
from searx.languages import language_codes
|
||||||
|
|
||||||
|
|
||||||
|
name = gettext('Autodetect search language')
|
||||||
|
description = gettext('Automatically detect the query search language and switch to it.')
|
||||||
|
preference_section = 'general'
|
||||||
|
default_on = False
|
||||||
|
|
||||||
|
supported_langs = set()
|
||||||
|
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
|
||||||
|
|
||||||
|
|
||||||
|
def pre_search(request, search): # pylint: disable=unused-argument
|
||||||
|
lang = detect_language(search.search_query.query, min_probability=0)
|
||||||
|
if lang in supported_langs:
|
||||||
|
search.search_query.lang = lang
|
||||||
|
try:
|
||||||
|
search.search_query.locale = babel.Locale.parse(lang)
|
||||||
|
except babel.core.UnknownLocaleError:
|
||||||
|
pass
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def init(app, settings): # pylint: disable=unused-argument
|
||||||
|
for searxng_locale in language_codes:
|
||||||
|
supported_langs.add(searxng_locale[0].split('-')[0])
|
||||||
|
return True
|
|
@ -10,12 +10,10 @@ from timeit import default_timer
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from langdetect import detect_langs
|
|
||||||
from langdetect.lang_detect_exception import LangDetectException
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from searx import network, logger
|
from searx import network, logger
|
||||||
from searx.utils import gen_useragent
|
from searx.utils import gen_useragent, detect_language
|
||||||
from searx.results import ResultContainer
|
from searx.results import ResultContainer
|
||||||
from searx.search.models import SearchQuery, EngineRef
|
from searx.search.models import SearchQuery, EngineRef
|
||||||
from searx.search.processors import EngineProcessor
|
from searx.search.processors import EngineProcessor
|
||||||
|
@ -208,14 +206,10 @@ class ResultContainerTests:
|
||||||
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
|
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
|
||||||
|
|
||||||
def _add_language(self, text: str) -> typing.Optional[str]:
|
def _add_language(self, text: str) -> typing.Optional[str]:
|
||||||
try:
|
langStr = detect_language(text)
|
||||||
r = detect_langs(str(text)) # pylint: disable=E1101
|
if langStr:
|
||||||
except LangDetectException:
|
self.languages.add(langStr)
|
||||||
return None
|
self.test_results.add_language(langStr)
|
||||||
|
|
||||||
if len(r) > 0 and r[0].prob > 0.95:
|
|
||||||
self.languages.add(r[0].lang)
|
|
||||||
self.test_results.add_language(r[0].lang)
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _check_result(self, result):
|
def _check_result(self, result):
|
||||||
|
|
|
@ -15,6 +15,7 @@ from os.path import splitext, join
|
||||||
from random import choice
|
from random import choice
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
import fasttext
|
||||||
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
||||||
|
@ -22,7 +23,7 @@ from babel.core import get_global
|
||||||
|
|
||||||
|
|
||||||
from searx import settings
|
from searx import settings
|
||||||
from searx.data import USER_AGENTS
|
from searx.data import USER_AGENTS, data_dir
|
||||||
from searx.version import VERSION_TAG
|
from searx.version import VERSION_TAG
|
||||||
from searx.languages import language_codes
|
from searx.languages import language_codes
|
||||||
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
||||||
|
@ -50,6 +51,12 @@ _STORAGE_UNIT_VALUE: Dict[str, int] = {
|
||||||
_XPATH_CACHE: Dict[str, XPath] = {}
|
_XPATH_CACHE: Dict[str, XPath] = {}
|
||||||
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
|
_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
|
||||||
|
|
||||||
|
_FASTTEXT_MODEL: Optional[fasttext.FastText._FastText] = None
|
||||||
|
"""fasttext model to predict laguage of a search term"""
|
||||||
|
|
||||||
|
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
|
||||||
|
fasttext.FastText.eprint = lambda x: None
|
||||||
|
|
||||||
|
|
||||||
class _NotSetClass: # pylint: disable=too-few-public-methods
|
class _NotSetClass: # pylint: disable=too-few-public-methods
|
||||||
"""Internal class for this module, do not create instance of this class.
|
"""Internal class for this module, do not create instance of this class.
|
||||||
|
@ -621,3 +628,20 @@ def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index:
|
||||||
# to record xpath_spec
|
# to record xpath_spec
|
||||||
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
|
raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _get_fasttext_model() -> fasttext.FastText._FastText:
|
||||||
|
global _FASTTEXT_MODEL # pylint: disable=global-statement
|
||||||
|
if _FASTTEXT_MODEL is None:
|
||||||
|
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
|
||||||
|
return _FASTTEXT_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
|
||||||
|
"""https://fasttext.cc/docs/en/language-identification.html"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
raise ValueError('text must a str')
|
||||||
|
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
|
||||||
|
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
|
||||||
|
return r[0][0].split('__label__')[1]
|
||||||
|
return None
|
||||||
|
|
|
@ -17,14 +17,11 @@ from os.path import join
|
||||||
|
|
||||||
from lxml.html import fromstring
|
from lxml.html import fromstring
|
||||||
|
|
||||||
from langdetect import detect_langs
|
|
||||||
from langdetect.lang_detect_exception import LangDetectException
|
|
||||||
|
|
||||||
from searx.engines import wikidata, set_loggers
|
from searx.engines import wikidata, set_loggers
|
||||||
from searx.utils import extract_text, match_language
|
from searx.utils import extract_text, match_language
|
||||||
from searx.locales import LOCALE_NAMES, locales_initialize
|
from searx.locales import LOCALE_NAMES, locales_initialize
|
||||||
from searx import searx_dir
|
from searx import searx_dir
|
||||||
from searx.utils import gen_useragent
|
from searx.utils import gen_useragent, detect_language
|
||||||
import searx.search
|
import searx.search
|
||||||
import searx.network
|
import searx.network
|
||||||
|
|
||||||
|
@ -117,17 +114,6 @@ def get_wikipedia_summary(lang, pageid):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def detect_language(text):
|
|
||||||
try:
|
|
||||||
r = detect_langs(str(text)) # pylint: disable=E1101
|
|
||||||
except LangDetectException:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if len(r) > 0 and r[0].prob > 0.95:
|
|
||||||
return r[0].lang
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_website_description(url, lang1, lang2=None):
|
def get_website_description(url, lang1, lang2=None):
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': gen_useragent(),
|
'User-Agent': gen_useragent(),
|
||||||
|
|
|
@ -232,3 +232,25 @@ class TestXPathUtils(SearxTestCase):
|
||||||
with self.assertRaises(SearxEngineXPathException) as context:
|
with self.assertRaises(SearxEngineXPathException) as context:
|
||||||
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
|
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
|
||||||
self.assertEqual(context.exception.message, 'the result is not a list')
|
self.assertEqual(context.exception.message, 'the result is not a list')
|
||||||
|
|
||||||
|
def test_detect_language(self):
|
||||||
|
# make sure new line are not an issue
|
||||||
|
# fasttext.predict('') does not accept new line.
|
||||||
|
l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
|
||||||
|
self.assertEqual(l, 'en')
|
||||||
|
|
||||||
|
l = utils.detect_language('いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす')
|
||||||
|
self.assertEqual(l, 'ja')
|
||||||
|
|
||||||
|
l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
|
||||||
|
self.assertEqual(l, 'tr')
|
||||||
|
|
||||||
|
l = utils.detect_language('')
|
||||||
|
self.assertIsNone(l)
|
||||||
|
|
||||||
|
# mix languages --> None
|
||||||
|
l = utils.detect_language('The いろはにほへと Pijamalı')
|
||||||
|
self.assertIsNone(l)
|
||||||
|
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
utils.detect_language(None)
|
||||||
|
|
Loading…
Reference in New Issue