[mod] wikipedia & wikidata: upgrade to data_type: traits_v1

BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now using language `zh-classical` from wikipedia (and not `zh`). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-10-28 19:12:59 +02:00 · 2022-10-28 19:12:59 +02:00 · 858aa3e604
commit 858aa3e604
parent e0a6ca96cc
5 changed files with 275 additions and 2789 deletions
--- a/docs/src/searx.engines.wikipedia.rst
+++ b/docs/src/searx.engines.wikipedia.rst
@ -0,0 +1,27 @@
 .. _wikimedia engines:
 =========
 Wikimedia
 =========
 .. contents:: Contents
   :depth: 2
   :local:
   :backlinks: entry
 .. _wikipedia engine:
 Wikipedia
 =========
 .. automodule:: searx.engines.wikipedia
  :members:
 .. _wikidata engine:
 Wikidata
 =========
 .. automodule:: searx.engines.wikidata
  :members:
--- a/searx/autocomplete.py
+++ b/searx/autocomplete.py
@ -143,14 +143,31 @@ def qwant(query, sxng_locale):
    return results
-def wikipedia(query, lang):
+def wikipedia(query, sxng_locale):
-    # wikipedia autocompleter
+    """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
-    url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'
+    results = []
    eng_traits = engines['wikipedia'].traits
    wiki_lang = eng_traits.get_language(sxng_locale, 'en')
    wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
-    resp = loads(get(url.format(urlencode(dict(search=query)))).text)
+    url = 'https://{wiki_netloc}/w/api.php?{args}'
-    if len(resp) > 1:
+    args = urlencode(
-        return resp[1]
+        {
-    return []
+            'action': 'opensearch',
            'format': 'json',
            'formatversion': '2',
            'search': query,
            'namespace': '0',
            'limit': '10',
        }
    )
    resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
    if resp.ok:
        data = resp.json()
        if len(data) > 1:
            results = data[1]
    return results
 def yandex(query, _lang):
--- a/searx/data/engine_traits.json
+++ b/searx/data/engine_traits.json
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@ -1,9 +1,12 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # lint: pylint
-"""Wikidata
+"""This module implements the Wikidata engine.  Some implementations are shared
 from :ref:`wikipedia engine`.
 """
 # pylint: disable=missing-class-docstring
 from typing import TYPE_CHECKING
 from hashlib import md5
 from urllib.parse import urlencode, unquote
 from json import loads
@ -13,13 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
 from searx.data import WIKIDATA_UNITS
 from searx.network import post, get
-from searx.utils import match_language, searx_useragent, get_string_replaces_function
+from searx.utils import searx_useragent, get_string_replaces_function
 from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
-from searx.engines.wikipedia import (  # pylint: disable=unused-import
+from searx.engines.wikipedia import fetch_traits as _fetch_traits
-    fetch_traits,
+from searx.enginelib.traits import EngineTraits
-    _fetch_supported_languages,
+
-    supported_languages_url,
+if TYPE_CHECKING:
-)
+    import logging
    logger: logging.Logger
 traits: EngineTraits
 # about
 about = {
@ -155,33 +162,35 @@ def send_wikidata_query(query, method='GET'):
 def request(query, params):
-    language = params['language'].split('-')[0]
+
-    if language == 'all':
+    # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
-        language = 'en'
+    # mapped to zh
-    else:
+    sxng_lang = params['searxng_locale'].split('-')[0]
-        language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
+    language = traits.get_language(sxng_lang, 'en')
    query, attributes = get_query(query, language)
    logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
    params['method'] = 'POST'
    params['url'] = SPARQL_ENDPOINT_URL
    params['data'] = {'query': query}
    params['headers'] = get_headers()
    params['language'] = language
    params['attributes'] = attributes
    return params
 def response(resp):
    results = []
    jsonresponse = loads(resp.content.decode())
-    language = resp.search_params['language'].lower()
+    language = resp.search_params['language']
    attributes = resp.search_params['attributes']
    logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
    seen_entities = set()
    for result in jsonresponse.get('results', {}).get('bindings', []):
        attribute_result = {key: value['value'] for key, value in result.items()}
        entity_url = attribute_result['item']
@ -757,3 +766,15 @@ def init(engine_settings=None):  # pylint: disable=unused-argument
        lang = result['name']['xml:lang']
        entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
        WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
 def fetch_traits(engine_traits: EngineTraits):
    """Use languages evaluated from :py:obj:`wikipedia.fetch_traits
    <searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
    is not supported by wikidata."""
    _fetch_traits(engine_traits)
    # wikidata does not support zh-classical (zh_Hans)
    engine_traits.languages.pop('zh_Hans')
    # wikidata does not have net-locations for the languages
    engine_traits.custom['wiki_netloc'] = {}
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@ -1,16 +1,26 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""
+# lint: pylint
- Wikipedia (Web)
+"""This module implements the Wikipedia engine.  Some of this implementations
 are shared by other engines:
 - :ref:`wikidata engine`
 The list of supported languages is fetched from the article linked by
 :py:obj:`wikipedia_article_depth`.  Unlike traditional search engines, wikipedia
 does not support one Wikipedia for all the languages, but there is one Wikipedia
 for every language (:py:obj:`fetch_traits`).
 """
-from urllib.parse import quote
+import urllib.parse
-from json import loads
+import babel
 from lxml import html
-from searx.utils import match_language, searx_useragent
+
 from searx import network
 from searx.locales import language_tag
 from searx.enginelib.traits import EngineTraits
-engine_traits: EngineTraits
+traits: EngineTraits
 # about
 about = {
@ -22,32 +32,40 @@ about = {
    "results": 'JSON',
 }
 send_accept_language_header = True
-# search-url
+wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
-search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
+"""The *editing depth* of Wikipedia is one of several possible rough indicators
-supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
+of the encyclopedia's collaborative quality, showing how frequently its articles
-language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")}
+are updated.  The measurement of depth was introduced after some limitations of
 the classic measurement of article count were realized.
 """
 # example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
 rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
 """`wikipedia rest_v1 summary API`_: The summary response includes an extract of
 the first paragraph of the page in plain text and HTML as well as the type of
 page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
 and link previews in the apps.
 .. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
 """
 # set language in base_url
 def url_lang(lang):
    lang_pre = lang.split('-')[0]
    if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
        return 'en'
    return match_language(lang, supported_languages, language_aliases).split('-')[0]
 # do search-request
 def request(query, params):
    """Assemble a request (`wikipedia rest_v1 summary API`_)."""
    if query.islower():
        query = query.title()
-    language = url_lang(params['language'])
+    engine_language = traits.get_language(params['searxng_locale'], 'en')
-    params['url'] = search_url.format(title=quote(query), language=language)
+    wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
    title = urllib.parse.quote(query)
    # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
    # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
    params['headers']['User-Agent'] = searx_useragent()
    params['raise_for_httperror'] = False
    params['soft_max_redirects'] = 2
@ -56,13 +74,14 @@ def request(query, params):
 # get response from search-request
 def response(resp):
    results = []
    if resp.status_code == 404:
        return []
    if resp.status_code == 400:
        try:
-            api_result = loads(resp.text)
+            api_result = resp.json()
-        except:
+        except Exception:  # pylint: disable=broad-except
            pass
        else:
            if (
@ -73,18 +92,12 @@ def response(resp):
    network.raise_for_httperror(resp)
-    results = []
+    api_result = resp.json()
    api_result = loads(resp.text)
    # skip disambiguation pages
    if api_result.get('type') != 'standard':
        return []
    title = api_result['title']
    wikipedia_link = api_result['content_urls']['desktop']['page']
    results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
-    results.append({'url': wikipedia_link, 'title': title})
+    if api_result.get('type') == 'standard':
        results.append(
            {
                'infobox': title,
@ -98,27 +111,6 @@ def response(resp):
    return results
 # get supported languages from their site
 def _fetch_supported_languages(resp):
    supported_languages = {}
    dom = html.fromstring(resp.text)
    tables = dom.xpath('//table[contains(@class,"sortable")]')
    for table in tables:
        # exclude header row
        trs = table.xpath('.//tr')[1:]
        for tr in trs:
            td = tr.xpath('./td')
            code = td[3].xpath('./a')[0].text
            name = td[1].xpath('./a')[0].text
            english_name = td[1].xpath('./a')[0].text
            articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
            # exclude languages with too few articles
            if articles >= 100:
                supported_languages[code] = {"name": name, "english_name": english_name}
    return supported_languages
 # Nonstandard language codes
 #
 # These Wikipedias use language codes that do not conform to the ISO 639
@ -135,104 +127,57 @@ lang_map = {
    'nrm': 'nrf',
    'roa-rup': 'rup',
    'nds-nl': 'nds',
    #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
    #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
    'zh-classical': 'zh_Hant',
    'zh-min-nan': 'nan',
    'zh-yue': 'yue',
    'an': 'arg',
    'zh-classical': 'zh-Hant',  # babel maps classical to zh-Hans (for whatever reason)
 }
 unknown_langs = [
    'ab',  # Abkhazian
    'alt',  # Southern Altai
    'an',  # Aragonese
    'ang',  # Anglo-Saxon
    'arc',  # Aramaic
    'ary',  # Moroccan Arabic
    'av',  # Avar
    'ba',  # Bashkir
    'be-tarask',
    'bar',  # Bavarian
    'bcl',  # Central Bicolano
-    'bh',  # Bhojpuri
+    'be-tarask',  # Belarusian variant / Belarusian is already covered by 'be'
-    'bi',  # Bislama
+    'bpy',  # Bishnupriya Manipuri is unknown by babel
    'bjn',  # Banjar
    'blk',  # Pa'O
    'bpy',  # Bishnupriya Manipuri
    'bxr',  # Buryat
    'cbk-zam',  # Zamboanga Chavacano
    'co',  # Corsican
    'cu',  # Old Church Slavonic
    'dty',  # Doteli
    'dv',  # Divehi
    'ext',  # Extremaduran
    'fj',  # Fijian
    'frp',  # Franco-Provençal
    'gan',  # Gan
    'gom',  # Goan Konkani
    'hif',  # Fiji Hindi
    'ilo',  # Ilokano
    'inh',  # Ingush
    'jbo',  # Lojban
    'kaa',  # Karakalpak
    'kbd',  # Kabardian Circassian
    'kg',  # Kongo
    'koi',  # Komi-Permyak
    'krc',  # Karachay-Balkar
    'kv',  # Komi
    'lad',  # Ladino
    'lbe',  # Lak
    'lez',  # Lezgian
    'li',  # Limburgish
-    'ltg',  # Latgalian
+    'sco',  # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
    'mdf',  # Moksha
    'mnw',  # Mon
    'mwl',  # Mirandese
    'myv',  # Erzya
    'na',  # Nauruan
    'nah',  # Nahuatl
    'nov',  # Novial
    'nrm',  # Norman
    'pag',  # Pangasinan
    'pam',  # Kapampangan
    'pap',  # Papiamentu
    'pdc',  # Pennsylvania German
    'pfl',  # Palatinate German
    'roa-rup',  # Aromanian
    'sco',  # Scots
    'sco',  # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
    'sh',  # Serbo-Croatian
    'simple',  # simple english is not know as a natural language different to english (babel)
    'sm',  # Samoan
    'srn',  # Sranan
    'stq',  # Saterland Frisian
    'szy',  # Sakizaya
    'tcy',  # Tulu
    'tet',  # Tetum
    'tpi',  # Tok Pisin
    'trv',  # Seediq
    'ty',  # Tahitian
    'tyv',  # Tuvan
    'udm',  # Udmurt
    'vep',  # Vepsian
    'vls',  # West Flemish
    'vo',  # Volapük
    'wa',  # Walloon
    'xal',  # Kalmyk
 ]
 def fetch_traits(engine_traits: EngineTraits):
-    """Fetch languages from Wikipedia"""
+    """Fetch languages from Wikipedia.
    # pylint: disable=import-outside-toplevel
-    engine_traits.data_type = 'supported_languages'  # deprecated
+    The location of the Wikipedia address of a language is mapped in a
    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
    (``wiki_netloc``).  Here is a reduced example:
-    import babel
+    .. code:: python
    from searx.locales import language_tag
-    resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
+       traits.custom['wiki_netloc'] = {
           "en": "en.wikipedia.org",
           ..
           "gsw": "als.wikipedia.org",
           ..
           "zh": "zh.wikipedia.org",
           "zh-classical": "zh-classical.wikipedia.org"
       }
    """
    engine_traits.custom['wiki_netloc'] = {}
    # insert alias to map from a region like zh-CN to a language zh_Hans
    engine_traits.languages['zh_Hans'] = 'zh'
    resp = network.get(wikipedia_article_depth)
    if not resp.ok:
        print("ERROR: response from Wikipedia is not OK.")
@ -242,34 +187,31 @@ def fetch_traits(engine_traits: EngineTraits):
        cols = row.xpath('./td')
        if not cols:
            continue
        cols = [c.text_content().strip() for c in cols]
        articles = int(cols[4].replace(',', '').replace('-', '0'))
        users = int(cols[8].replace(',', '').replace('-', '0'))
        depth = cols[11].strip('-')
-        if articles < 1000:
+        depth = float(cols[3].replace('-', '0').replace(',', ''))
        articles = int(cols[4].replace(',', '').replace(',', ''))
        if articles < 10000:
            # exclude languages with too few articles
            continue
-        # depth: rough indicator of a Wikipedia’s quality, showing how
+        if int(depth) < 20:
-        #        frequently its articles are updated.
+            # Rough indicator of a Wikipedia’s quality, showing how frequently
-        if depth == '':
+            # its articles are updated.
            if users < 1000:
                # depth is not calculated --> at least 1000 user should registered
                continue
        elif int(depth) < 20:
            continue
-        eng_tag = cols[3]
+        eng_tag = cols[2]
        wiki_url = row.xpath('./td[3]/a/@href')[0]
        wiki_url = urllib.parse.urlparse(wiki_url)
        if eng_tag in unknown_langs:
            continue
        try:
-            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
+            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
        except babel.UnknownLocaleError:
-            print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
+            print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
            continue
        conflict = engine_traits.languages.get(sxng_tag)
@ -277,6 +219,6 @@ def fetch_traits(engine_traits: EngineTraits):
            if conflict != eng_tag:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
            continue
        engine_traits.languages[sxng_tag] = eng_tag
-    engine_traits.languages['zh_Hans'] = 'zh'
+        engine_traits.languages[sxng_tag] = eng_tag
        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc