mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[mod] wikipedia & wikidata: upgrade to data_type: traits_v1
BTW this fix an issue in wikipedia: SearXNG's locales zh-TW and zh-HK are now using language `zh-classical` from wikipedia (and not `zh`). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									e0a6ca96cc
								
							
						
					
					
						commit
						858aa3e604
					
				
					 5 changed files with 275 additions and 2789 deletions
				
			
		
							
								
								
									
										27
									
								
								docs/src/searx.engines.wikipedia.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								docs/src/searx.engines.wikipedia.rst
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,27 @@
 | 
			
		|||
.. _wikimedia engines:
 | 
			
		||||
 | 
			
		||||
=========
 | 
			
		||||
Wikimedia
 | 
			
		||||
=========
 | 
			
		||||
 | 
			
		||||
.. contents:: Contents
 | 
			
		||||
   :depth: 2
 | 
			
		||||
   :local:
 | 
			
		||||
   :backlinks: entry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. _wikipedia engine:
 | 
			
		||||
 | 
			
		||||
Wikipedia
 | 
			
		||||
=========
 | 
			
		||||
 | 
			
		||||
.. automodule:: searx.engines.wikipedia
 | 
			
		||||
  :members:
 | 
			
		||||
 | 
			
		||||
.. _wikidata engine:
 | 
			
		||||
 | 
			
		||||
Wikidata
 | 
			
		||||
=========
 | 
			
		||||
 | 
			
		||||
.. automodule:: searx.engines.wikidata
 | 
			
		||||
  :members:
 | 
			
		||||
| 
						 | 
				
			
			@ -143,14 +143,31 @@ def qwant(query, sxng_locale):
 | 
			
		|||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def wikipedia(query, lang):
 | 
			
		||||
    # wikipedia autocompleter
 | 
			
		||||
    url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'
 | 
			
		||||
def wikipedia(query, sxng_locale):
 | 
			
		||||
    """Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
 | 
			
		||||
    results = []
 | 
			
		||||
    eng_traits = engines['wikipedia'].traits
 | 
			
		||||
    wiki_lang = eng_traits.get_language(sxng_locale, 'en')
 | 
			
		||||
    wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
 | 
			
		||||
 | 
			
		||||
    resp = loads(get(url.format(urlencode(dict(search=query)))).text)
 | 
			
		||||
    if len(resp) > 1:
 | 
			
		||||
        return resp[1]
 | 
			
		||||
    return []
 | 
			
		||||
    url = 'https://{wiki_netloc}/w/api.php?{args}'
 | 
			
		||||
    args = urlencode(
 | 
			
		||||
        {
 | 
			
		||||
            'action': 'opensearch',
 | 
			
		||||
            'format': 'json',
 | 
			
		||||
            'formatversion': '2',
 | 
			
		||||
            'search': query,
 | 
			
		||||
            'namespace': '0',
 | 
			
		||||
            'limit': '10',
 | 
			
		||||
        }
 | 
			
		||||
    )
 | 
			
		||||
    resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
 | 
			
		||||
    if resp.ok:
 | 
			
		||||
        data = resp.json()
 | 
			
		||||
        if len(data) > 1:
 | 
			
		||||
            results = data[1]
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def yandex(query, _lang):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							| 
						 | 
				
			
			@ -1,9 +1,12 @@
 | 
			
		|||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
			
		||||
# lint: pylint
 | 
			
		||||
"""Wikidata
 | 
			
		||||
"""This module implements the Wikidata engine.  Some implementations are shared
 | 
			
		||||
from :ref:`wikipedia engine`.
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
# pylint: disable=missing-class-docstring
 | 
			
		||||
 | 
			
		||||
from typing import TYPE_CHECKING
 | 
			
		||||
from hashlib import md5
 | 
			
		||||
from urllib.parse import urlencode, unquote
 | 
			
		||||
from json import loads
 | 
			
		||||
| 
						 | 
				
			
			@ -13,13 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
 | 
			
		|||
 | 
			
		||||
from searx.data import WIKIDATA_UNITS
 | 
			
		||||
from searx.network import post, get
 | 
			
		||||
from searx.utils import match_language, searx_useragent, get_string_replaces_function
 | 
			
		||||
from searx.utils import searx_useragent, get_string_replaces_function
 | 
			
		||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
 | 
			
		||||
from searx.engines.wikipedia import (  # pylint: disable=unused-import
 | 
			
		||||
    fetch_traits,
 | 
			
		||||
    _fetch_supported_languages,
 | 
			
		||||
    supported_languages_url,
 | 
			
		||||
)
 | 
			
		||||
from searx.engines.wikipedia import fetch_traits as _fetch_traits
 | 
			
		||||
from searx.enginelib.traits import EngineTraits
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    import logging
 | 
			
		||||
 | 
			
		||||
    logger: logging.Logger
 | 
			
		||||
 | 
			
		||||
traits: EngineTraits
 | 
			
		||||
 | 
			
		||||
# about
 | 
			
		||||
about = {
 | 
			
		||||
| 
						 | 
				
			
			@ -155,33 +162,35 @@ def send_wikidata_query(query, method='GET'):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def request(query, params):
 | 
			
		||||
    language = params['language'].split('-')[0]
 | 
			
		||||
    if language == 'all':
 | 
			
		||||
        language = 'en'
 | 
			
		||||
    else:
 | 
			
		||||
        language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
 | 
			
		||||
 | 
			
		||||
    # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
 | 
			
		||||
    # mapped to zh
 | 
			
		||||
    sxng_lang = params['searxng_locale'].split('-')[0]
 | 
			
		||||
    language = traits.get_language(sxng_lang, 'en')
 | 
			
		||||
 | 
			
		||||
    query, attributes = get_query(query, language)
 | 
			
		||||
    logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
 | 
			
		||||
 | 
			
		||||
    params['method'] = 'POST'
 | 
			
		||||
    params['url'] = SPARQL_ENDPOINT_URL
 | 
			
		||||
    params['data'] = {'query': query}
 | 
			
		||||
    params['headers'] = get_headers()
 | 
			
		||||
 | 
			
		||||
    params['language'] = language
 | 
			
		||||
    params['attributes'] = attributes
 | 
			
		||||
 | 
			
		||||
    return params
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def response(resp):
 | 
			
		||||
 | 
			
		||||
    results = []
 | 
			
		||||
    jsonresponse = loads(resp.content.decode())
 | 
			
		||||
 | 
			
		||||
    language = resp.search_params['language'].lower()
 | 
			
		||||
    language = resp.search_params['language']
 | 
			
		||||
    attributes = resp.search_params['attributes']
 | 
			
		||||
    logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
 | 
			
		||||
 | 
			
		||||
    seen_entities = set()
 | 
			
		||||
 | 
			
		||||
    for result in jsonresponse.get('results', {}).get('bindings', []):
 | 
			
		||||
        attribute_result = {key: value['value'] for key, value in result.items()}
 | 
			
		||||
        entity_url = attribute_result['item']
 | 
			
		||||
| 
						 | 
				
			
			@ -757,3 +766,15 @@ def init(engine_settings=None):  # pylint: disable=unused-argument
 | 
			
		|||
        lang = result['name']['xml:lang']
 | 
			
		||||
        entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
 | 
			
		||||
        WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def fetch_traits(engine_traits: EngineTraits):
 | 
			
		||||
    """Use languages evaluated from :py:obj:`wikipedia.fetch_traits
 | 
			
		||||
    <searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
 | 
			
		||||
    is not supported by wikidata."""
 | 
			
		||||
 | 
			
		||||
    _fetch_traits(engine_traits)
 | 
			
		||||
    # wikidata does not support zh-classical (zh_Hans)
 | 
			
		||||
    engine_traits.languages.pop('zh_Hans')
 | 
			
		||||
    # wikidata does not have net-locations for the languages
 | 
			
		||||
    engine_traits.custom['wiki_netloc'] = {}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,16 +1,26 @@
 | 
			
		|||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
			
		||||
"""
 | 
			
		||||
 Wikipedia (Web)
 | 
			
		||||
# lint: pylint
 | 
			
		||||
"""This module implements the Wikipedia engine.  Some of this implementations
 | 
			
		||||
are shared by other engines:
 | 
			
		||||
 | 
			
		||||
- :ref:`wikidata engine`
 | 
			
		||||
 | 
			
		||||
The list of supported languages is fetched from the article linked by
 | 
			
		||||
:py:obj:`wikipedia_article_depth`.  Unlike traditional search engines, wikipedia
 | 
			
		||||
does not support one Wikipedia for all the languages, but there is one Wikipedia
 | 
			
		||||
for every language (:py:obj:`fetch_traits`).
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
from urllib.parse import quote
 | 
			
		||||
from json import loads
 | 
			
		||||
import urllib.parse
 | 
			
		||||
import babel
 | 
			
		||||
 | 
			
		||||
from lxml import html
 | 
			
		||||
from searx.utils import match_language, searx_useragent
 | 
			
		||||
 | 
			
		||||
from searx import network
 | 
			
		||||
from searx.locales import language_tag
 | 
			
		||||
from searx.enginelib.traits import EngineTraits
 | 
			
		||||
 | 
			
		||||
engine_traits: EngineTraits
 | 
			
		||||
traits: EngineTraits
 | 
			
		||||
 | 
			
		||||
# about
 | 
			
		||||
about = {
 | 
			
		||||
| 
						 | 
				
			
			@ -22,32 +32,40 @@ about = {
 | 
			
		|||
    "results": 'JSON',
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
send_accept_language_header = True
 | 
			
		||||
 | 
			
		||||
# search-url
 | 
			
		||||
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
 | 
			
		||||
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
 | 
			
		||||
language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")}
 | 
			
		||||
wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
 | 
			
		||||
"""The *editing depth* of Wikipedia is one of several possible rough indicators
 | 
			
		||||
of the encyclopedia's collaborative quality, showing how frequently its articles
 | 
			
		||||
are updated.  The measurement of depth was introduced after some limitations of
 | 
			
		||||
the classic measurement of article count were realized.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
 | 
			
		||||
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
 | 
			
		||||
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
 | 
			
		||||
the first paragraph of the page in plain text and HTML as well as the type of
 | 
			
		||||
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
 | 
			
		||||
and link previews in the apps.
 | 
			
		||||
 | 
			
		||||
.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# set language in base_url
 | 
			
		||||
def url_lang(lang):
 | 
			
		||||
    lang_pre = lang.split('-')[0]
 | 
			
		||||
    if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
 | 
			
		||||
        return 'en'
 | 
			
		||||
    return match_language(lang, supported_languages, language_aliases).split('-')[0]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# do search-request
 | 
			
		||||
def request(query, params):
 | 
			
		||||
    """Assemble a request (`wikipedia rest_v1 summary API`_)."""
 | 
			
		||||
    if query.islower():
 | 
			
		||||
        query = query.title()
 | 
			
		||||
 | 
			
		||||
    language = url_lang(params['language'])
 | 
			
		||||
    params['url'] = search_url.format(title=quote(query), language=language)
 | 
			
		||||
    engine_language = traits.get_language(params['searxng_locale'], 'en')
 | 
			
		||||
    wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
 | 
			
		||||
    title = urllib.parse.quote(query)
 | 
			
		||||
 | 
			
		||||
    # '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
 | 
			
		||||
    # '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
 | 
			
		||||
    params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
 | 
			
		||||
 | 
			
		||||
    params['headers']['User-Agent'] = searx_useragent()
 | 
			
		||||
    params['raise_for_httperror'] = False
 | 
			
		||||
    params['soft_max_redirects'] = 2
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -56,13 +74,14 @@ def request(query, params):
 | 
			
		|||
 | 
			
		||||
# get response from search-request
 | 
			
		||||
def response(resp):
 | 
			
		||||
 | 
			
		||||
    results = []
 | 
			
		||||
    if resp.status_code == 404:
 | 
			
		||||
        return []
 | 
			
		||||
 | 
			
		||||
    if resp.status_code == 400:
 | 
			
		||||
        try:
 | 
			
		||||
            api_result = loads(resp.text)
 | 
			
		||||
        except:
 | 
			
		||||
            api_result = resp.json()
 | 
			
		||||
        except Exception:  # pylint: disable=broad-except
 | 
			
		||||
            pass
 | 
			
		||||
        else:
 | 
			
		||||
            if (
 | 
			
		||||
| 
						 | 
				
			
			@ -73,52 +92,25 @@ def response(resp):
 | 
			
		|||
 | 
			
		||||
    network.raise_for_httperror(resp)
 | 
			
		||||
 | 
			
		||||
    results = []
 | 
			
		||||
    api_result = loads(resp.text)
 | 
			
		||||
 | 
			
		||||
    # skip disambiguation pages
 | 
			
		||||
    if api_result.get('type') != 'standard':
 | 
			
		||||
        return []
 | 
			
		||||
 | 
			
		||||
    api_result = resp.json()
 | 
			
		||||
    title = api_result['title']
 | 
			
		||||
    wikipedia_link = api_result['content_urls']['desktop']['page']
 | 
			
		||||
    results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
 | 
			
		||||
 | 
			
		||||
    results.append({'url': wikipedia_link, 'title': title})
 | 
			
		||||
 | 
			
		||||
    results.append(
 | 
			
		||||
        {
 | 
			
		||||
            'infobox': title,
 | 
			
		||||
            'id': wikipedia_link,
 | 
			
		||||
            'content': api_result.get('extract', ''),
 | 
			
		||||
            'img_src': api_result.get('thumbnail', {}).get('source'),
 | 
			
		||||
            'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
 | 
			
		||||
        }
 | 
			
		||||
    )
 | 
			
		||||
    if api_result.get('type') == 'standard':
 | 
			
		||||
        results.append(
 | 
			
		||||
            {
 | 
			
		||||
                'infobox': title,
 | 
			
		||||
                'id': wikipedia_link,
 | 
			
		||||
                'content': api_result.get('extract', ''),
 | 
			
		||||
                'img_src': api_result.get('thumbnail', {}).get('source'),
 | 
			
		||||
                'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# get supported languages from their site
 | 
			
		||||
def _fetch_supported_languages(resp):
 | 
			
		||||
    supported_languages = {}
 | 
			
		||||
    dom = html.fromstring(resp.text)
 | 
			
		||||
    tables = dom.xpath('//table[contains(@class,"sortable")]')
 | 
			
		||||
    for table in tables:
 | 
			
		||||
        # exclude header row
 | 
			
		||||
        trs = table.xpath('.//tr')[1:]
 | 
			
		||||
        for tr in trs:
 | 
			
		||||
            td = tr.xpath('./td')
 | 
			
		||||
            code = td[3].xpath('./a')[0].text
 | 
			
		||||
            name = td[1].xpath('./a')[0].text
 | 
			
		||||
            english_name = td[1].xpath('./a')[0].text
 | 
			
		||||
            articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
 | 
			
		||||
            # exclude languages with too few articles
 | 
			
		||||
            if articles >= 100:
 | 
			
		||||
                supported_languages[code] = {"name": name, "english_name": english_name}
 | 
			
		||||
 | 
			
		||||
    return supported_languages
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Nonstandard language codes
 | 
			
		||||
#
 | 
			
		||||
# These Wikipedias use language codes that do not conform to the ISO 639
 | 
			
		||||
| 
						 | 
				
			
			@ -135,104 +127,57 @@ lang_map = {
 | 
			
		|||
    'nrm': 'nrf',
 | 
			
		||||
    'roa-rup': 'rup',
 | 
			
		||||
    'nds-nl': 'nds',
 | 
			
		||||
    #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
 | 
			
		||||
    #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
 | 
			
		||||
    'zh-classical': 'zh_Hant',
 | 
			
		||||
    'zh-min-nan': 'nan',
 | 
			
		||||
    'zh-yue': 'yue',
 | 
			
		||||
    'an': 'arg',
 | 
			
		||||
    'zh-classical': 'zh-Hant',  # babel maps classical to zh-Hans (for whatever reason)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unknown_langs = [
 | 
			
		||||
    'ab',  # Abkhazian
 | 
			
		||||
    'alt',  # Southern Altai
 | 
			
		||||
    'an',  # Aragonese
 | 
			
		||||
    'ang',  # Anglo-Saxon
 | 
			
		||||
    'arc',  # Aramaic
 | 
			
		||||
    'ary',  # Moroccan Arabic
 | 
			
		||||
    'av',  # Avar
 | 
			
		||||
    'ba',  # Bashkir
 | 
			
		||||
    'be-tarask',
 | 
			
		||||
    'bar',  # Bavarian
 | 
			
		||||
    'bcl',  # Central Bicolano
 | 
			
		||||
    'bh',  # Bhojpuri
 | 
			
		||||
    'bi',  # Bislama
 | 
			
		||||
    'bjn',  # Banjar
 | 
			
		||||
    'blk',  # Pa'O
 | 
			
		||||
    'bpy',  # Bishnupriya Manipuri
 | 
			
		||||
    'bxr',  # Buryat
 | 
			
		||||
    'cbk-zam',  # Zamboanga Chavacano
 | 
			
		||||
    'co',  # Corsican
 | 
			
		||||
    'cu',  # Old Church Slavonic
 | 
			
		||||
    'dty',  # Doteli
 | 
			
		||||
    'dv',  # Divehi
 | 
			
		||||
    'ext',  # Extremaduran
 | 
			
		||||
    'fj',  # Fijian
 | 
			
		||||
    'frp',  # Franco-Provençal
 | 
			
		||||
    'gan',  # Gan
 | 
			
		||||
    'gom',  # Goan Konkani
 | 
			
		||||
    'be-tarask',  # Belarusian variant / Belarusian is already covered by 'be'
 | 
			
		||||
    'bpy',  # Bishnupriya Manipuri is unknown by babel
 | 
			
		||||
    'hif',  # Fiji Hindi
 | 
			
		||||
    'ilo',  # Ilokano
 | 
			
		||||
    'inh',  # Ingush
 | 
			
		||||
    'jbo',  # Lojban
 | 
			
		||||
    'kaa',  # Karakalpak
 | 
			
		||||
    'kbd',  # Kabardian Circassian
 | 
			
		||||
    'kg',  # Kongo
 | 
			
		||||
    'koi',  # Komi-Permyak
 | 
			
		||||
    'krc',  # Karachay-Balkar
 | 
			
		||||
    'kv',  # Komi
 | 
			
		||||
    'lad',  # Ladino
 | 
			
		||||
    'lbe',  # Lak
 | 
			
		||||
    'lez',  # Lezgian
 | 
			
		||||
    'li',  # Limburgish
 | 
			
		||||
    'ltg',  # Latgalian
 | 
			
		||||
    'mdf',  # Moksha
 | 
			
		||||
    'mnw',  # Mon
 | 
			
		||||
    'mwl',  # Mirandese
 | 
			
		||||
    'myv',  # Erzya
 | 
			
		||||
    'na',  # Nauruan
 | 
			
		||||
    'nah',  # Nahuatl
 | 
			
		||||
    'nov',  # Novial
 | 
			
		||||
    'nrm',  # Norman
 | 
			
		||||
    'pag',  # Pangasinan
 | 
			
		||||
    'pam',  # Kapampangan
 | 
			
		||||
    'pap',  # Papiamentu
 | 
			
		||||
    'pdc',  # Pennsylvania German
 | 
			
		||||
    'pfl',  # Palatinate German
 | 
			
		||||
    'roa-rup',  # Aromanian
 | 
			
		||||
    'sco',  # Scots
 | 
			
		||||
    'sco',  # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
 | 
			
		||||
    'sco',  # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
 | 
			
		||||
    'sh',  # Serbo-Croatian
 | 
			
		||||
    'simple',  # simple english is not know as a natural language different to english (babel)
 | 
			
		||||
    'sm',  # Samoan
 | 
			
		||||
    'srn',  # Sranan
 | 
			
		||||
    'stq',  # Saterland Frisian
 | 
			
		||||
    'szy',  # Sakizaya
 | 
			
		||||
    'tcy',  # Tulu
 | 
			
		||||
    'tet',  # Tetum
 | 
			
		||||
    'tpi',  # Tok Pisin
 | 
			
		||||
    'trv',  # Seediq
 | 
			
		||||
    'ty',  # Tahitian
 | 
			
		||||
    'tyv',  # Tuvan
 | 
			
		||||
    'udm',  # Udmurt
 | 
			
		||||
    'vep',  # Vepsian
 | 
			
		||||
    'vls',  # West Flemish
 | 
			
		||||
    'vo',  # Volapük
 | 
			
		||||
    'wa',  # Walloon
 | 
			
		||||
    'xal',  # Kalmyk
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def fetch_traits(engine_traits: EngineTraits):
 | 
			
		||||
    """Fetch languages from Wikipedia"""
 | 
			
		||||
    # pylint: disable=import-outside-toplevel
 | 
			
		||||
    """Fetch languages from Wikipedia.
 | 
			
		||||
 | 
			
		||||
    engine_traits.data_type = 'supported_languages'  # deprecated
 | 
			
		||||
    The location of the Wikipedia address of a language is mapped in a
 | 
			
		||||
    :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
 | 
			
		||||
    (``wiki_netloc``).  Here is a reduced example:
 | 
			
		||||
 | 
			
		||||
    import babel
 | 
			
		||||
    from searx.locales import language_tag
 | 
			
		||||
    .. code:: python
 | 
			
		||||
 | 
			
		||||
    resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
 | 
			
		||||
       traits.custom['wiki_netloc'] = {
 | 
			
		||||
           "en": "en.wikipedia.org",
 | 
			
		||||
           ..
 | 
			
		||||
           "gsw": "als.wikipedia.org",
 | 
			
		||||
           ..
 | 
			
		||||
           "zh": "zh.wikipedia.org",
 | 
			
		||||
           "zh-classical": "zh-classical.wikipedia.org"
 | 
			
		||||
       }
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    engine_traits.custom['wiki_netloc'] = {}
 | 
			
		||||
 | 
			
		||||
    # insert alias to map from a region like zh-CN to a language zh_Hans
 | 
			
		||||
    engine_traits.languages['zh_Hans'] = 'zh'
 | 
			
		||||
 | 
			
		||||
    resp = network.get(wikipedia_article_depth)
 | 
			
		||||
    if not resp.ok:
 | 
			
		||||
        print("ERROR: response from Wikipedia is not OK.")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -242,34 +187,31 @@ def fetch_traits(engine_traits: EngineTraits):
 | 
			
		|||
        cols = row.xpath('./td')
 | 
			
		||||
        if not cols:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        cols = [c.text_content().strip() for c in cols]
 | 
			
		||||
        articles = int(cols[4].replace(',', '').replace('-', '0'))
 | 
			
		||||
        users = int(cols[8].replace(',', '').replace('-', '0'))
 | 
			
		||||
        depth = cols[11].strip('-')
 | 
			
		||||
 | 
			
		||||
        if articles < 1000:
 | 
			
		||||
        depth = float(cols[3].replace('-', '0').replace(',', ''))
 | 
			
		||||
        articles = int(cols[4].replace(',', '').replace(',', ''))
 | 
			
		||||
 | 
			
		||||
        if articles < 10000:
 | 
			
		||||
            # exclude languages with too few articles
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        # depth: rough indicator of a Wikipedia’s quality, showing how
 | 
			
		||||
        #        frequently its articles are updated.
 | 
			
		||||
        if depth == '':
 | 
			
		||||
            if users < 1000:
 | 
			
		||||
                # depth is not calculated --> at least 1000 user should registered
 | 
			
		||||
                continue
 | 
			
		||||
        elif int(depth) < 20:
 | 
			
		||||
        if int(depth) < 20:
 | 
			
		||||
            # Rough indicator of a Wikipedia’s quality, showing how frequently
 | 
			
		||||
            # its articles are updated.
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        eng_tag = cols[3]
 | 
			
		||||
        eng_tag = cols[2]
 | 
			
		||||
        wiki_url = row.xpath('./td[3]/a/@href')[0]
 | 
			
		||||
        wiki_url = urllib.parse.urlparse(wiki_url)
 | 
			
		||||
 | 
			
		||||
        if eng_tag in unknown_langs:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
 | 
			
		||||
            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
 | 
			
		||||
        except babel.UnknownLocaleError:
 | 
			
		||||
            print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
 | 
			
		||||
            print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        conflict = engine_traits.languages.get(sxng_tag)
 | 
			
		||||
| 
						 | 
				
			
			@ -277,6 +219,6 @@ def fetch_traits(engine_traits: EngineTraits):
 | 
			
		|||
            if conflict != eng_tag:
 | 
			
		||||
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
 | 
			
		||||
            continue
 | 
			
		||||
        engine_traits.languages[sxng_tag] = eng_tag
 | 
			
		||||
 | 
			
		||||
    engine_traits.languages['zh_Hans'] = 'zh'
 | 
			
		||||
        engine_traits.languages[sxng_tag] = eng_tag
 | 
			
		||||
        engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue