forked from zaclys/searxng
		
	[mod] Wikipedia: fetch engine traits (data_type: supported_languages)
Implements a fetch_traits function for the Wikipedia engines. .. note:: Does not include migration of the request methode from 'supported_languages' to 'traits' (EngineTraits) object! Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									f78f908383
								
							
						
					
					
						commit
						7daf4f95ef
					
				
					 3 changed files with 391 additions and 6 deletions
				
			
		| 
						 | 
				
			
			@ -5121,7 +5121,116 @@
 | 
			
		|||
    "all_locale": null,
 | 
			
		||||
    "custom": {},
 | 
			
		||||
    "data_type": "supported_languages",
 | 
			
		||||
    "languages": {},
 | 
			
		||||
    "languages": {
 | 
			
		||||
      "af": "af",
 | 
			
		||||
      "ak": "tw",
 | 
			
		||||
      "am": "am",
 | 
			
		||||
      "ar": "ar",
 | 
			
		||||
      "as": "as",
 | 
			
		||||
      "az": "az",
 | 
			
		||||
      "be": "be",
 | 
			
		||||
      "bg": "bg",
 | 
			
		||||
      "bn": "bn",
 | 
			
		||||
      "bo": "bo",
 | 
			
		||||
      "bs": "bs",
 | 
			
		||||
      "ca": "ca",
 | 
			
		||||
      "chr": "chr",
 | 
			
		||||
      "ckb": "ckb",
 | 
			
		||||
      "cs": "cs",
 | 
			
		||||
      "da": "da",
 | 
			
		||||
      "de": "de",
 | 
			
		||||
      "dsb": "dsb",
 | 
			
		||||
      "el": "el",
 | 
			
		||||
      "en": "en",
 | 
			
		||||
      "es": "es",
 | 
			
		||||
      "et": "et",
 | 
			
		||||
      "fa": "fa",
 | 
			
		||||
      "fi": "fi",
 | 
			
		||||
      "fil": "tl",
 | 
			
		||||
      "fo": "fo",
 | 
			
		||||
      "fr": "fr",
 | 
			
		||||
      "fur": "fur",
 | 
			
		||||
      "fy": "fy",
 | 
			
		||||
      "gl": "gl",
 | 
			
		||||
      "gsw": "als",
 | 
			
		||||
      "gu": "gu",
 | 
			
		||||
      "gv": "gv",
 | 
			
		||||
      "haw": "haw",
 | 
			
		||||
      "he": "he",
 | 
			
		||||
      "hi": "hi",
 | 
			
		||||
      "hsb": "hsb",
 | 
			
		||||
      "hu": "hu",
 | 
			
		||||
      "hy": "hy",
 | 
			
		||||
      "id": "id",
 | 
			
		||||
      "is": "is",
 | 
			
		||||
      "it": "it",
 | 
			
		||||
      "ja": "ja",
 | 
			
		||||
      "jv": "jv",
 | 
			
		||||
      "ka": "ka",
 | 
			
		||||
      "km": "km",
 | 
			
		||||
      "kn": "kn",
 | 
			
		||||
      "ko": "ko",
 | 
			
		||||
      "ks": "ks",
 | 
			
		||||
      "ksh": "ksh",
 | 
			
		||||
      "kw": "kw",
 | 
			
		||||
      "lb": "lb",
 | 
			
		||||
      "lg": "lg",
 | 
			
		||||
      "ln": "ln",
 | 
			
		||||
      "lo": "lo",
 | 
			
		||||
      "lt": "lt",
 | 
			
		||||
      "lv": "lv",
 | 
			
		||||
      "mai": "mai",
 | 
			
		||||
      "mk": "mk",
 | 
			
		||||
      "ml": "ml",
 | 
			
		||||
      "mn": "mn",
 | 
			
		||||
      "mr": "mr",
 | 
			
		||||
      "ms": "ms",
 | 
			
		||||
      "mt": "mt",
 | 
			
		||||
      "nds": "nds-nl",
 | 
			
		||||
      "ne": "ne",
 | 
			
		||||
      "no": "no",
 | 
			
		||||
      "om": "om",
 | 
			
		||||
      "or": "or",
 | 
			
		||||
      "os": "os",
 | 
			
		||||
      "pa": "pa",
 | 
			
		||||
      "pl": "pl",
 | 
			
		||||
      "ps": "ps",
 | 
			
		||||
      "pt": "pt",
 | 
			
		||||
      "qu": "qu",
 | 
			
		||||
      "rm": "rm",
 | 
			
		||||
      "ro": "ro",
 | 
			
		||||
      "ru": "ru",
 | 
			
		||||
      "rw": "rw",
 | 
			
		||||
      "sa": "sa",
 | 
			
		||||
      "sah": "sah",
 | 
			
		||||
      "sd": "sd",
 | 
			
		||||
      "se": "se",
 | 
			
		||||
      "shi": "shi",
 | 
			
		||||
      "si": "si",
 | 
			
		||||
      "sk": "sk",
 | 
			
		||||
      "sl": "sl",
 | 
			
		||||
      "smn": "smn",
 | 
			
		||||
      "so": "so",
 | 
			
		||||
      "sq": "sq",
 | 
			
		||||
      "sr": "sr",
 | 
			
		||||
      "ta": "ta",
 | 
			
		||||
      "te": "te",
 | 
			
		||||
      "th": "th",
 | 
			
		||||
      "tk": "tk",
 | 
			
		||||
      "to": "to",
 | 
			
		||||
      "tr": "tr",
 | 
			
		||||
      "ug": "ug",
 | 
			
		||||
      "uk": "uk",
 | 
			
		||||
      "ur": "ur",
 | 
			
		||||
      "uz": "uz",
 | 
			
		||||
      "vi": "vi",
 | 
			
		||||
      "wo": "wo",
 | 
			
		||||
      "xh": "xh",
 | 
			
		||||
      "yi": "yi",
 | 
			
		||||
      "zh": "zh",
 | 
			
		||||
      "zh_Hans": "zh",
 | 
			
		||||
      "zh_Hant": "zh-classical"
 | 
			
		||||
    },
 | 
			
		||||
    "regions": {},
 | 
			
		||||
    "supported_languages": {
 | 
			
		||||
      "ab": {
 | 
			
		||||
| 
						 | 
				
			
			@ -6402,7 +6511,116 @@
 | 
			
		|||
    "all_locale": null,
 | 
			
		||||
    "custom": {},
 | 
			
		||||
    "data_type": "supported_languages",
 | 
			
		||||
    "languages": {},
 | 
			
		||||
    "languages": {
 | 
			
		||||
      "af": "af",
 | 
			
		||||
      "ak": "tw",
 | 
			
		||||
      "am": "am",
 | 
			
		||||
      "ar": "ar",
 | 
			
		||||
      "as": "as",
 | 
			
		||||
      "az": "az",
 | 
			
		||||
      "be": "be",
 | 
			
		||||
      "bg": "bg",
 | 
			
		||||
      "bn": "bn",
 | 
			
		||||
      "bo": "bo",
 | 
			
		||||
      "bs": "bs",
 | 
			
		||||
      "ca": "ca",
 | 
			
		||||
      "chr": "chr",
 | 
			
		||||
      "ckb": "ckb",
 | 
			
		||||
      "cs": "cs",
 | 
			
		||||
      "da": "da",
 | 
			
		||||
      "de": "de",
 | 
			
		||||
      "dsb": "dsb",
 | 
			
		||||
      "el": "el",
 | 
			
		||||
      "en": "en",
 | 
			
		||||
      "es": "es",
 | 
			
		||||
      "et": "et",
 | 
			
		||||
      "fa": "fa",
 | 
			
		||||
      "fi": "fi",
 | 
			
		||||
      "fil": "tl",
 | 
			
		||||
      "fo": "fo",
 | 
			
		||||
      "fr": "fr",
 | 
			
		||||
      "fur": "fur",
 | 
			
		||||
      "fy": "fy",
 | 
			
		||||
      "gl": "gl",
 | 
			
		||||
      "gsw": "als",
 | 
			
		||||
      "gu": "gu",
 | 
			
		||||
      "gv": "gv",
 | 
			
		||||
      "haw": "haw",
 | 
			
		||||
      "he": "he",
 | 
			
		||||
      "hi": "hi",
 | 
			
		||||
      "hsb": "hsb",
 | 
			
		||||
      "hu": "hu",
 | 
			
		||||
      "hy": "hy",
 | 
			
		||||
      "id": "id",
 | 
			
		||||
      "is": "is",
 | 
			
		||||
      "it": "it",
 | 
			
		||||
      "ja": "ja",
 | 
			
		||||
      "jv": "jv",
 | 
			
		||||
      "ka": "ka",
 | 
			
		||||
      "km": "km",
 | 
			
		||||
      "kn": "kn",
 | 
			
		||||
      "ko": "ko",
 | 
			
		||||
      "ks": "ks",
 | 
			
		||||
      "ksh": "ksh",
 | 
			
		||||
      "kw": "kw",
 | 
			
		||||
      "lb": "lb",
 | 
			
		||||
      "lg": "lg",
 | 
			
		||||
      "ln": "ln",
 | 
			
		||||
      "lo": "lo",
 | 
			
		||||
      "lt": "lt",
 | 
			
		||||
      "lv": "lv",
 | 
			
		||||
      "mai": "mai",
 | 
			
		||||
      "mk": "mk",
 | 
			
		||||
      "ml": "ml",
 | 
			
		||||
      "mn": "mn",
 | 
			
		||||
      "mr": "mr",
 | 
			
		||||
      "ms": "ms",
 | 
			
		||||
      "mt": "mt",
 | 
			
		||||
      "nds": "nds-nl",
 | 
			
		||||
      "ne": "ne",
 | 
			
		||||
      "no": "no",
 | 
			
		||||
      "om": "om",
 | 
			
		||||
      "or": "or",
 | 
			
		||||
      "os": "os",
 | 
			
		||||
      "pa": "pa",
 | 
			
		||||
      "pl": "pl",
 | 
			
		||||
      "ps": "ps",
 | 
			
		||||
      "pt": "pt",
 | 
			
		||||
      "qu": "qu",
 | 
			
		||||
      "rm": "rm",
 | 
			
		||||
      "ro": "ro",
 | 
			
		||||
      "ru": "ru",
 | 
			
		||||
      "rw": "rw",
 | 
			
		||||
      "sa": "sa",
 | 
			
		||||
      "sah": "sah",
 | 
			
		||||
      "sd": "sd",
 | 
			
		||||
      "se": "se",
 | 
			
		||||
      "shi": "shi",
 | 
			
		||||
      "si": "si",
 | 
			
		||||
      "sk": "sk",
 | 
			
		||||
      "sl": "sl",
 | 
			
		||||
      "smn": "smn",
 | 
			
		||||
      "so": "so",
 | 
			
		||||
      "sq": "sq",
 | 
			
		||||
      "sr": "sr",
 | 
			
		||||
      "ta": "ta",
 | 
			
		||||
      "te": "te",
 | 
			
		||||
      "th": "th",
 | 
			
		||||
      "tk": "tk",
 | 
			
		||||
      "to": "to",
 | 
			
		||||
      "tr": "tr",
 | 
			
		||||
      "ug": "ug",
 | 
			
		||||
      "uk": "uk",
 | 
			
		||||
      "ur": "ur",
 | 
			
		||||
      "uz": "uz",
 | 
			
		||||
      "vi": "vi",
 | 
			
		||||
      "wo": "wo",
 | 
			
		||||
      "xh": "xh",
 | 
			
		||||
      "yi": "yi",
 | 
			
		||||
      "zh": "zh",
 | 
			
		||||
      "zh_Hans": "zh",
 | 
			
		||||
      "zh_Hant": "zh-classical"
 | 
			
		||||
    },
 | 
			
		||||
    "regions": {},
 | 
			
		||||
    "supported_languages": {
 | 
			
		||||
      "ab": {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -16,6 +16,7 @@ from searx.network import post, get
 | 
			
		|||
from searx.utils import match_language, searx_useragent, get_string_replaces_function
 | 
			
		||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
 | 
			
		||||
from searx.engines.wikipedia import (  # pylint: disable=unused-import
 | 
			
		||||
    fetch_traits,
 | 
			
		||||
    _fetch_supported_languages,
 | 
			
		||||
    supported_languages_url,
 | 
			
		||||
)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,9 +5,12 @@
 | 
			
		|||
 | 
			
		||||
from urllib.parse import quote
 | 
			
		||||
from json import loads
 | 
			
		||||
from lxml.html import fromstring
 | 
			
		||||
from lxml import html
 | 
			
		||||
from searx.utils import match_language, searx_useragent
 | 
			
		||||
from searx.network import raise_for_httperror
 | 
			
		||||
from searx import network
 | 
			
		||||
from searx.enginelib.traits import EngineTraits
 | 
			
		||||
 | 
			
		||||
engine_traits: EngineTraits
 | 
			
		||||
 | 
			
		||||
# about
 | 
			
		||||
about = {
 | 
			
		||||
| 
						 | 
				
			
			@ -68,7 +71,7 @@ def response(resp):
 | 
			
		|||
            ):
 | 
			
		||||
                return []
 | 
			
		||||
 | 
			
		||||
    raise_for_httperror(resp)
 | 
			
		||||
    network.raise_for_httperror(resp)
 | 
			
		||||
 | 
			
		||||
    results = []
 | 
			
		||||
    api_result = loads(resp.text)
 | 
			
		||||
| 
						 | 
				
			
			@ -98,7 +101,7 @@ def response(resp):
 | 
			
		|||
# get supported languages from their site
 | 
			
		||||
def _fetch_supported_languages(resp):
 | 
			
		||||
    supported_languages = {}
 | 
			
		||||
    dom = fromstring(resp.text)
 | 
			
		||||
    dom = html.fromstring(resp.text)
 | 
			
		||||
    tables = dom.xpath('//table[contains(@class,"sortable")]')
 | 
			
		||||
    for table in tables:
 | 
			
		||||
        # exclude header row
 | 
			
		||||
| 
						 | 
				
			
			@ -114,3 +117,166 @@ def _fetch_supported_languages(resp):
 | 
			
		|||
                supported_languages[code] = {"name": name, "english_name": english_name}
 | 
			
		||||
 | 
			
		||||
    return supported_languages
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Nonstandard language codes
 | 
			
		||||
#
 | 
			
		||||
# These Wikipedias use language codes that do not conform to the ISO 639
 | 
			
		||||
# standard (which is how wiki subdomains are chosen nowadays).
 | 
			
		||||
 | 
			
		||||
lang_map = {
 | 
			
		||||
    'be-tarask': 'bel',
 | 
			
		||||
    'ak': 'aka',
 | 
			
		||||
    'als': 'gsw',
 | 
			
		||||
    'bat-smg': 'sgs',
 | 
			
		||||
    'cbk-zam': 'cbk',
 | 
			
		||||
    'fiu-vro': 'vro',
 | 
			
		||||
    'map-bms': 'map',
 | 
			
		||||
    'nrm': 'nrf',
 | 
			
		||||
    'roa-rup': 'rup',
 | 
			
		||||
    'nds-nl': 'nds',
 | 
			
		||||
    #'roa-tara: – invented code used for the Tarantino Wikipedia (again, roa is the standard code for the large family of Romance languages that the Tarantino dialect falls within)
 | 
			
		||||
    #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
 | 
			
		||||
    'zh-classical': 'zh_Hant',
 | 
			
		||||
    'zh-min-nan': 'nan',
 | 
			
		||||
    'zh-yue': 'yue',
 | 
			
		||||
    'an': 'arg',
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unknown_langs = [
 | 
			
		||||
    'ab',  # Abkhazian
 | 
			
		||||
    'alt',  # Southern Altai
 | 
			
		||||
    'an',  # Aragonese
 | 
			
		||||
    'ang',  # Anglo-Saxon
 | 
			
		||||
    'arc',  # Aramaic
 | 
			
		||||
    'ary',  # Moroccan Arabic
 | 
			
		||||
    'av',  # Avar
 | 
			
		||||
    'ba',  # Bashkir
 | 
			
		||||
    'be-tarask',
 | 
			
		||||
    'bar',  # Bavarian
 | 
			
		||||
    'bcl',  # Central Bicolano
 | 
			
		||||
    'bh',  # Bhojpuri
 | 
			
		||||
    'bi',  # Bislama
 | 
			
		||||
    'bjn',  # Banjar
 | 
			
		||||
    'blk',  # Pa'O
 | 
			
		||||
    'bpy',  # Bishnupriya Manipuri
 | 
			
		||||
    'bxr',  # Buryat
 | 
			
		||||
    'cbk-zam',  # Zamboanga Chavacano
 | 
			
		||||
    'co',  # Corsican
 | 
			
		||||
    'cu',  # Old Church Slavonic
 | 
			
		||||
    'dty',  # Doteli
 | 
			
		||||
    'dv',  # Divehi
 | 
			
		||||
    'ext',  # Extremaduran
 | 
			
		||||
    'fj',  # Fijian
 | 
			
		||||
    'frp',  # Franco-Provençal
 | 
			
		||||
    'gan',  # Gan
 | 
			
		||||
    'gom',  # Goan Konkani
 | 
			
		||||
    'hif',  # Fiji Hindi
 | 
			
		||||
    'ilo',  # Ilokano
 | 
			
		||||
    'inh',  # Ingush
 | 
			
		||||
    'jbo',  # Lojban
 | 
			
		||||
    'kaa',  # Karakalpak
 | 
			
		||||
    'kbd',  # Kabardian Circassian
 | 
			
		||||
    'kg',  # Kongo
 | 
			
		||||
    'koi',  # Komi-Permyak
 | 
			
		||||
    'krc',  # Karachay-Balkar
 | 
			
		||||
    'kv',  # Komi
 | 
			
		||||
    'lad',  # Ladino
 | 
			
		||||
    'lbe',  # Lak
 | 
			
		||||
    'lez',  # Lezgian
 | 
			
		||||
    'li',  # Limburgish
 | 
			
		||||
    'ltg',  # Latgalian
 | 
			
		||||
    'mdf',  # Moksha
 | 
			
		||||
    'mnw',  # Mon
 | 
			
		||||
    'mwl',  # Mirandese
 | 
			
		||||
    'myv',  # Erzya
 | 
			
		||||
    'na',  # Nauruan
 | 
			
		||||
    'nah',  # Nahuatl
 | 
			
		||||
    'nov',  # Novial
 | 
			
		||||
    'nrm',  # Norman
 | 
			
		||||
    'pag',  # Pangasinan
 | 
			
		||||
    'pam',  # Kapampangan
 | 
			
		||||
    'pap',  # Papiamentu
 | 
			
		||||
    'pdc',  # Pennsylvania German
 | 
			
		||||
    'pfl',  # Palatinate German
 | 
			
		||||
    'roa-rup',  # Aromanian
 | 
			
		||||
    'sco',  # Scots
 | 
			
		||||
    'sco',  # Scots (https://sco.wikipedia.org) is not known by babel, Scottish Gaelic (https://gd.wikipedia.org) is known by babel
 | 
			
		||||
    'sh',  # Serbo-Croatian
 | 
			
		||||
    'simple',  # simple english is not know as a natural language different to english (babel)
 | 
			
		||||
    'sm',  # Samoan
 | 
			
		||||
    'srn',  # Sranan
 | 
			
		||||
    'stq',  # Saterland Frisian
 | 
			
		||||
    'szy',  # Sakizaya
 | 
			
		||||
    'tcy',  # Tulu
 | 
			
		||||
    'tet',  # Tetum
 | 
			
		||||
    'tpi',  # Tok Pisin
 | 
			
		||||
    'trv',  # Seediq
 | 
			
		||||
    'ty',  # Tahitian
 | 
			
		||||
    'tyv',  # Tuvan
 | 
			
		||||
    'udm',  # Udmurt
 | 
			
		||||
    'vep',  # Vepsian
 | 
			
		||||
    'vls',  # West Flemish
 | 
			
		||||
    'vo',  # Volapük
 | 
			
		||||
    'wa',  # Walloon
 | 
			
		||||
    'xal',  # Kalmyk
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def fetch_traits(engine_traits: EngineTraits):
 | 
			
		||||
    """Fetch languages from Wikipedia"""
 | 
			
		||||
    # pylint: disable=import-outside-toplevel
 | 
			
		||||
 | 
			
		||||
    engine_traits.data_type = 'supported_languages'  # deprecated
 | 
			
		||||
 | 
			
		||||
    import babel
 | 
			
		||||
    from searx.locales import language_tag
 | 
			
		||||
 | 
			
		||||
    resp = network.get('https://meta.wikimedia.org/wiki/List_of_Wikipedias')
 | 
			
		||||
    if not resp.ok:
 | 
			
		||||
        print("ERROR: response from Wikipedia is not OK.")
 | 
			
		||||
 | 
			
		||||
    dom = html.fromstring(resp.text)
 | 
			
		||||
    for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
 | 
			
		||||
 | 
			
		||||
        cols = row.xpath('./td')
 | 
			
		||||
        if not cols:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        cols = [c.text_content().strip() for c in cols]
 | 
			
		||||
        articles = int(cols[4].replace(',', '').replace('-', '0'))
 | 
			
		||||
        users = int(cols[8].replace(',', '').replace('-', '0'))
 | 
			
		||||
        depth = cols[11].strip('-')
 | 
			
		||||
 | 
			
		||||
        if articles < 1000:
 | 
			
		||||
            # exclude languages with too few articles
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        # depth: rough indicator of a Wikipedia’s quality, showing how
 | 
			
		||||
        #        frequently its articles are updated.
 | 
			
		||||
        if depth == '':
 | 
			
		||||
            if users < 1000:
 | 
			
		||||
                # depth is not calculated --> at least 1000 user should registered
 | 
			
		||||
                continue
 | 
			
		||||
        elif int(depth) < 20:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        eng_tag = cols[3]
 | 
			
		||||
 | 
			
		||||
        if eng_tag in unknown_langs:
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag)))
 | 
			
		||||
        except babel.UnknownLocaleError:
 | 
			
		||||
            print("ERROR: %s -> %s is unknown by babel" % (cols[1], eng_tag))
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        conflict = engine_traits.languages.get(sxng_tag)
 | 
			
		||||
        if conflict:
 | 
			
		||||
            if conflict != eng_tag:
 | 
			
		||||
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
 | 
			
		||||
            continue
 | 
			
		||||
        engine_traits.languages[sxng_tag] = eng_tag
 | 
			
		||||
 | 
			
		||||
    engine_traits.languages['zh_Hans'] = 'zh'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue