Merge pull request #2316 from return42/fix-2314-upd-desc

[fix] searxng_extra/update/update_engine_descriptions.py
This commit is contained in:
Markus Heiser 2023-04-15 16:10:53 +02:00 committed by GitHub
commit 5c8d56e73a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1001 additions and 784 deletions

File diff suppressed because it is too large Load Diff

View File

@ -3480,6 +3480,7 @@
"es-US": "es_US", "es-US": "es_US",
"es-UY": "es_UY", "es-UY": "es_UY",
"es-VE": "es_VE", "es-VE": "es_VE",
"et-EE": "et_EE",
"fi-FI": "fi_FI", "fi-FI": "fi_FI",
"fil-PH": "fil_PH", "fil-PH": "fil_PH",
"fr-BE": "fr_BE", "fr-BE": "fr_BE",
@ -3487,6 +3488,7 @@
"fr-CH": "fr_CH", "fr-CH": "fr_CH",
"fr-FR": "fr_FR", "fr-FR": "fr_FR",
"hi-IN": "hi_IN", "hi-IN": "hi_IN",
"hu-HU": "hu_HU",
"id-ID": "id_ID", "id-ID": "id_ID",
"it-CH": "it_CH", "it-CH": "it_CH",
"it-IT": "it_IT", "it-IT": "it_IT",
@ -3514,6 +3516,7 @@
"wikidata": { "wikidata": {
"all_locale": null, "all_locale": null,
"custom": { "custom": {
"WIKIPEDIA_LANGUAGES": [],
"wiki_netloc": {} "wiki_netloc": {}
}, },
"data_type": "traits_v1", "data_type": "traits_v1",
@ -3556,6 +3559,7 @@
"ja": "ja", "ja": "ja",
"jv": "jv", "jv": "jv",
"ka": "ka", "ka": "ka",
"km": "km",
"kn": "kn", "kn": "kn",
"ko": "ko", "ko": "ko",
"lb": "lb", "lb": "lb",
@ -3566,8 +3570,8 @@
"ml": "ml", "ml": "ml",
"mn": "mn", "mn": "mn",
"mr": "mr", "mr": "mr",
"nb": "no",
"ne": "ne", "ne": "ne",
"no": "no",
"or": "or", "or": "or",
"os": "os", "os": "os",
"pa": "pa", "pa": "pa",
@ -3595,13 +3599,345 @@
"vi": "vi", "vi": "vi",
"yi": "yi", "yi": "yi",
"zh": "zh", "zh": "zh",
"zh_Hant": "zh-classical" "zh_Hans": "zh",
"zh_Hant": "zh"
}, },
"regions": {} "regions": {
"zh-CN": "zh",
"zh-HK": "zh",
"zh-MO": "zh",
"zh-MY": "zh",
"zh-SG": "zh",
"zh-TW": "zh",
"zh-classical": "zh-classical"
}
}, },
"wikipedia": { "wikipedia": {
"all_locale": null, "all_locale": null,
"custom": { "custom": {
"WIKIPEDIA_LANGUAGES": [
"ab",
"ace",
"ady",
"af",
"ak",
"als",
"alt",
"am",
"ami",
"an",
"ang",
"anp",
"ar",
"arc",
"ary",
"arz",
"as",
"ast",
"atj",
"av",
"avk",
"awa",
"ay",
"az",
"azb",
"ba",
"ban",
"bar",
"bat-smg",
"bcl",
"be",
"be-tarask",
"bg",
"bh",
"bi",
"bjn",
"blk",
"bm",
"bn",
"bo",
"bpy",
"br",
"bs",
"bug",
"bxr",
"ca",
"cbk-zam",
"cdo",
"ce",
"ceb",
"ch",
"chr",
"chy",
"ckb",
"co",
"cr",
"crh",
"cs",
"csb",
"cu",
"cv",
"cy",
"da",
"dag",
"de",
"din",
"diq",
"dsb",
"dty",
"dv",
"dz",
"ee",
"el",
"eml",
"en",
"eo",
"es",
"et",
"eu",
"ext",
"fa",
"ff",
"fi",
"fiu-vro",
"fj",
"fo",
"fr",
"frp",
"frr",
"fur",
"fy",
"ga",
"gag",
"gan",
"gcr",
"gd",
"gl",
"glk",
"gn",
"gom",
"gor",
"got",
"gu",
"guc",
"gur",
"guw",
"gv",
"ha",
"hak",
"haw",
"he",
"hi",
"hif",
"hr",
"hsb",
"ht",
"hu",
"hy",
"hyw",
"ia",
"id",
"ie",
"ig",
"ik",
"ilo",
"inh",
"io",
"is",
"it",
"iu",
"ja",
"jam",
"jbo",
"jv",
"ka",
"kaa",
"kab",
"kbd",
"kbp",
"kcg",
"kg",
"ki",
"kk",
"kl",
"km",
"kn",
"ko",
"koi",
"krc",
"ks",
"ksh",
"ku",
"kv",
"kw",
"ky",
"la",
"lad",
"lb",
"lbe",
"lez",
"lfn",
"lg",
"li",
"lij",
"lld",
"lmo",
"ln",
"lo",
"lt",
"ltg",
"lv",
"mad",
"mai",
"map-bms",
"mdf",
"mg",
"mhr",
"mi",
"min",
"mk",
"ml",
"mn",
"mni",
"mnw",
"mr",
"mrj",
"ms",
"mt",
"mwl",
"my",
"myv",
"mzn",
"na",
"nah",
"nap",
"nds",
"nds-nl",
"ne",
"new",
"nia",
"nl",
"nn",
"no",
"nov",
"nqo",
"nrm",
"nso",
"nv",
"ny",
"oc",
"olo",
"om",
"or",
"os",
"pa",
"pag",
"pam",
"pap",
"pcd",
"pcm",
"pdc",
"pfl",
"pi",
"pih",
"pl",
"pms",
"pnb",
"pnt",
"ps",
"pt",
"pwn",
"qu",
"rm",
"rmy",
"rn",
"ro",
"roa-rup",
"roa-tara",
"ru",
"rue",
"rw",
"sa",
"sah",
"sat",
"sc",
"scn",
"sco",
"sd",
"se",
"sg",
"sh",
"shi",
"shn",
"si",
"simple",
"sk",
"skr",
"sl",
"sm",
"smn",
"sn",
"so",
"sq",
"sr",
"srn",
"ss",
"st",
"stq",
"su",
"sv",
"sw",
"szl",
"szy",
"ta",
"tay",
"tcy",
"te",
"tet",
"tg",
"th",
"ti",
"tk",
"tl",
"tn",
"to",
"tpi",
"tr",
"trv",
"ts",
"tt",
"tum",
"tw",
"ty",
"tyv",
"udm",
"ug",
"uk",
"ur",
"uz",
"ve",
"vec",
"vep",
"vi",
"vls",
"vo",
"wa",
"war",
"wo",
"wuu",
"xal",
"xh",
"xmf",
"yi",
"yo",
"za",
"zea",
"zh",
"zh-classical",
"zh-min-nan",
"zh-yue",
"zu"
],
"wiki_netloc": { "wiki_netloc": {
"af": "af.wikipedia.org", "af": "af.wikipedia.org",
"als": "als.wikipedia.org", "als": "als.wikipedia.org",
@ -3640,6 +3976,7 @@
"ja": "ja.wikipedia.org", "ja": "ja.wikipedia.org",
"jv": "jv.wikipedia.org", "jv": "jv.wikipedia.org",
"ka": "ka.wikipedia.org", "ka": "ka.wikipedia.org",
"km": "km.wikipedia.org",
"kn": "kn.wikipedia.org", "kn": "kn.wikipedia.org",
"ko": "ko.wikipedia.org", "ko": "ko.wikipedia.org",
"lb": "lb.wikipedia.org", "lb": "lb.wikipedia.org",
@ -3679,8 +4016,7 @@
"uz": "uz.wikipedia.org", "uz": "uz.wikipedia.org",
"vi": "vi.wikipedia.org", "vi": "vi.wikipedia.org",
"yi": "yi.wikipedia.org", "yi": "yi.wikipedia.org",
"zh": "zh.wikipedia.org", "zh": "zh.wikipedia.org"
"zh-classical": "zh-classical.wikipedia.org"
} }
}, },
"data_type": "traits_v1", "data_type": "traits_v1",
@ -3723,6 +4059,7 @@
"ja": "ja", "ja": "ja",
"jv": "jv", "jv": "jv",
"ka": "ka", "ka": "ka",
"km": "km",
"kn": "kn", "kn": "kn",
"ko": "ko", "ko": "ko",
"lb": "lb", "lb": "lb",
@ -3733,8 +4070,8 @@
"ml": "ml", "ml": "ml",
"mn": "mn", "mn": "mn",
"mr": "mr", "mr": "mr",
"nb": "no",
"ne": "ne", "ne": "ne",
"no": "no",
"or": "or", "or": "or",
"os": "os", "os": "os",
"pa": "pa", "pa": "pa",
@ -3763,9 +4100,17 @@
"yi": "yi", "yi": "yi",
"zh": "zh", "zh": "zh",
"zh_Hans": "zh", "zh_Hans": "zh",
"zh_Hant": "zh-classical" "zh_Hant": "zh"
}, },
"regions": {} "regions": {
"zh-CN": "zh",
"zh-HK": "zh",
"zh-MO": "zh",
"zh-MY": "zh",
"zh-SG": "zh",
"zh-TW": "zh",
"zh-classical": "zh-classical"
}
}, },
"yahoo": { "yahoo": {
"all_locale": "any", "all_locale": "any",

View File

@ -13,7 +13,7 @@ used.
from __future__ import annotations from __future__ import annotations
import json import json
import dataclasses import dataclasses
from typing import Dict, Union, Callable, Optional, TYPE_CHECKING from typing import Dict, Iterable, Union, Callable, Optional, TYPE_CHECKING
from typing_extensions import Literal, Self from typing_extensions import Literal, Self
from searx import locales from searx import locales
@ -81,7 +81,7 @@ class EngineTraits:
"""Data type, default is 'traits_v1'. """Data type, default is 'traits_v1'.
""" """
custom: Dict[str, Dict] = dataclasses.field(default_factory=dict) custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict)
"""A place to store engine's custom traits, not related to the SearXNG core """A place to store engine's custom traits, not related to the SearXNG core
""" """

View File

@ -18,7 +18,10 @@ from searx.data import WIKIDATA_UNITS
from searx.network import post, get from searx.network import post, get
from searx.utils import searx_useragent, get_string_replaces_function from searx.utils import searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import fetch_traits as _fetch_traits from searx.engines.wikipedia import (
fetch_wikimedia_traits,
get_wiki_params,
)
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING: if TYPE_CHECKING:
@ -165,17 +168,15 @@ def request(query, params):
# wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN # wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
# mapped to zh # mapped to zh
sxng_lang = params['searxng_locale'].split('-')[0] eng_tag, _wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
language = traits.get_language(sxng_lang, 'en') query, attributes = get_query(query, eng_tag)
logger.debug("request --> language %s // len(attributes): %s", eng_tag, len(attributes))
query, attributes = get_query(query, language)
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
params['method'] = 'POST' params['method'] = 'POST'
params['url'] = SPARQL_ENDPOINT_URL params['url'] = SPARQL_ENDPOINT_URL
params['data'] = {'query': query} params['data'] = {'query': query}
params['headers'] = get_headers() params['headers'] = get_headers()
params['language'] = language params['language'] = eng_tag
params['attributes'] = attributes params['attributes'] = attributes
return params return params
@ -769,12 +770,16 @@ def init(engine_settings=None): # pylint: disable=unused-argument
def fetch_traits(engine_traits: EngineTraits): def fetch_traits(engine_traits: EngineTraits):
"""Use languages evaluated from :py:obj:`wikipedia.fetch_traits """Uses languages evaluated from :py:obj:`wikipedia.fetch_wikimedia_traits
<searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what <searx.engines.wikipedia.fetch_wikimedia_traits>` and removes
is not supported by wikidata."""
_fetch_traits(engine_traits) - ``traits.custom['wiki_netloc']``: wikidata does not have net-locations for
# wikidata does not support zh-classical (zh_Hans) the languages and the list of all
engine_traits.languages.pop('zh_Hans')
# wikidata does not have net-locations for the languages - ``traits.custom['WIKIPEDIA_LANGUAGES']``: not used in the wikipedia engine
"""
fetch_wikimedia_traits(engine_traits)
engine_traits.custom['wiki_netloc'] = {} engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []

View File

@ -5,10 +5,54 @@ are shared by other engines:
- :ref:`wikidata engine` - :ref:`wikidata engine`
The list of supported languages is fetched from the article linked by The list of supported languages is :py:obj:`fetched <fetch_wikimedia_traits>` from
:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia the article linked by :py:obj:`list_of_wikipedias`.
does not support one Wikipedia for all the languages, but there is one Wikipedia
for every language (:py:obj:`fetch_traits`). Unlike traditional search engines, wikipedia does not support one Wikipedia for
all languages, but there is one Wikipedia for each supported language. Some of
these Wikipedias have a LanguageConverter_ enabled
(:py:obj:`rest_v1_summary_url`).
A LanguageConverter_ (LC) is a system based on language variants that
automatically converts the content of a page into a different variant. A variant
is mostly the same language in a different script.
- `Wikipedias in multiple writing systems`_
- `Automatic conversion between traditional and simplified Chinese characters`_
PR-2554_:
The Wikipedia link returned by the API is still the same in all cases
(`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's
``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK``
or .. Wikipedia's LC automatically returns the desired script in their
web-page.
- You can test the API here: https://reqbin.com/gesg2kvx
.. _https://zh.wikipedia.org/wiki/出租車:
https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A
To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses
:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the
:py:obj:`fetch_wikimedia_traits` function.
To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese
options:
- ``!wp 出租車 :zh`` should show 出租車
- ``!wp 出租車 :zh-CN`` should show 出租车
- ``!wp 出租車 :zh-TW`` should show 計程車
- ``!wp 出租車 :zh-HK`` should show 的士
- ``!wp 出租車 :zh-SG`` should show 德士
.. _LanguageConverter:
https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter
.. _Wikipedias in multiple writing systems:
https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems
.. _Automatic conversion between traditional and simplified Chinese characters:
https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters
.. _PR-2554: https://github.com/searx/searx/pull/2554
""" """
import urllib.parse import urllib.parse
@ -16,8 +60,9 @@ import babel
from lxml import html from lxml import html
from searx import utils
from searx import network from searx import network
from searx.locales import language_tag from searx import locales
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
traits: EngineTraits traits: EngineTraits
@ -33,6 +78,12 @@ about = {
} }
send_accept_language_header = True send_accept_language_header = True
"""The HTTP ``Accept-Language`` header is needed for wikis where
LanguageConverter_ is enabled."""
list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
"""
wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth' wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
"""The *editing depth* of Wikipedia is one of several possible rough indicators """The *editing depth* of Wikipedia is one of several possible rough indicators
@ -41,29 +92,68 @@ are updated. The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized. the classic measurement of article count were realized.
""" """
# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}' rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of """
the first paragraph of the page in plain text and HTML as well as the type of `wikipedia rest_v1 summary API`_:
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web The summary response includes an extract of the first paragraph of the page in
and link previews in the apps. plain text and HTML as well as the type of page. This is useful for page
previews (fka. Hovercards, aka. Popups) on the web and link previews in the
apps.
.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_ HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
The desired language variant code for wikis where LanguageConverter_ is
enabled.
.. _wikipedia rest_v1 summary API:
https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
""" """
wiki_lc_locale_variants = {
"zh": (
"zh-CN",
"zh-HK",
"zh-MO",
"zh-MY",
"zh-SG",
"zh-TW",
),
"zh-classical": ("zh-classical",),
}
"""Mapping rule of the LanguageConverter_ to map a language and its variants to
a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC
Chinese`_.
.. _LC Chinese:
https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese
"""
wikipedia_script_variants = {
"zh": (
"zh_Hant",
"zh_Hans",
)
}
def get_wiki_params(sxng_locale, eng_traits):
"""Returns the Wikipedia language tag and the netloc that fits to the
``sxng_locale``. To support LanguageConverter_ this function rates a locale
(region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`).
"""
eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en'))
wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org')
return eng_tag, wiki_netloc
def request(query, params): def request(query, params):
"""Assemble a request (`wikipedia rest_v1 summary API`_).""" """Assemble a request (`wikipedia rest_v1 summary API`_)."""
if query.islower(): if query.islower():
query = query.title() query = query.title()
engine_language = traits.get_language(params['searxng_locale'], 'en') _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
title = urllib.parse.quote(query) title = urllib.parse.quote(query)
# '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
# '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title) params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
params['raise_for_httperror'] = False params['raise_for_httperror'] = False
@ -93,7 +183,7 @@ def response(resp):
network.raise_for_httperror(resp) network.raise_for_httperror(resp)
api_result = resp.json() api_result = resp.json()
title = api_result['title'] title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title'))
wikipedia_link = api_result['content_urls']['desktop']['page'] wikipedia_link = api_result['content_urls']['desktop']['page']
results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')}) results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
@ -116,7 +206,9 @@ def response(resp):
# These Wikipedias use language codes that do not conform to the ISO 639 # These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays). # standard (which is how wiki subdomains are chosen nowadays).
lang_map = { lang_map = locales.LOCALE_BEST_MATCH.copy()
lang_map.update(
{
'be-tarask': 'bel', 'be-tarask': 'bel',
'ak': 'aka', 'ak': 'aka',
'als': 'gsw', 'als': 'gsw',
@ -124,6 +216,7 @@ lang_map = {
'cbk-zam': 'cbk', 'cbk-zam': 'cbk',
'fiu-vro': 'vro', 'fiu-vro': 'vro',
'map-bms': 'map', 'map-bms': 'map',
'no': 'nb-NO',
'nrm': 'nrf', 'nrm': 'nrf',
'roa-rup': 'rup', 'roa-rup': 'rup',
'nds-nl': 'nds', 'nds-nl': 'nds',
@ -131,29 +224,20 @@ lang_map = {
'zh-min-nan': 'nan', 'zh-min-nan': 'nan',
'zh-yue': 'yue', 'zh-yue': 'yue',
'an': 'arg', 'an': 'arg',
'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason) }
} )
unknown_langs = [
'an', # Aragonese
'ba', # Bashkir
'bar', # Bavarian
'bcl', # Central Bicolano
'be-tarask', # Belarusian variant / Belarusian is already covered by 'be'
'bpy', # Bishnupriya Manipuri is unknown by babel
'hif', # Fiji Hindi
'ilo', # Ilokano
'li', # Limburgish
'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
'sh', # Serbo-Croatian
'simple', # simple english is not know as a natural language different to english (babel)
'vo', # Volapük
'wa', # Walloon
]
def fetch_traits(engine_traits: EngineTraits): def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Wikipedia. fetch_wikimedia_traits(engine_traits)
print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES']))
def fetch_wikimedia_traits(engine_traits: EngineTraits):
"""Fetch languages from Wikipedia. Not all languages from the
:py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those
known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal
:py:obj:`editing depth <wikipedia_article_depth>`.
The location of the Wikipedia address of a language is mapped in a The location of the Wikipedia address of a language is mapped in a
:py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>` :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
@ -169,15 +253,21 @@ def fetch_traits(engine_traits: EngineTraits):
"zh": "zh.wikipedia.org", "zh": "zh.wikipedia.org",
"zh-classical": "zh-classical.wikipedia.org" "zh-classical": "zh-classical.wikipedia.org"
} }
""" """
# pylint: disable=too-many-branches
engine_traits.custom['wiki_netloc'] = {} engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []
# insert alias to map from a region like zh-CN to a language zh_Hans # insert alias to map from a script or region to a wikipedia variant
engine_traits.languages['zh_Hans'] = 'zh'
resp = network.get(wikipedia_article_depth) for eng_tag, sxng_tag_list in wikipedia_script_variants.items():
for sxng_tag in sxng_tag_list:
engine_traits.languages[sxng_tag] = eng_tag
for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items():
for sxng_tag in sxng_tag_list:
engine_traits.regions[sxng_tag] = eng_tag
resp = network.get(list_of_wikipedias)
if not resp.ok: if not resp.ok:
print("ERROR: response from Wikipedia is not OK.") print("ERROR: response from Wikipedia is not OK.")
@ -189,29 +279,30 @@ def fetch_traits(engine_traits: EngineTraits):
continue continue
cols = [c.text_content().strip() for c in cols] cols = [c.text_content().strip() for c in cols]
depth = float(cols[3].replace('-', '0').replace(',', '')) depth = float(cols[11].replace('-', '0').replace(',', ''))
articles = int(cols[4].replace(',', '').replace(',', '')) articles = int(cols[4].replace(',', '').replace(',', ''))
eng_tag = cols[3]
wiki_url = row.xpath('./td[4]/a/@href')[0]
wiki_url = urllib.parse.urlparse(wiki_url)
try:
sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
except babel.UnknownLocaleError:
# print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
continue
finally:
engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag)
if sxng_tag not in locales.LOCALE_NAMES:
if articles < 10000: if articles < 10000:
# exclude languages with too few articles # exclude languages with too few articles
continue continue
if int(depth) < 20: if int(depth) < 20:
# Rough indicator of a Wikipedias quality, showing how frequently # Rough indicator of a Wikipedias quality, showing how
# its articles are updated. # frequently its articles are updated.
continue
eng_tag = cols[2]
wiki_url = row.xpath('./td[3]/a/@href')[0]
wiki_url = urllib.parse.urlparse(wiki_url)
if eng_tag in unknown_langs:
continue
try:
sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
except babel.UnknownLocaleError:
print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
continue continue
conflict = engine_traits.languages.get(sxng_tag) conflict = engine_traits.languages.get(sxng_tag)
@ -222,3 +313,5 @@ def fetch_traits(engine_traits: EngineTraits):
engine_traits.languages[sxng_tag] = eng_tag engine_traits.languages[sxng_tag] = eng_tag
engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort()

View File

@ -75,6 +75,7 @@ class OnlineProcessor(EngineProcessor):
) )
params['headers']['Accept-Language'] = ac_lang params['headers']['Accept-Language'] = ac_lang
self.logger.debug('HTTP Accept-Language: %s', params['headers'].get('Accept-Language', ''))
return params return params
def _send_http_request(self, params): def _send_http_request(self, params):

View File

@ -18,7 +18,7 @@ from os.path import join
from lxml.html import fromstring from lxml.html import fromstring
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.utils import extract_text from searx.utils import extract_text, searx_useragent
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir from searx import searx_dir
from searx.utils import gen_useragent, detect_language from searx.utils import gen_useragent, detect_language
@ -28,8 +28,12 @@ import searx.network
set_loggers(wikidata, 'wikidata') set_loggers(wikidata, 'wikidata')
locales_initialize() locales_initialize()
# you can run the query in https://query.wikidata.org
# replace %IDS% by Wikidata entities separated by spaces with the prefix wd:
# for example wd:Q182496 wd:Q1540899
# replace %LANGUAGES_SPARQL% by languages
SPARQL_WIKIPEDIA_ARTICLE = """ SPARQL_WIKIPEDIA_ARTICLE = """
SELECT DISTINCT ?item ?name SELECT DISTINCT ?item ?name ?article ?lang
WHERE { WHERE {
hint:Query hint:optimizer "None". hint:Query hint:optimizer "None".
VALUES ?item { %IDS% } VALUES ?item { %IDS% }
@ -40,6 +44,7 @@ WHERE {
FILTER(?lang in (%LANGUAGES_SPARQL%)) . FILTER(?lang in (%LANGUAGES_SPARQL%)) .
FILTER (!CONTAINS(?name, ':')) . FILTER (!CONTAINS(?name, ':')) .
} }
ORDER BY ?item ?lang
""" """
SPARQL_DESCRIPTION = """ SPARQL_DESCRIPTION = """
@ -69,10 +74,11 @@ SKIP_ENGINE_SOURCE = [
# fmt: on # fmt: on
] ]
LANGUAGES = LOCALE_NAMES.keys() WIKIPEDIA_LANGUAGES = {}
WIKIPEDIA_LANGUAGES = {'language': 'wikipedia_language'}
LANGUAGES_SPARQL = '' LANGUAGES_SPARQL = ''
IDS = None IDS = None
WIKIPEDIA_LANGUAGE_VARIANTS = {'zh_Hant': 'zh-tw'}
descriptions = {} descriptions = {}
wd_to_engine_name = {} wd_to_engine_name = {}
@ -102,16 +108,31 @@ def update_description(engine_name, lang, description, source, replace=True):
descriptions[engine_name][lang] = [description, source] descriptions[engine_name][lang] = [description, source]
def get_wikipedia_summary(lang, pageid): def get_wikipedia_summary(wikipedia_url, searxng_locale):
params = {'language': lang.replace('_', '-'), 'headers': {}} # get the REST API URL from the HTML URL
searx.engines.engines['wikipedia'].request(pageid, params)
# Headers
headers = {'User-Agent': searx_useragent()}
if searxng_locale in WIKIPEDIA_LANGUAGE_VARIANTS:
headers['Accept-Language'] = WIKIPEDIA_LANGUAGE_VARIANTS.get(searxng_locale)
# URL path : from HTML URL to REST API URL
parsed_url = urlparse(wikipedia_url)
# remove the /wiki/ prefix
article_name = parsed_url.path.split('/wiki/')[1]
# article_name is already encoded but not the / which is required for the REST API call
encoded_article_name = article_name.replace('/', '%2F')
path = '/api/rest_v1/page/summary/' + encoded_article_name
wikipedia_rest_url = parsed_url._replace(path=path).geturl()
try: try:
response = searx.network.get(params['url'], headers=params['headers'], timeout=10) response = searx.network.get(wikipedia_rest_url, headers=headers, timeout=10)
response.raise_for_status() response.raise_for_status()
except Exception as e: # pylint: disable=broad-except
print(" ", wikipedia_url, e)
return None
api_result = json.loads(response.text) api_result = json.loads(response.text)
return api_result.get('extract') return api_result.get('extract')
except Exception: # pylint: disable=broad-except
return None
def get_website_description(url, lang1, lang2=None): def get_website_description(url, lang1, lang2=None):
@ -154,11 +175,25 @@ def get_website_description(url, lang1, lang2=None):
def initialize(): def initialize():
global IDS, WIKIPEDIA_LANGUAGES, LANGUAGES_SPARQL global IDS, LANGUAGES_SPARQL
searx.search.initialize() searx.search.initialize()
wikipedia_engine = searx.engines.engines['wikipedia'] wikipedia_engine = searx.engines.engines['wikipedia']
WIKIPEDIA_LANGUAGES = {language: wikipedia_engine.url_lang(language.replace('_', '-')) for language in LANGUAGES}
WIKIPEDIA_LANGUAGES['nb_NO'] = 'no' locale2lang = {'nl-BE': 'nl'}
for sxng_ui_lang in LOCALE_NAMES:
sxng_ui_alias = locale2lang.get(sxng_ui_lang, sxng_ui_lang)
wiki_lang = None
if sxng_ui_alias in wikipedia_engine.traits.custom['WIKIPEDIA_LANGUAGES']:
wiki_lang = sxng_ui_alias
if not wiki_lang:
wiki_lang = wikipedia_engine.traits.get_language(sxng_ui_alias)
if not wiki_lang:
print(f"WIKIPEDIA_LANGUAGES missing {sxng_ui_lang}")
continue
WIKIPEDIA_LANGUAGES[sxng_ui_lang] = wiki_lang
LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values())) LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values()))
for engine_name, engine in searx.engines.engines.items(): for engine_name, engine in searx.engines.engines.items():
descriptions[engine_name] = {} descriptions[engine_name] = {}
@ -170,6 +205,7 @@ def initialize():
def fetch_wikidata_descriptions(): def fetch_wikidata_descriptions():
print('Fetching wikidata descriptions')
searx.network.set_timeout_for_thread(60) searx.network.set_timeout_for_thread(60)
result = wikidata.send_wikidata_query( result = wikidata.send_wikidata_query(
SPARQL_DESCRIPTION.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL) SPARQL_DESCRIPTION.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
@ -178,14 +214,20 @@ def fetch_wikidata_descriptions():
for binding in result['results']['bindings']: for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
wikidata_lang = binding['itemDescription']['xml:lang'] wikidata_lang = binding['itemDescription']['xml:lang']
description = binding['itemDescription']['value'] desc = binding['itemDescription']['value']
for engine_name in wd_to_engine_name[wikidata_id]: for engine_name in wd_to_engine_name[wikidata_id]:
for lang in LANGUAGES: for searxng_locale in LOCALE_NAMES:
if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang: if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang:
update_description(engine_name, lang, description, 'wikidata') continue
print(
f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}",
f"/ len(wikidata_desc): {len(desc)}",
)
update_description(engine_name, searxng_locale, desc, 'wikidata')
def fetch_wikipedia_descriptions(): def fetch_wikipedia_descriptions():
print('Fetching wikipedia descriptions')
result = wikidata.send_wikidata_query( result = wikidata.send_wikidata_query(
SPARQL_WIKIPEDIA_ARTICLE.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL) SPARQL_WIKIPEDIA_ARTICLE.replace('%IDS%', IDS).replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)
) )
@ -193,12 +235,19 @@ def fetch_wikipedia_descriptions():
for binding in result['results']['bindings']: for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
wikidata_lang = binding['name']['xml:lang'] wikidata_lang = binding['name']['xml:lang']
pageid = binding['name']['value'] wikipedia_url = binding['article']['value'] # for example the URL https://de.wikipedia.org/wiki/PubMed
for engine_name in wd_to_engine_name[wikidata_id]: for engine_name in wd_to_engine_name[wikidata_id]:
for lang in LANGUAGES: for searxng_locale in LOCALE_NAMES:
if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang: if WIKIPEDIA_LANGUAGES[searxng_locale] != wikidata_lang:
description = get_wikipedia_summary(lang, pageid) continue
update_description(engine_name, lang, description, 'wikipedia') desc = get_wikipedia_summary(wikipedia_url, searxng_locale)
if not desc:
continue
print(
f" engine: {engine_name:20} / wikidata_lang: {wikidata_lang:5}",
f"/ len(wikipedia_desc): {len(desc)}",
)
update_description(engine_name, searxng_locale, desc, 'wikipedia')
def normalize_url(url): def normalize_url(url):
@ -209,41 +258,60 @@ def normalize_url(url):
def fetch_website_description(engine_name, website): def fetch_website_description(engine_name, website):
print(f"- fetch website descr: {engine_name} / {website}")
default_lang, default_description = get_website_description(website, None, None) default_lang, default_description = get_website_description(website, None, None)
if default_lang is None or default_description is None: if default_lang is None or default_description is None:
# the front page can't be fetched: skip this engine # the front page can't be fetched: skip this engine
return return
wikipedia_languages_r = {V: K for K, V in WIKIPEDIA_LANGUAGES.items()} # to specify an order in where the most common languages are in front of the
# language list ..
languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr'] languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr']
languages = languages + [l for l in LANGUAGES if l not in languages] languages = languages + [l for l in LOCALE_NAMES if l not in languages]
previous_matched_lang = None previous_matched_lang = None
previous_count = 0 previous_count = 0
for lang in languages: for lang in languages:
if lang not in descriptions[engine_name]:
if lang in descriptions[engine_name]:
continue
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang]) fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None: if fetched_lang is None or desc is None:
continue continue
matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None)
if matched_lang is None:
fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
if matched_lang is not None:
update_description(engine_name, matched_lang, desc, website, replace=False)
# check if desc changed with the different lang values # check if desc changed with the different lang values
if matched_lang == previous_matched_lang:
if fetched_lang == previous_matched_lang:
previous_count += 1 previous_count += 1
if previous_count == 6: if previous_count == 6:
# the website has returned the same description for 6 different languages in Accept-Language header # the website has returned the same description for 6 different languages in Accept-Language header
# stop now # stop now
break break
else: else:
previous_matched_lang = matched_lang previous_matched_lang = fetched_lang
previous_count = 0 previous_count = 0
# Don't trust in the value of fetched_lang, some websites return
# for some inappropriate values, by example bing-images::
#
# requested lang: zh-Hans-CN / fetched lang: ceb / desc: 查看根据您的兴趣量身定制的提要
#
# The lang ceb is "Cebuano" but the description is given in zh-Hans-CN
print(
f" engine: {engine_name:20} / requested lang:{lang:7}"
f" / fetched lang: {fetched_lang:7} / len(desc): {len(desc)}"
)
matched_lang = match_locale(fetched_lang, LOCALE_NAMES.keys(), fallback=lang)
update_description(engine_name, matched_lang, desc, website, replace=False)
def fetch_website_descriptions(): def fetch_website_descriptions():
print('Fetching website descriptions')
for engine_name, engine in searx.engines.engines.items(): for engine_name, engine in searx.engines.engines.items():
website = getattr(engine, "about", {}).get('website') website = getattr(engine, "about", {}).get('website')
if website is None and hasattr(engine, "search_url"): if website is None and hasattr(engine, "search_url"):
@ -289,11 +357,8 @@ def get_output():
def main(): def main():
initialize() initialize()
print('Fetching wikidata descriptions')
fetch_wikidata_descriptions() fetch_wikidata_descriptions()
print('Fetching wikipedia descriptions')
fetch_wikipedia_descriptions() fetch_wikipedia_descriptions()
print('Fetching website descriptions')
fetch_website_descriptions() fetch_website_descriptions()
output = get_output() output = get_output()