From 96a2eec3b5e20afda89d9e3a81b09ca1612dc186 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 7 Jan 2023 16:04:19 +0100 Subject: [PATCH] [mod] Archlinux Wiki: improved request API & upgrade to data_type: traits_v1 re-implementation of the Archlinux Wiki: - fetch_traits(): fetch languages, wiki URLs and title arguments - add content field to the result list - add documentation Wikis from wiki.archlinux.fr, wiki.archlinux.ro, archtr.org/wiki do no longer exists (has been merged in the main wiki). Signed-off-by: Markus Heiser --- docs/src/searx.engine.archlinux.rst | 9 ++ searx/data/engine_traits.json | 53 +++++++ searx/engines/archlinux.py | 223 ++++++++++++++-------------- searx/sxng_locales.py | 1 + 4 files changed, 177 insertions(+), 109 deletions(-) create mode 100644 docs/src/searx.engine.archlinux.rst diff --git a/docs/src/searx.engine.archlinux.rst b/docs/src/searx.engine.archlinux.rst new file mode 100644 index 000000000..be48b1859 --- /dev/null +++ b/docs/src/searx.engine.archlinux.rst @@ -0,0 +1,9 @@ +.. _archlinux engine: + +========== +Arch Linux +========== + +.. automodule:: searx.engines.archlinux + :members: + diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json index f0e6ef045..8f416a636 100644 --- a/searx/data/engine_traits.json +++ b/searx/data/engine_traits.json @@ -1,4 +1,57 @@ { + "arch linux wiki": { + "all_locale": null, + "custom": { + "title": { + "de": "Spezial:Suche", + "fa": "\u0648\u06cc\u0698\u0647:\u062c\u0633\u062a\u062c\u0648", + "ja": "\u7279\u5225:\u691c\u7d22", + "zh": "Special:\u641c\u7d22" + }, + "wiki_netloc": { + "de": "wiki.archlinux.de", + "fa": "wiki.archusers.ir", + "ja": "wiki.archlinux.jp", + "zh": "wiki.archlinuxcn.org" + } + }, + "data_type": "traits_v1", + "languages": { + "ar": "\u0627\u0644\u0639\u0631\u0628\u064a\u0629", + "bg": "\u0411\u044a\u043b\u0433\u0430\u0440\u0441\u043a\u0438", + "bs": "Bosanski", + "cs": "\u010ce\u0161tina", + "da": "Dansk", + "de": "Deutsch", + "el": "\u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac", + "en": "English", + "es": "Espa\u00f1ol", + "fa": "\u0641\u0627\u0631\u0633\u06cc", + "fi": "Suomi", + "fr": "Fran\u00e7ais", + "he": "\u05e2\u05d1\u05e8\u05d9\u05ea", + "hr": "Hrvatski", + "hu": "Magyar", + "id": "Bahasa Indonesia", + "it": "Italiano", + "ja": "\u65e5\u672c\u8a9e", + "ko": "\ud55c\uad6d\uc5b4", + "lt": "Lietuvi\u0173", + "nl": "Nederlands", + "pl": "Polski", + "pt": "Portugu\u00eas", + "ru": "\u0420\u0443\u0441\u0441\u043a\u0438\u0439", + "sk": "Sloven\u010dina", + "sr": "\u0421\u0440\u043f\u0441\u043a\u0438 / srpski", + "sv": "Svenska", + "th": "\u0e44\u0e17\u0e22", + "tr": "T\u00fcrk\u00e7e", + "uk": "\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430", + "zh": "\u4e2d\u6587\uff08\u7e41\u9ad4\uff09" + }, + "regions": {}, + "supported_languages": {} + }, "bing": { "all_locale": null, "custom": {}, diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index b5e426107..56c3b447f 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -1,15 +1,32 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint """ - Arch Linux Wiki +Arch Linux Wiki +~~~~~~~~~~~~~~~ + +This implementation does not use a official API: Mediawiki provides API, but +Arch Wiki blocks access to it. - API: Mediawiki provides API, but Arch Wiki blocks access to it """ -from urllib.parse import urlencode, urljoin -from lxml import html +from typing import TYPE_CHECKING +from urllib.parse import urlencode, urljoin, urlparse +import lxml +import babel + +from searx import network from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex +from searx.enginelib.traits import EngineTraits +from searx.locales import language_tag + +if TYPE_CHECKING: + import logging + + logger: logging.Logger + +traits: EngineTraits + -# about about = { "website": 'https://wiki.archlinux.org/', "wikidata_id": 'Q101445877', @@ -22,125 +39,113 @@ about = { # engine dependent config categories = ['it', 'software wikis'] paging = True -base_url = 'https://wiki.archlinux.org' - -# xpath queries -xpath_results = '//ul[@class="mw-search-results"]/li' -xpath_link = './/div[@class="mw-search-result-heading"]/a' +main_wiki = 'wiki.archlinux.org' -# cut 'en' from 'en-US', 'de' from 'de-CH', and so on -def locale_to_lang_code(locale): - if locale.find('-') >= 0: - locale = locale.split('-')[0] - return locale - - -# wikis for some languages were moved off from the main site, we need to make -# requests to correct URLs to be able to get results in those languages -lang_urls = { - # fmt: off - 'all': { - 'base': 'https://wiki.archlinux.org', - 'search': '/index.php?title=Special:Search&offset={offset}&{query}' - }, - 'de': { - 'base': 'https://wiki.archlinux.de', - 'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}' - }, - 'fr': { - 'base': 'https://wiki.archlinux.fr', - 'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}' - }, - 'ja': { - 'base': 'https://wiki.archlinuxjp.org', - 'search': '/index.php?title=特別:検索&offset={offset}&{query}' - }, - 'ro': { - 'base': 'http://wiki.archlinux.ro', - 'search': '/index.php?title=Special:Căutare&offset={offset}&{query}' - }, - 'tr': { - 'base': 'http://archtr.org/wiki', - 'search': '/index.php?title=Özel:Ara&offset={offset}&{query}' - } - # fmt: on -} - - -# get base & search URLs for selected language -def get_lang_urls(language): - if language in lang_urls: - return lang_urls[language] - return lang_urls['all'] - - -# Language names to build search requests for -# those languages which are hosted on the main site. -main_langs = { - 'ar': 'العربية', - 'bg': 'Български', - 'cs': 'Česky', - 'da': 'Dansk', - 'el': 'Ελληνικά', - 'es': 'Español', - 'he': 'עברית', - 'hr': 'Hrvatski', - 'hu': 'Magyar', - 'it': 'Italiano', - 'ko': '한국어', - 'lt': 'Lietuviškai', - 'nl': 'Nederlands', - 'pl': 'Polski', - 'pt': 'Português', - 'ru': 'Русский', - 'sl': 'Slovenský', - 'th': 'ไทย', - 'uk': 'Українська', - 'zh': '简体中文', -} -supported_languages = dict(lang_urls, **main_langs) - - -# do search-request def request(query, params): - # translate the locale (e.g. 'en-US') to language code ('en') - language = locale_to_lang_code(params['language']) - # if our language is hosted on the main site, we need to add its name - # to the query in order to narrow the results to that language - if language in main_langs: - query += ' (' + main_langs[language] + ')' - - # prepare the request parameters - query = urlencode({'search': query}) + sxng_lang = params['searxng_locale'].split('-')[0] + netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) + title = traits.custom['title'].get(sxng_lang, 'Special:Search') + base_url = 'https://' + netloc + '/index.php?' offset = (params['pageno'] - 1) * 20 - # get request URLs for our language of choice - urls = get_lang_urls(language) - search_url = urls['base'] + urls['search'] + if netloc == main_wiki: + eng_lang: str = traits.get_language(sxng_lang, 'English') + query += ' (' + eng_lang + ')' + elif netloc == 'wiki.archlinuxcn.org': + base_url = 'https://' + netloc + '/wzh/index.php?' - params['url'] = search_url.format(query=query, offset=offset) + args = { + 'search': query, + 'title': title, + 'limit': 20, + 'offset': offset, + 'profile': 'default', + } + params['url'] = base_url + urlencode(args) return params -# get response from search-request def response(resp): - # get the base URL for the language in which request was made - language = locale_to_lang_code(resp.search_params['language']) - base_url = get_lang_urls(language)['base'] results = [] + dom = lxml.html.fromstring(resp.text) - dom = html.fromstring(resp.text) + # get the base URL for the language in which request was made + sxng_lang = resp.search_params['searxng_locale'].split('-')[0] + netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) + base_url = 'https://' + netloc + '/index.php?' - # parse results - for result in eval_xpath_list(dom, xpath_results): - link = eval_xpath_getindex(result, xpath_link, 0) - href = urljoin(base_url, link.attrib.get('href')) - title = extract_text(link) - - results.append({'url': href, 'title': title}) + for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'): + link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0) + content = extract_text(result.xpath('.//div[@class="searchresult"]')) + results.append( + { + 'url': urljoin(base_url, link.get('href')), + 'title': extract_text(link), + 'content': content, + } + ) return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages from Archlinix-Wiki. The location of the Wiki address of a + language is mapped in a :py:obj:`custom field + ` (``wiki_netloc``). Depending + on the location, the ``title`` argument in the request is translated. + + .. code:: python + + "custom": { + "wiki_netloc": { + "de": "wiki.archlinux.de", + # ... + "zh": "wiki.archlinuxcn.org" + } + "title": { + "de": "Spezial:Suche", + # ... + "zh": "Special:\u641c\u7d22" + }, + }, + + """ + + engine_traits.custom['wiki_netloc'] = {} + engine_traits.custom['title'] = {} + + title_map = { + 'de': 'Spezial:Suche', + 'fa': 'ویژه:جستجو', + 'ja': '特別:検索', + 'zh': 'Special:搜索', + } + + resp = network.get('https://wiki.archlinux.org/') + if not resp.ok: + print("ERROR: response from wiki.archlinix.org is not OK.") + + dom = lxml.html.fromstring(resp.text) + for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"): + + sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-')) + # zh_Hans --> zh + sxng_tag = sxng_tag.split('_')[0] + + netloc = urlparse(a.get('href')).netloc + if netloc != 'wiki.archlinux.org': + title = title_map.get(sxng_tag) + if not title: + print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag)) + continue + engine_traits.custom['wiki_netloc'][sxng_tag] = netloc + engine_traits.custom['title'][sxng_tag] = title + + eng_tag = extract_text(eval_xpath_list(a, ".//span")) + engine_traits.languages[sxng_tag] = eng_tag + + engine_traits.languages['en'] = 'English' diff --git a/searx/sxng_locales.py b/searx/sxng_locales.py index b6ae85848..eedf664b0 100644 --- a/searx/sxng_locales.py +++ b/searx/sxng_locales.py @@ -81,6 +81,7 @@ sxng_locales = ( ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'), ('sk', 'Slovenčina', '', 'Slovak', '\U0001f310'), ('sl', 'Slovenščina', '', 'Slovenian', '\U0001f310'), + ('sr', 'Српски', '', 'Serbian', '\U0001f310'), ('sv', 'Svenska', '', 'Swedish', '\U0001f310'), ('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'), ('th', 'ไทย', '', 'Thai', '\U0001f310'),