[mod] engine: wikimedia - improve results, add addition settings & doc

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-08-04 16:54:22 +02:00 committed by Markus Heiser
parent 7d8c20c80d
commit db522cf76d
4 changed files with 154 additions and 67 deletions

View File

@ -40,6 +40,12 @@ Online Engines
demo/demo_online demo/demo_online
xpath xpath
mediawiki
.. toctree::
:maxdepth: 1
:glob:
online/* online/*
.. _offline engines: .. _offline engines:

View File

@ -0,0 +1,13 @@
.. _mediawiki engine:
================
MediaWiki Engine
================
.. contents::
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.engines.mediawiki
:members:

View File

@ -1,18 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" # lint: pylint
General mediawiki-engine (Web) """The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
""" the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
endpoints that follow this pattern::
from string import Formatter https://{base_url}/w/api.php?action=query&list=search&format=json
.. note::
In its actual state, this engine is implemented to parse JSON result
(`format=json`_) from a search query (`list=search`_). If you need other
``action`` and ``list`` types ask SearXNG developers to extend the
implementation according to your needs.
.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
Configuration
=============
Request:
- :py:obj:`base_url`
- :py:obj:`search_type`
- :py:obj:`srenablerewrites`
- :py:obj:`srsort`
- :py:obj:`srprop`
Implementations
===============
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from datetime import datetime
from urllib.parse import urlencode, quote from urllib.parse import urlencode, quote
from searx.utils import html_to_text from searx.utils import html_to_text
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about # about
about = { about = {
"website": None, "website": None,
"wikidata_id": None, "wikidata_id": None,
"official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search', "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
"use_official_api": True, "use_official_api": True,
"require_api_key": False, "require_api_key": False,
"results": 'JSON', "results": 'JSON',
@ -21,73 +62,119 @@ about = {
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
paging = True paging = True
number_of_results = 1 number_of_results = 5
search_type = 'nearmatch' # possible values: title, text, nearmatch
# search-url search_type: str = 'nearmatch'
base_url = 'https://{language}.wikipedia.org/' """Which type of search to perform. One of the following values: ``nearmatch``,
search_postfix = ( ``text`` or ``title``.
'w/api.php?action=query'
'&list=search' See ``srwhat`` argument in `list=search`_ documentation.
'&{query}' """
'&format=json'
'&sroffset={offset}' srenablerewrites: bool = True
'&srlimit={limit}' """Enable internal query rewriting (Type: boolean). Some search backends can
'&srwhat={searchtype}' rewrite the query into another which is thought to provide better results, for
) instance by correcting spelling errors.
See ``srenablerewrites`` argument in `list=search`_ documentation.
"""
srsort: str = 'relevance'
"""Set the sort order of returned results. One of the following values:
``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
``none``, ``random``, ``relevance``, ``user_random``.
See ``srenablerewrites`` argument in `list=search`_ documentation.
"""
srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
"""Which properties to return.
See ``srprop`` argument in `list=search`_ documentation.
"""
base_url: str = 'https://{language}.wikipedia.org/'
"""Base URL of the Wikimedia wiki.
``{language}``:
ISO 639-1 language code (en, de, fr ..) of the search language.
"""
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
"""The longhand version of MediaWiki time strings."""
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(
query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type
)
format_strings = list(Formatter().parse(base_url))
if params['language'] == 'all':
language = 'en'
else:
language = params['language'].split('-')[0]
# format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
if any(x[1] == 'language' for x in format_strings):
string_args['language'] = language
# write search-language back to params, required in response # write search-language back to params, required in response
params['language'] = language
search_url = base_url + search_postfix if params['language'] == 'all':
params['language'] = 'en'
else:
params['language'] = params['language'].split('-')[0]
params['url'] = search_url.format(**string_args) if base_url.endswith('/'):
api_url = base_url + 'w/api.php?'
else:
api_url = base_url + '/w/api.php?'
api_url = api_url.format(language=params['language'])
offset = (params['pageno'] - 1) * number_of_results
args = {
'action': 'query',
'list': 'search',
'format': 'json',
'srsearch': query,
'sroffset': offset,
'srlimit': number_of_results,
'srwhat': search_type,
'srprop': srprop,
'srsort': srsort,
}
if srenablerewrites:
args['srenablerewrites'] = '1'
params['url'] = api_url + urlencode(args)
return params return params
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = []
results = []
search_results = resp.json() search_results = resp.json()
# return empty array if there are no results # return empty array if there are no results
if not search_results.get('query', {}).get('search'): if not search_results.get('query', {}).get('search'):
return [] return []
# parse results
for result in search_results['query']['search']: for result in search_results['query']['search']:
if result.get('snippet', '').startswith('#REDIRECT'): if result.get('snippet', '').startswith('#REDIRECT'):
continue continue
url = (
base_url.format(language=resp.search_params['language'])
+ 'wiki/'
+ quote(result['title'].replace(' ', '_').encode())
)
# append result title = result['title']
results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))}) sectiontitle = result.get('sectiontitle')
content = html_to_text(result.get('snippet', ''))
metadata = html_to_text(result.get('categorysnippet', ''))
timestamp = result.get('timestamp')
url = (
base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
)
if sectiontitle:
# in case of sectiontitle create a link to the section in the wiki page
url += '#' + quote(sectiontitle.replace(' ', '_').encode())
title += ' / ' + sectiontitle
item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
if timestamp:
item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
results.append(item)
# return results # return results
return results return results

View File

@ -667,11 +667,6 @@ engines:
shortcut: fsd shortcut: fsd
categories: [it, software wikis] categories: [it, software wikis]
base_url: https://directory.fsf.org/ base_url: https://directory.fsf.org/
number_of_results: 5
# what part of a page matches the query string: title, text, nearmatch
# * title - query matches title
# * text - query matches the text of page
# * nearmatch - nearmatch in title
search_type: title search_type: title
timeout: 5.0 timeout: 5.0
disabled: true disabled: true
@ -1449,13 +1444,6 @@ engines:
engine: twitter engine: twitter
disabled: true disabled: true
# maybe in a fun category
# - name: uncyclopedia
# engine: mediawiki
# shortcut: unc
# base_url: https://uncyclopedia.wikia.com/
# number_of_results: 5
# tmp suspended - too slow, too many errors # tmp suspended - too slow, too many errors
# - name: urbandictionary # - name: urbandictionary
# engine : xpath # engine : xpath
@ -1534,7 +1522,6 @@ engines:
shortcut: wb shortcut: wb
categories: general categories: general
base_url: "https://{language}.wikibooks.org/" base_url: "https://{language}.wikibooks.org/"
number_of_results: 5
search_type: text search_type: text
disabled: true disabled: true
about: about:
@ -1546,9 +1533,9 @@ engines:
shortcut: wn shortcut: wn
categories: news categories: news
base_url: "https://{language}.wikinews.org/" base_url: "https://{language}.wikinews.org/"
number_of_results: 5
search_type: text search_type: text
disabled: true disabled: true
srsort: create_timestamp_desc
about: about:
website: https://www.wikinews.org/ website: https://www.wikinews.org/
wikidata_id: Q964 wikidata_id: Q964
@ -1558,7 +1545,6 @@ engines:
shortcut: wq shortcut: wq
categories: general categories: general
base_url: "https://{language}.wikiquote.org/" base_url: "https://{language}.wikiquote.org/"
number_of_results: 5
search_type: text search_type: text
disabled: true disabled: true
additional_tests: additional_tests:
@ -1572,7 +1558,6 @@ engines:
shortcut: ws shortcut: ws
categories: general categories: general
base_url: "https://{language}.wikisource.org/" base_url: "https://{language}.wikisource.org/"
number_of_results: 5
search_type: text search_type: text
disabled: true disabled: true
about: about:
@ -1584,7 +1569,6 @@ engines:
shortcut: wsp shortcut: wsp
categories: [general, science] categories: [general, science]
base_url: "https://species.wikimedia.org/" base_url: "https://species.wikimedia.org/"
number_of_results: 5
search_type: text search_type: text
disabled: true disabled: true
about: about:
@ -1596,7 +1580,6 @@ engines:
shortcut: wt shortcut: wt
categories: [dictionaries] categories: [dictionaries]
base_url: "https://{language}.wiktionary.org/" base_url: "https://{language}.wiktionary.org/"
number_of_results: 5
search_type: text search_type: text
about: about:
website: https://www.wiktionary.org/ website: https://www.wiktionary.org/
@ -1607,7 +1590,6 @@ engines:
shortcut: wv shortcut: wv
categories: general categories: general
base_url: "https://{language}.wikiversity.org/" base_url: "https://{language}.wikiversity.org/"
number_of_results: 5
search_type: text search_type: text
disabled: true disabled: true
about: about:
@ -1619,7 +1601,6 @@ engines:
shortcut: wy shortcut: wy
categories: general categories: general
base_url: "https://{language}.wikivoyage.org/" base_url: "https://{language}.wikivoyage.org/"
number_of_results: 5
search_type: text search_type: text
disabled: true disabled: true
about: about: