mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[mod] engine: wikimedia - improve results, add addition settings & doc
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									7d8c20c80d
								
							
						
					
					
						commit
						db522cf76d
					
				
					 4 changed files with 154 additions and 67 deletions
				
			
		| 
						 | 
				
			
			@ -40,6 +40,12 @@ Online Engines
 | 
			
		|||
 | 
			
		||||
   demo/demo_online
 | 
			
		||||
   xpath
 | 
			
		||||
   mediawiki
 | 
			
		||||
 | 
			
		||||
.. toctree::
 | 
			
		||||
   :maxdepth: 1
 | 
			
		||||
   :glob:
 | 
			
		||||
 | 
			
		||||
   online/*
 | 
			
		||||
 | 
			
		||||
.. _offline engines:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										13
									
								
								docs/dev/engines/mediawiki.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								docs/dev/engines/mediawiki.rst
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,13 @@
 | 
			
		|||
.. _mediawiki engine:
 | 
			
		||||
 | 
			
		||||
================
 | 
			
		||||
MediaWiki Engine
 | 
			
		||||
================
 | 
			
		||||
 | 
			
		||||
.. contents::
 | 
			
		||||
   :depth: 2
 | 
			
		||||
   :local:
 | 
			
		||||
   :backlinks: entry
 | 
			
		||||
 | 
			
		||||
.. automodule:: searx.engines.mediawiki
 | 
			
		||||
  :members:
 | 
			
		||||
| 
						 | 
				
			
			@ -1,18 +1,59 @@
 | 
			
		|||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
			
		||||
"""
 | 
			
		||||
 General mediawiki-engine (Web)
 | 
			
		||||
"""
 | 
			
		||||
# lint: pylint
 | 
			
		||||
"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
 | 
			
		||||
the `MediaWiki Action API`_.  For a `query action`_ all Wikimedia wikis have
 | 
			
		||||
endpoints that follow this pattern::
 | 
			
		||||
 | 
			
		||||
from string import Formatter
 | 
			
		||||
    https://{base_url}/w/api.php?action=query&list=search&format=json
 | 
			
		||||
 | 
			
		||||
.. note::
 | 
			
		||||
 | 
			
		||||
   In its actual state, this engine is implemented to parse JSON result
 | 
			
		||||
   (`format=json`_) from a search query (`list=search`_).  If you need other
 | 
			
		||||
   ``action`` and ``list`` types ask SearXNG developers to extend the
 | 
			
		||||
   implementation according to your needs.
 | 
			
		||||
 | 
			
		||||
.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
 | 
			
		||||
.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
 | 
			
		||||
.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
 | 
			
		||||
.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
 | 
			
		||||
 | 
			
		||||
Configuration
 | 
			
		||||
=============
 | 
			
		||||
 | 
			
		||||
Request:
 | 
			
		||||
 | 
			
		||||
- :py:obj:`base_url`
 | 
			
		||||
- :py:obj:`search_type`
 | 
			
		||||
- :py:obj:`srenablerewrites`
 | 
			
		||||
- :py:obj:`srsort`
 | 
			
		||||
- :py:obj:`srprop`
 | 
			
		||||
 | 
			
		||||
Implementations
 | 
			
		||||
===============
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
from __future__ import annotations
 | 
			
		||||
from typing import TYPE_CHECKING
 | 
			
		||||
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from urllib.parse import urlencode, quote
 | 
			
		||||
 | 
			
		||||
from searx.utils import html_to_text
 | 
			
		||||
from searx.enginelib.traits import EngineTraits
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    import logging
 | 
			
		||||
 | 
			
		||||
    logger: logging.Logger
 | 
			
		||||
 | 
			
		||||
traits: EngineTraits
 | 
			
		||||
 | 
			
		||||
# about
 | 
			
		||||
about = {
 | 
			
		||||
    "website": None,
 | 
			
		||||
    "wikidata_id": None,
 | 
			
		||||
    "official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search',
 | 
			
		||||
    "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
 | 
			
		||||
    "use_official_api": True,
 | 
			
		||||
    "require_api_key": False,
 | 
			
		||||
    "results": 'JSON',
 | 
			
		||||
| 
						 | 
				
			
			@ -21,73 +62,119 @@ about = {
 | 
			
		|||
# engine dependent config
 | 
			
		||||
categories = ['general']
 | 
			
		||||
paging = True
 | 
			
		||||
number_of_results = 1
 | 
			
		||||
search_type = 'nearmatch'  # possible values: title, text, nearmatch
 | 
			
		||||
number_of_results = 5
 | 
			
		||||
 | 
			
		||||
# search-url
 | 
			
		||||
base_url = 'https://{language}.wikipedia.org/'
 | 
			
		||||
search_postfix = (
 | 
			
		||||
    'w/api.php?action=query'
 | 
			
		||||
    '&list=search'
 | 
			
		||||
    '&{query}'
 | 
			
		||||
    '&format=json'
 | 
			
		||||
    '&sroffset={offset}'
 | 
			
		||||
    '&srlimit={limit}'
 | 
			
		||||
    '&srwhat={searchtype}'
 | 
			
		||||
)
 | 
			
		||||
search_type: str = 'nearmatch'
 | 
			
		||||
"""Which type of search to perform.  One of the following values: ``nearmatch``,
 | 
			
		||||
``text`` or ``title``.
 | 
			
		||||
 | 
			
		||||
See ``srwhat`` argument in `list=search`_ documentation.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
srenablerewrites: bool = True
 | 
			
		||||
"""Enable internal query rewriting (Type: boolean).  Some search backends can
 | 
			
		||||
rewrite the query into another which is thought to provide better results, for
 | 
			
		||||
instance by correcting spelling errors.
 | 
			
		||||
 | 
			
		||||
See ``srenablerewrites`` argument in `list=search`_ documentation.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
srsort: str = 'relevance'
 | 
			
		||||
"""Set the sort order of returned results.  One of the following values:
 | 
			
		||||
``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
 | 
			
		||||
``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
 | 
			
		||||
``none``, ``random``, ``relevance``, ``user_random``.
 | 
			
		||||
 | 
			
		||||
See ``srenablerewrites`` argument in `list=search`_ documentation.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
 | 
			
		||||
"""Which properties to return.
 | 
			
		||||
 | 
			
		||||
See ``srprop`` argument in `list=search`_ documentation.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
base_url: str = 'https://{language}.wikipedia.org/'
 | 
			
		||||
"""Base URL of the Wikimedia wiki.
 | 
			
		||||
 | 
			
		||||
``{language}``:
 | 
			
		||||
  ISO 639-1 language code (en, de, fr ..) of the search language.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
 | 
			
		||||
"""The longhand version of MediaWiki time strings."""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# do search-request
 | 
			
		||||
def request(query, params):
 | 
			
		||||
    offset = (params['pageno'] - 1) * number_of_results
 | 
			
		||||
 | 
			
		||||
    string_args = dict(
 | 
			
		||||
        query=urlencode({'srsearch': query}), offset=offset, limit=number_of_results, searchtype=search_type
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    format_strings = list(Formatter().parse(base_url))
 | 
			
		||||
 | 
			
		||||
    if params['language'] == 'all':
 | 
			
		||||
        language = 'en'
 | 
			
		||||
    else:
 | 
			
		||||
        language = params['language'].split('-')[0]
 | 
			
		||||
 | 
			
		||||
    # format_string [('https://', 'language', '', None), ('.wikipedia.org/', None, None, None)]
 | 
			
		||||
    if any(x[1] == 'language' for x in format_strings):
 | 
			
		||||
        string_args['language'] = language
 | 
			
		||||
 | 
			
		||||
    # write search-language back to params, required in response
 | 
			
		||||
    params['language'] = language
 | 
			
		||||
 | 
			
		||||
    search_url = base_url + search_postfix
 | 
			
		||||
    if params['language'] == 'all':
 | 
			
		||||
        params['language'] = 'en'
 | 
			
		||||
    else:
 | 
			
		||||
        params['language'] = params['language'].split('-')[0]
 | 
			
		||||
 | 
			
		||||
    params['url'] = search_url.format(**string_args)
 | 
			
		||||
    if base_url.endswith('/'):
 | 
			
		||||
        api_url = base_url + 'w/api.php?'
 | 
			
		||||
    else:
 | 
			
		||||
        api_url = base_url + '/w/api.php?'
 | 
			
		||||
    api_url = api_url.format(language=params['language'])
 | 
			
		||||
 | 
			
		||||
    offset = (params['pageno'] - 1) * number_of_results
 | 
			
		||||
 | 
			
		||||
    args = {
 | 
			
		||||
        'action': 'query',
 | 
			
		||||
        'list': 'search',
 | 
			
		||||
        'format': 'json',
 | 
			
		||||
        'srsearch': query,
 | 
			
		||||
        'sroffset': offset,
 | 
			
		||||
        'srlimit': number_of_results,
 | 
			
		||||
        'srwhat': search_type,
 | 
			
		||||
        'srprop': srprop,
 | 
			
		||||
        'srsort': srsort,
 | 
			
		||||
    }
 | 
			
		||||
    if srenablerewrites:
 | 
			
		||||
        args['srenablerewrites'] = '1'
 | 
			
		||||
 | 
			
		||||
    params['url'] = api_url + urlencode(args)
 | 
			
		||||
    return params
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# get response from search-request
 | 
			
		||||
def response(resp):
 | 
			
		||||
    results = []
 | 
			
		||||
 | 
			
		||||
    results = []
 | 
			
		||||
    search_results = resp.json()
 | 
			
		||||
 | 
			
		||||
    # return empty array if there are no results
 | 
			
		||||
    if not search_results.get('query', {}).get('search'):
 | 
			
		||||
        return []
 | 
			
		||||
 | 
			
		||||
    # parse results
 | 
			
		||||
    for result in search_results['query']['search']:
 | 
			
		||||
 | 
			
		||||
        if result.get('snippet', '').startswith('#REDIRECT'):
 | 
			
		||||
            continue
 | 
			
		||||
        url = (
 | 
			
		||||
            base_url.format(language=resp.search_params['language'])
 | 
			
		||||
            + 'wiki/'
 | 
			
		||||
            + quote(result['title'].replace(' ', '_').encode())
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # append result
 | 
			
		||||
        results.append({'url': url, 'title': result['title'], 'content': html_to_text(result.get('snippet', ''))})
 | 
			
		||||
        title = result['title']
 | 
			
		||||
        sectiontitle = result.get('sectiontitle')
 | 
			
		||||
        content = html_to_text(result.get('snippet', ''))
 | 
			
		||||
        metadata = html_to_text(result.get('categorysnippet', ''))
 | 
			
		||||
        timestamp = result.get('timestamp')
 | 
			
		||||
 | 
			
		||||
        url = (
 | 
			
		||||
            base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
 | 
			
		||||
        )
 | 
			
		||||
        if sectiontitle:
 | 
			
		||||
            # in case of sectiontitle create a link to the section in the wiki page
 | 
			
		||||
            url += '#' + quote(sectiontitle.replace(' ', '_').encode())
 | 
			
		||||
            title += ' / ' + sectiontitle
 | 
			
		||||
 | 
			
		||||
        item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
 | 
			
		||||
 | 
			
		||||
        if timestamp:
 | 
			
		||||
            item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
 | 
			
		||||
 | 
			
		||||
        results.append(item)
 | 
			
		||||
 | 
			
		||||
    # return results
 | 
			
		||||
    return results
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -667,11 +667,6 @@ engines:
 | 
			
		|||
    shortcut: fsd
 | 
			
		||||
    categories: [it, software wikis]
 | 
			
		||||
    base_url: https://directory.fsf.org/
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    # what part of a page matches the query string: title, text, nearmatch
 | 
			
		||||
    # * title     - query matches title
 | 
			
		||||
    # * text      - query matches the text of page
 | 
			
		||||
    # * nearmatch - nearmatch in title
 | 
			
		||||
    search_type: title
 | 
			
		||||
    timeout: 5.0
 | 
			
		||||
    disabled: true
 | 
			
		||||
| 
						 | 
				
			
			@ -1449,13 +1444,6 @@ engines:
 | 
			
		|||
    engine: twitter
 | 
			
		||||
    disabled: true
 | 
			
		||||
 | 
			
		||||
  # maybe in a fun category
 | 
			
		||||
  #  - name: uncyclopedia
 | 
			
		||||
  #    engine: mediawiki
 | 
			
		||||
  #    shortcut: unc
 | 
			
		||||
  #    base_url: https://uncyclopedia.wikia.com/
 | 
			
		||||
  #    number_of_results: 5
 | 
			
		||||
 | 
			
		||||
  # tmp suspended - too slow, too many errors
 | 
			
		||||
  #  - name: urbandictionary
 | 
			
		||||
  #    engine      : xpath
 | 
			
		||||
| 
						 | 
				
			
			@ -1534,7 +1522,6 @@ engines:
 | 
			
		|||
    shortcut: wb
 | 
			
		||||
    categories: general
 | 
			
		||||
    base_url: "https://{language}.wikibooks.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    disabled: true
 | 
			
		||||
    about:
 | 
			
		||||
| 
						 | 
				
			
			@ -1546,9 +1533,9 @@ engines:
 | 
			
		|||
    shortcut: wn
 | 
			
		||||
    categories: news
 | 
			
		||||
    base_url: "https://{language}.wikinews.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    disabled: true
 | 
			
		||||
    srsort: create_timestamp_desc
 | 
			
		||||
    about:
 | 
			
		||||
      website: https://www.wikinews.org/
 | 
			
		||||
      wikidata_id: Q964
 | 
			
		||||
| 
						 | 
				
			
			@ -1558,7 +1545,6 @@ engines:
 | 
			
		|||
    shortcut: wq
 | 
			
		||||
    categories: general
 | 
			
		||||
    base_url: "https://{language}.wikiquote.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    disabled: true
 | 
			
		||||
    additional_tests:
 | 
			
		||||
| 
						 | 
				
			
			@ -1572,7 +1558,6 @@ engines:
 | 
			
		|||
    shortcut: ws
 | 
			
		||||
    categories: general
 | 
			
		||||
    base_url: "https://{language}.wikisource.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    disabled: true
 | 
			
		||||
    about:
 | 
			
		||||
| 
						 | 
				
			
			@ -1584,7 +1569,6 @@ engines:
 | 
			
		|||
    shortcut: wsp
 | 
			
		||||
    categories: [general, science]
 | 
			
		||||
    base_url: "https://species.wikimedia.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    disabled: true
 | 
			
		||||
    about:
 | 
			
		||||
| 
						 | 
				
			
			@ -1596,7 +1580,6 @@ engines:
 | 
			
		|||
    shortcut: wt
 | 
			
		||||
    categories: [dictionaries]
 | 
			
		||||
    base_url: "https://{language}.wiktionary.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    about:
 | 
			
		||||
      website: https://www.wiktionary.org/
 | 
			
		||||
| 
						 | 
				
			
			@ -1607,7 +1590,6 @@ engines:
 | 
			
		|||
    shortcut: wv
 | 
			
		||||
    categories: general
 | 
			
		||||
    base_url: "https://{language}.wikiversity.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    disabled: true
 | 
			
		||||
    about:
 | 
			
		||||
| 
						 | 
				
			
			@ -1619,7 +1601,6 @@ engines:
 | 
			
		|||
    shortcut: wy
 | 
			
		||||
    categories: general
 | 
			
		||||
    base_url: "https://{language}.wikivoyage.org/"
 | 
			
		||||
    number_of_results: 5
 | 
			
		||||
    search_type: text
 | 
			
		||||
    disabled: true
 | 
			
		||||
    about:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue