forked from zaclys/searxng
5d74bf3820
The general category is the category that is searched by default. From a privacy standpoint it doesn't make sense to send all general queries to specialized search engines that cannot deal with those queries anyway.
77 lines
2.3 KiB
Python
77 lines
2.3 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""
|
|
Duden
|
|
"""
|
|
|
|
import re
|
|
from urllib.parse import quote, urljoin
|
|
from lxml import html
|
|
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
|
|
|
# about
|
|
about = {
|
|
"website": 'https://www.duden.de',
|
|
"wikidata_id": 'Q73624591',
|
|
"official_api_documentation": None,
|
|
"use_official_api": False,
|
|
"require_api_key": False,
|
|
"results": 'HTML',
|
|
"language": 'de',
|
|
}
|
|
|
|
categories = ['dictionaries']
|
|
paging = True
|
|
|
|
# search-url
|
|
base_url = 'https://www.duden.de/'
|
|
search_url = base_url + 'suchen/dudenonline/{query}?search_api_fulltext=&page={offset}'
|
|
|
|
|
|
def request(query, params):
|
|
'''pre-request callback
|
|
params<dict>:
|
|
method : POST/GET
|
|
headers : {}
|
|
data : {} # if method == POST
|
|
url : ''
|
|
category: 'search category'
|
|
pageno : 1 # number of the requested page
|
|
'''
|
|
|
|
offset = params['pageno'] - 1
|
|
if offset == 0:
|
|
search_url_fmt = base_url + 'suchen/dudenonline/{query}'
|
|
params['url'] = search_url_fmt.format(query=quote(query))
|
|
else:
|
|
params['url'] = search_url.format(offset=offset, query=quote(query))
|
|
# after the last page of results, spelling corrections are returned after a HTTP redirect
|
|
# whatever the page number is
|
|
params['soft_max_redirects'] = 1
|
|
return params
|
|
|
|
|
|
def response(resp):
|
|
'''post-response callback
|
|
resp: requests response object
|
|
'''
|
|
results = []
|
|
|
|
dom = html.fromstring(resp.text)
|
|
|
|
number_of_results_element = eval_xpath_getindex(
|
|
dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()', 0, default=None
|
|
)
|
|
if number_of_results_element is not None:
|
|
number_of_results_string = re.sub('[^0-9]', '', number_of_results_element)
|
|
results.append({'number_of_results': int(number_of_results_string)})
|
|
|
|
for result in eval_xpath_list(dom, '//section[not(contains(@class, "essay"))]'):
|
|
url = eval_xpath_getindex(result, './/h2/a', 0).get('href')
|
|
url = urljoin(base_url, url)
|
|
title = eval_xpath(result, 'string(.//h2/a)').strip()
|
|
content = extract_text(eval_xpath(result, './/p'))
|
|
# append result
|
|
results.append({'url': url, 'title': title, 'content': content})
|
|
|
|
return results
|