[feat] implementation of presearch engine

This commit is contained in:
Bnyro 2023-09-14 13:31:54 +02:00 committed by Markus Heiser
parent 99fb565b39
commit 23582aac5c
2 changed files with 132 additions and 0 deletions

102
searx/engines/presearch.py Normal file
View File

@ -0,0 +1,102 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Presearch (general, images, videos, news)
"""
from urllib.parse import urlencode
from searx.network import get
from searx.utils import gen_useragent, html_to_text
about = {
"website": "https://presearch.io",
"wikidiata_id": "Q7240905",
"official_api_documentation": "https://docs.presearch.io/nodes/api",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
paging = True
time_range_support = True
categories = ["general", "web"] # general, images, videos, news
search_type = "search" # must be any of "search", "images", "videos", "news"
base_url = "https://presearch.com"
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
def _get_request_id(query, page, time_range, safesearch):
args = {
"q": query,
"page": page,
}
if time_range:
args["time_range"] = time_range
url = f"{base_url}/{search_type}?{urlencode(args)}"
headers = {
'User-Agent': gen_useragent(),
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
}
resp_text = get(url, headers=headers).text
for line in resp_text.split("\n"):
if "window.searchId = " in line:
return line.split("= ")[1][:-1].replace('"', "")
return None
def _is_valid_img_src(url):
# in some cases, the image url is a base64 encoded string, which has to be skipped
return "https://" in url
def request(query, params):
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
params["headers"]["Accept"] = "application/json"
params["url"] = f"{base_url}/results?id={request_id}"
return params
def response(resp):
results = []
json = resp.json()
json_results = []
if search_type == "search":
json_results = json['results'].get('standardResults', [])
else:
json_results = json.get(search_type, [])
for json_result in json_results:
result = {
'url': json_result['link'],
'title': json_result['title'],
'content': html_to_text(json_result.get('description', '')),
}
if search_type == "images":
result['template'] = 'images.html'
if not _is_valid_img_src(json_result['image']):
continue
result['img_src'] = json_result['image']
if _is_valid_img_src(json_result['thumbnail']):
result['thumbnail'] = json_result['thumbnail']
elif search_type == "videos":
result['template'] = 'videos.html'
if _is_valid_img_src(json_result['image']):
result['thumbnail'] = json_result['image']
result['duration'] = json_result['duration']
result['length'] = json_result['duration']
results.append(result)
return results

View File

@ -1290,6 +1290,36 @@ engines:
# query_str: 'SELECT * from my_table WHERE my_column = %(query)s'
# shortcut : psql
- name: presearch
engine: presearch
search_type: search
categories: [general, web]
shortcut: ps
- name: presearch images
engine: presearch
search_type: images
categories: [images, web]
timeout: 4.0
shortcut: psimg
disabled: true
- name: presearch videos
engine: presearch
search_type: videos
categories: [videos, web]
timeout: 4.0
shortcut: psvid
disabled: true
- name: presearch news
engine: presearch
search_type: news
categories: [news, web]
timeout: 4.0
shortcut: psnews
disabled: true
- name: pub.dev
engine: xpath
shortcut: pd