mirror of https://github.com/searxng/searxng.git
[mod] improve implementation of presearch engine
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
23582aac5c
commit
44392bd436
|
@ -1,6 +1,20 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Presearch (general, images, videos, news)
|
||||
|
||||
.. hint::
|
||||
|
||||
The results in the video category are most often links to pages that contain
|
||||
a video, for instance many links from preasearch's video category link
|
||||
content from facebook (aka Meta) or Twitter (aka X). Since these are not
|
||||
real links to video streams SearXNG can't use the video template for this and
|
||||
if SearXNG can't use this template, then the user doesn't want to see these
|
||||
hits in the videos category.
|
||||
|
||||
TL;DR; by default presearch's video category is placed into categories::
|
||||
|
||||
categories: [general, web]
|
||||
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
|
@ -19,12 +33,18 @@ paging = True
|
|||
time_range_support = True
|
||||
categories = ["general", "web"] # general, images, videos, news
|
||||
|
||||
search_type = "search" # must be any of "search", "images", "videos", "news"
|
||||
search_type = "search"
|
||||
"""must be any of ``search``, ``images``, ``videos``, ``news``"""
|
||||
|
||||
base_url = "https://presearch.com"
|
||||
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
|
||||
|
||||
|
||||
def init(_):
|
||||
if search_type not in ['search', 'images', 'videos', 'news']:
|
||||
raise ValueError(f'presearch search_type: {search_type}')
|
||||
|
||||
|
||||
def _get_request_id(query, page, time_range, safesearch):
|
||||
args = {
|
||||
"q": query,
|
||||
|
@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch):
|
|||
'User-Agent': gen_useragent(),
|
||||
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
|
||||
}
|
||||
resp_text = get(url, headers=headers).text
|
||||
resp_text = get(url, headers=headers).text # type: ignore
|
||||
|
||||
for line in resp_text.split("\n"):
|
||||
if "window.searchId = " in line:
|
||||
|
@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch):
|
|||
return None
|
||||
|
||||
|
||||
def _is_valid_img_src(url):
|
||||
# in some cases, the image url is a base64 encoded string, which has to be skipped
|
||||
return "https://" in url
|
||||
|
||||
|
||||
def request(query, params):
|
||||
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
|
||||
|
||||
|
@ -61,42 +76,105 @@ def request(query, params):
|
|||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
def _strip_leading_strings(text):
|
||||
for x in ['wikipedia', 'google']:
|
||||
if text.lower().endswith(x):
|
||||
text = text[: -len(x)]
|
||||
return text.strip()
|
||||
|
||||
|
||||
def parse_search_query(json_results):
|
||||
results = []
|
||||
|
||||
json = resp.json()
|
||||
|
||||
json_results = []
|
||||
if search_type == "search":
|
||||
json_results = json['results'].get('standardResults', [])
|
||||
else:
|
||||
json_results = json.get(search_type, [])
|
||||
|
||||
for json_result in json_results:
|
||||
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
||||
result = {
|
||||
'url': json_result['link'],
|
||||
'title': json_result['title'],
|
||||
'content': html_to_text(json_result.get('description', '')),
|
||||
'url': item['link'],
|
||||
'title': item['title'],
|
||||
'img_src': item['image'],
|
||||
'content': '',
|
||||
'metadata': item.get('source'),
|
||||
}
|
||||
if search_type == "images":
|
||||
result['template'] = 'images.html'
|
||||
|
||||
if not _is_valid_img_src(json_result['image']):
|
||||
continue
|
||||
|
||||
result['img_src'] = json_result['image']
|
||||
if _is_valid_img_src(json_result['thumbnail']):
|
||||
result['thumbnail'] = json_result['thumbnail']
|
||||
|
||||
elif search_type == "videos":
|
||||
result['template'] = 'videos.html'
|
||||
|
||||
if _is_valid_img_src(json_result['image']):
|
||||
result['thumbnail'] = json_result['image']
|
||||
|
||||
result['duration'] = json_result['duration']
|
||||
result['length'] = json_result['duration']
|
||||
|
||||
results.append(result)
|
||||
|
||||
for item in json_results.get('standardResults', []):
|
||||
result = {
|
||||
'url': item['link'],
|
||||
'title': item['title'],
|
||||
'content': html_to_text(item['description']),
|
||||
}
|
||||
results.append(result)
|
||||
|
||||
info = json_results.get('infoSection', {}).get('data')
|
||||
if info:
|
||||
attributes = []
|
||||
for item in info.get('about', []):
|
||||
label, value = html_to_text(item).split(':', 1)
|
||||
value = _strip_leading_strings(value)
|
||||
attributes.append({'label': label, 'value': value})
|
||||
content = []
|
||||
for item in [info['subtitle'], info['description']]:
|
||||
item = _strip_leading_strings(html_to_text(item))
|
||||
if item:
|
||||
content.append(item)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'infobox': info['title'],
|
||||
'id': info['title'],
|
||||
'img_src': info.get('image'),
|
||||
'content': ' | '.join(content),
|
||||
'attributes': attributes,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
json_resp = resp.json()
|
||||
|
||||
if search_type == 'search':
|
||||
results = parse_search_query(json_resp['results'])
|
||||
|
||||
elif search_type == 'images':
|
||||
for item in json_resp['images']:
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'title': item['title'],
|
||||
'url': item['link'],
|
||||
'img_src': item['image'],
|
||||
'thumbnail_src': item['thumbnail'],
|
||||
}
|
||||
)
|
||||
|
||||
elif search_type == 'videos':
|
||||
# The results in the video category are most often links to pages that contain
|
||||
# a video and not to a video stream --> SearXNG can't use the video template.
|
||||
|
||||
for item in json_resp['videos']:
|
||||
metadata = [x for x in [item.get('description'), item.get('duration')] if x]
|
||||
results.append(
|
||||
{
|
||||
'title': item['title'],
|
||||
'url': item['link'],
|
||||
'content': '',
|
||||
'metadata': ' / '.join(metadata),
|
||||
'img_src': item.get('image'),
|
||||
}
|
||||
)
|
||||
|
||||
elif search_type == 'news':
|
||||
for item in json_resp['news']:
|
||||
metadata = [x for x in [item.get('source'), item.get('time')] if x]
|
||||
results.append(
|
||||
{
|
||||
'title': item['title'],
|
||||
'url': item['link'],
|
||||
'content': item['description'],
|
||||
'metadata': ' / '.join(metadata),
|
||||
'img_src': item.get('image'),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
|
|
@ -1295,6 +1295,7 @@ engines:
|
|||
search_type: search
|
||||
categories: [general, web]
|
||||
shortcut: ps
|
||||
disabled: true
|
||||
|
||||
- name: presearch images
|
||||
engine: presearch
|
||||
|
@ -1307,7 +1308,7 @@ engines:
|
|||
- name: presearch videos
|
||||
engine: presearch
|
||||
search_type: videos
|
||||
categories: [videos, web]
|
||||
categories: [general, web]
|
||||
timeout: 4.0
|
||||
shortcut: psvid
|
||||
disabled: true
|
||||
|
|
Loading…
Reference in New Issue