mirror of https://github.com/searxng/searxng.git
[mod] improve implementation of presearch engine
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
23582aac5c
commit
44392bd436
|
@ -1,6 +1,20 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
# lint: pylint
|
# lint: pylint
|
||||||
"""Presearch (general, images, videos, news)
|
"""Presearch (general, images, videos, news)
|
||||||
|
|
||||||
|
.. hint::
|
||||||
|
|
||||||
|
The results in the video category are most often links to pages that contain
|
||||||
|
a video, for instance many links from preasearch's video category link
|
||||||
|
content from facebook (aka Meta) or Twitter (aka X). Since these are not
|
||||||
|
real links to video streams SearXNG can't use the video template for this and
|
||||||
|
if SearXNG can't use this template, then the user doesn't want to see these
|
||||||
|
hits in the videos category.
|
||||||
|
|
||||||
|
TL;DR; by default presearch's video category is placed into categories::
|
||||||
|
|
||||||
|
categories: [general, web]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
@ -19,12 +33,18 @@ paging = True
|
||||||
time_range_support = True
|
time_range_support = True
|
||||||
categories = ["general", "web"] # general, images, videos, news
|
categories = ["general", "web"] # general, images, videos, news
|
||||||
|
|
||||||
search_type = "search" # must be any of "search", "images", "videos", "news"
|
search_type = "search"
|
||||||
|
"""must be any of ``search``, ``images``, ``videos``, ``news``"""
|
||||||
|
|
||||||
base_url = "https://presearch.com"
|
base_url = "https://presearch.com"
|
||||||
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
|
safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
|
||||||
|
|
||||||
|
|
||||||
|
def init(_):
|
||||||
|
if search_type not in ['search', 'images', 'videos', 'news']:
|
||||||
|
raise ValueError(f'presearch search_type: {search_type}')
|
||||||
|
|
||||||
|
|
||||||
def _get_request_id(query, page, time_range, safesearch):
|
def _get_request_id(query, page, time_range, safesearch):
|
||||||
args = {
|
args = {
|
||||||
"q": query,
|
"q": query,
|
||||||
|
@ -38,7 +58,7 @@ def _get_request_id(query, page, time_range, safesearch):
|
||||||
'User-Agent': gen_useragent(),
|
'User-Agent': gen_useragent(),
|
||||||
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
|
'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
|
||||||
}
|
}
|
||||||
resp_text = get(url, headers=headers).text
|
resp_text = get(url, headers=headers).text # type: ignore
|
||||||
|
|
||||||
for line in resp_text.split("\n"):
|
for line in resp_text.split("\n"):
|
||||||
if "window.searchId = " in line:
|
if "window.searchId = " in line:
|
||||||
|
@ -47,11 +67,6 @@ def _get_request_id(query, page, time_range, safesearch):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _is_valid_img_src(url):
|
|
||||||
# in some cases, the image url is a base64 encoded string, which has to be skipped
|
|
||||||
return "https://" in url
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
|
request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
|
||||||
|
|
||||||
|
@ -61,42 +76,105 @@ def request(query, params):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def _strip_leading_strings(text):
|
||||||
|
for x in ['wikipedia', 'google']:
|
||||||
|
if text.lower().endswith(x):
|
||||||
|
text = text[: -len(x)]
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_search_query(json_results):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
json = resp.json()
|
for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
|
||||||
|
|
||||||
json_results = []
|
|
||||||
if search_type == "search":
|
|
||||||
json_results = json['results'].get('standardResults', [])
|
|
||||||
else:
|
|
||||||
json_results = json.get(search_type, [])
|
|
||||||
|
|
||||||
for json_result in json_results:
|
|
||||||
result = {
|
result = {
|
||||||
'url': json_result['link'],
|
'url': item['link'],
|
||||||
'title': json_result['title'],
|
'title': item['title'],
|
||||||
'content': html_to_text(json_result.get('description', '')),
|
'img_src': item['image'],
|
||||||
|
'content': '',
|
||||||
|
'metadata': item.get('source'),
|
||||||
}
|
}
|
||||||
if search_type == "images":
|
|
||||||
result['template'] = 'images.html'
|
|
||||||
|
|
||||||
if not _is_valid_img_src(json_result['image']):
|
|
||||||
continue
|
|
||||||
|
|
||||||
result['img_src'] = json_result['image']
|
|
||||||
if _is_valid_img_src(json_result['thumbnail']):
|
|
||||||
result['thumbnail'] = json_result['thumbnail']
|
|
||||||
|
|
||||||
elif search_type == "videos":
|
|
||||||
result['template'] = 'videos.html'
|
|
||||||
|
|
||||||
if _is_valid_img_src(json_result['image']):
|
|
||||||
result['thumbnail'] = json_result['image']
|
|
||||||
|
|
||||||
result['duration'] = json_result['duration']
|
|
||||||
result['length'] = json_result['duration']
|
|
||||||
|
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
|
for item in json_results.get('standardResults', []):
|
||||||
|
result = {
|
||||||
|
'url': item['link'],
|
||||||
|
'title': item['title'],
|
||||||
|
'content': html_to_text(item['description']),
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
info = json_results.get('infoSection', {}).get('data')
|
||||||
|
if info:
|
||||||
|
attributes = []
|
||||||
|
for item in info.get('about', []):
|
||||||
|
label, value = html_to_text(item).split(':', 1)
|
||||||
|
value = _strip_leading_strings(value)
|
||||||
|
attributes.append({'label': label, 'value': value})
|
||||||
|
content = []
|
||||||
|
for item in [info['subtitle'], info['description']]:
|
||||||
|
item = _strip_leading_strings(html_to_text(item))
|
||||||
|
if item:
|
||||||
|
content.append(item)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
'infobox': info['title'],
|
||||||
|
'id': info['title'],
|
||||||
|
'img_src': info.get('image'),
|
||||||
|
'content': ' | '.join(content),
|
||||||
|
'attributes': attributes,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
json_resp = resp.json()
|
||||||
|
|
||||||
|
if search_type == 'search':
|
||||||
|
results = parse_search_query(json_resp['results'])
|
||||||
|
|
||||||
|
elif search_type == 'images':
|
||||||
|
for item in json_resp['images']:
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
'template': 'images.html',
|
||||||
|
'title': item['title'],
|
||||||
|
'url': item['link'],
|
||||||
|
'img_src': item['image'],
|
||||||
|
'thumbnail_src': item['thumbnail'],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
elif search_type == 'videos':
|
||||||
|
# The results in the video category are most often links to pages that contain
|
||||||
|
# a video and not to a video stream --> SearXNG can't use the video template.
|
||||||
|
|
||||||
|
for item in json_resp['videos']:
|
||||||
|
metadata = [x for x in [item.get('description'), item.get('duration')] if x]
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
'title': item['title'],
|
||||||
|
'url': item['link'],
|
||||||
|
'content': '',
|
||||||
|
'metadata': ' / '.join(metadata),
|
||||||
|
'img_src': item.get('image'),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
elif search_type == 'news':
|
||||||
|
for item in json_resp['news']:
|
||||||
|
metadata = [x for x in [item.get('source'), item.get('time')] if x]
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
'title': item['title'],
|
||||||
|
'url': item['link'],
|
||||||
|
'content': item['description'],
|
||||||
|
'metadata': ' / '.join(metadata),
|
||||||
|
'img_src': item.get('image'),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -1295,6 +1295,7 @@ engines:
|
||||||
search_type: search
|
search_type: search
|
||||||
categories: [general, web]
|
categories: [general, web]
|
||||||
shortcut: ps
|
shortcut: ps
|
||||||
|
disabled: true
|
||||||
|
|
||||||
- name: presearch images
|
- name: presearch images
|
||||||
engine: presearch
|
engine: presearch
|
||||||
|
@ -1307,7 +1308,7 @@ engines:
|
||||||
- name: presearch videos
|
- name: presearch videos
|
||||||
engine: presearch
|
engine: presearch
|
||||||
search_type: videos
|
search_type: videos
|
||||||
categories: [videos, web]
|
categories: [general, web]
|
||||||
timeout: 4.0
|
timeout: 4.0
|
||||||
shortcut: psvid
|
shortcut: psvid
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
Loading…
Reference in New Issue