[feat] implementation of presearch engine

2024-01-01 19:24:07 +01:00 · 2023-09-14 13:31:54 +02:00 · 2023-09-14 13:31:54 +02:00 · 23582aac5c
commit 23582aac5c
parent 99fb565b39
2 changed files with 132 additions and 0 deletions
--- a/searx/engines/presearch.py
+++ b/searx/engines/presearch.py
@ -0,0 +1,102 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Presearch (general, images, videos, news)
+"""
+
+from urllib.parse import urlencode
+from searx.network import get
+from searx.utils import gen_useragent, html_to_text
+
+about = {
+    "website": "https://presearch.io",
+    "wikidiata_id": "Q7240905",
+    "official_api_documentation": "https://docs.presearch.io/nodes/api",
+    "use_official_api": False,
+    "require_api_key": False,
+    "results": "JSON",
+}
+paging = True
+time_range_support = True
+categories = ["general", "web"]  # general, images, videos, news
+
+search_type = "search"  # must be any of "search", "images", "videos", "news"
+
+base_url = "https://presearch.com"
+safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
+
+
+def _get_request_id(query, page, time_range, safesearch):
+    args = {
+        "q": query,
+        "page": page,
+    }
+    if time_range:
+        args["time_range"] = time_range
+
+    url = f"{base_url}/{search_type}?{urlencode(args)}"
+    headers = {
+        'User-Agent': gen_useragent(),
+        'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}",
+    }
+    resp_text = get(url, headers=headers).text
+
+    for line in resp_text.split("\n"):
+        if "window.searchId = " in line:
+            return line.split("= ")[1][:-1].replace('"', "")
+
+    return None
+
+
+def _is_valid_img_src(url):
+    # in some cases, the image url is a base64 encoded string, which has to be skipped
+    return "https://" in url
+
+
+def request(query, params):
+    request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
+
+    params["headers"]["Accept"] = "application/json"
+    params["url"] = f"{base_url}/results?id={request_id}"
+
+    return params
+
+
+def response(resp):
+    results = []
+
+    json = resp.json()
+
+    json_results = []
+    if search_type == "search":
+        json_results = json['results'].get('standardResults', [])
+    else:
+        json_results = json.get(search_type, [])
+
+    for json_result in json_results:
+        result = {
+            'url': json_result['link'],
+            'title': json_result['title'],
+            'content': html_to_text(json_result.get('description', '')),
+        }
+        if search_type == "images":
+            result['template'] = 'images.html'
+
+            if not _is_valid_img_src(json_result['image']):
+                continue
+
+            result['img_src'] = json_result['image']
+            if _is_valid_img_src(json_result['thumbnail']):
+                result['thumbnail'] = json_result['thumbnail']
+
+        elif search_type == "videos":
+            result['template'] = 'videos.html'
+
+            if _is_valid_img_src(json_result['image']):
+                result['thumbnail'] = json_result['image']
+
+            result['duration'] = json_result['duration']
+            result['length'] = json_result['duration']
+
+        results.append(result)
+
+    return results
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -1290,6 +1290,36 @@ engines:
  #    query_str: 'SELECT * from my_table WHERE my_column = %(query)s'
  #    shortcut : psql

+  - name: presearch
+    engine: presearch
+    search_type: search
+    categories: [general, web]
+    shortcut: ps
+
+  - name: presearch images
+    engine: presearch
+    search_type: images
+    categories: [images, web]
+    timeout: 4.0
+    shortcut: psimg
+    disabled: true
+
+  - name: presearch videos
+    engine: presearch
+    search_type: videos
+    categories: [videos, web]
+    timeout: 4.0
+    shortcut: psvid
+    disabled: true
+
+  - name: presearch news
+    engine: presearch
+    search_type: news
+    categories: [news, web]
+    timeout: 4.0
+    shortcut: psnews
+    disabled: true
+
  - name: pub.dev
    engine: xpath
    shortcut: pd