diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py new file mode 100644 index 000000000..c41cf3b37 --- /dev/null +++ b/searx/engines/presearch.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Presearch (general, images, videos, news) +""" + +from urllib.parse import urlencode +from searx.network import get +from searx.utils import gen_useragent, html_to_text + +about = { + "website": "https://presearch.io", + "wikidiata_id": "Q7240905", + "official_api_documentation": "https://docs.presearch.io/nodes/api", + "use_official_api": False, + "require_api_key": False, + "results": "JSON", +} +paging = True +time_range_support = True +categories = ["general", "web"] # general, images, videos, news + +search_type = "search" # must be any of "search", "images", "videos", "news" + +base_url = "https://presearch.com" +safesearch_map = {0: 'false', 1: 'true', 2: 'true'} + + +def _get_request_id(query, page, time_range, safesearch): + args = { + "q": query, + "page": page, + } + if time_range: + args["time_range"] = time_range + + url = f"{base_url}/{search_type}?{urlencode(args)}" + headers = { + 'User-Agent': gen_useragent(), + 'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch]}", + } + resp_text = get(url, headers=headers).text + + for line in resp_text.split("\n"): + if "window.searchId = " in line: + return line.split("= ")[1][:-1].replace('"', "") + + return None + + +def _is_valid_img_src(url): + # in some cases, the image url is a base64 encoded string, which has to be skipped + return "https://" in url + + +def request(query, params): + request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"]) + + params["headers"]["Accept"] = "application/json" + params["url"] = f"{base_url}/results?id={request_id}" + + return params + + +def response(resp): + results = [] + + json = resp.json() + + json_results = [] + if search_type == "search": + json_results = json['results'].get('standardResults', []) + else: + json_results = json.get(search_type, []) + + for json_result in json_results: + result = { + 'url': json_result['link'], + 'title': json_result['title'], + 'content': html_to_text(json_result.get('description', '')), + } + if search_type == "images": + result['template'] = 'images.html' + + if not _is_valid_img_src(json_result['image']): + continue + + result['img_src'] = json_result['image'] + if _is_valid_img_src(json_result['thumbnail']): + result['thumbnail'] = json_result['thumbnail'] + + elif search_type == "videos": + result['template'] = 'videos.html' + + if _is_valid_img_src(json_result['image']): + result['thumbnail'] = json_result['image'] + + result['duration'] = json_result['duration'] + result['length'] = json_result['duration'] + + results.append(result) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 4437fb3a9..0edf01762 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1290,6 +1290,36 @@ engines: # query_str: 'SELECT * from my_table WHERE my_column = %(query)s' # shortcut : psql + - name: presearch + engine: presearch + search_type: search + categories: [general, web] + shortcut: ps + + - name: presearch images + engine: presearch + search_type: images + categories: [images, web] + timeout: 4.0 + shortcut: psimg + disabled: true + + - name: presearch videos + engine: presearch + search_type: videos + categories: [videos, web] + timeout: 4.0 + shortcut: psvid + disabled: true + + - name: presearch news + engine: presearch + search_type: news + categories: [news, web] + timeout: 4.0 + shortcut: psnews + disabled: true + - name: pub.dev engine: xpath shortcut: pd