From 43dc9eb7d64ddcdc07639129b9dbbc96c92d3caa Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Mon, 30 May 2022 00:53:26 +0200 Subject: [PATCH 1/2] [enh] Initial Petalsearch Images support Upstream example query: https://petalsearch.com/search?query=test&channel=image&ps=50&pn=1®ion=de-de&ss_mode=off&ss_type=normal Depending on locale it will internally use some/all results from other engines. See: https://seirdy.one/posts/2021/03/10/search-engines-with-own-indexes/#general-indexing-search-engines --- searx/engines/petal_images.py | 94 +++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +++ 2 files changed, 100 insertions(+) create mode 100644 searx/engines/petal_images.py diff --git a/searx/engines/petal_images.py b/searx/engines/petal_images.py new file mode 100644 index 000000000..88853c1bd --- /dev/null +++ b/searx/engines/petal_images.py @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Petalsearch Images + +""" + +from json import loads +from urllib.parse import urlencode +from datetime import datetime + +from lxml import html + +from searx.utils import extract_text + +about = { + "website": 'https://petalsearch.com/', + "wikidata_id": 'Q104399280', + "official_api_documentation": False, + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['images'] +paging = True +time_range_support = False + +safesearch = True +safesearch_table = {0: 'off', 1: 'moderate', 2: 'on'} + +base_url = 'https://petalsearch.com/' +search_string = 'search?{query}&channel=image&ps=50&pn={page}®ion={lang}&ss_mode={safesearch}&ss_type=normal' + + +def request(query, params): + + search_path = search_string.format( + query=urlencode({'query': query}), + page=params['pageno'], + lang=params['language'].lower(), + safesearch=safesearch_table[params['safesearch']], + ) + + params['url'] = base_url + search_path + + return params + + +def response(resp): + results = [] + + tree = html.fromstring(resp.text) + root = tree.findall('.//script[3]') + + # Convert list to JSON + json_content = extract_text(root) + + # Manipulate with JSON + data = loads(json_content) + + for result in data['newImages']: + url = result['url'] + title = result['title'] + thumbnail_src = result['image'] + + pic_dict = result.get('extrainfo') + + date_from_api = pic_dict.get('publish_time') + width = pic_dict.get('width') + height = pic_dict.get('height') + img_src = pic_dict.get('real_url') + + # Continue if img_src is missing + if img_src is None or '': + continue + + # Get and convert published date + if date_from_api is not None: + publishedDate = datetime.fromtimestamp(int(date_from_api)) + + # Append results + results.append( + { + 'template': 'images.html', + 'url': url, + 'title': title, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'width': width, + 'height': height, + 'publishedDate': publishedDate, + } + ) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index ae1291e08..8a97bbcdc 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1720,6 +1720,12 @@ engines: require_api_key: false results: HTML + - name: petalsearch images + engine: petal_images + shortcut: ptsi + disabled: true + timeout: 3.0 + - name: petalsearch news shortcut: ptsn categories: news From 7e95d6bb79ccf36f33ac3e287022f396721d472a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9on=20Tiek=C3=B6tter?= Date: Tue, 31 May 2022 21:07:30 +0200 Subject: [PATCH 2/2] [mod] image proxy: allow binary/octet-stream mime type The Petal Search Images engine sends the thumbnails with the binary/octet-stream mime type. --- searx/webapp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/searx/webapp.py b/searx/webapp.py index 86de88407..fa5ca0605 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -1163,7 +1163,9 @@ def image_proxy(): return '', resp.status_code return '', 400 - if not resp.headers.get('Content-Type', '').startswith('image/'): + if not resp.headers.get('Content-Type', '').startswith('image/') and not resp.headers.get( + 'Content-Type', '' + ).startswith('binary/octet-stream'): logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', '')) return '', 400