From 5be55e3309761842e070f48580a519499cfc8ceb Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 21 Aug 2024 08:19:54 +0200 Subject: [PATCH 1/5] [fix] unit tests: fix load / unload engines & fix messages - https://github.com/searxng/searxng/pull/3746#issuecomment-2300965005 - https://github.com/searxng/searxng/issues/2988#issuecomment-2226929084 Signed-off-by: Markus Heiser --- tests/unit/test_engines_init.py | 1 + tests/unit/test_search.py | 6 ++++++ tests/unit/test_webapp.py | 6 ++++++ 3 files changed, 13 insertions(+) diff --git a/tests/unit/test_engines_init.py b/tests/unit/test_engines_init.py index 4872a1b1b..e2445160a 100644 --- a/tests/unit/test_engines_init.py +++ b/tests/unit/test_engines_init.py @@ -10,6 +10,7 @@ class TestEnginesInit(SearxTestCase): # pylint: disable=missing-class-docstring def tearDownClass(cls): settings['outgoing']['using_tor_proxy'] = False settings['outgoing']['extra_proxy_timeout'] = 0 + engines.load_engines([]) def test_initialize_engines_default(self): engine_list = [ diff --git a/tests/unit/test_search.py b/tests/unit/test_search.py index b85c90c68..a60089aef 100644 --- a/tests/unit/test_search.py +++ b/tests/unit/test_search.py @@ -2,6 +2,7 @@ # pylint: disable=missing-module-docstring, invalid-name from copy import copy +import logging import searx.search from searx.search import SearchQuery, EngineRef @@ -46,8 +47,13 @@ class SearchQueryTestCase(SearxTestCase): # pylint: disable=missing-class-docst class SearchTestCase(SearxTestCase): # pylint: disable=missing-class-docstring def setUp(self): + log = logging.getLogger("searx") + log_lev = log.level + log.setLevel(logging.ERROR) from searx import webapp # pylint: disable=import-outside-toplevel + log.setLevel(log_lev) + self.app = webapp.app @classmethod diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index 868645e17..7c6e1ef82 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=missing-module-docstring +import logging import json from urllib.parse import ParseResult from mock import Mock @@ -20,8 +21,13 @@ class ViewsTestCase(SearxTestCase): # pylint: disable=missing-class-docstring, self.setattr4test(searx.search.processors, 'initialize_processor', dummy) + log = logging.getLogger("searx") + log_lev = log.level + log.setLevel(logging.ERROR) from searx import webapp # pylint: disable=import-outside-toplevel + log.setLevel(log_lev) + webapp.app.config['TESTING'] = True # to get better error messages self.app = webapp.app.test_client() From 5276219b9d790baeeb505813bb76d0dffa1d2d51 Mon Sep 17 00:00:00 2001 From: Grant Lanham Date: Mon, 19 Aug 2024 23:02:06 -0400 Subject: [PATCH 2/5] Fix tineye engine url, datetime parsing, and minor refactor Changes made to tineye engine: 1. Importing logging if TYPE_CHECKING is enabled 2. Remove unecessary try-catch around json parsing the response, as this masked the original error and had no immediate benefit 3. Improve error handling explicitely for status code 422 and 400 upfront, deferring json_parsing only for these status codes and successful status codes 4. Unit test all new applicable changes to ensure compatability --- searx/engines/tineye.py | 57 +++++++++++---------- tests/unit/test_tineye.py | 102 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 29 deletions(-) create mode 100644 tests/unit/test_tineye.py diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py index 196c89a2b..c35799c69 100644 --- a/searx/engines/tineye.py +++ b/searx/engines/tineye.py @@ -14,10 +14,16 @@ billion images `[tineye.com] `_. """ +from typing import TYPE_CHECKING from urllib.parse import urlencode from datetime import datetime from flask_babel import gettext +if TYPE_CHECKING: + import logging + + logger = logging.getLogger() + about = { "website": 'https://tineye.com', "wikidata_id": 'Q2382535', @@ -34,7 +40,7 @@ categories = ['general'] paging = True safesearch = False base_url = 'https://tineye.com' -search_string = '/result_json/?page={page}&{query}' +search_string = '/api/v1/result_json/?page={page}&{query}' FORMAT_NOT_SUPPORTED = gettext( "Could not read that image url. This may be due to an unsupported file" @@ -120,7 +126,7 @@ def parse_tineye_match(match_json): crawl_date = backlink_json.get("crawl_date") if crawl_date: - crawl_date = datetime.fromisoformat(crawl_date[:-3]) + crawl_date = datetime.strptime(crawl_date, '%Y-%m-%d') else: crawl_date = datetime.min @@ -150,29 +156,15 @@ def parse_tineye_match(match_json): def response(resp): """Parse HTTP response from TinEye.""" - results = [] - try: + # handle the 422 client side errors, and the possible 400 status code error + if resp.status_code in (400, 422): json_data = resp.json() - except Exception as exc: # pylint: disable=broad-except - msg = "can't parse JSON response // %s" % exc - logger.error(msg) - json_data = {'error': msg} - - # handle error codes from Tineye - - if resp.is_error: - if resp.status_code in (400, 422): - - message = 'HTTP status: %s' % resp.status_code - error = json_data.get('error') - s_key = json_data.get('suggestions', {}).get('key', '') - - if error and s_key: - message = "%s (%s)" % (error, s_key) - elif error: - message = error + suggestions = json_data.get('suggestions', {}) + message = f'HTTP Status Code: {resp.status_code}' + if resp.status_code == 422: + s_key = suggestions.get('key', '') if s_key == "Invalid image URL": # test https://docs.searxng.org/_static/searxng-wordmark.svg message = FORMAT_NOT_SUPPORTED @@ -182,16 +174,23 @@ def response(resp): elif s_key == 'Download Error': # test https://notexists message = DOWNLOAD_ERROR + else: + logger.warning("Unknown suggestion key encountered: %s", s_key) + else: # 400 + description = suggestions.get('description') + if isinstance(description, list): + message = ','.join(description) - # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 - # results.append({'answer': message}) - logger.error(message) + # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 + # results.append({'answer': message}) + logger.error(message) + return [] - return results + # Raise for all other responses + resp.raise_for_status() - resp.raise_for_status() - - # append results from matches + results = [] + json_data = resp.json() for match_json in json_data['matches']: diff --git a/tests/unit/test_tineye.py b/tests/unit/test_tineye.py new file mode 100644 index 000000000..0530b4c5e --- /dev/null +++ b/tests/unit/test_tineye.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring + + +from datetime import datetime +from unittest.mock import Mock +from requests import HTTPError +from searx.engines import load_engines, tineye +from tests import SearxTestCase + + +class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring + + def setUp(self): + load_engines([{'name': 'tineye', 'engine': 'tineye', 'shortcut': 'tin', 'timeout': 9.0, 'disabled': True}]) + + def tearDown(self): + load_engines([]) + + def test_status_code_raises(self): + response = Mock() + response.status_code = 401 + response.raise_for_status.side_effect = HTTPError() + self.assertRaises(HTTPError, lambda: tineye.response(response)) + + def test_returns_empty_list_for_422(self): + response = Mock() + response.json.return_value = {} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + with self.assertLogs(tineye.logger) as _dev_null: + results = tineye.response(response) + self.assertEqual(0, len(results)) + + def test_logs_format_for_422(self): + response = Mock() + response.json.return_value = {"suggestions": {"key": "Invalid image URL"}} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(tineye.FORMAT_NOT_SUPPORTED, ','.join(assert_logs_context.output)) + + def test_logs_signature_for_422(self): + response = Mock() + response.json.return_value = {"suggestions": {"key": "NO_SIGNATURE_ERROR"}} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(tineye.NO_SIGNATURE_ERROR, ','.join(assert_logs_context.output)) + + def test_logs_download_for_422(self): + response = Mock() + response.json.return_value = {"suggestions": {"key": "Download Error"}} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(tineye.DOWNLOAD_ERROR, ','.join(assert_logs_context.output)) + + def test_empty_list_for_400(self): + response = Mock() + response.json.return_value = {} + response.status_code = 400 + response.raise_for_status.side_effect = HTTPError() + with self.assertLogs(tineye.logger) as _dev_null: + results = tineye.response(response) + self.assertEqual(0, len(results)) + + def test_logs_description_for_400(self): + description = 'There was a problem with that request. Error ID: ad5fc955-a934-43c1-8187-f9a61d301645' + response = Mock() + response.json.return_value = {"suggestions": {"description": [description], "title": "Oops! We're sorry!"}} + response.status_code = 400 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(description, ','.join(assert_logs_context.output)) + + def test_crawl_date_parses(self): + date_str = '2020-05-25' + date = datetime.strptime(date_str, '%Y-%m-%d') + response = Mock() + response.json.return_value = { + 'matches': [ + { + 'backlinks': [ + { + 'crawl_date': date_str, + } + ] + } + ] + } + response.status_code = 200 + results = tineye.response(response) + self.assertEqual(date, results[0]['publishedDate']) From e45b771ffaeeb41a22fa17690b27be98b01d14cc Mon Sep 17 00:00:00 2001 From: Austin-Olacsi <138650713+Austin-Olacsi@users.noreply.github.com> Date: Sun, 11 Aug 2024 21:38:01 -0600 Subject: [PATCH 3/5] [feat] engine: implementation of yandex (web, images) It's set to inactive in settings.yml because of CAPTCHA. You need to remove that from the settings.yml to get in use. Closes: https://github.com/searxng/searxng/issues/961 --- searx/engines/yandex.py | 133 ++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 16 +++++ 2 files changed, 149 insertions(+) create mode 100644 searx/engines/yandex.py diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 000000000..2c6984fdc --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Yandex (Web, images)""" + +from json import loads +from urllib.parse import urlencode +from html import unescape +from lxml import html +from searx.exceptions import SearxEngineCaptchaException +from searx.utils import humanize_bytes, eval_xpath, eval_xpath_list, extract_text, extr + + +# Engine metadata +about = { + "website": 'https://yandex.com/', + "wikidata_id": 'Q5281', + "official_api_documentation": "?", + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# Engine configuration +categories = [] +paging = True +search_type = "" + +# Search URL +base_url_web = 'https://yandex.com/search/site/' +base_url_images = 'https://yandex.com/images/search' + +results_xpath = '//li[contains(@class, "serp-item")]' +url_xpath = './/a[@class="b-serp-item__title-link"]/@href' +title_xpath = './/h3[@class="b-serp-item__title"]/a[@class="b-serp-item__title-link"]/span' +content_xpath = './/div[@class="b-serp-item__content"]//div[@class="b-serp-item__text"]' + + +def catch_bad_response(resp): + if resp.url.path.startswith('/showcaptcha'): + raise SearxEngineCaptchaException() + + +def request(query, params): + query_params_web = { + "tmpl_version": "releases", + "text": query, + "web": "1", + "frame": "1", + "searchid": "3131712", + } + + query_params_images = { + "text": query, + "uinfo": "sw-1920-sh-1080-ww-1125-wh-999", + } + + if params['pageno'] > 1: + query_params_web.update({"p": params["pageno"] - 1}) + query_params_images.update({"p": params["pageno"] - 1}) + + params["cookies"] = {'cookie': "yp=1716337604.sp.family%3A0#1685406411.szm.1:1920x1080:1920x999"} + + if search_type == 'web': + params['url'] = f"{base_url_web}?{urlencode(query_params_web)}" + elif search_type == 'images': + params['url'] = f"{base_url_images}?{urlencode(query_params_images)}" + + return params + + +def response(resp): + if search_type == 'web': + + catch_bad_response(resp) + + dom = html.fromstring(resp.text) + + results = [] + + for result in eval_xpath_list(dom, results_xpath): + results.append( + { + 'url': extract_text(eval_xpath(result, url_xpath)), + 'title': extract_text(eval_xpath(result, title_xpath)), + 'content': extract_text(eval_xpath(result, content_xpath)), + } + ) + + return results + + if search_type == 'images': + + catch_bad_response(resp) + + html_data = html.fromstring(resp.text) + html_sample = unescape(html.tostring(html_data, encoding='unicode')) + + content_between_tags = extr( + html_sample, '{"location":"/images/search/', 'advRsyaSearchColumn":null}}', default="fail" + ) + json_data = '{"location":"/images/search/' + content_between_tags + 'advRsyaSearchColumn":null}}' + + if content_between_tags == "fail": + content_between_tags = extr(html_sample, '{"location":"/images/search/', 'false}}}') + json_data = '{"location":"/images/search/' + content_between_tags + 'false}}}' + + json_resp = loads(json_data) + + results = [] + for _, item_data in json_resp['initialState']['serpList']['items']['entities'].items(): + title = item_data['snippet']['title'] + source = item_data['snippet']['url'] + thumb = item_data['image'] + fullsize_image = item_data['viewerData']['dups'][0]['url'] + height = item_data['viewerData']['dups'][0]['h'] + width = item_data['viewerData']['dups'][0]['w'] + filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes'] + humanized_filesize = humanize_bytes(filesize) + + results.append( + { + 'title': title, + 'url': source, + 'img_src': fullsize_image, + 'filesize': humanized_filesize, + 'thumbnail_src': thumb, + 'template': 'images.html', + 'resolution': f'{width} x {height}', + } + ) + + return results + + return [] diff --git a/searx/settings.yml b/searx/settings.yml index b3c7f5ffe..a1701d009 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1814,6 +1814,22 @@ engines: engine: unsplash shortcut: us + - name: yandex + engine: yandex + categories: general + search_type: web + shortcut: yd + disabled: true + inactive: true + + - name: yandex images + engine: yandex + categories: images + search_type: images + shortcut: ydi + disabled: true + inactive: true + - name: yandex music engine: yandex_music shortcut: ydm From fe6bac5a08b8dfc8d91478f5ed78bd584ec9c147 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 8 Aug 2024 11:46:54 +0200 Subject: [PATCH 4/5] [fix] pip install -e: legacy editable install (setup.py develop) is deprecated From [1]: There is now a standardized mechanism [2] for an installer like pip to request an editable install of a project. pip is transitioning to using this standard only instead of invoking the deprecated `setup.py develop` command. For backward compatibility, we can use switches: --use-pep517 https://pip.pypa.io/en/stable/cli/pip_install/#cmdoption-use-pep517 --no-build-isolation https://pip.pypa.io/en/stable/cli/pip_install/#cmdoption-no-build-isolation - [1] https://github.com/pypa/pip/issues/11457 - [2] https://peps.python.org/pep-0660/ Closes: https://github.com/searxng/searxng/issues/3701 Signed-off-by: Markus Heiser --- docs/build-templates/searxng.rst | 2 +- docs/dev/makefile.rst | 4 ++-- manage | 4 ++-- utils/lib.sh | 4 ++-- utils/searxng.sh | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/build-templates/searxng.rst b/docs/build-templates/searxng.rst index 14b385468..bc5d3e8fc 100644 --- a/docs/build-templates/searxng.rst +++ b/docs/build-templates/searxng.rst @@ -123,7 +123,7 @@ ${fedora_build} # jump to SearXNG's working tree and install SearXNG into virtualenv (${SERVICE_USER})$ cd \"$SEARXNG_SRC\" - (${SERVICE_USER})$ pip install -e . + (${SERVICE_USER})$ pip install --use-pep517 --no-build-isolation -e . .. END manage.sh update_packages diff --git a/docs/dev/makefile.rst b/docs/dev/makefile.rst index 3c3b2bf3b..383113bae 100644 --- a/docs/dev/makefile.rst +++ b/docs/dev/makefile.rst @@ -61,7 +61,7 @@ working tree and release a ``make install`` to get a virtualenv with a $ make install PYENV [virtualenv] installing ./requirements*.txt into local/py3 ... - PYENV [install] pip install -e 'searx[test]' + PYENV [install] pip install --use-pep517 --no-build-isolation -e 'searx[test]' ... Successfully installed searxng-2023.7.19+a446dea1b @@ -78,7 +78,7 @@ the check fails if you edit the requirements listed in ... PYENV [virtualenv] installing ./requirements*.txt into local/py3 ... - PYENV [install] pip install -e 'searx[test]' + PYENV [install] pip install --use-pep517 --no-build-isolation -e 'searx[test]' ... Successfully installed searxng-2023.7.19+a446dea1b diff --git a/manage b/manage index 2a88a4c5c..7edcb1f5a 100755 --- a/manage +++ b/manage @@ -299,8 +299,8 @@ pyenv.install() { ( set -e pyenv - build_msg PYENV "[install] pip install -e 'searx${PY_SETUP_EXTRAS}'" - "${PY_ENV_BIN}/python" -m pip install -e ".${PY_SETUP_EXTRAS}" + build_msg PYENV "[install] pip install --use-pep517 --no-build-isolation -e 'searx${PY_SETUP_EXTRAS}'" + "${PY_ENV_BIN}/python" -m pip install --use-pep517 --no-build-isolation -e ".${PY_SETUP_EXTRAS}" ) local exit_val=$? if [ ! $exit_val -eq 0 ]; then diff --git a/utils/lib.sh b/utils/lib.sh index e527fa1b6..b932b875c 100755 --- a/utils/lib.sh +++ b/utils/lib.sh @@ -663,8 +663,8 @@ pyenv.install() { pyenv fi for i in ${PYOBJECTS}; do - build_msg PYENV "[install] pip install -e '$i${PY_SETUP_EXTRAS}'" - "${PY_ENV_BIN}/python" -m pip install -e "$i${PY_SETUP_EXTRAS}" + build_msg PYENV "[install] pip install --use-pep517 --no-build-isolation -e '$i${PY_SETUP_EXTRAS}'" + "${PY_ENV_BIN}/python" -m pip install --use-pep517 --no-build-isolation -e "$i${PY_SETUP_EXTRAS}" done fi pyenv.install.OK diff --git a/utils/searxng.sh b/utils/searxng.sh index ea6a467d2..c15c18218 100755 --- a/utils/searxng.sh +++ b/utils/searxng.sh @@ -501,7 +501,7 @@ pip install -U setuptools pip install -U wheel pip install -U pyyaml cd ${SEARXNG_SRC} -pip install -e . +pip install --use-pep517 --no-build-isolation -e . EOF } @@ -569,7 +569,7 @@ pip install -U pip pip install -U setuptools pip install -U wheel pip install -U pyyaml -pip install -U -e . +pip install -U --use-pep517 --no-build-isolation -e . EOF rst_para "update instance's settings.yml from ${SEARXNG_SETTINGS_PATH}" DEFAULT_SELECT=2 \ From 2033f30c8df9e6af23876345785ec28ba982d5e5 Mon Sep 17 00:00:00 2001 From: Dennis ten Hoove Date: Fri, 16 Aug 2024 05:14:25 +0200 Subject: [PATCH 5/5] [docs] improve Hostname plugin documentation --- searx/plugins/hostnames.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/searx/plugins/hostnames.py b/searx/plugins/hostnames.py index 770b00e15..6519452db 100644 --- a/searx/plugins/hostnames.py +++ b/searx/plugins/hostnames.py @@ -1,7 +1,19 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=too-many-branches -"""In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers -other features. +""" +.. attention:: + + The **"Hostname replace"** plugin has been replace by **"Hostnames + plugin"**, see :pull:`3463` & :pull:`3552`. + +The **Hostnames plugin** can be enabled by adding it to the +``enabled_plugins`` **list** in the ``setting.yml`` like so. + + .. code:: yaml + + enabled_plugins: + - 'Hostnames plugin' + ... - ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be replaced by other hostnames.