diff --git a/docs/build-templates/searxng.rst b/docs/build-templates/searxng.rst index 14b385468..bc5d3e8fc 100644 --- a/docs/build-templates/searxng.rst +++ b/docs/build-templates/searxng.rst @@ -123,7 +123,7 @@ ${fedora_build} # jump to SearXNG's working tree and install SearXNG into virtualenv (${SERVICE_USER})$ cd \"$SEARXNG_SRC\" - (${SERVICE_USER})$ pip install -e . + (${SERVICE_USER})$ pip install --use-pep517 --no-build-isolation -e . .. END manage.sh update_packages diff --git a/docs/dev/makefile.rst b/docs/dev/makefile.rst index 3c3b2bf3b..383113bae 100644 --- a/docs/dev/makefile.rst +++ b/docs/dev/makefile.rst @@ -61,7 +61,7 @@ working tree and release a ``make install`` to get a virtualenv with a $ make install PYENV [virtualenv] installing ./requirements*.txt into local/py3 ... - PYENV [install] pip install -e 'searx[test]' + PYENV [install] pip install --use-pep517 --no-build-isolation -e 'searx[test]' ... Successfully installed searxng-2023.7.19+a446dea1b @@ -78,7 +78,7 @@ the check fails if you edit the requirements listed in ... PYENV [virtualenv] installing ./requirements*.txt into local/py3 ... - PYENV [install] pip install -e 'searx[test]' + PYENV [install] pip install --use-pep517 --no-build-isolation -e 'searx[test]' ... Successfully installed searxng-2023.7.19+a446dea1b diff --git a/manage b/manage index a279307ce..9c659682d 100755 --- a/manage +++ b/manage @@ -300,8 +300,8 @@ pyenv.install() { ( set -e pyenv - build_msg PYENV "[install] pip install -e 'searx${PY_SETUP_EXTRAS}'" - "${PY_ENV_BIN}/python" -m pip install -e ".${PY_SETUP_EXTRAS}" + build_msg PYENV "[install] pip install --use-pep517 --no-build-isolation -e 'searx${PY_SETUP_EXTRAS}'" + "${PY_ENV_BIN}/python" -m pip install --use-pep517 --no-build-isolation -e ".${PY_SETUP_EXTRAS}" ) local exit_val=$? if [ ! $exit_val -eq 0 ]; then diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py index 196c89a2b..c35799c69 100644 --- a/searx/engines/tineye.py +++ b/searx/engines/tineye.py @@ -14,10 +14,16 @@ billion images `[tineye.com] `_. """ +from typing import TYPE_CHECKING from urllib.parse import urlencode from datetime import datetime from flask_babel import gettext +if TYPE_CHECKING: + import logging + + logger = logging.getLogger() + about = { "website": 'https://tineye.com', "wikidata_id": 'Q2382535', @@ -34,7 +40,7 @@ categories = ['general'] paging = True safesearch = False base_url = 'https://tineye.com' -search_string = '/result_json/?page={page}&{query}' +search_string = '/api/v1/result_json/?page={page}&{query}' FORMAT_NOT_SUPPORTED = gettext( "Could not read that image url. This may be due to an unsupported file" @@ -120,7 +126,7 @@ def parse_tineye_match(match_json): crawl_date = backlink_json.get("crawl_date") if crawl_date: - crawl_date = datetime.fromisoformat(crawl_date[:-3]) + crawl_date = datetime.strptime(crawl_date, '%Y-%m-%d') else: crawl_date = datetime.min @@ -150,29 +156,15 @@ def parse_tineye_match(match_json): def response(resp): """Parse HTTP response from TinEye.""" - results = [] - try: + # handle the 422 client side errors, and the possible 400 status code error + if resp.status_code in (400, 422): json_data = resp.json() - except Exception as exc: # pylint: disable=broad-except - msg = "can't parse JSON response // %s" % exc - logger.error(msg) - json_data = {'error': msg} - - # handle error codes from Tineye - - if resp.is_error: - if resp.status_code in (400, 422): - - message = 'HTTP status: %s' % resp.status_code - error = json_data.get('error') - s_key = json_data.get('suggestions', {}).get('key', '') - - if error and s_key: - message = "%s (%s)" % (error, s_key) - elif error: - message = error + suggestions = json_data.get('suggestions', {}) + message = f'HTTP Status Code: {resp.status_code}' + if resp.status_code == 422: + s_key = suggestions.get('key', '') if s_key == "Invalid image URL": # test https://docs.searxng.org/_static/searxng-wordmark.svg message = FORMAT_NOT_SUPPORTED @@ -182,16 +174,23 @@ def response(resp): elif s_key == 'Download Error': # test https://notexists message = DOWNLOAD_ERROR + else: + logger.warning("Unknown suggestion key encountered: %s", s_key) + else: # 400 + description = suggestions.get('description') + if isinstance(description, list): + message = ','.join(description) - # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 - # results.append({'answer': message}) - logger.error(message) + # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 + # results.append({'answer': message}) + logger.error(message) + return [] - return results + # Raise for all other responses + resp.raise_for_status() - resp.raise_for_status() - - # append results from matches + results = [] + json_data = resp.json() for match_json in json_data['matches']: diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py new file mode 100644 index 000000000..2c6984fdc --- /dev/null +++ b/searx/engines/yandex.py @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Yandex (Web, images)""" + +from json import loads +from urllib.parse import urlencode +from html import unescape +from lxml import html +from searx.exceptions import SearxEngineCaptchaException +from searx.utils import humanize_bytes, eval_xpath, eval_xpath_list, extract_text, extr + + +# Engine metadata +about = { + "website": 'https://yandex.com/', + "wikidata_id": 'Q5281', + "official_api_documentation": "?", + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# Engine configuration +categories = [] +paging = True +search_type = "" + +# Search URL +base_url_web = 'https://yandex.com/search/site/' +base_url_images = 'https://yandex.com/images/search' + +results_xpath = '//li[contains(@class, "serp-item")]' +url_xpath = './/a[@class="b-serp-item__title-link"]/@href' +title_xpath = './/h3[@class="b-serp-item__title"]/a[@class="b-serp-item__title-link"]/span' +content_xpath = './/div[@class="b-serp-item__content"]//div[@class="b-serp-item__text"]' + + +def catch_bad_response(resp): + if resp.url.path.startswith('/showcaptcha'): + raise SearxEngineCaptchaException() + + +def request(query, params): + query_params_web = { + "tmpl_version": "releases", + "text": query, + "web": "1", + "frame": "1", + "searchid": "3131712", + } + + query_params_images = { + "text": query, + "uinfo": "sw-1920-sh-1080-ww-1125-wh-999", + } + + if params['pageno'] > 1: + query_params_web.update({"p": params["pageno"] - 1}) + query_params_images.update({"p": params["pageno"] - 1}) + + params["cookies"] = {'cookie': "yp=1716337604.sp.family%3A0#1685406411.szm.1:1920x1080:1920x999"} + + if search_type == 'web': + params['url'] = f"{base_url_web}?{urlencode(query_params_web)}" + elif search_type == 'images': + params['url'] = f"{base_url_images}?{urlencode(query_params_images)}" + + return params + + +def response(resp): + if search_type == 'web': + + catch_bad_response(resp) + + dom = html.fromstring(resp.text) + + results = [] + + for result in eval_xpath_list(dom, results_xpath): + results.append( + { + 'url': extract_text(eval_xpath(result, url_xpath)), + 'title': extract_text(eval_xpath(result, title_xpath)), + 'content': extract_text(eval_xpath(result, content_xpath)), + } + ) + + return results + + if search_type == 'images': + + catch_bad_response(resp) + + html_data = html.fromstring(resp.text) + html_sample = unescape(html.tostring(html_data, encoding='unicode')) + + content_between_tags = extr( + html_sample, '{"location":"/images/search/', 'advRsyaSearchColumn":null}}', default="fail" + ) + json_data = '{"location":"/images/search/' + content_between_tags + 'advRsyaSearchColumn":null}}' + + if content_between_tags == "fail": + content_between_tags = extr(html_sample, '{"location":"/images/search/', 'false}}}') + json_data = '{"location":"/images/search/' + content_between_tags + 'false}}}' + + json_resp = loads(json_data) + + results = [] + for _, item_data in json_resp['initialState']['serpList']['items']['entities'].items(): + title = item_data['snippet']['title'] + source = item_data['snippet']['url'] + thumb = item_data['image'] + fullsize_image = item_data['viewerData']['dups'][0]['url'] + height = item_data['viewerData']['dups'][0]['h'] + width = item_data['viewerData']['dups'][0]['w'] + filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes'] + humanized_filesize = humanize_bytes(filesize) + + results.append( + { + 'title': title, + 'url': source, + 'img_src': fullsize_image, + 'filesize': humanized_filesize, + 'thumbnail_src': thumb, + 'template': 'images.html', + 'resolution': f'{width} x {height}', + } + ) + + return results + + return [] diff --git a/searx/plugins/hostnames.py b/searx/plugins/hostnames.py index 770b00e15..6519452db 100644 --- a/searx/plugins/hostnames.py +++ b/searx/plugins/hostnames.py @@ -1,7 +1,19 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=too-many-branches -"""In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers -other features. +""" +.. attention:: + + The **"Hostname replace"** plugin has been replace by **"Hostnames + plugin"**, see :pull:`3463` & :pull:`3552`. + +The **Hostnames plugin** can be enabled by adding it to the +``enabled_plugins`` **list** in the ``setting.yml`` like so. + + .. code:: yaml + + enabled_plugins: + - 'Hostnames plugin' + ... - ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be replaced by other hostnames. diff --git a/searx/settings.yml b/searx/settings.yml index b3c7f5ffe..a1701d009 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1814,6 +1814,22 @@ engines: engine: unsplash shortcut: us + - name: yandex + engine: yandex + categories: general + search_type: web + shortcut: yd + disabled: true + inactive: true + + - name: yandex images + engine: yandex + categories: images + search_type: images + shortcut: ydi + disabled: true + inactive: true + - name: yandex music engine: yandex_music shortcut: ydm diff --git a/tests/unit/test_engines_init.py b/tests/unit/test_engines_init.py index 4872a1b1b..e2445160a 100644 --- a/tests/unit/test_engines_init.py +++ b/tests/unit/test_engines_init.py @@ -10,6 +10,7 @@ class TestEnginesInit(SearxTestCase): # pylint: disable=missing-class-docstring def tearDownClass(cls): settings['outgoing']['using_tor_proxy'] = False settings['outgoing']['extra_proxy_timeout'] = 0 + engines.load_engines([]) def test_initialize_engines_default(self): engine_list = [ diff --git a/tests/unit/test_search.py b/tests/unit/test_search.py index b85c90c68..a60089aef 100644 --- a/tests/unit/test_search.py +++ b/tests/unit/test_search.py @@ -2,6 +2,7 @@ # pylint: disable=missing-module-docstring, invalid-name from copy import copy +import logging import searx.search from searx.search import SearchQuery, EngineRef @@ -46,8 +47,13 @@ class SearchQueryTestCase(SearxTestCase): # pylint: disable=missing-class-docst class SearchTestCase(SearxTestCase): # pylint: disable=missing-class-docstring def setUp(self): + log = logging.getLogger("searx") + log_lev = log.level + log.setLevel(logging.ERROR) from searx import webapp # pylint: disable=import-outside-toplevel + log.setLevel(log_lev) + self.app = webapp.app @classmethod diff --git a/tests/unit/test_tineye.py b/tests/unit/test_tineye.py new file mode 100644 index 000000000..0530b4c5e --- /dev/null +++ b/tests/unit/test_tineye.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring + + +from datetime import datetime +from unittest.mock import Mock +from requests import HTTPError +from searx.engines import load_engines, tineye +from tests import SearxTestCase + + +class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring + + def setUp(self): + load_engines([{'name': 'tineye', 'engine': 'tineye', 'shortcut': 'tin', 'timeout': 9.0, 'disabled': True}]) + + def tearDown(self): + load_engines([]) + + def test_status_code_raises(self): + response = Mock() + response.status_code = 401 + response.raise_for_status.side_effect = HTTPError() + self.assertRaises(HTTPError, lambda: tineye.response(response)) + + def test_returns_empty_list_for_422(self): + response = Mock() + response.json.return_value = {} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + with self.assertLogs(tineye.logger) as _dev_null: + results = tineye.response(response) + self.assertEqual(0, len(results)) + + def test_logs_format_for_422(self): + response = Mock() + response.json.return_value = {"suggestions": {"key": "Invalid image URL"}} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(tineye.FORMAT_NOT_SUPPORTED, ','.join(assert_logs_context.output)) + + def test_logs_signature_for_422(self): + response = Mock() + response.json.return_value = {"suggestions": {"key": "NO_SIGNATURE_ERROR"}} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(tineye.NO_SIGNATURE_ERROR, ','.join(assert_logs_context.output)) + + def test_logs_download_for_422(self): + response = Mock() + response.json.return_value = {"suggestions": {"key": "Download Error"}} + response.status_code = 422 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(tineye.DOWNLOAD_ERROR, ','.join(assert_logs_context.output)) + + def test_empty_list_for_400(self): + response = Mock() + response.json.return_value = {} + response.status_code = 400 + response.raise_for_status.side_effect = HTTPError() + with self.assertLogs(tineye.logger) as _dev_null: + results = tineye.response(response) + self.assertEqual(0, len(results)) + + def test_logs_description_for_400(self): + description = 'There was a problem with that request. Error ID: ad5fc955-a934-43c1-8187-f9a61d301645' + response = Mock() + response.json.return_value = {"suggestions": {"description": [description], "title": "Oops! We're sorry!"}} + response.status_code = 400 + response.raise_for_status.side_effect = HTTPError() + + with self.assertLogs(tineye.logger) as assert_logs_context: + tineye.response(response) + self.assertIn(description, ','.join(assert_logs_context.output)) + + def test_crawl_date_parses(self): + date_str = '2020-05-25' + date = datetime.strptime(date_str, '%Y-%m-%d') + response = Mock() + response.json.return_value = { + 'matches': [ + { + 'backlinks': [ + { + 'crawl_date': date_str, + } + ] + } + ] + } + response.status_code = 200 + results = tineye.response(response) + self.assertEqual(date, results[0]['publishedDate']) diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index 868645e17..7c6e1ef82 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=missing-module-docstring +import logging import json from urllib.parse import ParseResult from mock import Mock @@ -20,8 +21,13 @@ class ViewsTestCase(SearxTestCase): # pylint: disable=missing-class-docstring, self.setattr4test(searx.search.processors, 'initialize_processor', dummy) + log = logging.getLogger("searx") + log_lev = log.level + log.setLevel(logging.ERROR) from searx import webapp # pylint: disable=import-outside-toplevel + log.setLevel(log_lev) + webapp.app.config['TESTING'] = True # to get better error messages self.app = webapp.app.test_client() diff --git a/utils/lib.sh b/utils/lib.sh index e527fa1b6..b932b875c 100755 --- a/utils/lib.sh +++ b/utils/lib.sh @@ -663,8 +663,8 @@ pyenv.install() { pyenv fi for i in ${PYOBJECTS}; do - build_msg PYENV "[install] pip install -e '$i${PY_SETUP_EXTRAS}'" - "${PY_ENV_BIN}/python" -m pip install -e "$i${PY_SETUP_EXTRAS}" + build_msg PYENV "[install] pip install --use-pep517 --no-build-isolation -e '$i${PY_SETUP_EXTRAS}'" + "${PY_ENV_BIN}/python" -m pip install --use-pep517 --no-build-isolation -e "$i${PY_SETUP_EXTRAS}" done fi pyenv.install.OK diff --git a/utils/searxng.sh b/utils/searxng.sh index ea6a467d2..c15c18218 100755 --- a/utils/searxng.sh +++ b/utils/searxng.sh @@ -501,7 +501,7 @@ pip install -U setuptools pip install -U wheel pip install -U pyyaml cd ${SEARXNG_SRC} -pip install -e . +pip install --use-pep517 --no-build-isolation -e . EOF } @@ -569,7 +569,7 @@ pip install -U pip pip install -U setuptools pip install -U wheel pip install -U pyyaml -pip install -U -e . +pip install -U --use-pep517 --no-build-isolation -e . EOF rst_para "update instance's settings.yml from ${SEARXNG_SETTINGS_PATH}" DEFAULT_SELECT=2 \