mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
Sync upstream
This commit is contained in:
commit
c4fc0609d3
13 changed files with 315 additions and 40 deletions
|
@ -123,7 +123,7 @@ ${fedora_build}
|
|||
|
||||
# jump to SearXNG's working tree and install SearXNG into virtualenv
|
||||
(${SERVICE_USER})$ cd \"$SEARXNG_SRC\"
|
||||
(${SERVICE_USER})$ pip install -e .
|
||||
(${SERVICE_USER})$ pip install --use-pep517 --no-build-isolation -e .
|
||||
|
||||
|
||||
.. END manage.sh update_packages
|
||||
|
|
|
@ -61,7 +61,7 @@ working tree and release a ``make install`` to get a virtualenv with a
|
|||
$ make install
|
||||
PYENV [virtualenv] installing ./requirements*.txt into local/py3
|
||||
...
|
||||
PYENV [install] pip install -e 'searx[test]'
|
||||
PYENV [install] pip install --use-pep517 --no-build-isolation -e 'searx[test]'
|
||||
...
|
||||
Successfully installed searxng-2023.7.19+a446dea1b
|
||||
|
||||
|
@ -78,7 +78,7 @@ the check fails if you edit the requirements listed in
|
|||
...
|
||||
PYENV [virtualenv] installing ./requirements*.txt into local/py3
|
||||
...
|
||||
PYENV [install] pip install -e 'searx[test]'
|
||||
PYENV [install] pip install --use-pep517 --no-build-isolation -e 'searx[test]'
|
||||
...
|
||||
Successfully installed searxng-2023.7.19+a446dea1b
|
||||
|
||||
|
|
4
manage
4
manage
|
@ -300,8 +300,8 @@ pyenv.install() {
|
|||
|
||||
( set -e
|
||||
pyenv
|
||||
build_msg PYENV "[install] pip install -e 'searx${PY_SETUP_EXTRAS}'"
|
||||
"${PY_ENV_BIN}/python" -m pip install -e ".${PY_SETUP_EXTRAS}"
|
||||
build_msg PYENV "[install] pip install --use-pep517 --no-build-isolation -e 'searx${PY_SETUP_EXTRAS}'"
|
||||
"${PY_ENV_BIN}/python" -m pip install --use-pep517 --no-build-isolation -e ".${PY_SETUP_EXTRAS}"
|
||||
)
|
||||
local exit_val=$?
|
||||
if [ ! $exit_val -eq 0 ]; then
|
||||
|
|
|
@ -14,10 +14,16 @@ billion images `[tineye.com] <https://tineye.com/how>`_.
|
|||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
from flask_babel import gettext
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
about = {
|
||||
"website": 'https://tineye.com',
|
||||
"wikidata_id": 'Q2382535',
|
||||
|
@ -34,7 +40,7 @@ categories = ['general']
|
|||
paging = True
|
||||
safesearch = False
|
||||
base_url = 'https://tineye.com'
|
||||
search_string = '/result_json/?page={page}&{query}'
|
||||
search_string = '/api/v1/result_json/?page={page}&{query}'
|
||||
|
||||
FORMAT_NOT_SUPPORTED = gettext(
|
||||
"Could not read that image url. This may be due to an unsupported file"
|
||||
|
@ -120,7 +126,7 @@ def parse_tineye_match(match_json):
|
|||
|
||||
crawl_date = backlink_json.get("crawl_date")
|
||||
if crawl_date:
|
||||
crawl_date = datetime.fromisoformat(crawl_date[:-3])
|
||||
crawl_date = datetime.strptime(crawl_date, '%Y-%m-%d')
|
||||
else:
|
||||
crawl_date = datetime.min
|
||||
|
||||
|
@ -150,29 +156,15 @@ def parse_tineye_match(match_json):
|
|||
|
||||
def response(resp):
|
||||
"""Parse HTTP response from TinEye."""
|
||||
results = []
|
||||
|
||||
try:
|
||||
# handle the 422 client side errors, and the possible 400 status code error
|
||||
if resp.status_code in (400, 422):
|
||||
json_data = resp.json()
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
msg = "can't parse JSON response // %s" % exc
|
||||
logger.error(msg)
|
||||
json_data = {'error': msg}
|
||||
|
||||
# handle error codes from Tineye
|
||||
|
||||
if resp.is_error:
|
||||
if resp.status_code in (400, 422):
|
||||
|
||||
message = 'HTTP status: %s' % resp.status_code
|
||||
error = json_data.get('error')
|
||||
s_key = json_data.get('suggestions', {}).get('key', '')
|
||||
|
||||
if error and s_key:
|
||||
message = "%s (%s)" % (error, s_key)
|
||||
elif error:
|
||||
message = error
|
||||
suggestions = json_data.get('suggestions', {})
|
||||
message = f'HTTP Status Code: {resp.status_code}'
|
||||
|
||||
if resp.status_code == 422:
|
||||
s_key = suggestions.get('key', '')
|
||||
if s_key == "Invalid image URL":
|
||||
# test https://docs.searxng.org/_static/searxng-wordmark.svg
|
||||
message = FORMAT_NOT_SUPPORTED
|
||||
|
@ -182,16 +174,23 @@ def response(resp):
|
|||
elif s_key == 'Download Error':
|
||||
# test https://notexists
|
||||
message = DOWNLOAD_ERROR
|
||||
else:
|
||||
logger.warning("Unknown suggestion key encountered: %s", s_key)
|
||||
else: # 400
|
||||
description = suggestions.get('description')
|
||||
if isinstance(description, list):
|
||||
message = ','.join(description)
|
||||
|
||||
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
|
||||
# results.append({'answer': message})
|
||||
logger.error(message)
|
||||
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
|
||||
# results.append({'answer': message})
|
||||
logger.error(message)
|
||||
return []
|
||||
|
||||
return results
|
||||
# Raise for all other responses
|
||||
resp.raise_for_status()
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
# append results from matches
|
||||
results = []
|
||||
json_data = resp.json()
|
||||
|
||||
for match_json in json_data['matches']:
|
||||
|
||||
|
|
133
searx/engines/yandex.py
Normal file
133
searx/engines/yandex.py
Normal file
|
@ -0,0 +1,133 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Yandex (Web, images)"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from html import unescape
|
||||
from lxml import html
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.utils import humanize_bytes, eval_xpath, eval_xpath_list, extract_text, extr
|
||||
|
||||
|
||||
# Engine metadata
|
||||
about = {
|
||||
"website": 'https://yandex.com/',
|
||||
"wikidata_id": 'Q5281',
|
||||
"official_api_documentation": "?",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# Engine configuration
|
||||
categories = []
|
||||
paging = True
|
||||
search_type = ""
|
||||
|
||||
# Search URL
|
||||
base_url_web = 'https://yandex.com/search/site/'
|
||||
base_url_images = 'https://yandex.com/images/search'
|
||||
|
||||
results_xpath = '//li[contains(@class, "serp-item")]'
|
||||
url_xpath = './/a[@class="b-serp-item__title-link"]/@href'
|
||||
title_xpath = './/h3[@class="b-serp-item__title"]/a[@class="b-serp-item__title-link"]/span'
|
||||
content_xpath = './/div[@class="b-serp-item__content"]//div[@class="b-serp-item__text"]'
|
||||
|
||||
|
||||
def catch_bad_response(resp):
|
||||
if resp.url.path.startswith('/showcaptcha'):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params_web = {
|
||||
"tmpl_version": "releases",
|
||||
"text": query,
|
||||
"web": "1",
|
||||
"frame": "1",
|
||||
"searchid": "3131712",
|
||||
}
|
||||
|
||||
query_params_images = {
|
||||
"text": query,
|
||||
"uinfo": "sw-1920-sh-1080-ww-1125-wh-999",
|
||||
}
|
||||
|
||||
if params['pageno'] > 1:
|
||||
query_params_web.update({"p": params["pageno"] - 1})
|
||||
query_params_images.update({"p": params["pageno"] - 1})
|
||||
|
||||
params["cookies"] = {'cookie': "yp=1716337604.sp.family%3A0#1685406411.szm.1:1920x1080:1920x999"}
|
||||
|
||||
if search_type == 'web':
|
||||
params['url'] = f"{base_url_web}?{urlencode(query_params_web)}"
|
||||
elif search_type == 'images':
|
||||
params['url'] = f"{base_url_images}?{urlencode(query_params_images)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
if search_type == 'web':
|
||||
|
||||
catch_bad_response(resp)
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
results = []
|
||||
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
results.append(
|
||||
{
|
||||
'url': extract_text(eval_xpath(result, url_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
'content': extract_text(eval_xpath(result, content_xpath)),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
if search_type == 'images':
|
||||
|
||||
catch_bad_response(resp)
|
||||
|
||||
html_data = html.fromstring(resp.text)
|
||||
html_sample = unescape(html.tostring(html_data, encoding='unicode'))
|
||||
|
||||
content_between_tags = extr(
|
||||
html_sample, '{"location":"/images/search/', 'advRsyaSearchColumn":null}}', default="fail"
|
||||
)
|
||||
json_data = '{"location":"/images/search/' + content_between_tags + 'advRsyaSearchColumn":null}}'
|
||||
|
||||
if content_between_tags == "fail":
|
||||
content_between_tags = extr(html_sample, '{"location":"/images/search/', 'false}}}')
|
||||
json_data = '{"location":"/images/search/' + content_between_tags + 'false}}}'
|
||||
|
||||
json_resp = loads(json_data)
|
||||
|
||||
results = []
|
||||
for _, item_data in json_resp['initialState']['serpList']['items']['entities'].items():
|
||||
title = item_data['snippet']['title']
|
||||
source = item_data['snippet']['url']
|
||||
thumb = item_data['image']
|
||||
fullsize_image = item_data['viewerData']['dups'][0]['url']
|
||||
height = item_data['viewerData']['dups'][0]['h']
|
||||
width = item_data['viewerData']['dups'][0]['w']
|
||||
filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes']
|
||||
humanized_filesize = humanize_bytes(filesize)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'title': title,
|
||||
'url': source,
|
||||
'img_src': fullsize_image,
|
||||
'filesize': humanized_filesize,
|
||||
'thumbnail_src': thumb,
|
||||
'template': 'images.html',
|
||||
'resolution': f'{width} x {height}',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
return []
|
|
@ -1,7 +1,19 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=too-many-branches
|
||||
"""In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers
|
||||
other features.
|
||||
"""
|
||||
.. attention::
|
||||
|
||||
The **"Hostname replace"** plugin has been replace by **"Hostnames
|
||||
plugin"**, see :pull:`3463` & :pull:`3552`.
|
||||
|
||||
The **Hostnames plugin** can be enabled by adding it to the
|
||||
``enabled_plugins`` **list** in the ``setting.yml`` like so.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
enabled_plugins:
|
||||
- 'Hostnames plugin'
|
||||
...
|
||||
|
||||
- ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be
|
||||
replaced by other hostnames.
|
||||
|
|
|
@ -1814,6 +1814,22 @@ engines:
|
|||
engine: unsplash
|
||||
shortcut: us
|
||||
|
||||
- name: yandex
|
||||
engine: yandex
|
||||
categories: general
|
||||
search_type: web
|
||||
shortcut: yd
|
||||
disabled: true
|
||||
inactive: true
|
||||
|
||||
- name: yandex images
|
||||
engine: yandex
|
||||
categories: images
|
||||
search_type: images
|
||||
shortcut: ydi
|
||||
disabled: true
|
||||
inactive: true
|
||||
|
||||
- name: yandex music
|
||||
engine: yandex_music
|
||||
shortcut: ydm
|
||||
|
|
|
@ -10,6 +10,7 @@ class TestEnginesInit(SearxTestCase): # pylint: disable=missing-class-docstring
|
|||
def tearDownClass(cls):
|
||||
settings['outgoing']['using_tor_proxy'] = False
|
||||
settings['outgoing']['extra_proxy_timeout'] = 0
|
||||
engines.load_engines([])
|
||||
|
||||
def test_initialize_engines_default(self):
|
||||
engine_list = [
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# pylint: disable=missing-module-docstring, invalid-name
|
||||
|
||||
from copy import copy
|
||||
import logging
|
||||
|
||||
import searx.search
|
||||
from searx.search import SearchQuery, EngineRef
|
||||
|
@ -46,8 +47,13 @@ class SearchQueryTestCase(SearxTestCase): # pylint: disable=missing-class-docst
|
|||
class SearchTestCase(SearxTestCase): # pylint: disable=missing-class-docstring
|
||||
def setUp(self):
|
||||
|
||||
log = logging.getLogger("searx")
|
||||
log_lev = log.level
|
||||
log.setLevel(logging.ERROR)
|
||||
from searx import webapp # pylint: disable=import-outside-toplevel
|
||||
|
||||
log.setLevel(log_lev)
|
||||
|
||||
self.app = webapp.app
|
||||
|
||||
@classmethod
|
||||
|
|
102
tests/unit/test_tineye.py
Normal file
102
tests/unit/test_tineye.py
Normal file
|
@ -0,0 +1,102 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=missing-module-docstring
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import Mock
|
||||
from requests import HTTPError
|
||||
from searx.engines import load_engines, tineye
|
||||
from tests import SearxTestCase
|
||||
|
||||
|
||||
class TinEyeTests(SearxTestCase): # pylint: disable=missing-class-docstring
|
||||
|
||||
def setUp(self):
|
||||
load_engines([{'name': 'tineye', 'engine': 'tineye', 'shortcut': 'tin', 'timeout': 9.0, 'disabled': True}])
|
||||
|
||||
def tearDown(self):
|
||||
load_engines([])
|
||||
|
||||
def test_status_code_raises(self):
|
||||
response = Mock()
|
||||
response.status_code = 401
|
||||
response.raise_for_status.side_effect = HTTPError()
|
||||
self.assertRaises(HTTPError, lambda: tineye.response(response))
|
||||
|
||||
def test_returns_empty_list_for_422(self):
|
||||
response = Mock()
|
||||
response.json.return_value = {}
|
||||
response.status_code = 422
|
||||
response.raise_for_status.side_effect = HTTPError()
|
||||
with self.assertLogs(tineye.logger) as _dev_null:
|
||||
results = tineye.response(response)
|
||||
self.assertEqual(0, len(results))
|
||||
|
||||
def test_logs_format_for_422(self):
|
||||
response = Mock()
|
||||
response.json.return_value = {"suggestions": {"key": "Invalid image URL"}}
|
||||
response.status_code = 422
|
||||
response.raise_for_status.side_effect = HTTPError()
|
||||
|
||||
with self.assertLogs(tineye.logger) as assert_logs_context:
|
||||
tineye.response(response)
|
||||
self.assertIn(tineye.FORMAT_NOT_SUPPORTED, ','.join(assert_logs_context.output))
|
||||
|
||||
def test_logs_signature_for_422(self):
|
||||
response = Mock()
|
||||
response.json.return_value = {"suggestions": {"key": "NO_SIGNATURE_ERROR"}}
|
||||
response.status_code = 422
|
||||
response.raise_for_status.side_effect = HTTPError()
|
||||
|
||||
with self.assertLogs(tineye.logger) as assert_logs_context:
|
||||
tineye.response(response)
|
||||
self.assertIn(tineye.NO_SIGNATURE_ERROR, ','.join(assert_logs_context.output))
|
||||
|
||||
def test_logs_download_for_422(self):
|
||||
response = Mock()
|
||||
response.json.return_value = {"suggestions": {"key": "Download Error"}}
|
||||
response.status_code = 422
|
||||
response.raise_for_status.side_effect = HTTPError()
|
||||
|
||||
with self.assertLogs(tineye.logger) as assert_logs_context:
|
||||
tineye.response(response)
|
||||
self.assertIn(tineye.DOWNLOAD_ERROR, ','.join(assert_logs_context.output))
|
||||
|
||||
def test_empty_list_for_400(self):
|
||||
response = Mock()
|
||||
response.json.return_value = {}
|
||||
response.status_code = 400
|
||||
response.raise_for_status.side_effect = HTTPError()
|
||||
with self.assertLogs(tineye.logger) as _dev_null:
|
||||
results = tineye.response(response)
|
||||
self.assertEqual(0, len(results))
|
||||
|
||||
def test_logs_description_for_400(self):
|
||||
description = 'There was a problem with that request. Error ID: ad5fc955-a934-43c1-8187-f9a61d301645'
|
||||
response = Mock()
|
||||
response.json.return_value = {"suggestions": {"description": [description], "title": "Oops! We're sorry!"}}
|
||||
response.status_code = 400
|
||||
response.raise_for_status.side_effect = HTTPError()
|
||||
|
||||
with self.assertLogs(tineye.logger) as assert_logs_context:
|
||||
tineye.response(response)
|
||||
self.assertIn(description, ','.join(assert_logs_context.output))
|
||||
|
||||
def test_crawl_date_parses(self):
|
||||
date_str = '2020-05-25'
|
||||
date = datetime.strptime(date_str, '%Y-%m-%d')
|
||||
response = Mock()
|
||||
response.json.return_value = {
|
||||
'matches': [
|
||||
{
|
||||
'backlinks': [
|
||||
{
|
||||
'crawl_date': date_str,
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
response.status_code = 200
|
||||
results = tineye.response(response)
|
||||
self.assertEqual(date, results[0]['publishedDate'])
|
|
@ -1,6 +1,7 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=missing-module-docstring
|
||||
|
||||
import logging
|
||||
import json
|
||||
from urllib.parse import ParseResult
|
||||
from mock import Mock
|
||||
|
@ -20,8 +21,13 @@ class ViewsTestCase(SearxTestCase): # pylint: disable=missing-class-docstring,
|
|||
|
||||
self.setattr4test(searx.search.processors, 'initialize_processor', dummy)
|
||||
|
||||
log = logging.getLogger("searx")
|
||||
log_lev = log.level
|
||||
log.setLevel(logging.ERROR)
|
||||
from searx import webapp # pylint: disable=import-outside-toplevel
|
||||
|
||||
log.setLevel(log_lev)
|
||||
|
||||
webapp.app.config['TESTING'] = True # to get better error messages
|
||||
self.app = webapp.app.test_client()
|
||||
|
||||
|
|
|
@ -663,8 +663,8 @@ pyenv.install() {
|
|||
pyenv
|
||||
fi
|
||||
for i in ${PYOBJECTS}; do
|
||||
build_msg PYENV "[install] pip install -e '$i${PY_SETUP_EXTRAS}'"
|
||||
"${PY_ENV_BIN}/python" -m pip install -e "$i${PY_SETUP_EXTRAS}"
|
||||
build_msg PYENV "[install] pip install --use-pep517 --no-build-isolation -e '$i${PY_SETUP_EXTRAS}'"
|
||||
"${PY_ENV_BIN}/python" -m pip install --use-pep517 --no-build-isolation -e "$i${PY_SETUP_EXTRAS}"
|
||||
done
|
||||
fi
|
||||
pyenv.install.OK
|
||||
|
|
|
@ -501,7 +501,7 @@ pip install -U setuptools
|
|||
pip install -U wheel
|
||||
pip install -U pyyaml
|
||||
cd ${SEARXNG_SRC}
|
||||
pip install -e .
|
||||
pip install --use-pep517 --no-build-isolation -e .
|
||||
EOF
|
||||
}
|
||||
|
||||
|
@ -569,7 +569,7 @@ pip install -U pip
|
|||
pip install -U setuptools
|
||||
pip install -U wheel
|
||||
pip install -U pyyaml
|
||||
pip install -U -e .
|
||||
pip install -U --use-pep517 --no-build-isolation -e .
|
||||
EOF
|
||||
rst_para "update instance's settings.yml from ${SEARXNG_SETTINGS_PATH}"
|
||||
DEFAULT_SELECT=2 \
|
||||
|
|
Loading…
Add table
Reference in a new issue