mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
Sync upstream
This commit is contained in:
commit
c4fc0609d3
13 changed files with 315 additions and 40 deletions
|
|
@ -14,10 +14,16 @@ billion images `[tineye.com] <https://tineye.com/how>`_.
|
|||
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlencode
|
||||
from datetime import datetime
|
||||
from flask_babel import gettext
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
about = {
|
||||
"website": 'https://tineye.com',
|
||||
"wikidata_id": 'Q2382535',
|
||||
|
|
@ -34,7 +40,7 @@ categories = ['general']
|
|||
paging = True
|
||||
safesearch = False
|
||||
base_url = 'https://tineye.com'
|
||||
search_string = '/result_json/?page={page}&{query}'
|
||||
search_string = '/api/v1/result_json/?page={page}&{query}'
|
||||
|
||||
FORMAT_NOT_SUPPORTED = gettext(
|
||||
"Could not read that image url. This may be due to an unsupported file"
|
||||
|
|
@ -120,7 +126,7 @@ def parse_tineye_match(match_json):
|
|||
|
||||
crawl_date = backlink_json.get("crawl_date")
|
||||
if crawl_date:
|
||||
crawl_date = datetime.fromisoformat(crawl_date[:-3])
|
||||
crawl_date = datetime.strptime(crawl_date, '%Y-%m-%d')
|
||||
else:
|
||||
crawl_date = datetime.min
|
||||
|
||||
|
|
@ -150,29 +156,15 @@ def parse_tineye_match(match_json):
|
|||
|
||||
def response(resp):
|
||||
"""Parse HTTP response from TinEye."""
|
||||
results = []
|
||||
|
||||
try:
|
||||
# handle the 422 client side errors, and the possible 400 status code error
|
||||
if resp.status_code in (400, 422):
|
||||
json_data = resp.json()
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
msg = "can't parse JSON response // %s" % exc
|
||||
logger.error(msg)
|
||||
json_data = {'error': msg}
|
||||
|
||||
# handle error codes from Tineye
|
||||
|
||||
if resp.is_error:
|
||||
if resp.status_code in (400, 422):
|
||||
|
||||
message = 'HTTP status: %s' % resp.status_code
|
||||
error = json_data.get('error')
|
||||
s_key = json_data.get('suggestions', {}).get('key', '')
|
||||
|
||||
if error and s_key:
|
||||
message = "%s (%s)" % (error, s_key)
|
||||
elif error:
|
||||
message = error
|
||||
suggestions = json_data.get('suggestions', {})
|
||||
message = f'HTTP Status Code: {resp.status_code}'
|
||||
|
||||
if resp.status_code == 422:
|
||||
s_key = suggestions.get('key', '')
|
||||
if s_key == "Invalid image URL":
|
||||
# test https://docs.searxng.org/_static/searxng-wordmark.svg
|
||||
message = FORMAT_NOT_SUPPORTED
|
||||
|
|
@ -182,16 +174,23 @@ def response(resp):
|
|||
elif s_key == 'Download Error':
|
||||
# test https://notexists
|
||||
message = DOWNLOAD_ERROR
|
||||
else:
|
||||
logger.warning("Unknown suggestion key encountered: %s", s_key)
|
||||
else: # 400
|
||||
description = suggestions.get('description')
|
||||
if isinstance(description, list):
|
||||
message = ','.join(description)
|
||||
|
||||
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
|
||||
# results.append({'answer': message})
|
||||
logger.error(message)
|
||||
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
|
||||
# results.append({'answer': message})
|
||||
logger.error(message)
|
||||
return []
|
||||
|
||||
return results
|
||||
# Raise for all other responses
|
||||
resp.raise_for_status()
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
# append results from matches
|
||||
results = []
|
||||
json_data = resp.json()
|
||||
|
||||
for match_json in json_data['matches']:
|
||||
|
||||
|
|
|
|||
133
searx/engines/yandex.py
Normal file
133
searx/engines/yandex.py
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Yandex (Web, images)"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from html import unescape
|
||||
from lxml import html
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
from searx.utils import humanize_bytes, eval_xpath, eval_xpath_list, extract_text, extr
|
||||
|
||||
|
||||
# Engine metadata
|
||||
about = {
|
||||
"website": 'https://yandex.com/',
|
||||
"wikidata_id": 'Q5281',
|
||||
"official_api_documentation": "?",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# Engine configuration
|
||||
categories = []
|
||||
paging = True
|
||||
search_type = ""
|
||||
|
||||
# Search URL
|
||||
base_url_web = 'https://yandex.com/search/site/'
|
||||
base_url_images = 'https://yandex.com/images/search'
|
||||
|
||||
results_xpath = '//li[contains(@class, "serp-item")]'
|
||||
url_xpath = './/a[@class="b-serp-item__title-link"]/@href'
|
||||
title_xpath = './/h3[@class="b-serp-item__title"]/a[@class="b-serp-item__title-link"]/span'
|
||||
content_xpath = './/div[@class="b-serp-item__content"]//div[@class="b-serp-item__text"]'
|
||||
|
||||
|
||||
def catch_bad_response(resp):
|
||||
if resp.url.path.startswith('/showcaptcha'):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
|
||||
def request(query, params):
|
||||
query_params_web = {
|
||||
"tmpl_version": "releases",
|
||||
"text": query,
|
||||
"web": "1",
|
||||
"frame": "1",
|
||||
"searchid": "3131712",
|
||||
}
|
||||
|
||||
query_params_images = {
|
||||
"text": query,
|
||||
"uinfo": "sw-1920-sh-1080-ww-1125-wh-999",
|
||||
}
|
||||
|
||||
if params['pageno'] > 1:
|
||||
query_params_web.update({"p": params["pageno"] - 1})
|
||||
query_params_images.update({"p": params["pageno"] - 1})
|
||||
|
||||
params["cookies"] = {'cookie': "yp=1716337604.sp.family%3A0#1685406411.szm.1:1920x1080:1920x999"}
|
||||
|
||||
if search_type == 'web':
|
||||
params['url'] = f"{base_url_web}?{urlencode(query_params_web)}"
|
||||
elif search_type == 'images':
|
||||
params['url'] = f"{base_url_images}?{urlencode(query_params_images)}"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
if search_type == 'web':
|
||||
|
||||
catch_bad_response(resp)
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
results = []
|
||||
|
||||
for result in eval_xpath_list(dom, results_xpath):
|
||||
results.append(
|
||||
{
|
||||
'url': extract_text(eval_xpath(result, url_xpath)),
|
||||
'title': extract_text(eval_xpath(result, title_xpath)),
|
||||
'content': extract_text(eval_xpath(result, content_xpath)),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
if search_type == 'images':
|
||||
|
||||
catch_bad_response(resp)
|
||||
|
||||
html_data = html.fromstring(resp.text)
|
||||
html_sample = unescape(html.tostring(html_data, encoding='unicode'))
|
||||
|
||||
content_between_tags = extr(
|
||||
html_sample, '{"location":"/images/search/', 'advRsyaSearchColumn":null}}', default="fail"
|
||||
)
|
||||
json_data = '{"location":"/images/search/' + content_between_tags + 'advRsyaSearchColumn":null}}'
|
||||
|
||||
if content_between_tags == "fail":
|
||||
content_between_tags = extr(html_sample, '{"location":"/images/search/', 'false}}}')
|
||||
json_data = '{"location":"/images/search/' + content_between_tags + 'false}}}'
|
||||
|
||||
json_resp = loads(json_data)
|
||||
|
||||
results = []
|
||||
for _, item_data in json_resp['initialState']['serpList']['items']['entities'].items():
|
||||
title = item_data['snippet']['title']
|
||||
source = item_data['snippet']['url']
|
||||
thumb = item_data['image']
|
||||
fullsize_image = item_data['viewerData']['dups'][0]['url']
|
||||
height = item_data['viewerData']['dups'][0]['h']
|
||||
width = item_data['viewerData']['dups'][0]['w']
|
||||
filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes']
|
||||
humanized_filesize = humanize_bytes(filesize)
|
||||
|
||||
results.append(
|
||||
{
|
||||
'title': title,
|
||||
'url': source,
|
||||
'img_src': fullsize_image,
|
||||
'filesize': humanized_filesize,
|
||||
'thumbnail_src': thumb,
|
||||
'template': 'images.html',
|
||||
'resolution': f'{width} x {height}',
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
return []
|
||||
|
|
@ -1,7 +1,19 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pylint: disable=too-many-branches
|
||||
"""In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers
|
||||
other features.
|
||||
"""
|
||||
.. attention::
|
||||
|
||||
The **"Hostname replace"** plugin has been replace by **"Hostnames
|
||||
plugin"**, see :pull:`3463` & :pull:`3552`.
|
||||
|
||||
The **Hostnames plugin** can be enabled by adding it to the
|
||||
``enabled_plugins`` **list** in the ``setting.yml`` like so.
|
||||
|
||||
.. code:: yaml
|
||||
|
||||
enabled_plugins:
|
||||
- 'Hostnames plugin'
|
||||
...
|
||||
|
||||
- ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be
|
||||
replaced by other hostnames.
|
||||
|
|
|
|||
|
|
@ -1814,6 +1814,22 @@ engines:
|
|||
engine: unsplash
|
||||
shortcut: us
|
||||
|
||||
- name: yandex
|
||||
engine: yandex
|
||||
categories: general
|
||||
search_type: web
|
||||
shortcut: yd
|
||||
disabled: true
|
||||
inactive: true
|
||||
|
||||
- name: yandex images
|
||||
engine: yandex
|
||||
categories: images
|
||||
search_type: images
|
||||
shortcut: ydi
|
||||
disabled: true
|
||||
inactive: true
|
||||
|
||||
- name: yandex music
|
||||
engine: yandex_music
|
||||
shortcut: ydm
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue