forked from zaclys/searxng
[fix] engine tineye: handle 422 response of not supported img format
Closes: https://github.com/searxng/searxng/issues/1449 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
50d714d829
commit
1540891561
|
@ -17,6 +17,7 @@ billion images `[tineye.com] <https://tineye.com/how>`_.
|
||||||
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from flask_babel import gettext
|
||||||
|
|
||||||
about = {
|
about = {
|
||||||
"website": 'https://tineye.com',
|
"website": 'https://tineye.com',
|
||||||
|
@ -28,20 +29,41 @@ about = {
|
||||||
}
|
}
|
||||||
|
|
||||||
engine_type = 'online_url_search'
|
engine_type = 'online_url_search'
|
||||||
|
""":py:obj:`searx.search.processors.online_url_search`"""
|
||||||
|
|
||||||
categories = ['general']
|
categories = ['general']
|
||||||
paging = True
|
paging = True
|
||||||
safesearch = False
|
safesearch = False
|
||||||
base_url = 'https://tineye.com'
|
base_url = 'https://tineye.com'
|
||||||
search_string = '/result_json/?page={page}&{query}'
|
search_string = '/result_json/?page={page}&{query}'
|
||||||
|
|
||||||
|
FORMAT_NOT_SUPPORTED = gettext(
|
||||||
|
"Could not read that image url. This may be due to an unsupported file"
|
||||||
|
" format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP."
|
||||||
|
)
|
||||||
|
"""TinEye error message"""
|
||||||
|
|
||||||
|
NO_SIGNATURE_ERROR = gettext(
|
||||||
|
"The image is too simple to find matches. TinEye requires a basic level of"
|
||||||
|
" visual detail to successfully identify matches."
|
||||||
|
)
|
||||||
|
"""TinEye error message"""
|
||||||
|
|
||||||
|
DOWNLOAD_ERROR = gettext("The image could not be downloaded.")
|
||||||
|
"""TinEye error message"""
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
"""Build TinEye HTTP request using ``search_urls`` of a :py:obj:`engine_type`."""
|
||||||
|
|
||||||
|
params['raise_for_httperror'] = False
|
||||||
|
|
||||||
if params['search_urls']['data:image']:
|
if params['search_urls']['data:image']:
|
||||||
query = params['search_urls']['data:image']
|
query = params['search_urls']['data:image']
|
||||||
elif params['search_urls']['http']:
|
elif params['search_urls']['http']:
|
||||||
query = params['search_urls']['http']
|
query = params['search_urls']['http']
|
||||||
|
|
||||||
|
logger.debug("query URL: %s", query)
|
||||||
query = urlencode({'url': query})
|
query = urlencode({'url': query})
|
||||||
|
|
||||||
# see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
|
# see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
|
||||||
|
@ -59,45 +81,145 @@ def request(query, params):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tineye_match(match_json):
|
||||||
|
"""Takes parsed JSON from the API server and turns it into a :py:obj:`dict`
|
||||||
|
object.
|
||||||
|
|
||||||
|
Attributes `(class Match) <https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__
|
||||||
|
|
||||||
|
- `image_url`, link to the result image.
|
||||||
|
- `domain`, domain this result was found on.
|
||||||
|
- `score`, a number (0 to 100) that indicates how closely the images match.
|
||||||
|
- `width`, image width in pixels.
|
||||||
|
- `height`, image height in pixels.
|
||||||
|
- `size`, image area in pixels.
|
||||||
|
- `format`, image format.
|
||||||
|
- `filesize`, image size in bytes.
|
||||||
|
- `overlay`, overlay URL.
|
||||||
|
- `tags`, whether this match belongs to a collection or stock domain.
|
||||||
|
|
||||||
|
- `backlinks`, a list of Backlink objects pointing to the original websites
|
||||||
|
and image URLs. List items are instances of :py:obj:`dict`, (`Backlink
|
||||||
|
<https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__):
|
||||||
|
|
||||||
|
- `url`, the image URL to the image.
|
||||||
|
- `backlink`, the original website URL.
|
||||||
|
- `crawl_date`, the date the image was crawled.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# HINT: there exists an alternative backlink dict in the domains list / e.g.::
|
||||||
|
#
|
||||||
|
# match_json['domains'][0]['backlinks']
|
||||||
|
|
||||||
|
backlinks = []
|
||||||
|
if "backlinks" in match_json:
|
||||||
|
|
||||||
|
for backlink_json in match_json["backlinks"]:
|
||||||
|
if not isinstance(backlink_json, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
crawl_date = backlink_json.get("crawl_date")
|
||||||
|
if crawl_date:
|
||||||
|
crawl_date = datetime.fromisoformat(crawl_date[:-3])
|
||||||
|
else:
|
||||||
|
crawl_date = datetime.min
|
||||||
|
|
||||||
|
backlinks.append(
|
||||||
|
{
|
||||||
|
'url': backlink_json.get("url"),
|
||||||
|
'backlink': backlink_json.get("backlink"),
|
||||||
|
'crawl_date': crawl_date,
|
||||||
|
'image_name': backlink_json.get("image_name"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'image_url': match_json.get("image_url"),
|
||||||
|
'domain': match_json.get("domain"),
|
||||||
|
'score': match_json.get("score"),
|
||||||
|
'width': match_json.get("width"),
|
||||||
|
'height': match_json.get("height"),
|
||||||
|
'size': match_json.get("size"),
|
||||||
|
'image_format': match_json.get("format"),
|
||||||
|
'filesize': match_json.get("filesize"),
|
||||||
|
'overlay': match_json.get("overlay"),
|
||||||
|
'tags': match_json.get("tags"),
|
||||||
|
'backlinks': backlinks,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
|
"""Parse HTTP response from TinEye."""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
# Define wanted results
|
try:
|
||||||
json_data = resp.json()
|
json_data = resp.json()
|
||||||
number_of_results = json_data['num_matches']
|
except Exception as exc: # pylint: disable=broad-except
|
||||||
|
msg = "can't parse JSON response // %s" % exc
|
||||||
|
logger.error(msg)
|
||||||
|
json_data = {'error': msg}
|
||||||
|
|
||||||
for i in json_data['matches']:
|
# handle error codes from Tineye
|
||||||
image_format = i['format']
|
|
||||||
width = i['width']
|
|
||||||
height = i['height']
|
|
||||||
thumbnail_src = i['image_url']
|
|
||||||
backlink = i['domains'][0]['backlinks'][0]
|
|
||||||
url = backlink['backlink']
|
|
||||||
source = backlink['url']
|
|
||||||
title = backlink['image_name']
|
|
||||||
img_src = backlink['url']
|
|
||||||
|
|
||||||
# Get and convert published date
|
if resp.is_error:
|
||||||
api_date = backlink['crawl_date'][:-3]
|
if resp.status_code in (400, 422):
|
||||||
publishedDate = datetime.fromisoformat(api_date)
|
|
||||||
|
|
||||||
# Append results
|
message = 'HTTP status: %s' % resp.status_code
|
||||||
|
error = json_data.get('error')
|
||||||
|
s_key = json_data.get('suggestions', {}).get('key', '')
|
||||||
|
|
||||||
|
if error and s_key:
|
||||||
|
message = "%s (%s)" % (error, s_key)
|
||||||
|
elif error:
|
||||||
|
message = error
|
||||||
|
|
||||||
|
if s_key == "Invalid image URL":
|
||||||
|
# test https://docs.searxng.org/_static/searxng-wordmark.svg
|
||||||
|
message = FORMAT_NOT_SUPPORTED
|
||||||
|
elif s_key == 'NO_SIGNATURE_ERROR':
|
||||||
|
# test https://pngimg.com/uploads/dot/dot_PNG4.png
|
||||||
|
message = NO_SIGNATURE_ERROR
|
||||||
|
elif s_key == 'Download Error':
|
||||||
|
# test https://notexists
|
||||||
|
message = DOWNLOAD_ERROR
|
||||||
|
|
||||||
|
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
|
||||||
|
# results.append({'answer': message})
|
||||||
|
logger.error(message)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# append results from matches
|
||||||
|
|
||||||
|
for match_json in json_data['matches']:
|
||||||
|
|
||||||
|
tineye_match = parse_tineye_match(match_json)
|
||||||
|
if not tineye_match['backlinks']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
backlink = tineye_match['backlinks'][0]
|
||||||
results.append(
|
results.append(
|
||||||
{
|
{
|
||||||
'template': 'images.html',
|
'template': 'images.html',
|
||||||
'url': url,
|
'url': backlink['backlink'],
|
||||||
'thumbnail_src': thumbnail_src,
|
'thumbnail_src': tineye_match['image_url'],
|
||||||
'source': source,
|
'source': backlink['url'],
|
||||||
'title': title,
|
'title': backlink['image_name'],
|
||||||
'img_src': img_src,
|
'img_src': backlink['url'],
|
||||||
'format': image_format,
|
'format': tineye_match['image_format'],
|
||||||
'widht': width,
|
'widht': tineye_match['width'],
|
||||||
'height': height,
|
'height': tineye_match['height'],
|
||||||
'publishedDate': publishedDate,
|
'publishedDate': backlink['crawl_date'],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Append number of results
|
# append number of results
|
||||||
results.append({'number_of_results': number_of_results})
|
|
||||||
|
number_of_results = json_data.get('num_matches')
|
||||||
|
if number_of_results:
|
||||||
|
results.append({'number_of_results': number_of_results})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
Loading…
Reference in New Issue