From cbc5e13275fe3ea5bbab2c7665432398d2179c24 Mon Sep 17 00:00:00 2001 From: Frank de Lange Date: Sun, 28 Jul 2019 10:42:00 +0200 Subject: [PATCH] [enh] flickr_noapi: use complete JSON data block, add 'content', 'img_format', 'source', etc. (#1571) Fetch complete JSON data block, use legend to extract images. Unquote urlencoded strings. Add image description as 'content'. Add 'img_format' and 'source' data (needs PR #1567 to enable this data to be displayed). Show images which lack ownerid instead of discarding them. --- searx/engines/flickr_noapi.py | 51 +-- tests/unit/engines/test_flickr_noapi.py | 470 +++++++++++++----------- 2 files changed, 277 insertions(+), 244 deletions(-) diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 08f07f7ce..eeee413ec 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -16,8 +16,7 @@ from json import loads from time import time import re from searx.engines import logger -from searx.url_utils import urlencode - +from searx.url_utils import urlencode, unquote logger = logger.getChild('flickr-noapi') @@ -27,7 +26,7 @@ url = 'https://www.flickr.com/' search_url = url + 'search?{query}&page={page}' time_range_url = '&min_upload_date={start}&max_upload_date={end}' photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}' -regex = re.compile(r"\"search-photos-lite-models\",\"photos\":(.*}),\"totalItems\":", re.DOTALL) +modelexport_re = re.compile(r"^\s*modelExport:\s*({.*}),$", re.M) image_sizes = ('o', 'k', 'h', 'b', 'c', 'z', 'n', 'm', 't', 'q', 's') paging = True @@ -57,40 +56,45 @@ def request(query, params): def response(resp): results = [] - matches = regex.search(resp.text) + matches = modelexport_re.search(resp.text) if matches is None: return results match = matches.group(1) - search_results = loads(match) + model_export = loads(match) - if '_data' not in search_results: - return [] + if 'legend' not in model_export: + return results - photos = search_results['_data'] + legend = model_export['legend'] - for photo in photos: + # handle empty page + if not legend or not legend[0]: + return results - # In paged configuration, the first pages' photos - # are represented by a None object - if photo is None: - continue + for index in legend: + photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])] + author = unquote(photo.get('realname', '')) + source = unquote(photo.get('username', '')) + ' @ Flickr' + title = unquote(photo.get('title', '')) + content = unquote(photo.get('description', '')) img_src = None # From the biggest to the lowest format for image_size in image_sizes: if image_size in photo['sizes']: img_src = photo['sizes'][image_size]['url'] + img_format = 'jpg ' \ + + str(photo['sizes'][image_size]['width']) \ + + 'x' \ + + str(photo['sizes'][image_size]['height']) break if not img_src: logger.debug('cannot find valid image size: {0}'.format(repr(photo))) continue - if 'ownerNsid' not in photo: - continue - # For a bigger thumbnail, keep only the url_z, not the url_n if 'n' in photo['sizes']: thumbnail_src = photo['sizes']['n']['url'] @@ -99,19 +103,20 @@ def response(resp): else: thumbnail_src = img_src - url = build_flickr_url(photo['ownerNsid'], photo['id']) + if 'ownerNsid' not in photo: + # should not happen, disowned photo? Show it anyway + url = img_src + else: + url = build_flickr_url(photo['ownerNsid'], photo['id']) - title = photo.get('title', '') - - author = photo['username'] - - # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'thumbnail_src': thumbnail_src, - 'content': '', + 'content': content, 'author': author, + 'source': source, + 'img_format': img_format, 'template': 'images.html'}) return results diff --git a/tests/unit/engines/test_flickr_noapi.py b/tests/unit/engines/test_flickr_noapi.py index 5f8b069e3..67699f2f0 100644 --- a/tests/unit/engines/test_flickr_noapi.py +++ b/tests/unit/engines/test_flickr_noapi.py @@ -27,116 +27,132 @@ class TestFlickrNoapiEngine(SearxTestCase): self.assertRaises(AttributeError, flickr_noapi.response, '') self.assertRaises(AttributeError, flickr_noapi.response, '[]') - response = mock.Mock(text='"search-photos-lite-models","photos":{},"totalItems":') + response = mock.Mock(text='"modelExport:{"legend":[],"main":{"search-photos-lite-models":[{"photos":{}}]}}') self.assertEqual(flickr_noapi.response(response), []) - response = mock.Mock(text='search-photos-lite-models","photos":{"data": []},"totalItems":') + response = \ + mock.Mock(text='"modelExport:{"legend":[],"main":{"search-photos-lite-models":[{"photos":{"_data":[]}}]}}') self.assertEqual(flickr_noapi.response(response), []) # everthing is ok test json = """ - "search-photos-lite-models","photos": - { - "_data": [ - { - "_flickrModelRegistry": "photo-lite-models", - "title": "This is the title", - "username": "Owner", - "pathAlias": "klink692", - "realname": "Owner", - "license": 0, - "ownerNsid": "59729010@N00", - "canComment": false, - "commentCount": 14, - "faveCount": 21, - "id": "14001294434", - "sizes": { - "c": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_c.jpg", - "width": 541, - "height": 800, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_c.jpg", - "key": "c" - }, - "h": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_761d32237a_h.jpg", - "width": 1081, - "height": 1600, - "url": "//c4.staticflickr.com/8/7246/14001294434_761d32237a_h.jpg", - "key": "h" - }, - "k": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_f145a2c11a_k.jpg", - "width": 1383, - "height": 2048, - "url": "//c4.staticflickr.com/8/7246/14001294434_f145a2c11a_k.jpg", - "key": "k" - }, - "l": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_b.jpg", - "width": 692, - "height": 1024, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_b.jpg", - "key": "l" - }, - "m": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777.jpg", - "width": 338, - "height": 500, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777.jpg", - "key": "m" - }, - "n": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_n.jpg", - "width": 216, - "height": 320, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_n.jpg", - "key": "n" - }, - "q": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_q.jpg", - "width": 150, - "height": 150, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_q.jpg", - "key": "q" - }, - "s": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_m.jpg", - "width": 162, - "height": 240, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_m.jpg", - "key": "s" - }, - "sq": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_s.jpg", - "width": 75, - "height": 75, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_s.jpg", - "key": "sq" - }, - "t": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_t.jpg", - "width": 68, - "height": 100, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_t.jpg", - "key": "t" - }, - "z": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg", - "width": 433, - "height": 640, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg", - "key": "z" + modelExport: { + "legend": [ + [ + "search-photos-lite-models", + "0", + "photos", + "_data", + "0" + ] + ], + "main": { + "search-photos-lite-models": [ + { + "photos": { + "_data": [ + { + "_flickrModelRegistry": "photo-lite-models", + "title": "This%20is%20the%20title", + "username": "Owner", + "pathAlias": "klink692", + "realname": "Owner", + "license": 0, + "ownerNsid": "59729010@N00", + "canComment": false, + "commentCount": 14, + "faveCount": 21, + "id": "14001294434", + "sizes": { + "c": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_c.jpg", + "width": 541, + "height": 800, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_c.jpg", + "key": "c" + }, + "h": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_761d32237a_h.jpg", + "width": 1081, + "height": 1600, + "url": "//c4.staticflickr.com/8/7246/14001294434_761d32237a_h.jpg", + "key": "h" + }, + "k": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_f145a2c11a_k.jpg", + "width": 1383, + "height": 2048, + "url": "//c4.staticflickr.com/8/7246/14001294434_f145a2c11a_k.jpg", + "key": "k" + }, + "l": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_b.jpg", + "width": 692, + "height": 1024, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_b.jpg", + "key": "l" + }, + "m": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777.jpg", + "width": 338, + "height": 500, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777.jpg", + "key": "m" + }, + "n": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_n.jpg", + "width": 216, + "height": 320, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_n.jpg", + "key": "n" + }, + "q": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_q.jpg", + "width": 150, + "height": 150, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_q.jpg", + "key": "q" + }, + "s": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_m.jpg", + "width": 162, + "height": 240, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_m.jpg", + "key": "s" + }, + "sq": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_s.jpg", + "width": 75, + "height": 75, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_s.jpg", + "key": "sq" + }, + "t": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_t.jpg", + "width": 68, + "height": 100, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_t.jpg", + "key": "t" + }, + "z": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg", + "width": 433, + "height": 640, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg", + "key": "z" + } + } + } + ] } } - } - ], - "fetchedStart": true, - "fetchedEnd": false, - "totalItems": "4386039" - },"totalItems": + ] + } + } """ - json = json.replace('\r\n', '').replace('\n', '').replace('\r', '') + # Flickr serves search results in a json block named 'modelExport' buried inside a script tag, + # this json is served as a single line terminating with a comma. + json = ''.join(json.split()) + ',\n' response = mock.Mock(text=json) results = flickr_noapi.response(response) self.assertEqual(type(results), list) @@ -149,37 +165,51 @@ class TestFlickrNoapiEngine(SearxTestCase): # no n size, only the z size json = """ - "search-photos-lite-models","photos": - { - "_data": [ - { - "_flickrModelRegistry": "photo-lite-models", - "title": "This is the title", - "username": "Owner", - "pathAlias": "klink692", - "realname": "Owner", - "license": 0, - "ownerNsid": "59729010@N00", - "canComment": false, - "commentCount": 14, - "faveCount": 21, - "id": "14001294434", - "sizes": { - "z": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg", - "width": 433, - "height": 640, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg", - "key": "z" + modelExport: { + "legend": [ + [ + "search-photos-lite-models", + "0", + "photos", + "_data", + "0" + ] + ], + "main": { + "search-photos-lite-models": [ + { + "photos": { + "_data": [ + { + "_flickrModelRegistry": "photo-lite-models", + "title": "This%20is%20the%20title", + "username": "Owner", + "pathAlias": "klink692", + "realname": "Owner", + "license": 0, + "ownerNsid": "59729010@N00", + "canComment": false, + "commentCount": 14, + "faveCount": 21, + "id": "14001294434", + "sizes": { + "z": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_z.jpg", + "width": 433, + "height": 640, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_z.jpg", + "key": "z" + } + } + } + ] } } - } - ], - "fetchedStart": true, - "fetchedEnd": false, - "totalItems": "4386039" - },"totalItems": + ] + } + } """ + json = ''.join(json.split()) + ',\n' response = mock.Mock(text=json) results = flickr_noapi.response(response) self.assertEqual(type(results), list) @@ -192,37 +222,51 @@ class TestFlickrNoapiEngine(SearxTestCase): # no z or n size json = """ - "search-photos-lite-models","photos": - { - "_data": [ - { - "_flickrModelRegistry": "photo-lite-models", - "title": "This is the title", - "username": "Owner", - "pathAlias": "klink692", - "realname": "Owner", - "license": 0, - "ownerNsid": "59729010@N00", - "canComment": false, - "commentCount": 14, - "faveCount": 21, - "id": "14001294434", - "sizes": { - "o": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_o.jpg", - "width": 433, - "height": 640, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_o.jpg", - "key": "o" + modelExport: { + "legend": [ + [ + "search-photos-lite-models", + "0", + "photos", + "_data", + "0" + ] + ], + "main": { + "search-photos-lite-models": [ + { + "photos": { + "_data": [ + { + "_flickrModelRegistry": "photo-lite-models", + "title": "This%20is%20the%20title", + "username": "Owner", + "pathAlias": "klink692", + "realname": "Owner", + "license": 0, + "ownerNsid": "59729010@N00", + "canComment": false, + "commentCount": 14, + "faveCount": 21, + "id": "14001294434", + "sizes": { + "o": { + "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_o.jpg", + "width": 433, + "height": 640, + "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_o.jpg", + "key": "o" + } + } + } + ] } } - } - ], - "fetchedStart": true, - "fetchedEnd": false, - "totalItems": "4386039" - },"totalItems": + ] + } + } """ + json = ''.join(json.split()) + ',\n' response = mock.Mock(text=json) results = flickr_noapi.response(response) self.assertEqual(type(results), list) @@ -235,30 +279,44 @@ class TestFlickrNoapiEngine(SearxTestCase): # no image test json = """ - "search-photos-lite-models","photos": - { - "_data": [ - { - "_flickrModelRegistry": "photo-lite-models", - "title": "This is the title", - "username": "Owner", - "pathAlias": "klink692", - "realname": "Owner", - "license": 0, - "ownerNsid": "59729010@N00", - "canComment": false, - "commentCount": 14, - "faveCount": 21, - "id": "14001294434", - "sizes": { - } - } + modelExport: { + "legend": [ + [ + "search-photos-lite-models", + "0", + "photos", + "_data", + "0" + ] ], - "fetchedStart": true, - "fetchedEnd": false, - "totalItems": "4386039" - },"totalItems": + "main": { + "search-photos-lite-models": [ + { + "photos": { + "_data": [ + { + "_flickrModelRegistry": "photo-lite-models", + "title": "This is the title", + "username": "Owner", + "pathAlias": "klink692", + "realname": "Owner", + "license": 0, + "ownerNsid": "59729010@N00", + "canComment": false, + "commentCount": 14, + "faveCount": 21, + "id": "14001294434", + "sizes": { + } + } + ] + } + } + ] + } + } """ + json = ''.join(json.split()) + ',\n' response = mock.Mock(text=json) results = flickr_noapi.response(response) self.assertEqual(type(results), list) @@ -266,51 +324,20 @@ class TestFlickrNoapiEngine(SearxTestCase): # null test json = """ - "search-photos-models","photos": - { - "_data": [null], - "fetchedStart": true, - "fetchedEnd": false, - "totalItems": "4386039" - },"totalItems": - """ - response = mock.Mock(text=json) - results = flickr_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 0) - - # no ownerNsid test - json = """ - "search-photos-lite-models","photos": - { - "_data": [ - { - "_flickrModelRegistry": "photo-lite-models", - "title": "This is the title", - "username": "Owner", - "pathAlias": "klink692", - "realname": "Owner", - "license": 0, - "canComment": false, - "commentCount": 14, - "faveCount": 21, - "id": "14001294434", - "sizes": { - "o": { - "displayUrl": "//farm8.staticflickr.com/7246/14001294434_410f653777_o.jpg", - "width": 433, - "height": 640, - "url": "//c4.staticflickr.com/8/7246/14001294434_410f653777_o.jpg", - "key": "o" + modelExport: { + "legend": [null], + "main": { + "search-photos-lite-models": [ + { + "photos": { + "_data": [null] } } - } - ], - "fetchedStart": true, - "fetchedEnd": false, - "totalItems": "4386039" - },"totalItems": + ] + } + } """ + json = ''.join(json.split()) + ',\n' response = mock.Mock(text=json) results = flickr_noapi.response(response) self.assertEqual(type(results), list) @@ -323,6 +350,7 @@ class TestFlickrNoapiEngine(SearxTestCase): "link":"http:\/\/www.flickr.com\/artist\/1217","type":"artist"} ]} """ + json = ''.join(json.split()) + ',\n' response = mock.Mock(text=json) results = flickr_noapi.response(response) self.assertEqual(type(results), list)