From 78edc16e66fccbfb16c522f4453c88a85de61bf4 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 2 May 2015 11:43:12 +0200 Subject: [PATCH 1/4] [enh] reduce the number of http outgoing connections. engines that still use http : gigablast, bing image for thumbnails, 1x and dbpedia autocompleter --- searx/autocomplete.py | 4 ++-- searx/engines/dailymotion.py | 3 +++ searx/engines/deviantart.py | 8 +++++++- searx/engines/digg.py | 3 +++ searx/engines/gigablast.py | 2 +- searx/engines/google_images.py | 3 +++ searx/engines/www1x.py | 4 ++-- searx/tests/engines/test_deviantart.py | 2 +- searx/tests/engines/test_google_images.py | 2 +- searx/tests/engines/test_www1x.py | 4 ++-- 10 files changed, 25 insertions(+), 10 deletions(-) diff --git a/searx/autocomplete.py b/searx/autocomplete.py index 83e204890..e7361b591 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -111,7 +111,7 @@ def searx_bang(full_query): def dbpedia(query): - # dbpedia autocompleter + # dbpedia autocompleter, no HTTPS autocomplete_url = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?' # noqa response = get(autocomplete_url @@ -139,7 +139,7 @@ def duckduckgo(query): def google(query): # google autocompleter - autocomplete_url = 'http://suggestqueries.google.com/complete/search?client=toolbar&' # noqa + autocomplete_url = 'https://suggestqueries.google.com/complete/search?client=toolbar&' # noqa response = get(autocomplete_url + urlencode(dict(q=query))) diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 03b1dbb8b..187639706 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -60,6 +60,9 @@ def response(resp): publishedDate = datetime.fromtimestamp(res['created_time'], None) embedded = embedded_url.format(videoid=res['id']) + # http to https + thumbnail = thumbnail.replace("http://", "https://") + results.append({'template': 'videos.html', 'url': url, 'title': title, diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index 4198e8c76..43cd8bbce 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -22,7 +22,7 @@ paging = True # search-url base_url = 'https://www.deviantart.com/' -search_url = base_url+'search?offset={offset}&{query}' +search_url = base_url+'browse/all/?offset={offset}&{query}' # do search-request @@ -56,6 +56,12 @@ def response(resp): thumbnail_src = link.xpath('.//img')[0].attrib.get('src') img_src = regex.sub('/', thumbnail_src) + # http to https, remove domain sharding + thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) + thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) + + url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) + # append result results.append({'url': url, 'title': title, diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 1b5f2c8e4..5cb4ca8d9 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -58,6 +58,9 @@ def response(resp): pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') publishedDate = parser.parse(pubdate) + # http to https + thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com") + # append result results.append({'url': url, 'title': title, diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 8749c3256..69717db99 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -17,7 +17,7 @@ categories = ['general'] paging = True number_of_results = 5 -# search-url +# search-url, invalid HTTPS certificate base_url = 'http://gigablast.com/' search_string = 'search?{query}&n={number_of_results}&s={offset}&xml=1&qh=0' diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 1c0e62f5c..21ca8946b 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -56,6 +56,9 @@ def response(resp): continue thumbnail_src = result['tbUrl'] + # http to https + thumbnail_src = thumbnail_src.replace("http://", "https://") + # append result results.append({'url': href, 'title': title, diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index a68c105ce..df9254f2e 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -19,8 +19,8 @@ import re categories = ['images'] paging = False -# search-url -base_url = 'http://1x.com' +# search-url, no HTTPS +base_url = 'https://1x.com' search_url = base_url+'/backend/search.php?{query}' diff --git a/searx/tests/engines/test_deviantart.py b/searx/tests/engines/test_deviantart.py index 9cf68d0b8..78a391334 100644 --- a/searx/tests/engines/test_deviantart.py +++ b/searx/tests/engines/test_deviantart.py @@ -75,7 +75,7 @@ class TestDeviantartEngine(SearxTestCase): self.assertEqual(results[0]['title'], 'Title of image') self.assertEqual(results[0]['url'], 'http://url.of.result/2nd.part.of.url') self.assertNotIn('content', results[0]) - self.assertEqual(results[0]['thumbnail_src'], 'http://url.of.thumbnail') + self.assertEqual(results[0]['thumbnail_src'], 'https://url.of.thumbnail') html = """ diff --git a/searx/tests/engines/test_google_images.py b/searx/tests/engines/test_google_images.py index 32d133334..9bef692d4 100644 --- a/searx/tests/engines/test_google_images.py +++ b/searx/tests/engines/test_google_images.py @@ -65,7 +65,7 @@ class TestGoogleImagesEngine(SearxTestCase): self.assertEqual(len(results), 1) self.assertEqual(results[0]['title'], 'This is the title') self.assertEqual(results[0]['url'], 'http://this.is.the.url') - self.assertEqual(results[0]['thumbnail_src'], 'http://thumbnail.url') + self.assertEqual(results[0]['thumbnail_src'], 'https://thumbnail.url') self.assertEqual(results[0]['img_src'], 'http://image.url.jpg') self.assertEqual(results[0]['content'], 'test') diff --git a/searx/tests/engines/test_www1x.py b/searx/tests/engines/test_www1x.py index ab4f282c1..9df8de6bf 100644 --- a/searx/tests/engines/test_www1x.py +++ b/searx/tests/engines/test_www1x.py @@ -51,7 +51,7 @@ class TestWww1xEngine(SearxTestCase): results = www1x.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 1) - self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456') - self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg') + self.assertEqual(results[0]['url'], 'https://1x.com/photo/123456') + self.assertEqual(results[0]['thumbnail_src'], 'https://1x.com/images/user/testimage-123456.jpg') self.assertEqual(results[0]['content'], '') self.assertEqual(results[0]['template'], 'images.html') From e7fd546aae12fa97bdd268b3b9c1d1eac13a1034 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 2 May 2015 13:02:42 +0200 Subject: [PATCH 2/4] [fix] revert of 1x.com (no valid https certificate) --- searx/engines/www1x.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index df9254f2e..0524cc220 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -19,14 +19,15 @@ import re categories = ['images'] paging = False -# search-url, no HTTPS -base_url = 'https://1x.com' +# search-url, no HTTPS (there is a valid certificate for https://api2.1x.com/ ) +base_url = 'http://1x.com' search_url = base_url+'/backend/search.php?{query}' # do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query})) + print params['url'] return params From 732ed952dc0f018b16254890491f8784c73e74cb Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 2 May 2015 13:09:18 +0200 Subject: [PATCH 3/4] [fix] 1x.com tests --- searx/tests/engines/test_www1x.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searx/tests/engines/test_www1x.py b/searx/tests/engines/test_www1x.py index 9df8de6bf..ab4f282c1 100644 --- a/searx/tests/engines/test_www1x.py +++ b/searx/tests/engines/test_www1x.py @@ -51,7 +51,7 @@ class TestWww1xEngine(SearxTestCase): results = www1x.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 1) - self.assertEqual(results[0]['url'], 'https://1x.com/photo/123456') - self.assertEqual(results[0]['thumbnail_src'], 'https://1x.com/images/user/testimage-123456.jpg') + self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456') + self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg') self.assertEqual(results[0]['content'], '') self.assertEqual(results[0]['template'], 'images.html') From 59ee040424ccdef51f3616ed6487522bdaa44f3f Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 2 May 2015 15:28:08 +0200 Subject: [PATCH 4/4] [fix] remove a useless print --- searx/engines/www1x.py | 1 - 1 file changed, 1 deletion(-) diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index 0524cc220..61cdaf6b0 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -27,7 +27,6 @@ search_url = base_url+'/backend/search.php?{query}' # do search-request def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query})) - print params['url'] return params