From 3ca9cad927a158d7a26842ad3f95472f409dcd47 Mon Sep 17 00:00:00 2001 From: marc Date: Sat, 5 Aug 2017 14:48:07 -0500 Subject: [PATCH] add bing videos engine --- searx/engines/bing_videos.py | 96 ++++++++++++++++++ searx/settings.yml | 4 + tests/unit/engines/test_bing_videos.py | 131 +++++++++++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 searx/engines/bing_videos.py create mode 100644 tests/unit/engines/test_bing_videos.py diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py new file mode 100644 index 000000000..918064c9b --- /dev/null +++ b/searx/engines/bing_videos.py @@ -0,0 +1,96 @@ +""" + Bing (Videos) + + @website https://www.bing.com/videos + @provide-api yes (http://datamarket.azure.com/dataset/bing/search) + + @using-api no + @results HTML + @stable no + @parse url, title, content, thumbnail +""" + +from json import loads +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + + +categories = ['videos'] +paging = True +safesearch = True +time_range_support = True +number_of_results = 10 + +search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\ + 'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5' +time_range_string = '&qft=+filterui:videoage-lt{interval}' +time_range_dict = {'day': '1440', + 'week': '10080', + 'month': '43200', + 'year': '525600'} + +# safesearch definitions +safesearch_types = {2: 'STRICT', + 1: 'DEMOTE', + 0: 'OFF'} + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * 10 + 1 + + # safesearch cookie + params['cookies']['SRCHHPGUSR'] = \ + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + # language cookie + params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1' + + # query and paging + params['url'] = search_url.format(query=urlencode({'q': query}), + offset=offset, + number_of_results=number_of_results) + + # time range + if params['time_range'] in time_range_dict: + params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) + + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + for result in dom.xpath('//div[@class="dg_u"]'): + + # try to extract the url + url_container = result.xpath('.//div[@class="sa_wrapper"]/@data-eventpayload') + if len(url_container) > 0: + url = loads(url_container[0])['purl'] + else: + url = result.xpath('./a/@href')[0] + + # discard results that do not return an external url + # very recent results sometimes don't return the video's url + if url.startswith('/videos/search?'): + continue + + title = extract_text(result.xpath('./a//div[@class="tl"]')) + content = extract_text(result.xpath('.//div[@class="pubInfo"]')) + thumbnail = result.xpath('.//div[@class="vthumb"]/img/@src')[0] + + results.append({'url': url, + 'title': title, + 'content': content, + 'thumbnail': thumbnail, + 'template': 'videos.html'}) + + # first page ignores requested number of results + if len(results) >= number_of_results: + break + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 7eadb0816..4da96b5bf 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -81,6 +81,10 @@ engines: engine : bing_news shortcut : bin + - name : bing videos + engine : bing_videos + shortcut : biv + - name : bitbucket engine : xpath paging : True diff --git a/tests/unit/engines/test_bing_videos.py b/tests/unit/engines/test_bing_videos.py new file mode 100644 index 000000000..011b5410a --- /dev/null +++ b/tests/unit/engines/test_bing_videos.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import bing_videos +from searx.testing import SearxTestCase + + +class TestBingVideosEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + dicto['language'] = 'fr-FR' + dicto['safesearch'] = 0 + dicto['time_range'] = '' + params = bing_videos.request(query, dicto) + self.assertTrue('url' in params) + self.assertTrue(query in params['url']) + self.assertTrue('bing.com' in params['url']) + self.assertTrue('SRCHHPGUSR' in params['cookies']) + self.assertTrue('OFF' in params['cookies']['SRCHHPGUSR']) + self.assertTrue('_EDGE_S' in params['cookies']) + self.assertTrue('fr-fr' in params['cookies']['_EDGE_S']) + + dicto['pageno'] = 2 + dicto['time_range'] = 'day' + dicto['safesearch'] = 2 + params = bing_videos.request(query, dicto) + self.assertTrue('first=11' in params['url']) + self.assertTrue('1440' in params['url']) + self.assertIn('SRCHHPGUSR', params['cookies']) + self.assertTrue('STRICT' in params['cookies']['SRCHHPGUSR']) + + def test_response(self): + self.assertRaises(AttributeError, bing_videos.response, None) + self.assertRaises(AttributeError, bing_videos.response, []) + self.assertRaises(AttributeError, bing_videos.response, '') + self.assertRaises(AttributeError, bing_videos.response, '[]') + + response = mock.Mock(text='') + self.assertEqual(bing_videos.response(response), []) + + response = mock.Mock(text='') + self.assertEqual(bing_videos.response(response), []) + + html = """ +
+ +
+ """ + response = mock.Mock(text=html) + results = bing_videos.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Title 1') + self.assertEqual(results[0]['url'], 'https://url.com/1') + self.assertEqual(results[0]['content'], 'Content 1') + self.assertEqual(results[0]['thumbnail'], 'thumb_1.jpg') + + html = """ +
+ + +
+ """ + response = mock.Mock(text=html) + results = bing_videos.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['title'], 'Title 1') + self.assertEqual(results[0]['url'], 'https://url.com/1') + self.assertEqual(results[0]['content'], 'Content 1') + self.assertEqual(results[0]['thumbnail'], 'thumb_1.jpg')