From a524dbb823e88482a762d56ac1ed352641f3f0c3 Mon Sep 17 00:00:00 2001 From: marc Date: Tue, 10 Oct 2017 16:49:49 -0500 Subject: [PATCH] [fix] language support for bing images and videos --- searx/engines/bing_images.py | 56 +++++++++++++++++++++----- searx/engines/bing_videos.py | 5 ++- tests/unit/engines/test_bing_images.py | 12 ++++-- tests/unit/engines/test_bing_videos.py | 2 + 4 files changed, 61 insertions(+), 14 deletions(-) diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 6300c94e4..15679056c 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -18,7 +18,6 @@ from lxml import html from json import loads import re -from searx.engines.bing import _fetch_supported_languages, supported_languages_url from searx.url_utils import urlencode # engine dependent config @@ -26,6 +25,8 @@ categories = ['images'] paging = True safesearch = True time_range_support = True +language_support = True +supported_languages_url = 'https://www.bing.com/account/general' # search-url base_url = 'https://www.bing.com/' @@ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT', _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) +# get supported region code +def get_region_code(lang, lang_list=None): + region = None + if lang in (lang_list or supported_languages): + region = lang + elif lang.startswith('no'): + region = 'nb-NO' + else: + # try to get a supported country code with language + lang = lang.split('-')[0] + for lc in (lang_list or supported_languages): + if lang == lc.split('-')[0]: + region = lc + break + if region: + return region.lower() + else: + return 'en-us' + + # do search-request def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 - # required for cookie - if params['language'] == 'all': - language = 'en-US' - else: - language = params['language'] - search_path = search_string.format( query=urlencode({'q': query}), offset=offset) + language = get_region_code(params['language']) + params['cookies']['SRCHHPGUSR'] = \ - 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\ - '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') + + params['cookies']['_EDGE_S'] = 'mkt=' + language +\ + '&ui=' + language + '&F=1' params['url'] = base_url + search_path if params['time_range'] in time_range_dict: @@ -106,3 +125,22 @@ def response(resp): # return results return results + + +# get supported languages from their site +def _fetch_supported_languages(resp): + supported_languages = [] + dom = html.fromstring(resp.text) + + regions_xpath = '//div[@id="region-section-content"]' \ + + '//ul[@class="b_vList"]/li/a/@href' + + regions = dom.xpath(regions_xpath) + for region in regions: + code = re.search('setmkt=[^\&]+', region).group()[7:] + if code == 'nb-NO': + code = 'no-NO' + + supported_languages.append(code) + + return supported_languages diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index 918064c9b..bd91bce37 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -12,6 +12,7 @@ from json import loads from lxml import html +from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code from searx.engines.xpath import extract_text from searx.url_utils import urlencode @@ -21,6 +22,7 @@ paging = True safesearch = True time_range_support = True number_of_results = 10 +language_support = True search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\ 'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5' @@ -45,7 +47,8 @@ def request(query, params): 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') # language cookie - params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1' + region = get_region_code(params['language'], lang_list=supported_languages) + params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1' # query and paging params['url'] = search_url.format(query=urlencode({'q': query}), diff --git a/tests/unit/engines/test_bing_images.py b/tests/unit/engines/test_bing_images.py index 287f13499..8b0bdb39d 100644 --- a/tests/unit/engines/test_bing_images.py +++ b/tests/unit/engines/test_bing_images.py @@ -8,10 +8,12 @@ from searx.testing import SearxTestCase class TestBingImagesEngine(SearxTestCase): def test_request(self): + bing_images.supported_languages = ['fr-FR', 'en-US'] + query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 1 - dicto['language'] = 'fr_FR' + dicto['language'] = 'fr-FR' dicto['safesearch'] = 1 dicto['time_range'] = '' params = bing_images.request(query, dicto) @@ -19,12 +21,14 @@ class TestBingImagesEngine(SearxTestCase): self.assertTrue(query in params['url']) self.assertTrue('bing.com' in params['url']) self.assertTrue('SRCHHPGUSR' in params['cookies']) - self.assertTrue('fr' in params['cookies']['SRCHHPGUSR']) + self.assertTrue('DEMOTE' in params['cookies']['SRCHHPGUSR']) + self.assertTrue('_EDGE_S' in params['cookies']) + self.assertTrue('fr-fr' in params['cookies']['_EDGE_S']) dicto['language'] = 'all' params = bing_images.request(query, dicto) - self.assertIn('SRCHHPGUSR', params['cookies']) - self.assertIn('en', params['cookies']['SRCHHPGUSR']) + self.assertTrue('_EDGE_S' in params['cookies']) + self.assertTrue('en' in params['cookies']['_EDGE_S']) def test_response(self): self.assertRaises(AttributeError, bing_images.response, None) diff --git a/tests/unit/engines/test_bing_videos.py b/tests/unit/engines/test_bing_videos.py index 011b5410a..118754b25 100644 --- a/tests/unit/engines/test_bing_videos.py +++ b/tests/unit/engines/test_bing_videos.py @@ -8,6 +8,8 @@ from searx.testing import SearxTestCase class TestBingVideosEngine(SearxTestCase): def test_request(self): + bing_videos.supported_languages = ['fr-FR', 'en-US'] + query = 'test_query' dicto = defaultdict(dict) dicto['pageno'] = 1