Merge pull request #1061 from a01200356/bing

[fix] Language support for Bing Images and Videos
This commit is contained in:
Adam Tauber 2017-10-13 21:36:21 +02:00 committed by GitHub
commit c8a66a090a
11 changed files with 114 additions and 39 deletions

View File

@ -13,7 +13,7 @@ python:
before_install: before_install:
- "export DISPLAY=:99.0" - "export DISPLAY=:99.0"
- "sh -e /etc/init.d/xvfb start" - "sh -e /etc/init.d/xvfb start"
- npm install less less-plugin-clean-css grunt-cli - npm install less@2.7 less-plugin-clean-css grunt-cli
- export PATH=`pwd`/node_modules/.bin:$PATH - export PATH=`pwd`/node_modules/.bin:$PATH
- ./manage.sh install_geckodriver ~/drivers - ./manage.sh install_geckodriver ~/drivers
- export PATH=~/drivers:$PATH - export PATH=~/drivers:$PATH

File diff suppressed because one or more lines are too long

View File

@ -18,7 +18,6 @@
from lxml import html from lxml import html
from json import loads from json import loads
import re import re
from searx.engines.bing import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode from searx.url_utils import urlencode
# engine dependent config # engine dependent config
@ -26,6 +25,8 @@ categories = ['images']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
language_support = True
supported_languages_url = 'https://www.bing.com/account/general'
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
@ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT',
_quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U) _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
# get supported region code
def get_region_code(lang, lang_list=None):
region = None
if lang in (lang_list or supported_languages):
region = lang
elif lang.startswith('no'):
region = 'nb-NO'
else:
# try to get a supported country code with language
lang = lang.split('-')[0]
for lc in (lang_list or supported_languages):
if lang == lc.split('-')[0]:
region = lc
break
if region:
return region.lower()
else:
return 'en-us'
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
# required for cookie
if params['language'] == 'all':
language = 'en-US'
else:
language = params['language']
search_path = search_string.format( search_path = search_string.format(
query=urlencode({'q': query}), query=urlencode({'q': query}),
offset=offset) offset=offset)
language = get_region_code(params['language'])
params['cookies']['SRCHHPGUSR'] = \ params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\ 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
'&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
params['cookies']['_EDGE_S'] = 'mkt=' + language +\
'&ui=' + language + '&F=1'
params['url'] = base_url + search_path params['url'] = base_url + search_path
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
@ -106,3 +125,22 @@ def response(resp):
# return results # return results
return results return results
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = []
dom = html.fromstring(resp.text)
regions_xpath = '//div[@id="region-section-content"]' \
+ '//ul[@class="b_vList"]/li/a/@href'
regions = dom.xpath(regions_xpath)
for region in regions:
code = re.search('setmkt=[^\&]+', region).group()[7:]
if code == 'nb-NO':
code = 'no-NO'
supported_languages.append(code)
return supported_languages

View File

@ -12,6 +12,7 @@
from json import loads from json import loads
from lxml import html from lxml import html
from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.url_utils import urlencode from searx.url_utils import urlencode
@ -21,6 +22,7 @@ paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
number_of_results = 10 number_of_results = 10
language_support = True
search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\ search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\
'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5' 'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5'
@ -45,7 +47,8 @@ def request(query, params):
'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE') 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
# language cookie # language cookie
params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1' region = get_region_code(params['language'], lang_list=supported_languages)
params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
# query and paging # query and paging
params['url'] = search_url.format(query=urlencode({'q': query}), params['url'] = search_url.format(query=urlencode({'q': query}),

View File

@ -134,4 +134,4 @@ def _fetch_supported_languages(resp):
regions_json = loads(response_page) regions_json = loads(response_page)
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
return supported_languages return list(supported_languages)

View File

@ -118,7 +118,7 @@ def _fetch_supported_languages(resp):
dom = fromstring(resp.text) dom = fromstring(resp.text)
options = dom.xpath('//div[@id="regions-popup"]//ul/li/a') options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
for option in options: for option in options:
code = option.xpath('./@data-val')[0] code = option.xpath('./@data-search-language')[0]
if code.startswith('nb-'): if code.startswith('nb-'):
code = code.replace('nb', 'no', 1) code = code.replace('nb', 'no', 1)
supported_languages.append(code) supported_languages.append(code)

View File

@ -5,6 +5,11 @@
language_codes = ( language_codes = (
(u"ar-SA", u"العربية", u"", u"Arabic"), (u"ar-SA", u"العربية", u"", u"Arabic"),
(u"bg-BG", u"Български", u"", u"Bulgarian"), (u"bg-BG", u"Български", u"", u"Bulgarian"),
(u"ca", u"Català", u"", u"Catalan"),
(u"ca-AD", u"Català", u"Andorra", u"Catalan"),
(u"ca-CT", u"Català", u"", u"Catalan"),
(u"ca-ES", u"Català", u"Espanya", u"Catalan"),
(u"ca-FR", u"Català", u"França", u"Catalan"),
(u"cs-CZ", u"Čeština", u"", u"Czech"), (u"cs-CZ", u"Čeština", u"", u"Czech"),
(u"da-DK", u"Dansk", u"", u"Danish"), (u"da-DK", u"Dansk", u"", u"Danish"),
(u"de", u"Deutsch", u"", u"German"), (u"de", u"Deutsch", u"", u"German"),
@ -15,9 +20,7 @@ language_codes = (
(u"en", u"English", u"", u"English"), (u"en", u"English", u"", u"English"),
(u"en-AU", u"English", u"Australia", u"English"), (u"en-AU", u"English", u"Australia", u"English"),
(u"en-CA", u"English", u"Canada", u"English"), (u"en-CA", u"English", u"Canada", u"English"),
(u"en-CY", u"English", u"Cyprus", u"English"),
(u"en-GB", u"English", u"United Kingdom", u"English"), (u"en-GB", u"English", u"United Kingdom", u"English"),
(u"en-GD", u"English", u"Grenada", u"English"),
(u"en-ID", u"English", u"Indonesia", u"English"), (u"en-ID", u"English", u"Indonesia", u"English"),
(u"en-IE", u"English", u"Ireland", u"English"), (u"en-IE", u"English", u"Ireland", u"English"),
(u"en-IN", u"English", u"India", u"English"), (u"en-IN", u"English", u"India", u"English"),
@ -28,6 +31,7 @@ language_codes = (
(u"en-US", u"English", u"United States", u"English"), (u"en-US", u"English", u"United States", u"English"),
(u"en-ZA", u"English", u"South Africa", u"English"), (u"en-ZA", u"English", u"South Africa", u"English"),
(u"es", u"Español", u"", u"Spanish"), (u"es", u"Español", u"", u"Spanish"),
(u"es-AD", u"Español", u"Andorra", u"Spanish"),
(u"es-AR", u"Español", u"Argentina", u"Spanish"), (u"es-AR", u"Español", u"Argentina", u"Spanish"),
(u"es-CL", u"Español", u"Chile", u"Spanish"), (u"es-CL", u"Español", u"Chile", u"Spanish"),
(u"es-CO", u"Español", u"Colombia", u"Spanish"), (u"es-CO", u"Español", u"Colombia", u"Spanish"),
@ -38,38 +42,32 @@ language_codes = (
(u"et-EE", u"Eesti", u"", u"Estonian"), (u"et-EE", u"Eesti", u"", u"Estonian"),
(u"fi-FI", u"Suomi", u"", u"Finnish"), (u"fi-FI", u"Suomi", u"", u"Finnish"),
(u"fr", u"Français", u"", u"French"), (u"fr", u"Français", u"", u"French"),
(u"fr-AD", u"Français", u"Andorre", u"French"),
(u"fr-BE", u"Français", u"Belgique", u"French"), (u"fr-BE", u"Français", u"Belgique", u"French"),
(u"fr-CA", u"Français", u"Canada", u"French"), (u"fr-CA", u"Français", u"Canada", u"French"),
(u"fr-CH", u"Français", u"Suisse", u"French"), (u"fr-CH", u"Français", u"Suisse", u"French"),
(u"fr-FR", u"Français", u"France", u"French"), (u"fr-FR", u"Français", u"France", u"French"),
(u"he-IL", u"עברית", u"", u"Hebrew"), (u"he-IL", u"עברית", u"", u"Hebrew"),
(u"hr-HR", u"Hrvatski", u"", u"Croatian"),
(u"hu-HU", u"Magyar", u"", u"Hungarian"), (u"hu-HU", u"Magyar", u"", u"Hungarian"),
(u"id-ID", u"Bahasa Indonesia", u"", u"Indonesian"),
(u"it", u"Italiano", u"", u"Italian"), (u"it", u"Italiano", u"", u"Italian"),
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"), (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
(u"it-IT", u"Italiano", u"Italia", u"Italian"), (u"it-IT", u"Italiano", u"Italia", u"Italian"),
(u"ja-JP", u"日本語", u"", u"Japanese"), (u"ja-JP", u"日本語", u"", u"Japanese"),
(u"ko-KR", u"한국어", u"", u"Korean"), (u"ko-KR", u"한국어", u"", u"Korean"),
(u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
(u"lv-LV", u"Latviešu", u"", u"Latvian"),
(u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
(u"nl", u"Nederlands", u"", u"Dutch"), (u"nl", u"Nederlands", u"", u"Dutch"),
(u"nl-BE", u"Nederlands", u"België", u"Dutch"), (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"), (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
(u"no-NO", u"Norsk", u"", u"Norwegian"), (u"no-NO", u"Norsk", u"", u"Norwegian"),
(u"pl-PL", u"Polski", u"", u"Polish"), (u"pl-PL", u"Polski", u"", u"Polish"),
(u"pt", u"Português", u"", u"Portuguese"), (u"pt", u"Português", u"", u"Portuguese"),
(u"pt-AD", u"Português", u"Andorra", u"Portuguese"),
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"), (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"), (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
(u"ro-RO", u"Română", u"", u"Romanian"), (u"ro-RO", u"Română", u"", u"Romanian"),
(u"ru-RU", u"Русский", u"", u"Russian"), (u"ru-RU", u"Русский", u"", u"Russian"),
(u"sk-SK", u"Slovenčina", u"", u"Slovak"),
(u"sl", u"Slovenščina", u"", u"Slovenian"),
(u"sv-SE", u"Svenska", u"", u"Swedish"), (u"sv-SE", u"Svenska", u"", u"Swedish"),
(u"th-TH", u"ไทย", u"", u"Thai"), (u"th-TH", u"ไทย", u"", u"Thai"),
(u"tr-TR", u"Türkçe", u"", u"Turkish"), (u"tr-TR", u"Türkçe", u"", u"Turkish"),
(u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
(u"zh", u"中文", u"", u"Chinese"), (u"zh", u"中文", u"", u"Chinese"),
(u"zh-CN", u"中文", u"中国", u"Chinese"), (u"zh-CN", u"中文", u"中国", u"Chinese"),
(u"zh-HK", u"中文", u"香港", u"Chinese"), (u"zh-HK", u"中文", u"香港", u"Chinese"),

View File

@ -8,10 +8,12 @@ from searx.testing import SearxTestCase
class TestBingImagesEngine(SearxTestCase): class TestBingImagesEngine(SearxTestCase):
def test_request(self): def test_request(self):
bing_images.supported_languages = ['fr-FR', 'en-US']
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1
dicto['language'] = 'fr_FR' dicto['language'] = 'fr-FR'
dicto['safesearch'] = 1 dicto['safesearch'] = 1
dicto['time_range'] = '' dicto['time_range'] = ''
params = bing_images.request(query, dicto) params = bing_images.request(query, dicto)
@ -19,12 +21,19 @@ class TestBingImagesEngine(SearxTestCase):
self.assertTrue(query in params['url']) self.assertTrue(query in params['url'])
self.assertTrue('bing.com' in params['url']) self.assertTrue('bing.com' in params['url'])
self.assertTrue('SRCHHPGUSR' in params['cookies']) self.assertTrue('SRCHHPGUSR' in params['cookies'])
self.assertTrue('fr' in params['cookies']['SRCHHPGUSR']) self.assertTrue('DEMOTE' in params['cookies']['SRCHHPGUSR'])
self.assertTrue('_EDGE_S' in params['cookies'])
self.assertTrue('fr-fr' in params['cookies']['_EDGE_S'])
dicto['language'] = 'fr'
params = bing_images.request(query, dicto)
self.assertTrue('_EDGE_S' in params['cookies'])
self.assertTrue('fr-fr' in params['cookies']['_EDGE_S'])
dicto['language'] = 'all' dicto['language'] = 'all'
params = bing_images.request(query, dicto) params = bing_images.request(query, dicto)
self.assertIn('SRCHHPGUSR', params['cookies']) self.assertTrue('_EDGE_S' in params['cookies'])
self.assertIn('en', params['cookies']['SRCHHPGUSR']) self.assertTrue('en-us' in params['cookies']['_EDGE_S'])
def test_response(self): def test_response(self):
self.assertRaises(AttributeError, bing_images.response, None) self.assertRaises(AttributeError, bing_images.response, None)
@ -82,3 +91,28 @@ class TestBingImagesEngine(SearxTestCase):
self.assertEqual(results[0]['content'], '') self.assertEqual(results[0]['content'], '')
self.assertEqual(results[0]['thumbnail_src'], 'thumb_url') self.assertEqual(results[0]['thumbnail_src'], 'thumb_url')
self.assertEqual(results[0]['img_src'], 'img_url') self.assertEqual(results[0]['img_src'], 'img_url')
def test_fetch_supported_languages(self):
html = """
<div>
<div id="region-section-content">
<ul class="b_vList">
<li>
<a href="https://bing...&setmkt=de-DE&s...">Germany</a>
<a href="https://bing...&setmkt=nb-NO&s...">Norway</a>
</li>
</ul>
<ul class="b_vList">
<li>
<a href="https://bing...&setmkt=es-AR&s...">Argentina</a>
</li>
</ul>
</div>
</div>
"""
response = mock.Mock(text=html)
languages = list(bing_images._fetch_supported_languages(response))
self.assertEqual(len(languages), 3)
self.assertIn('de-DE', languages)
self.assertIn('no-NO', languages)
self.assertIn('es-AR', languages)

View File

@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
class TestBingVideosEngine(SearxTestCase): class TestBingVideosEngine(SearxTestCase):
def test_request(self): def test_request(self):
bing_videos.supported_languages = ['fr-FR', 'en-US']
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1

View File

@ -139,9 +139,9 @@ class TestSwisscowsEngine(SearxTestCase):
<div id="regions-popup"> <div id="regions-popup">
<div> <div>
<ul> <ul>
<li><a data-val="browser"></a></li> <li><a data-search-language="browser"></a></li>
<li><a data-val="de-CH"></a></li> <li><a data-search-language="de-CH"></a></li>
<li><a data-val="fr-CH"></a></li> <li><a data-search-language="fr-CH"></a></li>
</ul> </ul>
</div> </div>
</div> </div>

View File

@ -8,13 +8,13 @@
# are written in current directory to avoid overwriting in case something goes wrong. # are written in current directory to avoid overwriting in case something goes wrong.
from requests import get from requests import get
from urllib import urlencode
from lxml.html import fromstring from lxml.html import fromstring
from json import loads, dumps from json import loads, dump
import io import io
from sys import path from sys import path
path.append('../searx') # noqa path.append('../searx') # noqa
from searx import settings from searx import settings
from searx.url_utils import urlencode
from searx.engines import initialize_engines, engines from searx.engines import initialize_engines, engines
# Geonames API for country names. # Geonames API for country names.
@ -70,7 +70,7 @@ def get_country_name(locale):
json = loads(response.text) json = loads(response.text)
content = json.get('geonames', None) content = json.get('geonames', None)
if content is None or len(content) != 1: if content is None or len(content) != 1:
print "No country name found for " + locale[0] + "-" + locale[1] print("No country name found for " + locale[0] + "-" + locale[1])
return '' return ''
return content[0].get('countryName', '') return content[0].get('countryName', '')
@ -84,11 +84,11 @@ def fetch_supported_languages():
try: try:
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
except Exception as e: except Exception as e:
print e print(e)
# write json file # write json file
with io.open(engines_languages_file, "w", encoding="utf-8") as f: with io.open(engines_languages_file, "w", encoding="utf-8") as f:
f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8"))) dump(engines_languages, f, ensure_ascii=False)
# Join all language lists. # Join all language lists.
@ -97,7 +97,7 @@ def join_language_lists():
global languages global languages
# include wikipedia first for more accurate language names # include wikipedia first for more accurate language names
languages = {code: lang for code, lang languages = {code: lang for code, lang
in engines_languages['wikipedia'].iteritems() in engines_languages['wikipedia'].items()
if valid_code(code)} if valid_code(code)}
for engine_name in engines_languages: for engine_name in engines_languages:
@ -121,7 +121,7 @@ def join_language_lists():
# filter list to include only languages supported by most engines # filter list to include only languages supported by most engines
min_supported_engines = int(0.70 * len(engines_languages)) min_supported_engines = int(0.70 * len(engines_languages))
languages = {code: lang for code, lang languages = {code: lang for code, lang
in languages.iteritems() in languages.items()
if len(lang.get('counter', [])) >= min_supported_engines or if len(lang.get('counter', [])) >= min_supported_engines or
len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines} len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
@ -165,7 +165,7 @@ def filter_single_country_languages():
# Write languages.py. # Write languages.py.
def write_languages_file(): def write_languages_file():
new_file = open(languages_file, 'w') new_file = open(languages_file, 'wb')
file_content = '# -*- coding: utf-8 -*-\n'\ file_content = '# -*- coding: utf-8 -*-\n'\
+ '# list of language codes\n'\ + '# list of language codes\n'\
+ '# this file is generated automatically by utils/update_search_languages.py\n'\ + '# this file is generated automatically by utils/update_search_languages.py\n'\