Add language support for more engines.

This commit is contained in:
marc 2016-10-29 21:04:01 -05:00
parent a11230819f
commit a11948c71b
15 changed files with 66 additions and 39 deletions

View File

@ -20,6 +20,24 @@ from datetime import datetime
categories = ['videos'] categories = ['videos']
paging = True paging = True
language_support = True language_support = True
supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
"ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
"cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
"de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
"fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
"gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
"hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
"is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
"km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
"la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
"mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
"nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
"oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
"rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
"sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
"su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
"ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
"uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
# search-url # search-url
# see http://www.dailymotion.com/doc/api/obj-video.html # see http://www.dailymotion.com/doc/api/obj-video.html

View File

@ -16,7 +16,6 @@
from urllib import urlencode from urllib import urlencode
from lxml.html import fromstring from lxml.html import fromstring
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.languages import language_codes
# engine dependent config # engine dependent config
categories = ['general'] categories = ['general']
@ -76,26 +75,7 @@ def request(query, params):
else: else:
# tries to get a country code from language # tries to get a country code from language
locale = locale[0].lower() locale = locale[0].lower()
lang_codes = [x[0] for x in language_codes] for lc in supported_languages:
for lc in lang_codes:
lc = lc.split('-')
if locale == lc[0] and len(lc) == 2:
locale = lc[1].lower() + '-' + lc[0].lower()
break
if locale:
params['url'] = url.format(
query=urlencode({'q': query, 'kl': locale}), offset=offset)
else:
locale = params['language'].split('-')
if len(locale) == 2:
# country code goes first
locale = locale[1].lower() + '-' + locale[0].lower()
else:
# tries to get a country code from language
locale = locale[0].lower()
lang_codes = [x[0] for x in language_codes]
for lc in lang_codes:
lc = lc.split('-') lc = lc.split('-')
if locale == lc[0]: if locale == lc[0]:
locale = lc[1].lower() + '-' + lc[0].lower() locale = lc[1].lower() + '-' + lc[0].lower()

View File

@ -20,6 +20,11 @@ from searx.utils import html_to_text
categories = None categories = None
paging = True paging = True
language_support = True language_support = True
supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
"fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
"el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
"es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
"th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
category_to_keyword = {'general': 'web', category_to_keyword = {'general': 'web',
'images': 'images', 'images': 'images',
@ -46,7 +51,15 @@ def request(query, params):
# add language tag if specified # add language tag if specified
if params['language'] != 'all': if params['language'] != 'all':
params['url'] += '&locale=' + params['language'].lower() locale = params['language'].split('-')
if len(locale) == 2 and params['language'] in supported_languages:
params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
else:
# try to get a country code for language
for lang in supported_languages:
if locale[0] == lang.split('-')[0]:
params['url'] += '&locale=' + lang.replace('-', '_').lower()
break
return params return params

View File

@ -24,6 +24,11 @@ categories = ['general']
# paging = False # paging = False
language_support = True language_support = True
supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
"ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
"el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
"lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
"sv", "tl", "th", "tr", "uk", "vi"]
# search-url # search-url
base_url = 'https://startpage.com/' base_url = 'https://startpage.com/'

View File

@ -18,6 +18,12 @@ import re
categories = ['general', 'images'] categories = ['general', 'images']
paging = True paging = True
language_support = True language_support = True
supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
"es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
"zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
"en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
"en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
"tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
# search-url # search-url
base_url = 'https://swisscows.ch/' base_url = 'https://swisscows.ch/'
@ -35,6 +41,8 @@ def request(query, params):
if params['language'] == 'all': if params['language'] == 'all':
ui_language = 'browser' ui_language = 'browser'
region = 'browser' region = 'browser'
elif params['language'].split('-')[0] == 'no':
region = 'nb-NO'
else: else:
region = params['language'] region = params['language']
ui_language = params['language'].split('-')[0] ui_language = params['language'].split('-')[0]

View File

@ -22,7 +22,9 @@ language_support = True # TODO
default_tld = 'com' default_tld = 'com'
language_map = {'ru': 'ru', language_map = {'ru': 'ru',
'ua': 'uk', 'ua': 'ua',
'be': 'by',
'kk': 'kz',
'tr': 'com.tr'} 'tr': 'com.tr'}
# search-url # search-url

View File

@ -100,7 +100,7 @@ language_codes = (
(u"sa", u"संस्कृतम्", u"", u"Sanskrit"), (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
(u"he-IL", u"עברית", u"", u"Hebrew"), (u"he-IL", u"עברית", u"", u"Hebrew"),
(u"se", u"Sámegiella", u"", u"Northern Sami"), (u"se", u"Sámegiella", u"", u"Northern Sami"),
(u"sd", u"سنڌي، سندھی ، सिन्ध", u"", u"Sindhi"), (u"sd", u"سنڌي ،सिन्ध", u"", u"Sindhi"),
(u"fr-CH", u"Français", u"", u"French"), (u"fr-CH", u"Français", u"", u"French"),
(u"zea", u"Zeêuws", u"", u"Zeelandic"), (u"zea", u"Zeêuws", u"", u"Zeelandic"),
(u"it-CH", u"Italiano", u"", u"Italian"), (u"it-CH", u"Italiano", u"", u"Italian"),
@ -191,6 +191,7 @@ language_codes = (
(u"jam", u"Jamaican Creole English", u"", u"Patois"), (u"jam", u"Jamaican Creole English", u"", u"Patois"),
(u"udm", u"Удмурт кыл", u"", u"Udmurt"), (u"udm", u"Удмурт кыл", u"", u"Udmurt"),
(u"ksh", u"Ripoarisch", u"", u"Ripuarian"), (u"ksh", u"Ripoarisch", u"", u"Ripuarian"),
(u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
(u"ms-MY", u"Bahasa Melayu", u"", u"Malay"), (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
(u"de", u"Deutsch", u"", u"German"), (u"de", u"Deutsch", u"", u"German"),
(u"da", u"Dansk", u"", u"Danish"), (u"da", u"Dansk", u"", u"Danish"),
@ -284,6 +285,7 @@ language_codes = (
(u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"), (u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"),
(u"ca-CT", u"Català", u"", u"Catalan"), (u"ca-CT", u"Català", u"", u"Catalan"),
(u"en-MY", u"English", u"", u"English"), (u"en-MY", u"English", u"", u"English"),
(u"olo", u"Livvi-Karelian", u"", u"Livvinkarjala"),
(u"sv-SE", u"Svenska", u"", u"Swedish"), (u"sv-SE", u"Svenska", u"", u"Swedish"),
(u"de-AT", u"Deutsch", u"", u"German"), (u"de-AT", u"Deutsch", u"", u"German"),
(u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"), (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),

View File

@ -11,7 +11,7 @@ class TestDuckduckgoEngine(SearxTestCase):
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1
dicto['language'] = 'de_CH' dicto['language'] = 'de-CH'
dicto['time_range'] = '' dicto['time_range'] = ''
params = duckduckgo.request(query, dicto) params = duckduckgo.request(query, dicto)
self.assertIn('url', params) self.assertIn('url', params)

View File

@ -21,10 +21,14 @@ class TestDDGDefinitionsEngine(SearxTestCase):
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1
dicto['language'] = 'es'
params = duckduckgo_definitions.request(query, dicto) params = duckduckgo_definitions.request(query, dicto)
self.assertIn('url', params) self.assertIn('url', params)
self.assertIn(query, params['url']) self.assertIn(query, params['url'])
self.assertIn('duckduckgo.com', params['url']) self.assertIn('duckduckgo.com', params['url'])
self.assertIn('headers', params)
self.assertIn('Accept-Language', params['headers'])
self.assertIn('es', params['headers']['Accept-Language'])
def test_response(self): def test_response(self):
self.assertRaises(AttributeError, duckduckgo_definitions.response, None) self.assertRaises(AttributeError, duckduckgo_definitions.response, None)

View File

@ -18,7 +18,7 @@ class TestGoogleEngine(SearxTestCase):
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1
dicto['language'] = 'fr_FR' dicto['language'] = 'fr-FR'
dicto['time_range'] = '' dicto['time_range'] = ''
params = google.request(query, dicto) params = google.request(query, dicto)
self.assertIn('url', params) self.assertIn('url', params)

View File

@ -10,7 +10,7 @@ class TestQwantEngine(SearxTestCase):
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 0 dicto['pageno'] = 0
dicto['language'] = 'fr_FR' dicto['language'] = 'fr-FR'
qwant.categories = [''] qwant.categories = ['']
params = qwant.request(query, dicto) params = qwant.request(query, dicto)
self.assertIn('url', params) self.assertIn('url', params)

View File

@ -10,7 +10,7 @@ class TestSwisscowsEngine(SearxTestCase):
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['pageno'] = 1 dicto['pageno'] = 1
dicto['language'] = 'de_DE' dicto['language'] = 'de-DE'
params = swisscows.request(query, dicto) params = swisscows.request(query, dicto)
self.assertTrue('url' in params) self.assertTrue('url' in params)
self.assertTrue(query in params['url']) self.assertTrue(query in params['url'])

View File

@ -10,7 +10,7 @@ class TestWikipediaEngine(SearxTestCase):
def test_request(self): def test_request(self):
query = 'test_query' query = 'test_query'
dicto = defaultdict(dict) dicto = defaultdict(dict)
dicto['language'] = 'fr_FR' dicto['language'] = 'fr-FR'
params = wikipedia.request(query, dicto) params = wikipedia.request(query, dicto)
self.assertIn('url', params) self.assertIn('url', params)
self.assertIn(query, params['url']) self.assertIn(query, params['url'])

View File

@ -41,7 +41,6 @@ def valid_code(lang_code):
if len(lang_code) > 2 or len(lang_code[0]) > 3: if len(lang_code) > 2 or len(lang_code[0]) > 3:
return False return False
if len(lang_code) == 2 and len(lang_code[1]) > 2: if len(lang_code) == 2 and len(lang_code[1]) > 2:
print lang_code
return False return False
return True return True
@ -62,8 +61,8 @@ def get_wikipedia_languages():
english_name = td[1].xpath('./a')[0].text english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',','')) articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
# exclude languages with few articles and language variants # exclude language variants and languages with few articles
if code not in languages and articles >= 100 and valid_code(code): if code not in languages and articles >= 1000 and valid_code(code):
languages[code] = (name, '', english_name) languages[code] = (name, '', english_name)
@ -90,7 +89,7 @@ def join_language_lists():
# try to get language name # try to get language name
language = languages.get(locale.split('-')[0], None) language = languages.get(locale.split('-')[0], None)
if language == None: if language == None:
# print engine_name + ": " + locale print engine_name + ": " + locale
continue continue
(name, country, english) = language (name, country, english) = language
@ -117,12 +116,8 @@ def write_languages_file():
new_file.close() new_file.close()
def main(): if __name__ == "__main__":
get_wikipedia_languages() get_wikipedia_languages()
get_google_languages() get_google_languages()
join_language_lists() join_language_lists()
write_languages_file() write_languages_file()
if __name__ == "__main__":
main()