forked from zaclys/searxng
change language list to only include languages with a minimum of engines
that support them. users can still query lesser supported through the :lang_code bang.
This commit is contained in:
parent
7388067f15
commit
1175b3906f
File diff suppressed because one or more lines are too long
|
@ -95,8 +95,13 @@ def _fetch_supported_languages(resp):
|
||||||
dom = fromstring(resp.text)
|
dom = fromstring(resp.text)
|
||||||
links = dom.xpath('//span[@id="menu2"]/a')
|
links = dom.xpath('//span[@id="menu2"]/a')
|
||||||
for link in links:
|
for link in links:
|
||||||
code = link.xpath('./@href')[0][-2:]
|
href = link.xpath('./@href')[0].split('lang%3A')
|
||||||
if code != 'xx' and code not in supported_languages:
|
if len(href) == 2:
|
||||||
|
code = href[1].split('_')
|
||||||
|
if len(code) == 2:
|
||||||
|
code = code[0] + '-' + code[1].upper()
|
||||||
|
else:
|
||||||
|
code = code[0]
|
||||||
supported_languages.append(code)
|
supported_languages.append(code)
|
||||||
|
|
||||||
return supported_languages
|
return supported_languages
|
||||||
|
|
|
@ -132,7 +132,7 @@ def _fetch_supported_languages(resp):
|
||||||
english_name = td[1].xpath('./a')[0].text
|
english_name = td[1].xpath('./a')[0].text
|
||||||
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
|
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
|
||||||
# exclude languages with too few articles
|
# exclude languages with too few articles
|
||||||
if articles >= 100000:
|
if articles >= 100:
|
||||||
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
|
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
|
||||||
|
|
||||||
return supported_languages
|
return supported_languages
|
||||||
|
|
|
@ -3,28 +3,18 @@
|
||||||
# this file is generated automatically by utils/update_search_languages.py
|
# this file is generated automatically by utils/update_search_languages.py
|
||||||
|
|
||||||
language_codes = (
|
language_codes = (
|
||||||
(u"af", u"Afrikaans", u"", u""),
|
(u"ar-SA", u"العربية", u"", u"Arabic"),
|
||||||
(u"am", u"አማርኛ", u"", u"Amharic"),
|
(u"bg-BG", u"Български", u"", u"Bulgarian"),
|
||||||
(u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
|
|
||||||
(u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
|
|
||||||
(u"be", u"Беларуская", u"", u"Belarusian"),
|
|
||||||
(u"bg-BG", u"Български", u"България", u"Bulgarian"),
|
|
||||||
(u"bn", u"বাংলা", u"", u"Bengali"),
|
|
||||||
(u"br", u"Brezhoneg", u"", u"Breton"),
|
|
||||||
(u"bs", u"Bosnian", u"", u"Bosnian"),
|
|
||||||
(u"ca", u"Català", u"", u"Catalan"),
|
(u"ca", u"Català", u"", u"Catalan"),
|
||||||
(u"ca-CT", u"Català", u"", u"Catalan"),
|
(u"ca-CT", u"Català", u"", u"Catalan"),
|
||||||
(u"ca-ES", u"Català", u"Espanya", u"Catalan"),
|
(u"ca-ES", u"Català", u"Espanya", u"Catalan"),
|
||||||
(u"ce", u"Нохчийн", u"", u"Chechen"),
|
(u"cs-CZ", u"Čeština", u"", u"Czech"),
|
||||||
(u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
|
(u"da-DK", u"Dansk", u"", u"Danish"),
|
||||||
(u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
|
|
||||||
(u"cy", u"Cymraeg", u"", u"Welsh"),
|
|
||||||
(u"da-DK", u"Dansk", u"Danmark", u"Danish"),
|
|
||||||
(u"de", u"Deutsch", u"", u"German"),
|
(u"de", u"Deutsch", u"", u"German"),
|
||||||
(u"de-AT", u"Deutsch", u"Österreich", u"German"),
|
(u"de-AT", u"Deutsch", u"Österreich", u"German"),
|
||||||
(u"de-CH", u"Deutsch", u"Schweiz", u"German"),
|
(u"de-CH", u"Deutsch", u"Schweiz", u"German"),
|
||||||
(u"de-DE", u"Deutsch", u"Deutschland", u"German"),
|
(u"de-DE", u"Deutsch", u"Deutschland", u"German"),
|
||||||
(u"el-GR", u"Ελληνικά", u"Ελλάδα", u"Greek"),
|
(u"el-GR", u"Ελληνικά", u"", u"Greek"),
|
||||||
(u"en", u"English", u"", u"English"),
|
(u"en", u"English", u"", u"English"),
|
||||||
(u"en-AU", u"English", u"Australia", u"English"),
|
(u"en-AU", u"English", u"Australia", u"English"),
|
||||||
(u"en-CA", u"English", u"Canada", u"English"),
|
(u"en-CA", u"English", u"Canada", u"English"),
|
||||||
|
@ -38,7 +28,6 @@ language_codes = (
|
||||||
(u"en-SG", u"English", u"Singapore", u"English"),
|
(u"en-SG", u"English", u"Singapore", u"English"),
|
||||||
(u"en-US", u"English", u"United States", u"English"),
|
(u"en-US", u"English", u"United States", u"English"),
|
||||||
(u"en-ZA", u"English", u"South Africa", u"English"),
|
(u"en-ZA", u"English", u"South Africa", u"English"),
|
||||||
(u"eo", u"Esperanto", u"", u"Esperanto"),
|
|
||||||
(u"es", u"Español", u"", u"Spanish"),
|
(u"es", u"Español", u"", u"Spanish"),
|
||||||
(u"es-AR", u"Español", u"Argentina", u"Spanish"),
|
(u"es-AR", u"Español", u"Argentina", u"Spanish"),
|
||||||
(u"es-CL", u"Español", u"Chile", u"Spanish"),
|
(u"es-CL", u"Español", u"Chile", u"Spanish"),
|
||||||
|
@ -47,85 +36,43 @@ language_codes = (
|
||||||
(u"es-MX", u"Español", u"México", u"Spanish"),
|
(u"es-MX", u"Español", u"México", u"Spanish"),
|
||||||
(u"es-PE", u"Español", u"Perú", u"Spanish"),
|
(u"es-PE", u"Español", u"Perú", u"Spanish"),
|
||||||
(u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
|
(u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
|
||||||
(u"et-EE", u"Eesti", u"Eesti", u"Estonian"),
|
(u"et-EE", u"Eesti", u"", u"Estonian"),
|
||||||
(u"eu", u"Euskara", u"", u"Basque"),
|
(u"fi-FI", u"Suomi", u"", u"Finnish"),
|
||||||
(u"fa", u"فارسی", u"", u"Persian"),
|
|
||||||
(u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
|
|
||||||
(u"fr", u"Français", u"", u"French"),
|
(u"fr", u"Français", u"", u"French"),
|
||||||
(u"fr-BE", u"Français", u"Belgique", u"French"),
|
(u"fr-BE", u"Français", u"Belgique", u"French"),
|
||||||
(u"fr-CA", u"Français", u"Canada", u"French"),
|
(u"fr-CA", u"Français", u"Canada", u"French"),
|
||||||
(u"fr-CH", u"Français", u"Suisse", u"French"),
|
(u"fr-CH", u"Français", u"Suisse", u"French"),
|
||||||
(u"fr-FR", u"Français", u"France", u"French"),
|
(u"fr-FR", u"Français", u"France", u"French"),
|
||||||
(u"ga", u"Gaeilge", u"", u"Irish"),
|
(u"he-IL", u"עברית", u"", u"Hebrew"),
|
||||||
(u"gl", u"Galego", u"", u"Galician"),
|
(u"hr-HR", u"Hrvatski", u"", u"Croatian"),
|
||||||
(u"gu", u"ગુજરાતી", u"", u"Gujarati"),
|
(u"hu-HU", u"Magyar", u"", u"Hungarian"),
|
||||||
(u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
|
(u"id-ID", u"Bahasa Indonesia", u"", u"Indonesian"),
|
||||||
(u"hi", u"हिन्दी", u"", u"Hindi"),
|
|
||||||
(u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
|
|
||||||
(u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
|
|
||||||
(u"hy", u"Հայերեն", u"", u"Armenian"),
|
|
||||||
(u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
|
|
||||||
(u"is", u"Íslenska", u"", u""),
|
|
||||||
(u"it", u"Italiano", u"", u"Italian"),
|
(u"it", u"Italiano", u"", u"Italian"),
|
||||||
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
|
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
|
||||||
(u"it-IT", u"Italiano", u"Italia", u"Italian"),
|
(u"it-IT", u"Italiano", u"Italia", u"Italian"),
|
||||||
(u"iw", u"עברית", u"", u""),
|
(u"ja-JP", u"日本語", u"", u"Japanese"),
|
||||||
(u"ja-JP", u"日本語", u"日本", u"Japanese"),
|
(u"ko-KR", u"한국어", u"", u"Korean"),
|
||||||
(u"ka", u"ქართული", u"", u"Georgian"),
|
(u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
|
||||||
(u"kk", u"Қазақша", u"", u"Kazakh"),
|
(u"lv-LV", u"Latviešu", u"", u"Latvian"),
|
||||||
(u"kn", u"ಕನ್ನಡ", u"", u"Kannada"),
|
|
||||||
(u"ko-KR", u"한국어", u"대한민국", u"Korean"),
|
|
||||||
(u"la", u"Latina", u"", u"Latin"),
|
|
||||||
(u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
|
|
||||||
(u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
|
|
||||||
(u"mi", u"Reo Māori", u"", u"Maori"),
|
|
||||||
(u"min", u"Minangkabau", u"", u"Minangkabau"),
|
|
||||||
(u"mk", u"Македонски", u"", u"Macedonian"),
|
|
||||||
(u"mn", u"Монгол", u"", u"Mongolian"),
|
|
||||||
(u"mr", u"मराठी", u"", u"Marathi"),
|
|
||||||
(u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
|
|
||||||
(u"mt", u"Malti", u"", u"Maltese"),
|
|
||||||
(u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
|
|
||||||
(u"nl", u"Nederlands", u"", u"Dutch"),
|
(u"nl", u"Nederlands", u"", u"Dutch"),
|
||||||
(u"nl-BE", u"Nederlands", u"België", u"Dutch"),
|
(u"nl-BE", u"Nederlands", u"België", u"Dutch"),
|
||||||
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
|
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
|
||||||
(u"nn", u"Nynorsk", u"", u"Norwegian"),
|
(u"no-NO", u"Norsk", u"", u"Norwegian"),
|
||||||
(u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
|
(u"pl-PL", u"Polski", u"", u"Polish"),
|
||||||
(u"oc", u"Occitan", u"", u"Occitan"),
|
|
||||||
(u"or", u"Oriya", u"", u"Oriya"),
|
|
||||||
(u"pa", u"ਪੰਜਾਬੀ", u"", u"Panjabi"),
|
|
||||||
(u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
|
|
||||||
(u"ps", u"Pushto", u"", u"Pushto"),
|
|
||||||
(u"pt", u"Português", u"", u"Portuguese"),
|
(u"pt", u"Português", u"", u"Portuguese"),
|
||||||
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
|
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
|
||||||
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
|
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
|
||||||
(u"ro-RO", u"Română", u"România", u"Romanian"),
|
(u"ro-RO", u"Română", u"", u"Romanian"),
|
||||||
(u"ru-RU", u"Русский", u"Россия", u"Russian"),
|
(u"ru-RU", u"Русский", u"", u"Russian"),
|
||||||
(u"rw", u"Ikinyarwanda", u"", u"Kinyarwanda"),
|
(u"sk-SK", u"Slovenčina", u"", u"Slovak"),
|
||||||
(u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
|
|
||||||
(u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
|
|
||||||
(u"sl", u"Slovenščina", u"", u"Slovenian"),
|
(u"sl", u"Slovenščina", u"", u"Slovenian"),
|
||||||
(u"sr", u"Српски / Srpski", u"", u"Serbian"),
|
(u"sv-SE", u"Svenska", u"", u"Swedish"),
|
||||||
(u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
|
(u"th-TH", u"ไทย", u"", u"Thai"),
|
||||||
(u"sw", u"Kiswahili", u"", u""),
|
(u"tr-TR", u"Türkçe", u"", u"Turkish"),
|
||||||
(u"ta", u"தமிழ்", u"", u"Tamil"),
|
(u"uk-UA", u"Українська", u"", u"Ukrainian"),
|
||||||
(u"th-TH", u"ไทย", u"ไทย", u"Thai"),
|
(u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
|
||||||
(u"ti", u"ትግርኛ", u"", u"Tigrinya"),
|
|
||||||
(u"tl-PH", u"Filipino", u"Pilipinas", u""),
|
|
||||||
(u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
|
|
||||||
(u"tt", u"Татарча", u"", u"Tatar"),
|
|
||||||
(u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
|
|
||||||
(u"ur", u"اردو", u"", u"Urdu"),
|
|
||||||
(u"uz", u"O‘zbek", u"", u"Uzbek"),
|
|
||||||
(u"ve", u"Venda", u"", u"Venda"),
|
|
||||||
(u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
|
|
||||||
(u"vo", u"Volapük", u"", u"Volapük"),
|
|
||||||
(u"wa", u"Walon", u"", u"Walloon"),
|
|
||||||
(u"war", u"Winaray", u"", u"Waray-Waray"),
|
|
||||||
(u"xh", u"Xhosa", u"", u"Xhosa"),
|
|
||||||
(u"zh", u"中文", u"", u"Chinese"),
|
(u"zh", u"中文", u"", u"Chinese"),
|
||||||
(u"zh-CN", u"中文", u"中国", u"Chinese"),
|
(u"zh-CN", u"中文", u"中国", u"Chinese"),
|
||||||
(u"zh-HK", u"中文", u"香港", u"Chinese"),
|
(u"zh-HK", u"中文", u"香港", u"Chinese"),
|
||||||
(u"zh-TW", u"中文", u"台湾", u"Chinese"),
|
(u"zh-TW", u"中文", u"台湾", u"Chinese")
|
||||||
(u"zu", u"Isi-Zulu", u"", u"Zulu")
|
|
||||||
)
|
)
|
||||||
|
|
|
@ -24,6 +24,8 @@ from searx.engines import (
|
||||||
import string
|
import string
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(\-[A-Z]{2})?$')
|
||||||
|
|
||||||
|
|
||||||
class RawTextQuery(object):
|
class RawTextQuery(object):
|
||||||
"""parse raw text query (the value from the html input)"""
|
"""parse raw text query (the value from the html input)"""
|
||||||
|
@ -68,6 +70,11 @@ class RawTextQuery(object):
|
||||||
if query_part[0] == ':':
|
if query_part[0] == ':':
|
||||||
lang = query_part[1:].lower()
|
lang = query_part[1:].lower()
|
||||||
|
|
||||||
|
# user may set a valid, yet not selectable language
|
||||||
|
if VALID_LANGUAGE_CODE.match(lang):
|
||||||
|
self.languages.append(lang)
|
||||||
|
parse_next = True
|
||||||
|
|
||||||
# check if any language-code is equal with
|
# check if any language-code is equal with
|
||||||
# declared language-codes
|
# declared language-codes
|
||||||
for lc in language_codes:
|
for lc in language_codes:
|
||||||
|
|
|
@ -102,10 +102,10 @@ Change search language
|
||||||
Page Should Contain preferences
|
Page Should Contain preferences
|
||||||
Go To http://localhost:11111/preferences
|
Go To http://localhost:11111/preferences
|
||||||
List Selection Should Be language Default language
|
List Selection Should Be language Default language
|
||||||
Select From List language Türkçe (Türkiye) - tr-TR
|
Select From List language Türkçe - tr-TR
|
||||||
Submit Preferences
|
Submit Preferences
|
||||||
Go To http://localhost:11111/preferences
|
Go To http://localhost:11111/preferences
|
||||||
List Selection Should Be language Türkçe (Türkiye) - tr-TR
|
List Selection Should Be language Türkçe - tr-TR
|
||||||
|
|
||||||
Change autocomplete
|
Change autocomplete
|
||||||
Page Should Contain about
|
Page Should Contain about
|
||||||
|
|
|
@ -103,7 +103,9 @@ class TestGigablastEngine(SearxTestCase):
|
||||||
<span id="menu2">
|
<span id="menu2">
|
||||||
<a href="/search?&rxikd=1&qlang=xx"></a>
|
<a href="/search?&rxikd=1&qlang=xx"></a>
|
||||||
<a href="/search?&rxikd=1&qlang=en"></a>
|
<a href="/search?&rxikd=1&qlang=en"></a>
|
||||||
<a href="/search?&rxikd=1&qlang=fr"></a>
|
<a href="/search?&rxikd=1&prepend=gblang%3Aen"></a>
|
||||||
|
<a href="/search?&rxikd=1&qlang=zh_"></a>
|
||||||
|
<a href="/search?&rxikd=1&prepend=gblang%3Azh_tw"></a>
|
||||||
</span>
|
</span>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -113,4 +115,4 @@ class TestGigablastEngine(SearxTestCase):
|
||||||
self.assertEqual(type(languages), list)
|
self.assertEqual(type(languages), list)
|
||||||
self.assertEqual(len(languages), 2)
|
self.assertEqual(len(languages), 2)
|
||||||
self.assertIn('en', languages)
|
self.assertIn('en', languages)
|
||||||
self.assertIn('fr', languages)
|
self.assertIn('zh-TW', languages)
|
||||||
|
|
|
@ -25,7 +25,6 @@ engines_languages_file = 'engines_languages.json'
|
||||||
languages_file = 'languages.py'
|
languages_file = 'languages.py'
|
||||||
|
|
||||||
engines_languages = {}
|
engines_languages = {}
|
||||||
languages = {}
|
|
||||||
|
|
||||||
|
|
||||||
# To filter out invalid codes and dialects.
|
# To filter out invalid codes and dialects.
|
||||||
|
@ -93,22 +92,36 @@ def fetch_supported_languages():
|
||||||
# Join all language lists.
|
# Join all language lists.
|
||||||
# Iterate all languages supported by each engine.
|
# Iterate all languages supported by each engine.
|
||||||
def join_language_lists():
|
def join_language_lists():
|
||||||
|
global languages
|
||||||
# include wikipedia first for more accurate language names
|
# include wikipedia first for more accurate language names
|
||||||
languages.update({code: lang for code, lang
|
languages = {code: lang for code, lang
|
||||||
in engines_languages['wikipedia'].iteritems()
|
in engines_languages['wikipedia'].iteritems()
|
||||||
if valid_code(code)})
|
if valid_code(code)}
|
||||||
|
|
||||||
for engine_name in engines_languages:
|
for engine_name in engines_languages:
|
||||||
for locale in engines_languages[engine_name]:
|
for locale in engines_languages[engine_name]:
|
||||||
if not valid_code(locale):
|
if valid_code(locale):
|
||||||
continue
|
# if language is not on list or if it has no name yet
|
||||||
|
if locale not in languages or not languages[locale].get('name'):
|
||||||
|
if isinstance(engines_languages[engine_name], dict):
|
||||||
|
languages[locale] = engines_languages[engine_name][locale]
|
||||||
|
else:
|
||||||
|
languages[locale] = {}
|
||||||
|
|
||||||
# if language is not on list or if it has no name yet
|
# add to counter of engines that support given language
|
||||||
if locale not in languages or not languages[locale].get('name'):
|
lang = locale.split('-')[0]
|
||||||
if isinstance(engines_languages[engine_name], dict):
|
if lang in languages:
|
||||||
languages[locale] = engines_languages[engine_name][locale]
|
if 'counter' not in languages[lang]:
|
||||||
else:
|
languages[lang]['counter'] = [engine_name]
|
||||||
languages[locale] = {}
|
elif engine_name not in languages[lang]['counter']:
|
||||||
|
languages[lang]['counter'].append(engine_name)
|
||||||
|
|
||||||
|
# filter list to include only languages supported by most engines
|
||||||
|
min_supported_engines = int(0.75 * len(engines_languages))
|
||||||
|
languages = {code: lang for code, lang
|
||||||
|
in languages.iteritems()
|
||||||
|
if len(lang.get('counter', [])) >= min_supported_engines or
|
||||||
|
len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
|
||||||
|
|
||||||
# get locales that have no name or country yet
|
# get locales that have no name or country yet
|
||||||
for locale in languages.keys():
|
for locale in languages.keys():
|
||||||
|
@ -134,6 +147,7 @@ def join_language_lists():
|
||||||
# Remove countryless language if language is featured in only one country.
|
# Remove countryless language if language is featured in only one country.
|
||||||
def filter_single_country_languages():
|
def filter_single_country_languages():
|
||||||
prev_lang = None
|
prev_lang = None
|
||||||
|
prev_code = None
|
||||||
for code in sorted(languages):
|
for code in sorted(languages):
|
||||||
lang = code.split('-')[0]
|
lang = code.split('-')[0]
|
||||||
if lang == prev_lang:
|
if lang == prev_lang:
|
||||||
|
@ -141,8 +155,10 @@ def filter_single_country_languages():
|
||||||
else:
|
else:
|
||||||
if prev_lang is not None and countries == 1:
|
if prev_lang is not None and countries == 1:
|
||||||
del languages[prev_lang]
|
del languages[prev_lang]
|
||||||
|
languages[prev_code]['country'] = ''
|
||||||
countries = 0
|
countries = 0
|
||||||
prev_lang = lang
|
prev_lang = lang
|
||||||
|
prev_code = code
|
||||||
|
|
||||||
|
|
||||||
# Write languages.py.
|
# Write languages.py.
|
||||||
|
|
Loading…
Reference in New Issue