forked from zaclys/searxng
[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and generates engines_languages.json with each engine's supported language.
This commit is contained in:
parent
92c6e88ad3
commit
f62ce21f50
3256
searx/data/engines_languages.json
Normal file
3256
searx/data/engines_languages.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -20,6 +20,7 @@ from os.path import realpath, dirname
|
||||
import sys
|
||||
from flask_babel import gettext
|
||||
from operator import itemgetter
|
||||
from json import loads
|
||||
from searx import settings
|
||||
from searx import logger
|
||||
from searx.utils import load_module
|
||||
@ -78,6 +79,9 @@ def load_engine(engine_data):
|
||||
if not hasattr(engine, arg_name):
|
||||
setattr(engine, arg_name, arg_value)
|
||||
|
||||
if engine_data['name'] in languages:
|
||||
setattr(engine, 'supported_languages', languages[engine_data['name']])
|
||||
|
||||
# checking required variables
|
||||
for engine_attr in dir(engine):
|
||||
if engine_attr.startswith('_'):
|
||||
@ -207,6 +211,8 @@ if 'engines' not in settings or not settings['engines']:
|
||||
logger.error('No engines found. Edit your settings.yml')
|
||||
exit(2)
|
||||
|
||||
languages = loads(open(engine_dir + '/../data/engines_languages.json').read())
|
||||
|
||||
for engine_data in settings['engines']:
|
||||
engine = load_engine(engine_data)
|
||||
if engine is not None:
|
||||
|
@ -15,12 +15,14 @@
|
||||
|
||||
from urllib import urlencode
|
||||
from lxml import html
|
||||
from requests import get
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
paging = True
|
||||
language_support = True
|
||||
supported_languages_url = 'https://www.bing.com/account/general'
|
||||
|
||||
# search-url
|
||||
base_url = 'https://www.bing.com/'
|
||||
@ -81,3 +83,16 @@ def response(resp):
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
supported_languages = []
|
||||
response = get(supported_languages_url)
|
||||
dom = html.fromstring(response.text)
|
||||
options = dom.xpath('//div[@id="limit-languages"]//input')
|
||||
for option in options:
|
||||
code = option.xpath('./@id')[0].replace('_', '-')
|
||||
supported_languages.append(code)
|
||||
|
||||
return supported_languages
|
||||
|
@ -19,7 +19,7 @@ from urllib import urlencode
|
||||
from lxml import html
|
||||
from json import loads
|
||||
import re
|
||||
from searx.engines.bing import supported_languages
|
||||
from searx.engines.bing import fetch_supported_languages
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images']
|
||||
|
@ -17,7 +17,7 @@ from datetime import datetime
|
||||
from dateutil import parser
|
||||
from lxml import etree
|
||||
from searx.utils import list_get
|
||||
from searx.engines.bing import supported_languages
|
||||
from searx.engines.bing import fetch_supported_languages
|
||||
|
||||
# engine dependent config
|
||||
categories = ['news']
|
||||
|
@ -15,29 +15,12 @@
|
||||
from urllib import urlencode
|
||||
from json import loads
|
||||
from datetime import datetime
|
||||
from requests import get
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos']
|
||||
paging = True
|
||||
language_support = True
|
||||
supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
|
||||
"ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
|
||||
"cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
|
||||
"de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
|
||||
"fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
|
||||
"gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
|
||||
"hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
|
||||
"is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
|
||||
"km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
|
||||
"la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
|
||||
"mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
|
||||
"nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
|
||||
"oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
|
||||
"rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
|
||||
"sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
|
||||
"su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
|
||||
"ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
|
||||
"uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
|
||||
|
||||
# search-url
|
||||
# see http://www.dailymotion.com/doc/api/obj-video.html
|
||||
@ -45,6 +28,8 @@ search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,descr
|
||||
embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
|
||||
'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
|
||||
|
||||
supported_languages_url = 'https://api.dailymotion.com/languages'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
@ -92,3 +77,23 @@ def response(resp):
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
supported_languages = {}
|
||||
|
||||
response = get(supported_languages_url)
|
||||
response_json = loads(response.text)
|
||||
|
||||
for language in response_json['list']:
|
||||
supported_languages[language['code']] = {}
|
||||
|
||||
name = language['native_name']
|
||||
if name:
|
||||
supported_languages[language['code']]['name'] = name
|
||||
english_name = language['name']
|
||||
if english_name:
|
||||
supported_languages[language['code']]['english_name'] = english_name
|
||||
|
||||
return supported_languages
|
||||
|
@ -15,19 +15,15 @@
|
||||
|
||||
from urllib import urlencode
|
||||
from lxml.html import fromstring
|
||||
from requests import get
|
||||
from json import loads
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
paging = True
|
||||
language_support = True
|
||||
supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT",
|
||||
"es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE",
|
||||
"el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP",
|
||||
"kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO",
|
||||
"es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG",
|
||||
"sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW",
|
||||
"th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"]
|
||||
supported_languages_url = 'https://duckduckgo.com/d2030.js'
|
||||
time_range_support = True
|
||||
|
||||
# search-url
|
||||
@ -65,8 +61,6 @@ def request(query, params):
|
||||
locale = 'xa' + params['language'].split('-')[0]
|
||||
elif params['language'][-2:] == 'GB':
|
||||
locale = 'uk' + params['language'].split('-')[0]
|
||||
elif params['language'] == 'es-419':
|
||||
locale = 'xl-es'
|
||||
else:
|
||||
locale = params['language'].split('-')
|
||||
if len(locale) == 2:
|
||||
@ -120,3 +114,18 @@ def response(resp):
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
response = get(supported_languages_url)
|
||||
|
||||
# response is a js file with regions as an embedded object
|
||||
response_page = response.text
|
||||
response_page = response_page[response_page.find('regions:{') + 8:]
|
||||
response_page = response_page[:response_page.find('}') + 1]
|
||||
|
||||
regions_json = loads(response_page)
|
||||
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
|
||||
|
||||
return supported_languages
|
||||
|
@ -4,7 +4,7 @@ from re import compile, sub
|
||||
from lxml import html
|
||||
from searx.utils import html_to_text
|
||||
from searx.engines.xpath import extract_text
|
||||
from searx.engines.duckduckgo import supported_languages
|
||||
from searx.engines.duckduckgo import fetch_supported_languages
|
||||
|
||||
url = 'https://api.duckduckgo.com/'\
|
||||
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
||||
|
@ -14,6 +14,8 @@ from json import loads
|
||||
from random import randint
|
||||
from time import time
|
||||
from urllib import urlencode
|
||||
from requests import get
|
||||
from lxml.html import fromstring
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
@ -40,11 +42,7 @@ url_xpath = './/url'
|
||||
title_xpath = './/title'
|
||||
content_xpath = './/sum'
|
||||
|
||||
supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de",
|
||||
"nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
|
||||
"th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
|
||||
"hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
|
||||
"hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
|
||||
supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
|
||||
|
||||
|
||||
# do search-request
|
||||
@ -90,3 +88,17 @@ def response(resp):
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
supported_languages = []
|
||||
response = get(supported_languages_url)
|
||||
dom = fromstring(response.text)
|
||||
links = dom.xpath('//span[@id="menu2"]/a')
|
||||
for link in links:
|
||||
code = link.xpath('./@href')[0][-2:]
|
||||
if code != 'xx' and code not in supported_languages:
|
||||
supported_languages.append(code)
|
||||
|
||||
return supported_languages
|
||||
|
@ -12,6 +12,7 @@ import re
|
||||
from urllib import urlencode
|
||||
from urlparse import urlparse, parse_qsl
|
||||
from lxml import html, etree
|
||||
from requests import get
|
||||
from searx.engines.xpath import extract_text, extract_url
|
||||
from searx.search import logger
|
||||
|
||||
@ -23,20 +24,6 @@ categories = ['general']
|
||||
paging = True
|
||||
language_support = True
|
||||
use_locale_domain = True
|
||||
supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca",
|
||||
"ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et",
|
||||
"xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr",
|
||||
"gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw",
|
||||
"bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw",
|
||||
"tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz",
|
||||
"lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso",
|
||||
"ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT",
|
||||
"ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st",
|
||||
"sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum",
|
||||
"tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk",
|
||||
"mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps",
|
||||
"fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te",
|
||||
"kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"]
|
||||
time_range_support = True
|
||||
|
||||
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
|
||||
@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.'
|
||||
maps_path = '/maps'
|
||||
redirect_path = '/url'
|
||||
images_path = '/images'
|
||||
supported_languages_url = 'https://www.google.com/preferences?#languages'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = '//div[@class="g"]'
|
||||
@ -373,3 +361,17 @@ def attributes_to_html(attributes):
|
||||
retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
|
||||
retval = retval + '</table>'
|
||||
return retval
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
supported_languages = {}
|
||||
response = get(supported_languages_url)
|
||||
dom = html.fromstring(response.text)
|
||||
options = dom.xpath('//select[@name="hl"]/option')
|
||||
for option in options:
|
||||
code = option.xpath('./@value')[0].split('-')[0]
|
||||
name = option.text[:-1].title()
|
||||
supported_languages[code] = {"name": name}
|
||||
|
||||
return supported_languages
|
||||
|
@ -13,7 +13,7 @@
|
||||
from lxml import html
|
||||
from urllib import urlencode
|
||||
from json import loads
|
||||
from searx.engines.google import supported_languages
|
||||
from searx.engines.google import fetch_supported_languages
|
||||
|
||||
# search-url
|
||||
categories = ['news']
|
||||
|
@ -15,7 +15,6 @@
|
||||
from json import loads
|
||||
from string import Formatter
|
||||
from urllib import urlencode, quote
|
||||
from searx.engines.wikipedia import supported_languages
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
|
@ -20,11 +20,6 @@ from searx.utils import html_to_text
|
||||
categories = None
|
||||
paging = True
|
||||
language_support = True
|
||||
supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
|
||||
"fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
|
||||
"el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
|
||||
"es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
|
||||
"th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
|
||||
|
||||
category_to_keyword = {'general': 'web',
|
||||
'images': 'images',
|
||||
@ -51,15 +46,7 @@ def request(query, params):
|
||||
|
||||
# add language tag if specified
|
||||
if params['language'] != 'all':
|
||||
locale = params['language'].split('-')
|
||||
if len(locale) == 2 and params['language'] in supported_languages:
|
||||
params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
|
||||
else:
|
||||
# try to get a country code for language
|
||||
for lang in supported_languages:
|
||||
if locale[0] == lang.split('-')[0]:
|
||||
params['url'] += '&locale=' + lang.replace('-', '_').lower()
|
||||
break
|
||||
params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
|
||||
|
||||
return params
|
||||
|
||||
|
@ -24,11 +24,6 @@ categories = ['general']
|
||||
|
||||
# paging = False
|
||||
language_support = True
|
||||
supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
|
||||
"ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
|
||||
"el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
|
||||
"lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
|
||||
"sv", "tl", "th", "tr", "uk", "vi"]
|
||||
|
||||
# search-url
|
||||
base_url = 'https://startpage.com/'
|
||||
|
@ -22,7 +22,7 @@ language = ""
|
||||
|
||||
# search-url
|
||||
url = 'http://www.subtitleseeker.com/'
|
||||
search_url = url + 'search/TITLES/{query}&p={pageno}'
|
||||
search_url = url + 'search/TITLES/{query}?p={pageno}'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = '//div[@class="boxRows"]'
|
||||
@ -51,7 +51,8 @@ def response(resp):
|
||||
elif resp.search_params['language'] != 'all':
|
||||
search_lang = [lc[3]
|
||||
for lc in language_codes
|
||||
if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
|
||||
if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
|
||||
search_lang = search_lang[0].split(' (')[0]
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath(results_xpath):
|
||||
|
@ -13,17 +13,13 @@
|
||||
from json import loads
|
||||
from urllib import urlencode, unquote
|
||||
import re
|
||||
from requests import get
|
||||
from lxml.html import fromstring
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'images']
|
||||
paging = True
|
||||
language_support = True
|
||||
supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
|
||||
"es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
|
||||
"zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
|
||||
"en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
|
||||
"en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
|
||||
"tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
|
||||
|
||||
# search-url
|
||||
base_url = 'https://swisscows.ch/'
|
||||
@ -114,3 +110,16 @@ def response(resp):
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
supported_languages = []
|
||||
response = get(base_url)
|
||||
dom = fromstring(response.text)
|
||||
options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
|
||||
for option in options:
|
||||
code = option.xpath('./@data-val')[0]
|
||||
supported_languages.append(code)
|
||||
|
||||
return supported_languages
|
||||
|
@ -15,7 +15,7 @@ from searx import logger
|
||||
from searx.poolrequests import get
|
||||
from searx.engines.xpath import extract_text
|
||||
from searx.utils import format_date_by_locale
|
||||
from searx.engines.wikipedia import supported_languages
|
||||
from searx.engines.wikipedia import fetch_supported_languages
|
||||
|
||||
from json import loads
|
||||
from lxml.html import fromstring
|
||||
@ -57,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
language = params['language'].split('_')[0]
|
||||
language = params['language'].split('-')[0]
|
||||
if language == 'all':
|
||||
language = 'en'
|
||||
|
||||
@ -72,7 +72,7 @@ def response(resp):
|
||||
html = fromstring(resp.content)
|
||||
wikidata_ids = html.xpath(wikidata_ids_xpath)
|
||||
|
||||
language = resp.search_params['language'].split('_')[0]
|
||||
language = resp.search_params['language'].split('-')[0]
|
||||
if language == 'all':
|
||||
language = 'en'
|
||||
|
||||
|
@ -12,36 +12,9 @@
|
||||
|
||||
from json import loads
|
||||
from urllib import urlencode, quote
|
||||
from requests import get
|
||||
from lxml.html import fromstring
|
||||
|
||||
supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
|
||||
"pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
|
||||
"ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
|
||||
"eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
|
||||
"lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
|
||||
"el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
|
||||
"mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
|
||||
"be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
|
||||
"mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
|
||||
"cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
|
||||
"scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
|
||||
"ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
|
||||
"am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
|
||||
"os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
|
||||
"sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
|
||||
"gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
|
||||
"tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
|
||||
"zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
|
||||
"udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
|
||||
"ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
|
||||
"kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
|
||||
"pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
|
||||
"rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
|
||||
"pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
|
||||
"roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
|
||||
"ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
|
||||
"xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
|
||||
"ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
|
||||
"dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
|
||||
|
||||
# search-url
|
||||
base_url = 'https://{language}.wikipedia.org/'
|
||||
@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\
|
||||
'&explaintext'\
|
||||
'&pithumbsize=300'\
|
||||
'&redirects'
|
||||
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
||||
|
||||
|
||||
# set language in base_url
|
||||
@ -142,3 +116,24 @@ def response(resp):
|
||||
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
supported_languages = {}
|
||||
response = get(supported_languages_url)
|
||||
dom = fromstring(response.text)
|
||||
tables = dom.xpath('//table[contains(@class,"sortable")]')
|
||||
for table in tables:
|
||||
# exclude header row
|
||||
trs = table.xpath('.//tr')[1:]
|
||||
for tr in trs:
|
||||
td = tr.xpath('./td')
|
||||
code = td[3].xpath('./a')[0].text
|
||||
name = td[2].xpath('./a')[0].text
|
||||
english_name = td[1].xpath('./a')[0].text
|
||||
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
|
||||
if articles >= 10000:
|
||||
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
|
||||
|
||||
return supported_languages
|
||||
|
@ -14,16 +14,13 @@
|
||||
from urllib import urlencode
|
||||
from urlparse import unquote
|
||||
from lxml import html
|
||||
from requests import get
|
||||
from searx.engines.xpath import extract_text, extract_url
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
paging = True
|
||||
language_support = True
|
||||
supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en",
|
||||
"et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja",
|
||||
"ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr",
|
||||
"sl", "es", "sv", "th", "tr"]
|
||||
time_range_support = True
|
||||
|
||||
# search-url
|
||||
@ -31,6 +28,8 @@ base_url = 'https://search.yahoo.com/'
|
||||
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
|
||||
search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
|
||||
|
||||
supported_languages_url = 'https://search.yahoo.com/web/advanced'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
|
||||
url_xpath = './/h3/a/@href'
|
||||
@ -142,3 +141,16 @@ def response(resp):
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
# get supported languages from their site
|
||||
def fetch_supported_languages():
|
||||
supported_languages = []
|
||||
response = get(supported_languages_url)
|
||||
dom = html.fromstring(response.text)
|
||||
options = dom.xpath('//div[@id="yschlang"]/span/label/input')
|
||||
for option in options:
|
||||
code = option.xpath('./@value')[0][5:]
|
||||
supported_languages.append(code)
|
||||
|
||||
return supported_languages
|
||||
|
@ -12,7 +12,7 @@
|
||||
from urllib import urlencode
|
||||
from lxml import html
|
||||
from searx.engines.xpath import extract_text, extract_url
|
||||
from searx.engines.yahoo import parse_url, supported_languages
|
||||
from searx.engines.yahoo import parse_url, fetch_supported_languages
|
||||
from datetime import datetime, timedelta
|
||||
import re
|
||||
from dateutil import parser
|
||||
|
@ -4,39 +4,29 @@
|
||||
|
||||
language_codes = (
|
||||
(u"ach", u"Acoli", u"", u""),
|
||||
(u"af", u"Afrikaans", u"", u"Afrikaans"),
|
||||
(u"af", u"Afrikaans", u"", u""),
|
||||
(u"ak", u"Akan", u"", u""),
|
||||
(u"als", u"Alemannisch", u"", u"Alemannic"),
|
||||
(u"am", u"አማርኛ", u"", u"Amharic"),
|
||||
(u"an", u"Aragonés", u"", u"Aragonese"),
|
||||
(u"am", u"አማርኛ", u"", u""),
|
||||
(u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
|
||||
(u"arz", u"مصرى (Maṣri)", u"", u"Egyptian Arabic"),
|
||||
(u"ast", u"Asturianu", u"", u"Asturian"),
|
||||
(u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
|
||||
(u"azb", u"تۆرکجه", u"", u"South Azerbaijani"),
|
||||
(u"ba", u"Башҡорт", u"", u"Bashkir"),
|
||||
(u"ban", u"Balinese", u"", u""),
|
||||
(u"bar", u"Boarisch", u"", u"Bavarian"),
|
||||
(u"be", u"Беларуская", u"", u"Belarusian"),
|
||||
(u"bem", u"Ichibemba", u"", u""),
|
||||
(u"bg-BG", u"Български", u"България", u"Bulgarian"),
|
||||
(u"bn", u"বাংলা", u"", u"Bengali"),
|
||||
(u"bpy", u"ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী", u"", u"Bishnupriya Manipuri"),
|
||||
(u"br", u"Brezhoneg", u"", u"Breton"),
|
||||
(u"bs", u"Bosanski", u"", u"Bosnian"),
|
||||
(u"bug", u"Basa Ugi", u"", u"Buginese"),
|
||||
(u"bn", u"বাংলা", u"", u""),
|
||||
(u"br", u"Brezhoneg", u"", u""),
|
||||
(u"bs", u"Bosanski", u"", u""),
|
||||
(u"ca", u"Català", u"", u"Catalan"),
|
||||
(u"ca-CT", u"Català", u"", u"Catalan"),
|
||||
(u"ca-ES", u"Català", u"Espanya", u"Catalan"),
|
||||
(u"ce", u"Нохчийн", u"", u"Chechen"),
|
||||
(u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
|
||||
(u"chr", u"ᏣᎳᎩ", u"", u""),
|
||||
(u"ckb", u"Soranî / کوردی", u"", u"Sorani"),
|
||||
(u"ckb", u"Central Kurdish", u"", u""),
|
||||
(u"co", u"Corsican", u"", u""),
|
||||
(u"crs", u"Seychellois Creole", u"", u""),
|
||||
(u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
|
||||
(u"cv", u"Чăваш", u"", u"Chuvash"),
|
||||
(u"cy", u"Cymraeg", u"", u"Welsh"),
|
||||
(u"cy", u"Cymraeg", u"", u""),
|
||||
(u"da-DK", u"Dansk", u"Danmark", u"Danish"),
|
||||
(u"de", u"Deutsch", u"", u"German"),
|
||||
(u"de-AT", u"Deutsch", u"Österreich", u"German"),
|
||||
@ -70,148 +60,129 @@ language_codes = (
|
||||
(u"eu", u"Euskara", u"", u"Basque"),
|
||||
(u"fa", u"فارسی", u"", u"Persian"),
|
||||
(u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
|
||||
(u"fo", u"Føroyskt", u"", u"Faroese"),
|
||||
(u"fo", u"Føroyskt", u"", u""),
|
||||
(u"fr", u"Français", u"", u"French"),
|
||||
(u"fr-BE", u"Français", u"Belgique", u"French"),
|
||||
(u"fr-CA", u"Français", u"Canada", u"French"),
|
||||
(u"fr-CH", u"Français", u"Suisse", u"French"),
|
||||
(u"fr-FR", u"Français", u"France", u"French"),
|
||||
(u"fy", u"Frysk", u"", u"West Frisian"),
|
||||
(u"ga", u"Gaeilge", u"", u"Irish"),
|
||||
(u"fy", u"West-Frysk", u"", u""),
|
||||
(u"ga", u"Gaeilge", u"", u""),
|
||||
(u"gaa", u"Ga", u"", u""),
|
||||
(u"gd", u"Gàidhlig", u"", u"Scottish Gaelic"),
|
||||
(u"gd", u"Gàidhlig", u"", u""),
|
||||
(u"gl", u"Galego", u"", u"Galician"),
|
||||
(u"gn", u"Guarani", u"", u""),
|
||||
(u"gu", u"ગુજરાતી", u"", u"Gujarati"),
|
||||
(u"gu", u"ગુજરાતી", u"", u""),
|
||||
(u"ha", u"Hausa", u"", u""),
|
||||
(u"haw", u"ʻŌlelo HawaiʻI", u"", u""),
|
||||
(u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
|
||||
(u"hi", u"हिन्दी", u"", u"Hindi"),
|
||||
(u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
|
||||
(u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
|
||||
(u"ht", u"Krèyol ayisyen", u"", u"Haitian"),
|
||||
(u"ht", u"Haitian Creole", u"", u""),
|
||||
(u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
|
||||
(u"hy", u"Հայերեն", u"", u"Armenian"),
|
||||
(u"ia", u"Interlingua", u"", u"Interlingua"),
|
||||
(u"ia", u"Interlingua", u"", u""),
|
||||
(u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
|
||||
(u"ig", u"Igbo", u"", u""),
|
||||
(u"io", u"Ido", u"", u"Ido"),
|
||||
(u"is", u"Íslenska", u"", u"Icelandic"),
|
||||
(u"is", u"Íslenska", u"", u""),
|
||||
(u"it", u"Italiano", u"", u"Italian"),
|
||||
(u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
|
||||
(u"it-IT", u"Italiano", u"Italia", u"Italian"),
|
||||
(u"iw", u"עברית", u"", u""),
|
||||
(u"ja-JP", u"日本語", u"日本", u"Japanese"),
|
||||
(u"jv", u"Basa Jawa", u"", u"Javanese"),
|
||||
(u"ka", u"ქართული", u"", u"Georgian"),
|
||||
(u"kg", u"Kongo", u"", u""),
|
||||
(u"kk", u"Қазақша", u"", u"Kazakh"),
|
||||
(u"km", u"ខ្មែរ", u"", u""),
|
||||
(u"kn", u"ಕನ್ನಡ", u"", u"Kannada"),
|
||||
(u"kn", u"ಕನ್ನಡ", u"", u""),
|
||||
(u"ko-KR", u"한국어", u"대한민국", u"Korean"),
|
||||
(u"kri", u"Krio (Sierra Leone)", u"", u""),
|
||||
(u"ku", u"Kurdî / كوردی", u"", u"Kurdish"),
|
||||
(u"ky", u"Кыргызча", u"", u"Kirghiz"),
|
||||
(u"kri", u"Krio", u"", u""),
|
||||
(u"ky", u"Кыргызча", u"", u""),
|
||||
(u"la", u"Latina", u"", u"Latin"),
|
||||
(u"lb", u"Lëtzebuergesch", u"", u"Luxembourgish"),
|
||||
(u"lg", u"Luganda", u"", u""),
|
||||
(u"li", u"Limburgs", u"", u"Limburgish"),
|
||||
(u"lmo", u"Lumbaart", u"", u"Lombard"),
|
||||
(u"ln", u"Lingála", u"", u""),
|
||||
(u"lo", u"ລາວ", u"", u""),
|
||||
(u"loz", u"Lozi", u"", u""),
|
||||
(u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
|
||||
(u"lua", u"Luba-Lulua", u"", u""),
|
||||
(u"lv-LV", u"Latviešu", u"Latvijas Republika", u"Latvian"),
|
||||
(u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
|
||||
(u"mfe", u"Kreol Morisien", u"", u""),
|
||||
(u"mg", u"Malagasy", u"", u"Malagasy"),
|
||||
(u"mg", u"Malagasy", u"", u""),
|
||||
(u"mi", u"Maori", u"", u""),
|
||||
(u"min", u"Minangkabau", u"", u"Minangkabau"),
|
||||
(u"mk", u"Македонски", u"", u"Macedonian"),
|
||||
(u"ml", u"മലയാളം", u"", u"Malayalam"),
|
||||
(u"mn", u"Монгол", u"", u"Mongolian"),
|
||||
(u"mr", u"मराठी", u"", u"Marathi"),
|
||||
(u"mrj", u"Кырык Мары (Kyryk Mary)", u"", u"Hill Mari"),
|
||||
(u"mk", u"Македонски", u"", u""),
|
||||
(u"ml", u"മലയാളം", u"", u""),
|
||||
(u"mn", u"Монгол", u"", u""),
|
||||
(u"mr", u"मराठी", u"", u""),
|
||||
(u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
|
||||
(u"mt", u"Malti", u"", u""),
|
||||
(u"my", u"မြန်မာဘာသာ", u"", u"Burmese"),
|
||||
(u"mzn", u"مَزِروني", u"", u"Mazandarani"),
|
||||
(u"nah", u"Nāhuatl", u"", u"Nahuatl"),
|
||||
(u"nap", u"Nnapulitano", u"", u"Neapolitan"),
|
||||
(u"nds-nl", u"Plattdüütsch", u"Nedderlannen", u"Low Saxon"),
|
||||
(u"ne", u"नेपाली", u"", u"Nepali"),
|
||||
(u"new", u"नेपाल भाषा", u"", u"Newar"),
|
||||
(u"my", u"ဗမာ", u"", u""),
|
||||
(u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
|
||||
(u"ne", u"नेपाली", u"", u""),
|
||||
(u"nl", u"Nederlands", u"", u"Dutch"),
|
||||
(u"nl-BE", u"Nederlands", u"België", u"Dutch"),
|
||||
(u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
|
||||
(u"nn", u"Nynorsk", u"", u"Norwegian (Nynorsk)"),
|
||||
(u"no-NO", u"Norsk (Bokmål)", u"Norge", u"Norwegian (Bokmål)"),
|
||||
(u"nn", u"Nynorsk", u"", u"Norwegian"),
|
||||
(u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
|
||||
(u"nso", u"Northern Sotho", u"", u""),
|
||||
(u"ny", u"Nyanja", u"", u""),
|
||||
(u"nyn", u"Runyankore", u"", u""),
|
||||
(u"oc", u"Occitan", u"", u"Occitan"),
|
||||
(u"oc", u"Occitan", u"", u""),
|
||||
(u"om", u"Oromoo", u"", u""),
|
||||
(u"or", u"ଓଡ଼ିଆ", u"", u"Oriya"),
|
||||
(u"os", u"Иронау", u"", u"Ossetian"),
|
||||
(u"pa", u"ਪੰਜਾਬੀ", u"", u"Punjabi"),
|
||||
(u"or", u"ଓଡ଼ିଆ", u"", u""),
|
||||
(u"pa", u"ਪੰਜਾਬੀ", u"", u""),
|
||||
(u"pcm", u"Nigerian Pidgin", u"", u""),
|
||||
(u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
|
||||
(u"pms", u"Piemontèis", u"", u"Piedmontese"),
|
||||
(u"pnb", u"شاہ مکھی پنجابی (Shāhmukhī Pañjābī)", u"", u"Western Punjabi"),
|
||||
(u"ps", u"پښتو", u"", u""),
|
||||
(u"pt", u"Português", u"", u"Portuguese"),
|
||||
(u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
|
||||
(u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
|
||||
(u"qu", u"Runa Simi", u"", u"Quechua"),
|
||||
(u"qu", u"Runasimi", u"", u""),
|
||||
(u"rm", u"Rumantsch", u"", u""),
|
||||
(u"rn", u"Ikirundi", u"", u""),
|
||||
(u"ro-RO", u"Română", u"România", u"Romanian"),
|
||||
(u"ru-RU", u"Русский", u"Россия", u"Russian"),
|
||||
(u"rw", u"Kinyarwanda", u"", u""),
|
||||
(u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
|
||||
(u"sah", u"Саха тыла (Saxa Tyla)", u"", u"Sakha"),
|
||||
(u"scn", u"Sicilianu", u"", u"Sicilian"),
|
||||
(u"sco", u"Scots", u"", u"Scots"),
|
||||
(u"sd", u"Sindhi", u"", u""),
|
||||
(u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
|
||||
(u"si", u"සිංහල", u"", u"Sinhalese"),
|
||||
(u"si", u"සිංහල", u"", u""),
|
||||
(u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
|
||||
(u"sl-SI", u"Slovenščina", u"Slovenija", u"Slovenian"),
|
||||
(u"sl", u"Slovenščina", u"", u"Slovenian"),
|
||||
(u"sn", u"Chishona", u"", u""),
|
||||
(u"so", u"Soomaali", u"", u""),
|
||||
(u"sq", u"Shqip", u"", u"Albanian"),
|
||||
(u"sr-ME", u"Српски / Srpski", u"Црна Гора", u"Serbian"),
|
||||
(u"sq", u"Shqip", u"", u""),
|
||||
(u"sr", u"Српски / Srpski", u"", u"Serbian"),
|
||||
(u"st", u"Southern Sotho", u"", u""),
|
||||
(u"su", u"Basa Sunda", u"", u"Sundanese"),
|
||||
(u"su", u"Sundanese", u"", u""),
|
||||
(u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
|
||||
(u"sw", u"Kiswahili", u"", u"Swahili"),
|
||||
(u"ta", u"தமிழ்", u"", u"Tamil"),
|
||||
(u"te", u"తెలుగు", u"", u"Telugu"),
|
||||
(u"tg", u"Тоҷикӣ", u"", u"Tajik"),
|
||||
(u"sw", u"Kiswahili", u"", u""),
|
||||
(u"ta", u"தமிழ்", u"", u""),
|
||||
(u"te", u"తెలుగు", u"", u""),
|
||||
(u"tg", u"Tajik", u"", u""),
|
||||
(u"th-TH", u"ไทย", u"ไทย", u"Thai"),
|
||||
(u"ti", u"ትግርኛ", u"", u""),
|
||||
(u"tk", u"Turkmen", u"", u""),
|
||||
(u"tl-PH", u"Tagalog", u"Pilipinas", u"Tagalog"),
|
||||
(u"tl-PH", u"Filipino", u"Pilipinas", u""),
|
||||
(u"tlh", u"Klingon", u"", u""),
|
||||
(u"tn", u"Tswana", u"", u""),
|
||||
(u"to", u"Lea Fakatonga", u"", u""),
|
||||
(u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
|
||||
(u"tt", u"Tatarça / Татарча", u"", u"Tatar"),
|
||||
(u"tt", u"Tatar", u"", u""),
|
||||
(u"tum", u"Tumbuka", u"", u""),
|
||||
(u"tw", u"Twi", u"", u""),
|
||||
(u"ug", u"ئۇيغۇرچە", u"", u""),
|
||||
(u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
|
||||
(u"ur", u"اردو", u"", u"Urdu"),
|
||||
(u"uz", u"O‘zbek", u"", u"Uzbek"),
|
||||
(u"vec", u"Vèneto", u"", u"Venetian"),
|
||||
(u"ve", u"Venda", u"", u"Venda"),
|
||||
(u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
|
||||
(u"vo", u"Volapük", u"", u"Volapük"),
|
||||
(u"wa", u"Walon", u"", u"Walloon"),
|
||||
(u"war", u"Winaray", u"", u"Waray-Waray"),
|
||||
(u"wo", u"Wolof", u"", u""),
|
||||
(u"xh", u"Xhosa", u"", u""),
|
||||
(u"yi", u"ייִדיש", u"", u"Yiddish"),
|
||||
(u"yo", u"Yorùbá", u"", u"Yoruba"),
|
||||
(u"yi", u"ייִדיש", u"", u""),
|
||||
(u"yo", u"Èdè Yorùbá", u"", u""),
|
||||
(u"zh", u"中文", u"", u"Chinese"),
|
||||
(u"zh-CN", u"中文", u"中国", u"Chinese"),
|
||||
(u"zh-HK", u"中文", u"香港", u"Chinese"),
|
||||
|
@ -514,7 +514,7 @@ def index():
|
||||
answers=result_container.answers,
|
||||
infoboxes=result_container.infoboxes,
|
||||
paging=result_container.paging,
|
||||
current_language=search.lang,
|
||||
current_language=search_query.lang,
|
||||
base_url=get_base_url(),
|
||||
theme=get_current_theme_name(),
|
||||
favicons=global_favicons[themes.index(get_current_theme_name())]
|
||||
|
@ -17,7 +17,7 @@ class TestSubtitleseekerEngine(SearxTestCase):
|
||||
|
||||
def test_response(self):
|
||||
dicto = defaultdict(dict)
|
||||
dicto['language'] = 'fr_FR'
|
||||
dicto['language'] = 'fr-FR'
|
||||
response = mock.Mock(search_params=dicto)
|
||||
|
||||
self.assertRaises(AttributeError, subtitleseeker.response, None)
|
||||
|
@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
|
||||
class TestWikipediaEngine(SearxTestCase):
|
||||
|
||||
def test_request(self):
|
||||
wikipedia.supported_languages = ['fr', 'en']
|
||||
|
||||
query = 'test_query'
|
||||
dicto = defaultdict(dict)
|
||||
dicto['language'] = 'fr-FR'
|
||||
|
164
utils/fetch_languages.py
Normal file
164
utils/fetch_languages.py
Normal file
@ -0,0 +1,164 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This script generates languages.py from intersecting each engine's supported languages.
|
||||
#
|
||||
# The country names are obtained from http://api.geonames.org which requires registering as a user.
|
||||
#
|
||||
# Output files (engines_languages.json and languages.py)
|
||||
# are written in current directory to avoid overwriting in case something goes wrong.
|
||||
|
||||
from requests import get
|
||||
from urllib import urlencode
|
||||
from lxml.html import fromstring
|
||||
from json import loads, dumps
|
||||
import io
|
||||
from sys import path
|
||||
path.append('../searx') # noqa
|
||||
from searx.engines import engines
|
||||
|
||||
# Geonames API for country names.
|
||||
geonames_user = '' # ADD USER NAME HERE
|
||||
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
|
||||
|
||||
# Output files.
|
||||
engines_languages_file = 'engines_languages.json'
|
||||
languages_file = 'languages.py'
|
||||
|
||||
engines_languages = {}
|
||||
languages = {}
|
||||
|
||||
|
||||
# To filter out invalid codes and dialects.
|
||||
def valid_code(lang_code):
|
||||
# filter invalid codes
|
||||
# sl-SL is technically not invalid, but still a mistake
|
||||
if lang_code[:2] == 'xx'\
|
||||
or lang_code == 'sl-SL'\
|
||||
or lang_code == 'wt-WT'\
|
||||
or lang_code == 'jw'\
|
||||
or lang_code[-2:] == 'UK'\
|
||||
or lang_code[-2:] == 'XA'\
|
||||
or lang_code[-2:] == 'XL':
|
||||
return False
|
||||
|
||||
# filter dialects
|
||||
lang_code = lang_code.split('-')
|
||||
if len(lang_code) > 2 or len(lang_code[0]) > 3:
|
||||
return False
|
||||
if len(lang_code) == 2 and len(lang_code[1]) > 2:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# Get country name in specified language.
|
||||
def get_country_name(locale):
|
||||
if geonames_user is '':
|
||||
return ''
|
||||
|
||||
locale = locale.split('-')
|
||||
if len(locale) != 2:
|
||||
return ''
|
||||
|
||||
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
|
||||
'country': locale[1],
|
||||
'username': geonames_user}))
|
||||
response = get(url)
|
||||
json = loads(response.text)
|
||||
content = json.get('geonames', None)
|
||||
if content is None or len(content) != 1:
|
||||
print "No country name found for " + locale[0] + "-" + locale[1]
|
||||
return ''
|
||||
|
||||
return content[0].get('countryName', '')
|
||||
|
||||
|
||||
# Fetchs supported languages for each engine and writes json file with those.
|
||||
def fetch_supported_languages():
|
||||
for engine_name in engines:
|
||||
if hasattr(engines[engine_name], 'fetch_supported_languages'):
|
||||
try:
|
||||
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
|
||||
except Exception as e:
|
||||
print e
|
||||
|
||||
# write json file
|
||||
f = io.open(engines_languages_file, "w", encoding="utf-8")
|
||||
f.write(unicode(dumps(engines_languages, indent=4, ensure_ascii=False, encoding="utf-8")))
|
||||
f.close()
|
||||
|
||||
|
||||
# Join all language lists.
|
||||
# Iterate all languages supported by each engine.
|
||||
def join_language_lists():
|
||||
# include wikipedia first for more accurate language names
|
||||
# exclude languages with too few articles
|
||||
languages.update({code: lang for code, lang
|
||||
in engines_languages['wikipedia'].iteritems()
|
||||
if valid_code(code) and lang['articles'] >= 100000})
|
||||
|
||||
for engine_name in engines_languages:
|
||||
for locale in engines_languages[engine_name]:
|
||||
if not valid_code(locale):
|
||||
continue
|
||||
|
||||
# if language is not on list or if it has no name yet
|
||||
if locale not in languages or not languages[locale].get('name'):
|
||||
if isinstance(engines_languages[engine_name], dict) \
|
||||
and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
|
||||
languages[locale] = engines_languages[engine_name][locale]
|
||||
else:
|
||||
languages[locale] = {}
|
||||
|
||||
# get locales that have no name yet
|
||||
for locale in languages.keys():
|
||||
if not languages[locale].get('name'):
|
||||
# try to get language and country names
|
||||
name = languages.get(locale.split('-')[0], {}).get('name', None)
|
||||
if name:
|
||||
languages[locale]['name'] = name
|
||||
languages[locale]['country'] = get_country_name(locale) or ''
|
||||
languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
|
||||
else:
|
||||
# filter out locales with no name
|
||||
del languages[locale]
|
||||
|
||||
|
||||
# Remove countryless language if language is featured in only one country.
|
||||
def filter_single_country_languages():
|
||||
prev_lang = None
|
||||
for code in sorted(languages):
|
||||
lang = code.split('-')[0]
|
||||
if lang == prev_lang:
|
||||
countries += 1
|
||||
else:
|
||||
if prev_lang is not None and countries == 1:
|
||||
del languages[prev_lang]
|
||||
countries = 0
|
||||
prev_lang = lang
|
||||
|
||||
|
||||
# Write languages.py.
|
||||
def write_languages_file():
|
||||
new_file = open(languages_file, 'w')
|
||||
file_content = '# -*- coding: utf-8 -*-\n'
|
||||
file_content += '# list of language codes\n'
|
||||
file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
|
||||
file_content += '\nlanguage_codes = ('
|
||||
for code in sorted(languages):
|
||||
file_content += '\n (u"' + code + '"'\
|
||||
+ ', u"' + languages[code]['name'].split(' (')[0] + '"'\
|
||||
+ ', u"' + languages[code].get('country', '') + '"'\
|
||||
+ ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
|
||||
# remove last comma
|
||||
file_content = file_content[:-1]
|
||||
file_content += '\n)\n'
|
||||
new_file.write(file_content.encode('utf8'))
|
||||
new_file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fetch_supported_languages()
|
||||
join_language_lists()
|
||||
filter_single_country_languages()
|
||||
write_languages_file()
|
@ -1,169 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This script generates languages.py from
|
||||
# intersecting each engine's supported languages.
|
||||
#
|
||||
# The language's native names are obtained from
|
||||
# Wikipedia and Google's supported languages.
|
||||
#
|
||||
# The country names are obtained from http://api.geonames.org
|
||||
# which requires registering as a user.
|
||||
#
|
||||
# Output file (languages.py) is written in current directory
|
||||
# to avoid overwriting in case something goes wrong.
|
||||
|
||||
from requests import get
|
||||
from urllib import urlencode
|
||||
from lxml.html import fromstring
|
||||
from json import loads
|
||||
from sys import path
|
||||
path.append('../searx')
|
||||
from searx.engines import engines
|
||||
|
||||
# list of names
|
||||
wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
||||
google_languages_url = 'https://www.google.com/preferences?#languages'
|
||||
country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
|
||||
|
||||
geonames_user = '' # add user name here
|
||||
|
||||
google_json_name = 'google.preferences.langMap'
|
||||
|
||||
languages = {}
|
||||
|
||||
|
||||
# To filter out invalid codes and dialects.
|
||||
def valid_code(lang_code):
|
||||
# filter invalid codes
|
||||
# sl-SL is technically not invalid, but still a mistake
|
||||
if lang_code[:2] == 'xx'\
|
||||
or lang_code == 'sl-SL'\
|
||||
or lang_code == 'jw'\
|
||||
or lang_code[-2:] == 'UK'\
|
||||
or lang_code[-2:] == 'XA'\
|
||||
or lang_code[-2:] == 'XL':
|
||||
return False
|
||||
|
||||
# filter dialects
|
||||
lang_code = lang_code.split('-')
|
||||
if len(lang_code) > 2 or len(lang_code[0]) > 3:
|
||||
return False
|
||||
if len(lang_code) == 2 and len(lang_code[1]) > 2:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# Get country name in specified language.
|
||||
def get_country_name(locale):
|
||||
if geonames_user is '':
|
||||
return ''
|
||||
|
||||
locale = locale.split('-')
|
||||
if len(locale) != 2:
|
||||
return ''
|
||||
|
||||
url = country_names_url.format(parameters=urlencode({'lang': locale[0],
|
||||
'country': locale[1],
|
||||
'username': geonames_user}))
|
||||
response = get(url)
|
||||
json = loads(response.text)
|
||||
content = json.get('geonames', None)
|
||||
if content is None or len(content) != 1:
|
||||
print "No country name found for " + locale[0] + "-" + locale[1]
|
||||
print json
|
||||
return ''
|
||||
|
||||
return content[0].get('countryName', '')
|
||||
|
||||
|
||||
# Get language names from Wikipedia.
|
||||
def get_wikipedia_languages():
|
||||
response = get(wiki_languages_url)
|
||||
dom = fromstring(response.text)
|
||||
tables = dom.xpath('//table[contains(@class,"sortable")]')
|
||||
for table in tables:
|
||||
# exclude header row
|
||||
trs = table.xpath('.//tr')[1:]
|
||||
for tr in trs:
|
||||
td = tr.xpath('./td')
|
||||
code = td[3].xpath('./a')[0].text
|
||||
name = td[2].xpath('./a')[0].text
|
||||
english_name = td[1].xpath('./a')[0].text
|
||||
articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
|
||||
|
||||
# exclude language variants and languages with few articles
|
||||
if code not in languages and articles >= 10000 and valid_code(code):
|
||||
languages[code] = (name, '', english_name)
|
||||
|
||||
|
||||
# Get language names from Google.
|
||||
def get_google_languages():
|
||||
response = get(google_languages_url)
|
||||
dom = fromstring(response.text)
|
||||
options = dom.xpath('//select[@name="hl"]/option')
|
||||
for option in options:
|
||||
code = option.xpath('./@value')[0].split('-')[0]
|
||||
name = option.text[:-1].title()
|
||||
|
||||
if code not in languages and valid_code(code):
|
||||
languages[code] = (name, '', '')
|
||||
|
||||
|
||||
# Join all language lists.
|
||||
# iterate all languages supported by each engine
|
||||
def join_language_lists():
|
||||
for engine_name in engines:
|
||||
for locale in engines[engine_name].supported_languages:
|
||||
locale = locale.replace('_', '-')
|
||||
if locale not in languages and valid_code(locale):
|
||||
# try to get language name
|
||||
language = languages.get(locale.split('-')[0], None)
|
||||
if language == None:
|
||||
print engine_name + ": " + locale
|
||||
continue
|
||||
|
||||
country = get_country_name(locale)
|
||||
languages[locale] = (language[0], country, language[2])
|
||||
|
||||
|
||||
# Remove countryless language if language is featured in only one country.
|
||||
def filter_single_country_languages():
|
||||
prev_lang = None
|
||||
for code in sorted(languages):
|
||||
lang = code.split('-')[0]
|
||||
if lang == prev_lang:
|
||||
countries += 1
|
||||
else:
|
||||
if prev_lang is not None and countries == 1:
|
||||
del languages[prev_lang]
|
||||
countries = 0
|
||||
prev_lang = lang
|
||||
|
||||
|
||||
# Write languages.py.
|
||||
def write_languages_file():
|
||||
new_file = open('languages.py', 'w')
|
||||
file_content = '# -*- coding: utf-8 -*-\n'
|
||||
file_content += '# list of language codes\n'
|
||||
file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
|
||||
file_content += '\nlanguage_codes = ('
|
||||
for code in sorted(languages):
|
||||
(name, country, english) = languages[code]
|
||||
file_content += '\n (u"' + code + '"'\
|
||||
+ ', u"' + name + '"'\
|
||||
+ ', u"' + country + '"'\
|
||||
+ ', u"' + english + '"),'
|
||||
# remove last comma
|
||||
file_content = file_content[:-1]
|
||||
file_content += '\n)\n'
|
||||
new_file.write(file_content.encode('utf8'))
|
||||
new_file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
get_wikipedia_languages()
|
||||
get_google_languages()
|
||||
join_language_lists()
|
||||
filter_single_country_languages()
|
||||
write_languages_file()
|
Loading…
Reference in New Issue
Block a user