Merge branch 'master' into fix-engine-spotify

This commit is contained in:
Markus Heiser 2019-12-29 09:47:06 +01:00 committed by GitHub
commit 36e72a4619
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
244 changed files with 10745 additions and 11499 deletions

View file

@ -27,7 +27,7 @@ from json import loads
from requests import get
from searx import settings
from searx import logger
from searx.utils import load_module, match_language
from searx.utils import load_module, match_language, get_engine_from_settings
logger = logger.getChild('engines')
@ -53,7 +53,8 @@ engine_default_args = {'paging': False,
'disabled': False,
'suspend_end_time': 0,
'continuous_errors': 0,
'time_range_support': False}
'time_range_support': False,
'offline': False}
def load_engine(engine_data):
@ -128,14 +129,16 @@ def load_engine(engine_data):
engine.stats = {
'result_count': 0,
'search_count': 0,
'page_load_time': 0,
'page_load_count': 0,
'engine_time': 0,
'engine_time_count': 0,
'score_count': 0,
'errors': 0
}
if not engine.offline:
engine.stats['page_load_time'] = 0
engine.stats['page_load_count'] = 0
for category_name in engine.categories:
categories.setdefault(category_name, []).append(engine)
@ -173,11 +176,6 @@ def get_engines_stats():
results_num = \
engine.stats['result_count'] / float(engine.stats['search_count'])
if engine.stats['page_load_count'] != 0:
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
else:
load_times = 0
if engine.stats['engine_time_count'] != 0:
this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa
else:
@ -189,14 +187,19 @@ def get_engines_stats():
else:
score = score_per_result = 0.0
max_pageload = max(load_times, max_pageload)
if not engine.offline:
load_times = 0
if engine.stats['page_load_count'] != 0:
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
max_pageload = max(load_times, max_pageload)
pageloads.append({'avg': load_times, 'name': engine.name})
max_engine_times = max(this_engine_time, max_engine_times)
max_results = max(results_num, max_results)
max_score = max(score, max_score)
max_score_per_result = max(score_per_result, max_score_per_result)
max_errors = max(max_errors, engine.stats['errors'])
pageloads.append({'avg': load_times, 'name': engine.name})
engine_times.append({'avg': this_engine_time, 'name': engine.name})
results.append({'avg': results_num, 'name': engine.name})
scores.append({'avg': score, 'name': engine.name})
@ -255,7 +258,7 @@ def initialize_engines(engine_list):
load_engines(engine_list)
def engine_init(engine_name, init_fn):
init_fn()
init_fn(get_engine_from_settings(engine_name))
logger.debug('%s engine: Initialized', engine_name)
for engine_name, engine in engines.items():

View file

@ -17,6 +17,7 @@ from searx.url_utils import urlencode
categories = ['science']
paging = True
base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
+ '{query}&start={offset}&max_results={number_of_results}'
@ -29,7 +30,7 @@ def request(query, params):
# basic search
offset = (params['pageno'] - 1) * number_of_results
string_args = dict(query=query,
string_args = dict(query=query.decode('utf-8'),
offset=offset,
number_of_results=number_of_results)

View file

@ -13,10 +13,14 @@
@todo publishedDate
"""
import re
from lxml import html
from searx import logger, utils
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode
from searx.utils import match_language, gen_useragent
from searx.utils import match_language, gen_useragent, eval_xpath
logger = logger.getChild('bing engine')
# engine dependent config
categories = ['general']
@ -30,9 +34,13 @@ base_url = 'https://www.bing.com/'
search_string = 'search?{query}&first={offset}'
def _get_offset_from_pageno(pageno):
return (pageno - 1) * 10 + 1
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
offset = _get_offset_from_pageno(params.get('pageno', 0))
if params['language'] == 'all':
lang = 'EN'
@ -47,29 +55,21 @@ def request(query, params):
params['url'] = base_url + search_path
params['headers']['User-Agent'] = gen_useragent('Windows NT 6.3; WOW64')
return params
# get response from search-request
def response(resp):
results = []
result_len = 0
dom = html.fromstring(resp.text)
try:
results.append({'number_of_results': int(dom.xpath('//span[@class="sb_count"]/text()')[0]
.split()[0].replace(',', ''))})
except:
pass
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0]
for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
link = eval_xpath(result, './/h3/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(result.xpath('.//p'))
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url,
@ -77,18 +77,35 @@ def response(resp):
'content': content})
# parse results again if nothing is found yet
for result in dom.xpath('//li[@class="b_algo"]'):
link = result.xpath('.//h2/a')[0]
for result in eval_xpath(dom, '//li[@class="b_algo"]'):
link = eval_xpath(result, './/h2/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(result.xpath('.//p'))
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
try:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]/text()'))
result_len_container = utils.to_string(result_len_container)
if "-" in result_len_container:
# Remove the part "from-to" for paginated request ...
result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:]
result_len_container = re.sub('[^0-9]', '', result_len_container)
if len(result_len_container) > 0:
result_len = int(result_len_container)
except Exception as e:
logger.debug('result error :\n%s', e)
pass
if _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
return []
results.append({'number_of_results': result_len})
return results
@ -96,9 +113,9 @@ def response(resp):
def _fetch_supported_languages(resp):
supported_languages = []
dom = html.fromstring(resp.text)
options = dom.xpath('//div[@id="limit-languages"]//input')
options = eval_xpath(dom, '//div[@id="limit-languages"]//input')
for option in options:
code = option.xpath('./@id')[0].replace('_', '-')
code = eval_xpath(option, './@id')[0].replace('_', '-')
if code == 'nb':
code = 'no'
supported_languages.append(code)

View file

@ -15,7 +15,7 @@
from json import loads
from datetime import datetime
from searx.url_utils import urlencode
from searx.utils import match_language
from searx.utils import match_language, html_to_text
# engine dependent config
categories = ['videos']
@ -59,7 +59,7 @@ def response(resp):
for res in search_res['list']:
title = res['title']
url = res['url']
content = res['description']
content = html_to_text(res['description'])
thumbnail = res['thumbnail_360_url']
publishedDate = datetime.fromtimestamp(res['created_time'], None)
embedded = embedded_url.format(videoid=res['id'])

View file

@ -24,7 +24,7 @@ time_range_support = True
# search-url
base_url = 'https://www.deviantart.com/'
search_url = base_url + 'browse/all/?offset={offset}&{query}'
search_url = base_url + 'search?page={page}&{query}'
time_range_url = '&order={range}'
time_range_dict = {'day': 11,
@ -37,9 +37,7 @@ def request(query, params):
if params['time_range'] and params['time_range'] not in time_range_dict:
return params
offset = (params['pageno'] - 1) * 24
params['url'] = search_url.format(offset=offset,
params['url'] = search_url.format(page=params['pageno'],
query=urlencode({'q': query}))
if params['time_range'] in time_range_dict:
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
@ -57,28 +55,27 @@ def response(resp):
dom = html.fromstring(resp.text)
regex = re.compile(r'\/200H\/')
# parse results
for result in dom.xpath('.//span[@class="thumb wide"]'):
link = result.xpath('.//a[@class="torpedo-thumb-link"]')[0]
url = link.attrib.get('href')
title = extract_text(result.xpath('.//span[@class="title"]'))
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
img_src = regex.sub('/', thumbnail_src)
for row in dom.xpath('//div[contains(@data-hook, "content_row")]'):
for result in row.xpath('./div'):
link = result.xpath('.//a[@data-hook="deviation_link"]')[0]
url = link.attrib.get('href')
title = link.attrib.get('title')
thumbnail_src = result.xpath('.//img')[0].attrib.get('src')
img_src = thumbnail_src
# http to https, remove domain sharding
thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src)
thumbnail_src = re.sub(r"http://", "https://", thumbnail_src)
# http to https, remove domain sharding
thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src)
thumbnail_src = re.sub(r"http://", "https://", thumbnail_src)
url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url)
url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url)
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
# return results
return results

View file

@ -11,11 +11,11 @@
import re
from lxml import html
from searx.utils import is_valid_lang
from searx.utils import is_valid_lang, eval_xpath
from searx.url_utils import urljoin
categories = ['general']
url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
url = u'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
weight = 100
parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
@ -47,14 +47,14 @@ def response(resp):
dom = html.fromstring(resp.text)
for k, result in enumerate(dom.xpath(results_xpath)[1:]):
for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]):
try:
from_result, to_results_raw = result.xpath('./td')
from_result, to_results_raw = eval_xpath(result, './td')
except:
continue
to_results = []
for to_result in to_results_raw.xpath('./p/a'):
for to_result in eval_xpath(to_results_raw, './p/a'):
t = to_result.text_content()
if t.strip():
to_results.append(to_result.text_content())

View file

@ -15,7 +15,8 @@ import string
from dateutil import parser
from json import loads
from lxml import html
from searx.url_utils import quote_plus
from searx.url_utils import urlencode
from datetime import datetime
# engine dependent config
categories = ['news', 'social media']
@ -23,7 +24,7 @@ paging = True
# search-url
base_url = 'https://digg.com/'
search_url = base_url + 'api/search/{query}.json?position={position}&format=html'
search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html'
# specific xpath variables
results_xpath = '//article'
@ -38,9 +39,9 @@ digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
offset = (params['pageno'] - 1) * 20
params['url'] = search_url.format(position=offset,
query=quote_plus(query))
query=urlencode({'q': query}))
params['cookies']['frontend.auid'] = ''.join(random.choice(
digg_cookie_chars) for _ in range(22))
return params
@ -52,30 +53,17 @@ def response(resp):
search_result = loads(resp.text)
if 'html' not in search_result or search_result['html'] == '':
return results
dom = html.fromstring(search_result['html'])
# parse results
for result in dom.xpath(results_xpath):
url = result.attrib.get('data-contenturl')
thumbnail = result.xpath('.//img')[0].attrib.get('src')
title = ''.join(result.xpath(title_xpath))
content = ''.join(result.xpath(content_xpath))
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
publishedDate = parser.parse(pubdate)
# http to https
thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")
for result in search_result['mapped']:
published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S")
# append result
results.append({'url': url,
'title': title,
'content': content,
results.append({'url': result['url'],
'title': result['title'],
'content': result['excerpt'],
'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail})
'publishedDate': published,
'thumbnail': result['images']['thumbImage']})
# return results
return results

View file

@ -11,6 +11,7 @@
from lxml.html import fromstring
from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
from searx.url_utils import urlencode
# engine dependent config
@ -45,16 +46,16 @@ def response(resp):
# parse results
# Quickhits
for r in doc.xpath('//div[@class="search_quickresult"]/ul/li'):
for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
try:
res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1]
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
except:
continue
if not res_url:
continue
title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title'))
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
# append result
results.append({'title': title,
@ -62,13 +63,13 @@ def response(resp):
'url': base_url + res_url})
# Search results
for r in doc.xpath('//dl[@class="search_results"]/*'):
for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
try:
if r.tag == "dt":
res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1]
title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title'))
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
elif r.tag == "dd":
content = extract_text(r.xpath('.'))
content = extract_text(eval_xpath(r, '.'))
# append result
results.append({'title': title,

View file

@ -18,7 +18,7 @@ from json import loads
from searx.engines.xpath import extract_text
from searx.poolrequests import get
from searx.url_utils import urlencode
from searx.utils import match_language
from searx.utils import match_language, eval_xpath
# engine dependent config
categories = ['general']
@ -65,21 +65,36 @@ def get_region_code(lang, lang_list=[]):
def request(query, params):
if params['time_range'] and params['time_range'] not in time_range_dict:
if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict:
return params
offset = (params['pageno'] - 1) * 30
region_code = get_region_code(params['language'], supported_languages)
if region_code:
params['url'] = url.format(
query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
params['url'] = 'https://duckduckgo.com/html/'
if params['pageno'] > 1:
params['method'] = 'POST'
params['data']['q'] = query
params['data']['s'] = offset
params['data']['dc'] = 30
params['data']['nextParams'] = ''
params['data']['v'] = 'l'
params['data']['o'] = 'json'
params['data']['api'] = '/d.js'
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
if region_code:
params['data']['kl'] = region_code
else:
params['url'] = url.format(
query=urlencode({'q': query}), offset=offset, dc_param=offset)
if region_code:
params['url'] = url.format(
query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
else:
params['url'] = url.format(
query=urlencode({'q': query}), offset=offset, dc_param=offset)
if params['time_range'] in time_range_dict:
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
if params['time_range'] in time_range_dict:
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
return params
@ -91,17 +106,19 @@ def response(resp):
doc = fromstring(resp.text)
# parse results
for r in doc.xpath(result_xpath):
for i, r in enumerate(eval_xpath(doc, result_xpath)):
if i >= 30:
break
try:
res_url = r.xpath(url_xpath)[-1]
res_url = eval_xpath(r, url_xpath)[-1]
except:
continue
if not res_url:
continue
title = extract_text(r.xpath(title_xpath))
content = extract_text(r.xpath(content_xpath))
title = extract_text(eval_xpath(r, title_xpath))
content = extract_text(eval_xpath(r, content_xpath))
# append result
results.append({'title': title,

View file

@ -1,3 +1,14 @@
"""
DuckDuckGo (definitions)
- `Instant Answer API`_
- `DuckDuckGo query`_
.. _Instant Answer API: https://duckduckgo.com/api
.. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1
"""
import json
from lxml import html
from re import compile
@ -25,7 +36,8 @@ def result_to_text(url, text, htmlResult):
def request(query, params):
params['url'] = url.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases)
params['headers']['Accept-Language'] = language.split('-')[0]
language = language.split('-')[0]
params['headers']['Accept-Language'] = language
return params
@ -43,8 +55,9 @@ def response(resp):
# add answer if there is one
answer = search_res.get('Answer', '')
if answer != '':
results.append({'answer': html_to_text(answer)})
if answer:
if search_res.get('AnswerType', '') not in ['calc']:
results.append({'answer': html_to_text(answer)})
# add infobox
if 'Definition' in search_res:

View file

@ -11,6 +11,7 @@
from lxml import html, etree
import re
from searx.engines.xpath import extract_text
from searx.utils import eval_xpath
from searx.url_utils import quote, urljoin
from searx import logger
@ -52,9 +53,9 @@ def response(resp):
dom = html.fromstring(resp.text)
try:
number_of_results_string = re.sub('[^0-9]', '', dom.xpath(
'//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]
)
number_of_results_string =\
re.sub('[^0-9]', '',
eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0])
results.append({'number_of_results': int(number_of_results_string)})
@ -62,12 +63,12 @@ def response(resp):
logger.debug("Couldn't read number of results.")
pass
for result in dom.xpath('//section[not(contains(@class, "essay"))]'):
for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'):
try:
url = result.xpath('.//h2/a')[0].get('href')
url = eval_xpath(result, './/h2/a')[0].get('href')
url = urljoin(base_url, url)
title = result.xpath('string(.//h2/a)').strip()
content = extract_text(result.xpath('.//p'))
title = eval_xpath(result, 'string(.//h2/a)').strip()
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url,
'title': title,

View file

@ -18,13 +18,13 @@ categories = ['files']
paging = True
# search-url
base_url = 'https://f-droid.org/'
search_url = base_url + 'repository/browse/?{query}'
base_url = 'https://search.f-droid.org/'
search_url = base_url + '?{query}'
# do search-request
def request(query, params):
query = urlencode({'fdfilter': query, 'fdpage': params['pageno']})
query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''})
params['url'] = search_url.format(query=query)
return params
@ -35,17 +35,16 @@ def response(resp):
dom = html.fromstring(resp.text)
for app in dom.xpath('//div[@id="appheader"]'):
url = app.xpath('./ancestor::a/@href')[0]
title = app.xpath('./p/span/text()')[0]
img_src = app.xpath('.//img/@src')[0]
for app in dom.xpath('//a[@class="package-header"]'):
app_url = app.xpath('./@href')[0]
app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \
+ ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0]
content = extract_text(app.xpath('./p')[0])
content = content.replace(title, '', 1).strip()
results.append({'url': url,
'title': title,
'content': content,
'img_src': img_src})
results.append({'url': app_url,
'title': app_title,
'content': app_content,
'img_src': app_img_src})
return results

View file

@ -16,7 +16,8 @@ from json import loads
from time import time
import re
from searx.engines import logger
from searx.url_utils import urlencode, unquote
from searx.url_utils import urlencode
from searx.utils import ecma_unescape, html_to_text
logger = logger.getChild('flickr-noapi')
@ -75,11 +76,10 @@ def response(resp):
for index in legend:
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
author = unquote(photo.get('realname', ''))
source = unquote(photo.get('username', '')) + ' @ Flickr'
title = unquote(photo.get('title', ''))
content = unquote(photo.get('description', ''))
author = ecma_unescape(photo.get('realname', ''))
source = ecma_unescape(photo.get('username', '')) + ' @ Flickr'
title = ecma_unescape(photo.get('title', ''))
content = html_to_text(ecma_unescape(photo.get('description', '')))
img_src = None
# From the biggest to the lowest format
for image_size in image_sizes:

View file

@ -10,7 +10,10 @@
@parse url, title, content, thumbnail, img_src
"""
from cgi import escape
try:
from cgi import escape
except:
from html import escape
from lxml import html
from searx.engines.xpath import extract_text
from searx.url_utils import urljoin, urlencode

View file

@ -14,7 +14,9 @@ import random
from json import loads
from time import time
from lxml.html import fromstring
from searx.poolrequests import get
from searx.url_utils import urlencode
from searx.utils import eval_xpath
# engine dependent config
categories = ['general']
@ -30,13 +32,9 @@ search_string = 'search?{query}'\
'&c=main'\
'&s={offset}'\
'&format=json'\
'&qh=0'\
'&qlang={lang}'\
'&langcountry={lang}'\
'&ff={safesearch}'\
'&rxiec={rxieu}'\
'&ulse={ulse}'\
'&rand={rxikd}' # current unix timestamp
'&rand={rxikd}'
# specific xpath variables
results_xpath = '//response//result'
url_xpath = './/url'
@ -45,9 +43,26 @@ content_xpath = './/sum'
supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
extra_param = '' # gigablast requires a random extra parameter
# which can be extracted from the source code of the search page
def parse_extra_param(text):
global extra_param
param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')]
extra_param = ''
for l in param_lines:
extra_param += l.split("'")[1]
extra_param = extra_param.split('&')[-1]
def init(engine_settings=None):
parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text)
# do search-request
def request(query, params):
print("EXTRAPARAM:", extra_param)
offset = (params['pageno'] - 1) * number_of_results
if params['language'] == 'all':
@ -66,13 +81,11 @@ def request(query, params):
search_path = search_string.format(query=urlencode({'q': query}),
offset=offset,
number_of_results=number_of_results,
rxikd=int(time() * 1000),
rxieu=random.randint(1000000000, 9999999999),
ulse=random.randint(100000000, 999999999),
lang=language,
rxikd=int(time() * 1000),
safesearch=safesearch)
params['url'] = base_url + search_path
params['url'] = base_url + search_path + '&' + extra_param
return params
@ -82,7 +95,11 @@ def response(resp):
results = []
# parse results
response_json = loads(resp.text)
try:
response_json = loads(resp.text)
except:
parse_extra_param(resp.text)
raise Exception('extra param expired, please reload')
for result in response_json['results']:
# append result
@ -98,9 +115,9 @@ def response(resp):
def _fetch_supported_languages(resp):
supported_languages = []
dom = fromstring(resp.text)
links = dom.xpath('//span[@id="menu2"]/a')
links = eval_xpath(dom, '//span[@id="menu2"]/a')
for link in links:
href = link.xpath('./@href')[0].split('lang%3A')
href = eval_xpath(link, './@href')[0].split('lang%3A')
if len(href) == 2:
code = href[1].split('_')
if len(code) == 2:

View file

@ -14,7 +14,7 @@ from lxml import html, etree
from searx.engines.xpath import extract_text, extract_url
from searx import logger
from searx.url_utils import urlencode, urlparse, parse_qsl
from searx.utils import match_language
from searx.utils import match_language, eval_xpath
logger = logger.getChild('google engine')
@ -107,13 +107,12 @@ images_path = '/images'
supported_languages_url = 'https://www.google.com/preferences?#languages'
# specific xpath variables
results_xpath = '//div[@class="g"]'
url_xpath = './/h3/a/@href'
title_xpath = './/h3'
content_xpath = './/span[@class="st"]'
content_misc_xpath = './/div[@class="f slp"]'
suggestion_xpath = '//p[@class="_Bmc"]'
spelling_suggestion_xpath = '//a[@class="spell"]'
results_xpath = '//div[contains(@class, "ZINbbc")]'
url_xpath = './/div[@class="kCrYT"][1]/a/@href'
title_xpath = './/div[@class="kCrYT"][1]/a/div[1]'
content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]'
suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]'
spelling_suggestion_xpath = '//div[@id="scc"]//a'
# map : detail location
map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
@ -156,7 +155,7 @@ def parse_url(url_string, google_hostname):
# returns extract_text on the first result selected by the xpath or None
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
r = eval_xpath(result, xpath)
if len(r) > 0:
return extract_text(r[0])
return None
@ -199,9 +198,6 @@ def request(query, params):
params['headers']['Accept-Language'] = language + ',' + language + '-' + country
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
# Force Internet Explorer 12 user agent to avoid loading the new UI that Searx can't parse
params['headers']['User-Agent'] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
params['google_hostname'] = google_hostname
return params
@ -226,21 +222,21 @@ def response(resp):
# convert the text to dom
dom = html.fromstring(resp.text)
instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()')
if instant_answer:
results.append({'answer': u' '.join(instant_answer)})
try:
results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0]
results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0]
.split()[1].replace(',', ''))
results.append({'number_of_results': results_num})
except:
pass
# parse results
for result in dom.xpath(results_xpath):
for result in eval_xpath(dom, results_xpath):
try:
title = extract_text(result.xpath(title_xpath)[0])
url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
title = extract_text(eval_xpath(result, title_xpath)[0])
url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname)
parsed_url = urlparse(url, google_hostname)
# map result
@ -249,7 +245,7 @@ def response(resp):
continue
# if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
# print "yooooo"*30
# x = result.xpath(map_near)
# x = eval_xpath(result, map_near)
# if len(x) > 0:
# # map : near the location
# results = results + parse_map_near(parsed_url, x, google_hostname)
@ -273,9 +269,7 @@ def response(resp):
content = extract_text_from_dom(result, content_xpath)
if content is None:
continue
content_misc = extract_text_from_dom(result, content_misc_xpath)
if content_misc is not None:
content = content_misc + "<br />" + content
# append result
results.append({'url': url,
'title': title,
@ -286,11 +280,11 @@ def response(resp):
continue
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath):
for suggestion in eval_xpath(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
for correction in dom.xpath(spelling_suggestion_xpath):
for correction in eval_xpath(dom, spelling_suggestion_xpath):
results.append({'correction': extract_text(correction)})
# return results
@ -299,9 +293,9 @@ def response(resp):
def parse_images(result, google_hostname):
results = []
for image in result.xpath(images_xpath):
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname)
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
for image in eval_xpath(result, images_xpath):
url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname)
img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0])
# append result
results.append({'url': url,
@ -388,10 +382,10 @@ def attributes_to_html(attributes):
def _fetch_supported_languages(resp):
supported_languages = {}
dom = html.fromstring(resp.text)
options = dom.xpath('//*[@id="langSec"]//input[@name="lr"]')
options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]')
for option in options:
code = option.xpath('./@value')[0].split('_')[-1]
name = option.xpath('./@data-name')[0].title()
code = eval_xpath(option, './@value')[0].split('_')[-1]
name = eval_xpath(option, './@data-name')[0].title()
supported_languages[code] = {"name": name}
return supported_languages

View file

@ -70,11 +70,21 @@ def response(resp):
try:
metadata = loads(result)
img_format = "{0} {1}x{2}".format(metadata['ity'], str(metadata['ow']), str(metadata['oh']))
source = "{0} ({1})".format(metadata['st'], metadata['isu'])
img_format = metadata.get('ity', '')
img_width = metadata.get('ow', '')
img_height = metadata.get('oh', '')
if img_width and img_height:
img_format += " {0}x{1}".format(img_width, img_height)
source = metadata.get('st', '')
source_url = metadata.get('isu', '')
if source_url:
source += " ({0})".format(source_url)
results.append({'url': metadata['ru'],
'title': metadata['pt'],
'content': metadata['s'],
'content': metadata.get('s', ''),
'source': source,
'img_format': img_format,
'thumbnail_src': metadata['tu'],

View file

@ -75,15 +75,17 @@ def response(resp):
# get thumbnails
script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
id = result.xpath('.//div[@class="s"]//img/@id')[0]
thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id,
script)
tmp = []
if len(thumbnails_data) != 0:
tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
thumbnail = ''
if len(tmp) != 0:
thumbnail = tmp[-1]
ids = result.xpath('.//div[@class="s"]//img/@id')
if len(ids) > 0:
thumbnails_data = \
re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
script)
tmp = []
if len(thumbnails_data) != 0:
tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
thumbnail = ''
if len(tmp) != 0:
thumbnail = tmp[-1]
# append result
results.append({'url': url,

100
searx/engines/invidious.py Normal file
View file

@ -0,0 +1,100 @@
# Invidious (Videos)
#
# @website https://invidio.us/
# @provide-api yes (https://github.com/omarroth/invidious/wiki/API)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, publishedDate, thumbnail, embedded
from searx.url_utils import quote_plus
from dateutil import parser
import time
# engine dependent config
categories = ["videos", "music"]
paging = True
language_support = True
time_range_support = True
# search-url
base_url = "https://invidio.us/"
# do search-request
def request(query, params):
time_range_dict = {
"day": "today",
"week": "week",
"month": "month",
"year": "year",
}
search_url = base_url + "api/v1/search?q={query}"
params["url"] = search_url.format(
query=quote_plus(query)
) + "&page={pageno}".format(pageno=params["pageno"])
if params["time_range"] in time_range_dict:
params["url"] += "&date={timerange}".format(
timerange=time_range_dict[params["time_range"]]
)
if params["language"] != "all":
lang = params["language"].split("-")
if len(lang) == 2:
params["url"] += "&range={lrange}".format(lrange=lang[1])
return params
# get response from search-request
def response(resp):
results = []
search_results = resp.json()
embedded_url = (
'<iframe width="540" height="304" '
+ 'data-src="'
+ base_url
+ 'embed/{videoid}" '
+ 'frameborder="0" allowfullscreen></iframe>'
)
base_invidious_url = base_url + "watch?v="
for result in search_results:
rtype = result.get("type", None)
if rtype == "video":
videoid = result.get("videoId", None)
if not videoid:
continue
url = base_invidious_url + videoid
embedded = embedded_url.format(videoid=videoid)
thumbs = result.get("videoThumbnails", [])
thumb = next(
(th for th in thumbs if th["quality"] == "sddefault"), None
)
if thumb:
thumbnail = thumb.get("url", "")
else:
thumbnail = ""
publishedDate = parser.parse(
time.ctime(result.get("published", 0))
)
results.append(
{
"url": url,
"title": result.get("title", ""),
"content": result.get("description", ""),
"template": "videos.html",
"publishedDate": publishedDate,
"embedded": embedded,
"thumbnail": thumbnail,
}
)
return results

View file

@ -24,7 +24,7 @@ result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
# do search-request
def request(query, params):
params['url'] = base_url + search_string.format(query=query)
params['url'] = base_url + search_string.format(query=query.decode('utf-8'))
return params

View file

@ -50,6 +50,7 @@ def request(query, params):
language = match_language(params['language'], supported_languages, language_aliases)
params['url'] += '&locale=' + language.replace('-', '_').lower()
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
return params

78
searx/engines/seedpeer.py Normal file
View file

@ -0,0 +1,78 @@
# Seedpeer (Videos, Music, Files)
#
# @website https://seedpeer.me
# @provide-api no (nothing found)
#
# @using-api no
# @results HTML (using search portal)
# @stable yes (HTML can change)
# @parse url, title, content, seed, leech, magnetlink
from lxml import html
from json import loads
from operator import itemgetter
from searx.url_utils import quote, urljoin
from searx.engines.xpath import extract_text
url = 'https://seedpeer.me/'
search_url = url + 'search/{search_term}?page={page_no}'
torrent_file_url = url + 'torrent/{torrent_hash}'
# specific xpath variables
script_xpath = '//script[@type="text/javascript"][not(@src)]'
torrent_xpath = '(//table)[2]/tbody/tr'
link_xpath = '(./td)[1]/a/@href'
age_xpath = '(./td)[2]'
size_xpath = '(./td)[3]'
# do search-request
def request(query, params):
params['url'] = search_url.format(search_term=quote(query),
page_no=params['pageno'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
result_rows = dom.xpath(torrent_xpath)
try:
script_element = dom.xpath(script_xpath)[0]
json_string = script_element.text[script_element.text.find('{'):]
torrents_json = loads(json_string)
except:
return []
# parse results
for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']):
title = torrent_json['name']
seed = int(torrent_json['seeds'])
leech = int(torrent_json['peers'])
size = int(torrent_json['size'])
torrent_hash = torrent_json['hash']
torrentfile = torrent_file_url.format(torrent_hash=torrent_hash)
magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash)
age = extract_text(torrent_row.xpath(age_xpath))
link = torrent_row.xpath(link_xpath)[0]
href = urljoin(url, link)
# append result
results.append({'url': href,
'title': title,
'content': age,
'seed': seed,
'leech': leech,
'filesize': size,
'torrentfile': torrentfile,
'magnetlink': magnetlink,
'template': 'torrent.html'})
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True)

View file

@ -51,7 +51,9 @@ def get_client_id():
if response.ok:
tree = html.fromstring(response.content)
script_tags = tree.xpath("//script[contains(@src, '/assets/app')]")
# script_tags has been moved from /assets/app/ to /assets/ path. I
# found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js
script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
# extracts valid app_js urls from soundcloud.com content
@ -66,7 +68,7 @@ def get_client_id():
return ""
def init():
def init(engine_settings=None):
global guest_client_id
# api-key
guest_client_id = get_client_id()

View file

@ -15,6 +15,8 @@ from dateutil import parser
from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
from searx.languages import language_codes
from searx.utils import eval_xpath
# engine dependent config
categories = ['general']
@ -22,7 +24,7 @@ categories = ['general']
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls
# paging = False
paging = True
language_support = True
# search-url
@ -32,23 +34,32 @@ search_url = base_url + 'do/search'
# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]'
link_xpath = './/h3/a'
content_xpath = './p[@class="search-item__body"]'
results_xpath = '//div[@class="w-gl__result"]'
link_xpath = './/a[@class="w-gl__result-title"]'
content_xpath = './/p[@class="w-gl__description"]'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = search_url
params['method'] = 'POST'
params['data'] = {'query': query,
'startat': offset}
params['data'] = {
'query': query,
'page': params['pageno'],
'cat': 'web',
'cmd': 'process_search',
'engine0': 'v1all',
}
# set language if specified
if params['language'] != 'all':
params['data']['with_language'] = ('lang_' + params['language'].split('-')[0])
language = 'english'
for lc, _, _, lang in language_codes:
if lc == params['language']:
language = lang
params['data']['language'] = language
params['data']['lui'] = language
return params
@ -60,8 +71,8 @@ def response(resp):
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath):
links = result.xpath(link_xpath)
for result in eval_xpath(dom, results_xpath):
links = eval_xpath(result, link_xpath)
if not links:
continue
link = links[0]
@ -77,8 +88,8 @@ def response(resp):
title = extract_text(link)
if result.xpath(content_xpath):
content = extract_text(result.xpath(content_xpath))
if eval_xpath(result, content_xpath):
content = extract_text(eval_xpath(result, content_xpath))
else:
content = ''

View file

@ -16,7 +16,7 @@ from searx.poolrequests import get
from searx.engines.xpath import extract_text
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode
from searx.utils import match_language
from searx.utils import match_language, eval_xpath
from json import loads
from lxml.html import fromstring
@ -57,22 +57,6 @@ language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator
calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a'
# xpath_cache
xpath_cache = {}
def get_xpath(xpath_str):
result = xpath_cache.get(xpath_str, None)
if not result:
result = etree.XPath(xpath_str)
xpath_cache[xpath_str] = result
return result
def eval_xpath(element, xpath_str):
xpath = get_xpath(xpath_str)
return xpath(element)
def get_id_cache(result):
id_cache = {}

View file

@ -21,7 +21,8 @@ search_url = base_url + u'w/api.php?'\
'action=query'\
'&format=json'\
'&{query}'\
'&prop=extracts|pageimages'\
'&prop=extracts|pageimages|pageprops'\
'&ppprop=disambiguation'\
'&exintro'\
'&explaintext'\
'&pithumbsize=300'\
@ -79,12 +80,15 @@ def response(resp):
# wikipedia article's unique id
# first valid id is assumed to be the requested article
if 'pages' not in search_result['query']:
return results
for article_id in search_result['query']['pages']:
page = search_result['query']['pages'][article_id]
if int(article_id) > 0:
break
if int(article_id) < 0:
if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}):
return []
title = page.get('title')
@ -96,6 +100,7 @@ def response(resp):
extract = page.get('extract')
summary = extract_first_paragraph(extract, title, image)
summary = summary.replace('() ', '')
# link to wikipedia article
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \

View file

@ -55,7 +55,7 @@ def obtain_token():
return token
def init():
def init(engine_settings=None):
obtain_token()

View file

@ -11,8 +11,8 @@
"""
from lxml import html
import re
from searx.url_utils import urlencode, urljoin
from searx.engines.xpath import extract_text
# engine dependent config
categories = ['images']
@ -34,41 +34,18 @@ def request(query, params):
def response(resp):
results = []
# get links from result-text
regex = re.compile('(</a>|<a)')
results_parts = re.split(regex, resp.text)
cur_element = ''
# iterate over link parts
for result_part in results_parts:
dom = html.fromstring(resp.text)
for res in dom.xpath('//div[@class="List-item MainListing"]'):
# processed start and end of link
if result_part == '<a':
cur_element = result_part
continue
elif result_part != '</a>':
cur_element += result_part
continue
cur_element += result_part
# fix xml-error
cur_element = cur_element.replace('"></a>', '"/></a>')
dom = html.fromstring(cur_element)
link = dom.xpath('//a')[0]
link = res.xpath('//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
title = link.attrib.get('title', '')
title = extract_text(link)
thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src'])
# TODO: get image with higher resolution
img_src = thumbnail_src
# check if url is showing to a photo
if '/photo/' not in url:
continue
# append result
results.append({'url': url,
'title': title,

View file

@ -1,6 +1,6 @@
from lxml import html
from lxml.etree import _ElementStringResult, _ElementUnicodeResult
from searx.utils import html_to_text
from searx.utils import html_to_text, eval_xpath
from searx.url_utils import unquote, urlencode, urljoin, urlparse
search_url = None
@ -104,15 +104,15 @@ def response(resp):
results = []
dom = html.fromstring(resp.text)
if results_xpath:
for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath), search_url)
title = extract_text(result.xpath(title_xpath))
content = extract_text(result.xpath(content_xpath))
for result in eval_xpath(dom, results_xpath):
url = extract_url(eval_xpath(result, url_xpath), search_url)
title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(eval_xpath(result, content_xpath))
tmp_result = {'url': url, 'title': title, 'content': content}
# add thumbnail if available
if thumbnail_xpath:
thumbnail_xpath_result = result.xpath(thumbnail_xpath)
thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
if len(thumbnail_xpath_result) > 0:
tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
@ -120,14 +120,14 @@ def response(resp):
else:
for url, title, content in zip(
(extract_url(x, search_url) for
x in dom.xpath(url_xpath)),
map(extract_text, dom.xpath(title_xpath)),
map(extract_text, dom.xpath(content_xpath))
x in eval_xpath(dom, url_xpath)),
map(extract_text, eval_xpath(dom, title_xpath)),
map(extract_text, eval_xpath(dom, content_xpath))
):
results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath:
return results
for suggestion in dom.xpath(suggestion_xpath):
for suggestion in eval_xpath(dom, suggestion_xpath):
results.append({'suggestion': extract_text(suggestion)})
return results

View file

@ -14,7 +14,7 @@
from lxml import html
from searx.engines.xpath import extract_text, extract_url
from searx.url_utils import unquote, urlencode
from searx.utils import match_language
from searx.utils import match_language, eval_xpath
# engine dependent config
categories = ['general']
@ -109,21 +109,21 @@ def response(resp):
dom = html.fromstring(resp.text)
try:
results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0]
results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0]
.split()[0].replace(',', ''))
results.append({'number_of_results': results_num})
except:
pass
# parse results
for result in dom.xpath(results_xpath):
for result in eval_xpath(dom, results_xpath):
try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0])
url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url))
title = extract_text(eval_xpath(result, title_xpath)[0])
except:
continue
content = extract_text(result.xpath(content_xpath)[0])
content = extract_text(eval_xpath(result, content_xpath)[0])
# append result
results.append({'url': url,
@ -131,7 +131,7 @@ def response(resp):
'content': content})
# if no suggestion found, return results
suggestions = dom.xpath(suggestion_xpath)
suggestions = eval_xpath(dom, suggestion_xpath)
if not suggestions:
return results
@ -148,9 +148,9 @@ def response(resp):
def _fetch_supported_languages(resp):
supported_languages = []
dom = html.fromstring(resp.text)
options = dom.xpath('//div[@id="yschlang"]/span/label/input')
options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input')
for option in options:
code_parts = option.xpath('./@value')[0][5:].split('_')
code_parts = eval_xpath(option, './@value')[0][5:].split('_')
if len(code_parts) == 2:
code = code_parts[0] + '-' + code_parts[1].upper()
else:

View file

@ -67,12 +67,8 @@ def response(resp):
if videoid is not None:
url = base_youtube_url + videoid
thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'
title = video.get('title', {}).get('simpleText', videoid)
description_snippet = video.get('descriptionSnippet', {})
if 'runs' in description_snippet:
content = reduce(lambda a, b: a + b.get('text', ''), description_snippet.get('runs'), '')
else:
content = description_snippet.get('simpleText', '')
title = get_text_from_json(video.get('title', {}))
content = get_text_from_json(video.get('descriptionSnippet', {}))
embedded = embedded_url.format(videoid=videoid)
# append result
@ -85,3 +81,10 @@ def response(resp):
# return results
return results
def get_text_from_json(element):
if 'runs' in element:
return reduce(lambda a, b: a + b.get('text', ''), element.get('runs'), '')
else:
return element.get('simpleText', '')