mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
Merge branch 'master' into fix-engine-spotify
This commit is contained in:
commit
36e72a4619
244 changed files with 10745 additions and 11499 deletions
|
|
@ -27,7 +27,7 @@ from json import loads
|
|||
from requests import get
|
||||
from searx import settings
|
||||
from searx import logger
|
||||
from searx.utils import load_module, match_language
|
||||
from searx.utils import load_module, match_language, get_engine_from_settings
|
||||
|
||||
|
||||
logger = logger.getChild('engines')
|
||||
|
|
@ -53,7 +53,8 @@ engine_default_args = {'paging': False,
|
|||
'disabled': False,
|
||||
'suspend_end_time': 0,
|
||||
'continuous_errors': 0,
|
||||
'time_range_support': False}
|
||||
'time_range_support': False,
|
||||
'offline': False}
|
||||
|
||||
|
||||
def load_engine(engine_data):
|
||||
|
|
@ -128,14 +129,16 @@ def load_engine(engine_data):
|
|||
engine.stats = {
|
||||
'result_count': 0,
|
||||
'search_count': 0,
|
||||
'page_load_time': 0,
|
||||
'page_load_count': 0,
|
||||
'engine_time': 0,
|
||||
'engine_time_count': 0,
|
||||
'score_count': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
if not engine.offline:
|
||||
engine.stats['page_load_time'] = 0
|
||||
engine.stats['page_load_count'] = 0
|
||||
|
||||
for category_name in engine.categories:
|
||||
categories.setdefault(category_name, []).append(engine)
|
||||
|
||||
|
|
@ -173,11 +176,6 @@ def get_engines_stats():
|
|||
results_num = \
|
||||
engine.stats['result_count'] / float(engine.stats['search_count'])
|
||||
|
||||
if engine.stats['page_load_count'] != 0:
|
||||
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
|
||||
else:
|
||||
load_times = 0
|
||||
|
||||
if engine.stats['engine_time_count'] != 0:
|
||||
this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count']) # noqa
|
||||
else:
|
||||
|
|
@ -189,14 +187,19 @@ def get_engines_stats():
|
|||
else:
|
||||
score = score_per_result = 0.0
|
||||
|
||||
max_pageload = max(load_times, max_pageload)
|
||||
if not engine.offline:
|
||||
load_times = 0
|
||||
if engine.stats['page_load_count'] != 0:
|
||||
load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count']) # noqa
|
||||
max_pageload = max(load_times, max_pageload)
|
||||
pageloads.append({'avg': load_times, 'name': engine.name})
|
||||
|
||||
max_engine_times = max(this_engine_time, max_engine_times)
|
||||
max_results = max(results_num, max_results)
|
||||
max_score = max(score, max_score)
|
||||
max_score_per_result = max(score_per_result, max_score_per_result)
|
||||
max_errors = max(max_errors, engine.stats['errors'])
|
||||
|
||||
pageloads.append({'avg': load_times, 'name': engine.name})
|
||||
engine_times.append({'avg': this_engine_time, 'name': engine.name})
|
||||
results.append({'avg': results_num, 'name': engine.name})
|
||||
scores.append({'avg': score, 'name': engine.name})
|
||||
|
|
@ -255,7 +258,7 @@ def initialize_engines(engine_list):
|
|||
load_engines(engine_list)
|
||||
|
||||
def engine_init(engine_name, init_fn):
|
||||
init_fn()
|
||||
init_fn(get_engine_from_settings(engine_name))
|
||||
logger.debug('%s engine: Initialized', engine_name)
|
||||
|
||||
for engine_name, engine in engines.items():
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from searx.url_utils import urlencode
|
|||
|
||||
|
||||
categories = ['science']
|
||||
paging = True
|
||||
|
||||
base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
|
||||
+ '{query}&start={offset}&max_results={number_of_results}'
|
||||
|
|
@ -29,7 +30,7 @@ def request(query, params):
|
|||
# basic search
|
||||
offset = (params['pageno'] - 1) * number_of_results
|
||||
|
||||
string_args = dict(query=query,
|
||||
string_args = dict(query=query.decode('utf-8'),
|
||||
offset=offset,
|
||||
number_of_results=number_of_results)
|
||||
|
||||
|
|
|
|||
|
|
@ -13,10 +13,14 @@
|
|||
@todo publishedDate
|
||||
"""
|
||||
|
||||
import re
|
||||
from lxml import html
|
||||
from searx import logger, utils
|
||||
from searx.engines.xpath import extract_text
|
||||
from searx.url_utils import urlencode
|
||||
from searx.utils import match_language, gen_useragent
|
||||
from searx.utils import match_language, gen_useragent, eval_xpath
|
||||
|
||||
logger = logger.getChild('bing engine')
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
|
|
@ -30,9 +34,13 @@ base_url = 'https://www.bing.com/'
|
|||
search_string = 'search?{query}&first={offset}'
|
||||
|
||||
|
||||
def _get_offset_from_pageno(pageno):
|
||||
return (pageno - 1) * 10 + 1
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
offset = (params['pageno'] - 1) * 10 + 1
|
||||
offset = _get_offset_from_pageno(params.get('pageno', 0))
|
||||
|
||||
if params['language'] == 'all':
|
||||
lang = 'EN'
|
||||
|
|
@ -47,29 +55,21 @@ def request(query, params):
|
|||
|
||||
params['url'] = base_url + search_path
|
||||
|
||||
params['headers']['User-Agent'] = gen_useragent('Windows NT 6.3; WOW64')
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
result_len = 0
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
try:
|
||||
results.append({'number_of_results': int(dom.xpath('//span[@class="sb_count"]/text()')[0]
|
||||
.split()[0].replace(',', ''))})
|
||||
except:
|
||||
pass
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath('//div[@class="sa_cc"]'):
|
||||
link = result.xpath('.//h3/a')[0]
|
||||
for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
|
||||
link = eval_xpath(result, './/h3/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(link)
|
||||
content = extract_text(result.xpath('.//p'))
|
||||
content = extract_text(eval_xpath(result, './/p'))
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
|
|
@ -77,18 +77,35 @@ def response(resp):
|
|||
'content': content})
|
||||
|
||||
# parse results again if nothing is found yet
|
||||
for result in dom.xpath('//li[@class="b_algo"]'):
|
||||
link = result.xpath('.//h2/a')[0]
|
||||
for result in eval_xpath(dom, '//li[@class="b_algo"]'):
|
||||
link = eval_xpath(result, './/h2/a')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(link)
|
||||
content = extract_text(result.xpath('.//p'))
|
||||
content = extract_text(eval_xpath(result, './/p'))
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'content': content})
|
||||
|
||||
# return results
|
||||
try:
|
||||
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]/text()'))
|
||||
result_len_container = utils.to_string(result_len_container)
|
||||
if "-" in result_len_container:
|
||||
# Remove the part "from-to" for paginated request ...
|
||||
result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:]
|
||||
|
||||
result_len_container = re.sub('[^0-9]', '', result_len_container)
|
||||
if len(result_len_container) > 0:
|
||||
result_len = int(result_len_container)
|
||||
except Exception as e:
|
||||
logger.debug('result error :\n%s', e)
|
||||
pass
|
||||
|
||||
if _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
|
||||
return []
|
||||
|
||||
results.append({'number_of_results': result_len})
|
||||
return results
|
||||
|
||||
|
||||
|
|
@ -96,9 +113,9 @@ def response(resp):
|
|||
def _fetch_supported_languages(resp):
|
||||
supported_languages = []
|
||||
dom = html.fromstring(resp.text)
|
||||
options = dom.xpath('//div[@id="limit-languages"]//input')
|
||||
options = eval_xpath(dom, '//div[@id="limit-languages"]//input')
|
||||
for option in options:
|
||||
code = option.xpath('./@id')[0].replace('_', '-')
|
||||
code = eval_xpath(option, './@id')[0].replace('_', '-')
|
||||
if code == 'nb':
|
||||
code = 'no'
|
||||
supported_languages.append(code)
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@
|
|||
from json import loads
|
||||
from datetime import datetime
|
||||
from searx.url_utils import urlencode
|
||||
from searx.utils import match_language
|
||||
from searx.utils import match_language, html_to_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['videos']
|
||||
|
|
@ -59,7 +59,7 @@ def response(resp):
|
|||
for res in search_res['list']:
|
||||
title = res['title']
|
||||
url = res['url']
|
||||
content = res['description']
|
||||
content = html_to_text(res['description'])
|
||||
thumbnail = res['thumbnail_360_url']
|
||||
publishedDate = datetime.fromtimestamp(res['created_time'], None)
|
||||
embedded = embedded_url.format(videoid=res['id'])
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ time_range_support = True
|
|||
|
||||
# search-url
|
||||
base_url = 'https://www.deviantart.com/'
|
||||
search_url = base_url + 'browse/all/?offset={offset}&{query}'
|
||||
search_url = base_url + 'search?page={page}&{query}'
|
||||
time_range_url = '&order={range}'
|
||||
|
||||
time_range_dict = {'day': 11,
|
||||
|
|
@ -37,9 +37,7 @@ def request(query, params):
|
|||
if params['time_range'] and params['time_range'] not in time_range_dict:
|
||||
return params
|
||||
|
||||
offset = (params['pageno'] - 1) * 24
|
||||
|
||||
params['url'] = search_url.format(offset=offset,
|
||||
params['url'] = search_url.format(page=params['pageno'],
|
||||
query=urlencode({'q': query}))
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
|
||||
|
|
@ -57,28 +55,27 @@ def response(resp):
|
|||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
regex = re.compile(r'\/200H\/')
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath('.//span[@class="thumb wide"]'):
|
||||
link = result.xpath('.//a[@class="torpedo-thumb-link"]')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = extract_text(result.xpath('.//span[@class="title"]'))
|
||||
thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
|
||||
img_src = regex.sub('/', thumbnail_src)
|
||||
for row in dom.xpath('//div[contains(@data-hook, "content_row")]'):
|
||||
for result in row.xpath('./div'):
|
||||
link = result.xpath('.//a[@data-hook="deviation_link"]')[0]
|
||||
url = link.attrib.get('href')
|
||||
title = link.attrib.get('title')
|
||||
thumbnail_src = result.xpath('.//img')[0].attrib.get('src')
|
||||
img_src = thumbnail_src
|
||||
|
||||
# http to https, remove domain sharding
|
||||
thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src)
|
||||
thumbnail_src = re.sub(r"http://", "https://", thumbnail_src)
|
||||
# http to https, remove domain sharding
|
||||
thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src)
|
||||
thumbnail_src = re.sub(r"http://", "https://", thumbnail_src)
|
||||
|
||||
url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url)
|
||||
url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url)
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'img_src': img_src,
|
||||
'thumbnail_src': thumbnail_src,
|
||||
'template': 'images.html'})
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'img_src': img_src,
|
||||
'thumbnail_src': thumbnail_src,
|
||||
'template': 'images.html'})
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -11,11 +11,11 @@
|
|||
|
||||
import re
|
||||
from lxml import html
|
||||
from searx.utils import is_valid_lang
|
||||
from searx.utils import is_valid_lang, eval_xpath
|
||||
from searx.url_utils import urljoin
|
||||
|
||||
categories = ['general']
|
||||
url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
|
||||
url = u'https://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}'
|
||||
weight = 100
|
||||
|
||||
parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
|
||||
|
|
@ -47,14 +47,14 @@ def response(resp):
|
|||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for k, result in enumerate(dom.xpath(results_xpath)[1:]):
|
||||
for k, result in enumerate(eval_xpath(dom, results_xpath)[1:]):
|
||||
try:
|
||||
from_result, to_results_raw = result.xpath('./td')
|
||||
from_result, to_results_raw = eval_xpath(result, './td')
|
||||
except:
|
||||
continue
|
||||
|
||||
to_results = []
|
||||
for to_result in to_results_raw.xpath('./p/a'):
|
||||
for to_result in eval_xpath(to_results_raw, './p/a'):
|
||||
t = to_result.text_content()
|
||||
if t.strip():
|
||||
to_results.append(to_result.text_content())
|
||||
|
|
|
|||
|
|
@ -15,7 +15,8 @@ import string
|
|||
from dateutil import parser
|
||||
from json import loads
|
||||
from lxml import html
|
||||
from searx.url_utils import quote_plus
|
||||
from searx.url_utils import urlencode
|
||||
from datetime import datetime
|
||||
|
||||
# engine dependent config
|
||||
categories = ['news', 'social media']
|
||||
|
|
@ -23,7 +24,7 @@ paging = True
|
|||
|
||||
# search-url
|
||||
base_url = 'https://digg.com/'
|
||||
search_url = base_url + 'api/search/{query}.json?position={position}&format=html'
|
||||
search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = '//article'
|
||||
|
|
@ -38,9 +39,9 @@ digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
|
|||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
offset = (params['pageno'] - 1) * 10
|
||||
offset = (params['pageno'] - 1) * 20
|
||||
params['url'] = search_url.format(position=offset,
|
||||
query=quote_plus(query))
|
||||
query=urlencode({'q': query}))
|
||||
params['cookies']['frontend.auid'] = ''.join(random.choice(
|
||||
digg_cookie_chars) for _ in range(22))
|
||||
return params
|
||||
|
|
@ -52,30 +53,17 @@ def response(resp):
|
|||
|
||||
search_result = loads(resp.text)
|
||||
|
||||
if 'html' not in search_result or search_result['html'] == '':
|
||||
return results
|
||||
|
||||
dom = html.fromstring(search_result['html'])
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath(results_xpath):
|
||||
url = result.attrib.get('data-contenturl')
|
||||
thumbnail = result.xpath('.//img')[0].attrib.get('src')
|
||||
title = ''.join(result.xpath(title_xpath))
|
||||
content = ''.join(result.xpath(content_xpath))
|
||||
pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
|
||||
publishedDate = parser.parse(pubdate)
|
||||
|
||||
# http to https
|
||||
thumbnail = thumbnail.replace("http://static.digg.com", "https://static.digg.com")
|
||||
for result in search_result['mapped']:
|
||||
|
||||
published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S")
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
results.append({'url': result['url'],
|
||||
'title': result['title'],
|
||||
'content': result['excerpt'],
|
||||
'template': 'videos.html',
|
||||
'publishedDate': publishedDate,
|
||||
'thumbnail': thumbnail})
|
||||
'publishedDate': published,
|
||||
'thumbnail': result['images']['thumbImage']})
|
||||
|
||||
# return results
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
from lxml.html import fromstring
|
||||
from searx.engines.xpath import extract_text
|
||||
from searx.utils import eval_xpath
|
||||
from searx.url_utils import urlencode
|
||||
|
||||
# engine dependent config
|
||||
|
|
@ -45,16 +46,16 @@ def response(resp):
|
|||
|
||||
# parse results
|
||||
# Quickhits
|
||||
for r in doc.xpath('//div[@class="search_quickresult"]/ul/li'):
|
||||
for r in eval_xpath(doc, '//div[@class="search_quickresult"]/ul/li'):
|
||||
try:
|
||||
res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1]
|
||||
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
|
||||
except:
|
||||
continue
|
||||
|
||||
if not res_url:
|
||||
continue
|
||||
|
||||
title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title'))
|
||||
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
|
||||
|
||||
# append result
|
||||
results.append({'title': title,
|
||||
|
|
@ -62,13 +63,13 @@ def response(resp):
|
|||
'url': base_url + res_url})
|
||||
|
||||
# Search results
|
||||
for r in doc.xpath('//dl[@class="search_results"]/*'):
|
||||
for r in eval_xpath(doc, '//dl[@class="search_results"]/*'):
|
||||
try:
|
||||
if r.tag == "dt":
|
||||
res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1]
|
||||
title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title'))
|
||||
res_url = eval_xpath(r, './/a[@class="wikilink1"]/@href')[-1]
|
||||
title = extract_text(eval_xpath(r, './/a[@class="wikilink1"]/@title'))
|
||||
elif r.tag == "dd":
|
||||
content = extract_text(r.xpath('.'))
|
||||
content = extract_text(eval_xpath(r, '.'))
|
||||
|
||||
# append result
|
||||
results.append({'title': title,
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ from json import loads
|
|||
from searx.engines.xpath import extract_text
|
||||
from searx.poolrequests import get
|
||||
from searx.url_utils import urlencode
|
||||
from searx.utils import match_language
|
||||
from searx.utils import match_language, eval_xpath
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
|
|
@ -65,21 +65,36 @@ def get_region_code(lang, lang_list=[]):
|
|||
|
||||
|
||||
def request(query, params):
|
||||
if params['time_range'] and params['time_range'] not in time_range_dict:
|
||||
if params['time_range'] not in (None, 'None', '') and params['time_range'] not in time_range_dict:
|
||||
return params
|
||||
|
||||
offset = (params['pageno'] - 1) * 30
|
||||
|
||||
region_code = get_region_code(params['language'], supported_languages)
|
||||
if region_code:
|
||||
params['url'] = url.format(
|
||||
query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
|
||||
params['url'] = 'https://duckduckgo.com/html/'
|
||||
if params['pageno'] > 1:
|
||||
params['method'] = 'POST'
|
||||
params['data']['q'] = query
|
||||
params['data']['s'] = offset
|
||||
params['data']['dc'] = 30
|
||||
params['data']['nextParams'] = ''
|
||||
params['data']['v'] = 'l'
|
||||
params['data']['o'] = 'json'
|
||||
params['data']['api'] = '/d.js'
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['data']['df'] = time_range_dict[params['time_range']]
|
||||
if region_code:
|
||||
params['data']['kl'] = region_code
|
||||
else:
|
||||
params['url'] = url.format(
|
||||
query=urlencode({'q': query}), offset=offset, dc_param=offset)
|
||||
if region_code:
|
||||
params['url'] = url.format(
|
||||
query=urlencode({'q': query, 'kl': region_code}), offset=offset, dc_param=offset)
|
||||
else:
|
||||
params['url'] = url.format(
|
||||
query=urlencode({'q': query}), offset=offset, dc_param=offset)
|
||||
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['url'] += time_range_url.format(range=time_range_dict[params['time_range']])
|
||||
|
||||
return params
|
||||
|
||||
|
|
@ -91,17 +106,19 @@ def response(resp):
|
|||
doc = fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for r in doc.xpath(result_xpath):
|
||||
for i, r in enumerate(eval_xpath(doc, result_xpath)):
|
||||
if i >= 30:
|
||||
break
|
||||
try:
|
||||
res_url = r.xpath(url_xpath)[-1]
|
||||
res_url = eval_xpath(r, url_xpath)[-1]
|
||||
except:
|
||||
continue
|
||||
|
||||
if not res_url:
|
||||
continue
|
||||
|
||||
title = extract_text(r.xpath(title_xpath))
|
||||
content = extract_text(r.xpath(content_xpath))
|
||||
title = extract_text(eval_xpath(r, title_xpath))
|
||||
content = extract_text(eval_xpath(r, content_xpath))
|
||||
|
||||
# append result
|
||||
results.append({'title': title,
|
||||
|
|
|
|||
|
|
@ -1,3 +1,14 @@
|
|||
"""
|
||||
DuckDuckGo (definitions)
|
||||
|
||||
- `Instant Answer API`_
|
||||
- `DuckDuckGo query`_
|
||||
|
||||
.. _Instant Answer API: https://duckduckgo.com/api
|
||||
.. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
from lxml import html
|
||||
from re import compile
|
||||
|
|
@ -25,7 +36,8 @@ def result_to_text(url, text, htmlResult):
|
|||
def request(query, params):
|
||||
params['url'] = url.format(query=urlencode({'q': query}))
|
||||
language = match_language(params['language'], supported_languages, language_aliases)
|
||||
params['headers']['Accept-Language'] = language.split('-')[0]
|
||||
language = language.split('-')[0]
|
||||
params['headers']['Accept-Language'] = language
|
||||
return params
|
||||
|
||||
|
||||
|
|
@ -43,8 +55,9 @@ def response(resp):
|
|||
|
||||
# add answer if there is one
|
||||
answer = search_res.get('Answer', '')
|
||||
if answer != '':
|
||||
results.append({'answer': html_to_text(answer)})
|
||||
if answer:
|
||||
if search_res.get('AnswerType', '') not in ['calc']:
|
||||
results.append({'answer': html_to_text(answer)})
|
||||
|
||||
# add infobox
|
||||
if 'Definition' in search_res:
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
from lxml import html, etree
|
||||
import re
|
||||
from searx.engines.xpath import extract_text
|
||||
from searx.utils import eval_xpath
|
||||
from searx.url_utils import quote, urljoin
|
||||
from searx import logger
|
||||
|
||||
|
|
@ -52,9 +53,9 @@ def response(resp):
|
|||
dom = html.fromstring(resp.text)
|
||||
|
||||
try:
|
||||
number_of_results_string = re.sub('[^0-9]', '', dom.xpath(
|
||||
'//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]
|
||||
)
|
||||
number_of_results_string =\
|
||||
re.sub('[^0-9]', '',
|
||||
eval_xpath(dom, '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0])
|
||||
|
||||
results.append({'number_of_results': int(number_of_results_string)})
|
||||
|
||||
|
|
@ -62,12 +63,12 @@ def response(resp):
|
|||
logger.debug("Couldn't read number of results.")
|
||||
pass
|
||||
|
||||
for result in dom.xpath('//section[not(contains(@class, "essay"))]'):
|
||||
for result in eval_xpath(dom, '//section[not(contains(@class, "essay"))]'):
|
||||
try:
|
||||
url = result.xpath('.//h2/a')[0].get('href')
|
||||
url = eval_xpath(result, './/h2/a')[0].get('href')
|
||||
url = urljoin(base_url, url)
|
||||
title = result.xpath('string(.//h2/a)').strip()
|
||||
content = extract_text(result.xpath('.//p'))
|
||||
title = eval_xpath(result, 'string(.//h2/a)').strip()
|
||||
content = extract_text(eval_xpath(result, './/p'))
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
|
|
|
|||
|
|
@ -18,13 +18,13 @@ categories = ['files']
|
|||
paging = True
|
||||
|
||||
# search-url
|
||||
base_url = 'https://f-droid.org/'
|
||||
search_url = base_url + 'repository/browse/?{query}'
|
||||
base_url = 'https://search.f-droid.org/'
|
||||
search_url = base_url + '?{query}'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
query = urlencode({'fdfilter': query, 'fdpage': params['pageno']})
|
||||
query = urlencode({'q': query, 'page': params['pageno'], 'lang': ''})
|
||||
params['url'] = search_url.format(query=query)
|
||||
return params
|
||||
|
||||
|
|
@ -35,17 +35,16 @@ def response(resp):
|
|||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
for app in dom.xpath('//div[@id="appheader"]'):
|
||||
url = app.xpath('./ancestor::a/@href')[0]
|
||||
title = app.xpath('./p/span/text()')[0]
|
||||
img_src = app.xpath('.//img/@src')[0]
|
||||
for app in dom.xpath('//a[@class="package-header"]'):
|
||||
app_url = app.xpath('./@href')[0]
|
||||
app_title = extract_text(app.xpath('./div/h4[@class="package-name"]/text()'))
|
||||
app_content = extract_text(app.xpath('./div/div/span[@class="package-summary"]')).strip() \
|
||||
+ ' - ' + extract_text(app.xpath('./div/div/span[@class="package-license"]')).strip()
|
||||
app_img_src = app.xpath('./img[@class="package-icon"]/@src')[0]
|
||||
|
||||
content = extract_text(app.xpath('./p')[0])
|
||||
content = content.replace(title, '', 1).strip()
|
||||
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'img_src': img_src})
|
||||
results.append({'url': app_url,
|
||||
'title': app_title,
|
||||
'content': app_content,
|
||||
'img_src': app_img_src})
|
||||
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -16,7 +16,8 @@ from json import loads
|
|||
from time import time
|
||||
import re
|
||||
from searx.engines import logger
|
||||
from searx.url_utils import urlencode, unquote
|
||||
from searx.url_utils import urlencode
|
||||
from searx.utils import ecma_unescape, html_to_text
|
||||
|
||||
logger = logger.getChild('flickr-noapi')
|
||||
|
||||
|
|
@ -75,11 +76,10 @@ def response(resp):
|
|||
|
||||
for index in legend:
|
||||
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
|
||||
author = unquote(photo.get('realname', ''))
|
||||
source = unquote(photo.get('username', '')) + ' @ Flickr'
|
||||
title = unquote(photo.get('title', ''))
|
||||
content = unquote(photo.get('description', ''))
|
||||
|
||||
author = ecma_unescape(photo.get('realname', ''))
|
||||
source = ecma_unescape(photo.get('username', '')) + ' @ Flickr'
|
||||
title = ecma_unescape(photo.get('title', ''))
|
||||
content = html_to_text(ecma_unescape(photo.get('description', '')))
|
||||
img_src = None
|
||||
# From the biggest to the lowest format
|
||||
for image_size in image_sizes:
|
||||
|
|
|
|||
|
|
@ -10,7 +10,10 @@
|
|||
@parse url, title, content, thumbnail, img_src
|
||||
"""
|
||||
|
||||
from cgi import escape
|
||||
try:
|
||||
from cgi import escape
|
||||
except:
|
||||
from html import escape
|
||||
from lxml import html
|
||||
from searx.engines.xpath import extract_text
|
||||
from searx.url_utils import urljoin, urlencode
|
||||
|
|
|
|||
|
|
@ -14,7 +14,9 @@ import random
|
|||
from json import loads
|
||||
from time import time
|
||||
from lxml.html import fromstring
|
||||
from searx.poolrequests import get
|
||||
from searx.url_utils import urlencode
|
||||
from searx.utils import eval_xpath
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
|
|
@ -30,13 +32,9 @@ search_string = 'search?{query}'\
|
|||
'&c=main'\
|
||||
'&s={offset}'\
|
||||
'&format=json'\
|
||||
'&qh=0'\
|
||||
'&qlang={lang}'\
|
||||
'&langcountry={lang}'\
|
||||
'&ff={safesearch}'\
|
||||
'&rxiec={rxieu}'\
|
||||
'&ulse={ulse}'\
|
||||
'&rand={rxikd}' # current unix timestamp
|
||||
|
||||
'&rand={rxikd}'
|
||||
# specific xpath variables
|
||||
results_xpath = '//response//result'
|
||||
url_xpath = './/url'
|
||||
|
|
@ -45,9 +43,26 @@ content_xpath = './/sum'
|
|||
|
||||
supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
|
||||
|
||||
extra_param = '' # gigablast requires a random extra parameter
|
||||
# which can be extracted from the source code of the search page
|
||||
|
||||
|
||||
def parse_extra_param(text):
|
||||
global extra_param
|
||||
param_lines = [x for x in text.splitlines() if x.startswith('var url=') or x.startswith('url=url+')]
|
||||
extra_param = ''
|
||||
for l in param_lines:
|
||||
extra_param += l.split("'")[1]
|
||||
extra_param = extra_param.split('&')[-1]
|
||||
|
||||
|
||||
def init(engine_settings=None):
|
||||
parse_extra_param(get('http://gigablast.com/search?c=main&qlangcountry=en-us&q=south&s=10').text)
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
print("EXTRAPARAM:", extra_param)
|
||||
offset = (params['pageno'] - 1) * number_of_results
|
||||
|
||||
if params['language'] == 'all':
|
||||
|
|
@ -66,13 +81,11 @@ def request(query, params):
|
|||
search_path = search_string.format(query=urlencode({'q': query}),
|
||||
offset=offset,
|
||||
number_of_results=number_of_results,
|
||||
rxikd=int(time() * 1000),
|
||||
rxieu=random.randint(1000000000, 9999999999),
|
||||
ulse=random.randint(100000000, 999999999),
|
||||
lang=language,
|
||||
rxikd=int(time() * 1000),
|
||||
safesearch=safesearch)
|
||||
|
||||
params['url'] = base_url + search_path
|
||||
params['url'] = base_url + search_path + '&' + extra_param
|
||||
|
||||
return params
|
||||
|
||||
|
|
@ -82,7 +95,11 @@ def response(resp):
|
|||
results = []
|
||||
|
||||
# parse results
|
||||
response_json = loads(resp.text)
|
||||
try:
|
||||
response_json = loads(resp.text)
|
||||
except:
|
||||
parse_extra_param(resp.text)
|
||||
raise Exception('extra param expired, please reload')
|
||||
|
||||
for result in response_json['results']:
|
||||
# append result
|
||||
|
|
@ -98,9 +115,9 @@ def response(resp):
|
|||
def _fetch_supported_languages(resp):
|
||||
supported_languages = []
|
||||
dom = fromstring(resp.text)
|
||||
links = dom.xpath('//span[@id="menu2"]/a')
|
||||
links = eval_xpath(dom, '//span[@id="menu2"]/a')
|
||||
for link in links:
|
||||
href = link.xpath('./@href')[0].split('lang%3A')
|
||||
href = eval_xpath(link, './@href')[0].split('lang%3A')
|
||||
if len(href) == 2:
|
||||
code = href[1].split('_')
|
||||
if len(code) == 2:
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ from lxml import html, etree
|
|||
from searx.engines.xpath import extract_text, extract_url
|
||||
from searx import logger
|
||||
from searx.url_utils import urlencode, urlparse, parse_qsl
|
||||
from searx.utils import match_language
|
||||
from searx.utils import match_language, eval_xpath
|
||||
|
||||
logger = logger.getChild('google engine')
|
||||
|
||||
|
|
@ -107,13 +107,12 @@ images_path = '/images'
|
|||
supported_languages_url = 'https://www.google.com/preferences?#languages'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = '//div[@class="g"]'
|
||||
url_xpath = './/h3/a/@href'
|
||||
title_xpath = './/h3'
|
||||
content_xpath = './/span[@class="st"]'
|
||||
content_misc_xpath = './/div[@class="f slp"]'
|
||||
suggestion_xpath = '//p[@class="_Bmc"]'
|
||||
spelling_suggestion_xpath = '//a[@class="spell"]'
|
||||
results_xpath = '//div[contains(@class, "ZINbbc")]'
|
||||
url_xpath = './/div[@class="kCrYT"][1]/a/@href'
|
||||
title_xpath = './/div[@class="kCrYT"][1]/a/div[1]'
|
||||
content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]'
|
||||
suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]'
|
||||
spelling_suggestion_xpath = '//div[@id="scc"]//a'
|
||||
|
||||
# map : detail location
|
||||
map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
|
||||
|
|
@ -156,7 +155,7 @@ def parse_url(url_string, google_hostname):
|
|||
|
||||
# returns extract_text on the first result selected by the xpath or None
|
||||
def extract_text_from_dom(result, xpath):
|
||||
r = result.xpath(xpath)
|
||||
r = eval_xpath(result, xpath)
|
||||
if len(r) > 0:
|
||||
return extract_text(r[0])
|
||||
return None
|
||||
|
|
@ -199,9 +198,6 @@ def request(query, params):
|
|||
params['headers']['Accept-Language'] = language + ',' + language + '-' + country
|
||||
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
|
||||
# Force Internet Explorer 12 user agent to avoid loading the new UI that Searx can't parse
|
||||
params['headers']['User-Agent'] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
|
||||
|
||||
params['google_hostname'] = google_hostname
|
||||
|
||||
return params
|
||||
|
|
@ -226,21 +222,21 @@ def response(resp):
|
|||
# convert the text to dom
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
|
||||
instant_answer = eval_xpath(dom, '//div[@id="_vBb"]//text()')
|
||||
if instant_answer:
|
||||
results.append({'answer': u' '.join(instant_answer)})
|
||||
try:
|
||||
results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0]
|
||||
results_num = int(eval_xpath(dom, '//div[@id="resultStats"]//text()')[0]
|
||||
.split()[1].replace(',', ''))
|
||||
results.append({'number_of_results': results_num})
|
||||
except:
|
||||
pass
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath(results_xpath):
|
||||
for result in eval_xpath(dom, results_xpath):
|
||||
try:
|
||||
title = extract_text(result.xpath(title_xpath)[0])
|
||||
url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
|
||||
title = extract_text(eval_xpath(result, title_xpath)[0])
|
||||
url = parse_url(extract_url(eval_xpath(result, url_xpath), google_url), google_hostname)
|
||||
parsed_url = urlparse(url, google_hostname)
|
||||
|
||||
# map result
|
||||
|
|
@ -249,7 +245,7 @@ def response(resp):
|
|||
continue
|
||||
# if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
|
||||
# print "yooooo"*30
|
||||
# x = result.xpath(map_near)
|
||||
# x = eval_xpath(result, map_near)
|
||||
# if len(x) > 0:
|
||||
# # map : near the location
|
||||
# results = results + parse_map_near(parsed_url, x, google_hostname)
|
||||
|
|
@ -273,9 +269,7 @@ def response(resp):
|
|||
content = extract_text_from_dom(result, content_xpath)
|
||||
if content is None:
|
||||
continue
|
||||
content_misc = extract_text_from_dom(result, content_misc_xpath)
|
||||
if content_misc is not None:
|
||||
content = content_misc + "<br />" + content
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
|
|
@ -286,11 +280,11 @@ def response(resp):
|
|||
continue
|
||||
|
||||
# parse suggestion
|
||||
for suggestion in dom.xpath(suggestion_xpath):
|
||||
for suggestion in eval_xpath(dom, suggestion_xpath):
|
||||
# append suggestion
|
||||
results.append({'suggestion': extract_text(suggestion)})
|
||||
|
||||
for correction in dom.xpath(spelling_suggestion_xpath):
|
||||
for correction in eval_xpath(dom, spelling_suggestion_xpath):
|
||||
results.append({'correction': extract_text(correction)})
|
||||
|
||||
# return results
|
||||
|
|
@ -299,9 +293,9 @@ def response(resp):
|
|||
|
||||
def parse_images(result, google_hostname):
|
||||
results = []
|
||||
for image in result.xpath(images_xpath):
|
||||
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname)
|
||||
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
|
||||
for image in eval_xpath(result, images_xpath):
|
||||
url = parse_url(extract_text(eval_xpath(image, image_url_xpath)[0]), google_hostname)
|
||||
img_src = extract_text(eval_xpath(image, image_img_src_xpath)[0])
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
|
|
@ -388,10 +382,10 @@ def attributes_to_html(attributes):
|
|||
def _fetch_supported_languages(resp):
|
||||
supported_languages = {}
|
||||
dom = html.fromstring(resp.text)
|
||||
options = dom.xpath('//*[@id="langSec"]//input[@name="lr"]')
|
||||
options = eval_xpath(dom, '//*[@id="langSec"]//input[@name="lr"]')
|
||||
for option in options:
|
||||
code = option.xpath('./@value')[0].split('_')[-1]
|
||||
name = option.xpath('./@data-name')[0].title()
|
||||
code = eval_xpath(option, './@value')[0].split('_')[-1]
|
||||
name = eval_xpath(option, './@data-name')[0].title()
|
||||
supported_languages[code] = {"name": name}
|
||||
|
||||
return supported_languages
|
||||
|
|
|
|||
|
|
@ -70,11 +70,21 @@ def response(resp):
|
|||
|
||||
try:
|
||||
metadata = loads(result)
|
||||
img_format = "{0} {1}x{2}".format(metadata['ity'], str(metadata['ow']), str(metadata['oh']))
|
||||
source = "{0} ({1})".format(metadata['st'], metadata['isu'])
|
||||
|
||||
img_format = metadata.get('ity', '')
|
||||
img_width = metadata.get('ow', '')
|
||||
img_height = metadata.get('oh', '')
|
||||
if img_width and img_height:
|
||||
img_format += " {0}x{1}".format(img_width, img_height)
|
||||
|
||||
source = metadata.get('st', '')
|
||||
source_url = metadata.get('isu', '')
|
||||
if source_url:
|
||||
source += " ({0})".format(source_url)
|
||||
|
||||
results.append({'url': metadata['ru'],
|
||||
'title': metadata['pt'],
|
||||
'content': metadata['s'],
|
||||
'content': metadata.get('s', ''),
|
||||
'source': source,
|
||||
'img_format': img_format,
|
||||
'thumbnail_src': metadata['tu'],
|
||||
|
|
|
|||
|
|
@ -75,15 +75,17 @@ def response(resp):
|
|||
|
||||
# get thumbnails
|
||||
script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
|
||||
id = result.xpath('.//div[@class="s"]//img/@id')[0]
|
||||
thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id,
|
||||
script)
|
||||
tmp = []
|
||||
if len(thumbnails_data) != 0:
|
||||
tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
|
||||
thumbnail = ''
|
||||
if len(tmp) != 0:
|
||||
thumbnail = tmp[-1]
|
||||
ids = result.xpath('.//div[@class="s"]//img/@id')
|
||||
if len(ids) > 0:
|
||||
thumbnails_data = \
|
||||
re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + ids[0],
|
||||
script)
|
||||
tmp = []
|
||||
if len(thumbnails_data) != 0:
|
||||
tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
|
||||
thumbnail = ''
|
||||
if len(tmp) != 0:
|
||||
thumbnail = tmp[-1]
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
|
|
|
|||
100
searx/engines/invidious.py
Normal file
100
searx/engines/invidious.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# Invidious (Videos)
|
||||
#
|
||||
# @website https://invidio.us/
|
||||
# @provide-api yes (https://github.com/omarroth/invidious/wiki/API)
|
||||
#
|
||||
# @using-api yes
|
||||
# @results JSON
|
||||
# @stable yes
|
||||
# @parse url, title, content, publishedDate, thumbnail, embedded
|
||||
|
||||
from searx.url_utils import quote_plus
|
||||
from dateutil import parser
|
||||
import time
|
||||
|
||||
# engine dependent config
|
||||
categories = ["videos", "music"]
|
||||
paging = True
|
||||
language_support = True
|
||||
time_range_support = True
|
||||
|
||||
# search-url
|
||||
base_url = "https://invidio.us/"
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
time_range_dict = {
|
||||
"day": "today",
|
||||
"week": "week",
|
||||
"month": "month",
|
||||
"year": "year",
|
||||
}
|
||||
search_url = base_url + "api/v1/search?q={query}"
|
||||
params["url"] = search_url.format(
|
||||
query=quote_plus(query)
|
||||
) + "&page={pageno}".format(pageno=params["pageno"])
|
||||
|
||||
if params["time_range"] in time_range_dict:
|
||||
params["url"] += "&date={timerange}".format(
|
||||
timerange=time_range_dict[params["time_range"]]
|
||||
)
|
||||
|
||||
if params["language"] != "all":
|
||||
lang = params["language"].split("-")
|
||||
if len(lang) == 2:
|
||||
params["url"] += "&range={lrange}".format(lrange=lang[1])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_results = resp.json()
|
||||
embedded_url = (
|
||||
'<iframe width="540" height="304" '
|
||||
+ 'data-src="'
|
||||
+ base_url
|
||||
+ 'embed/{videoid}" '
|
||||
+ 'frameborder="0" allowfullscreen></iframe>'
|
||||
)
|
||||
|
||||
base_invidious_url = base_url + "watch?v="
|
||||
|
||||
for result in search_results:
|
||||
rtype = result.get("type", None)
|
||||
if rtype == "video":
|
||||
videoid = result.get("videoId", None)
|
||||
if not videoid:
|
||||
continue
|
||||
|
||||
url = base_invidious_url + videoid
|
||||
embedded = embedded_url.format(videoid=videoid)
|
||||
thumbs = result.get("videoThumbnails", [])
|
||||
thumb = next(
|
||||
(th for th in thumbs if th["quality"] == "sddefault"), None
|
||||
)
|
||||
if thumb:
|
||||
thumbnail = thumb.get("url", "")
|
||||
else:
|
||||
thumbnail = ""
|
||||
|
||||
publishedDate = parser.parse(
|
||||
time.ctime(result.get("published", 0))
|
||||
)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"url": url,
|
||||
"title": result.get("title", ""),
|
||||
"content": result.get("description", ""),
|
||||
"template": "videos.html",
|
||||
"publishedDate": publishedDate,
|
||||
"embedded": embedded,
|
||||
"thumbnail": thumbnail,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
|
@ -24,7 +24,7 @@ result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
|
|||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
params['url'] = base_url + search_string.format(query=query)
|
||||
params['url'] = base_url + search_string.format(query=query.decode('utf-8'))
|
||||
|
||||
return params
|
||||
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ def request(query, params):
|
|||
language = match_language(params['language'], supported_languages, language_aliases)
|
||||
params['url'] += '&locale=' + language.replace('-', '_').lower()
|
||||
|
||||
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
|
||||
return params
|
||||
|
||||
|
||||
|
|
|
|||
78
searx/engines/seedpeer.py
Normal file
78
searx/engines/seedpeer.py
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
# Seedpeer (Videos, Music, Files)
|
||||
#
|
||||
# @website https://seedpeer.me
|
||||
# @provide-api no (nothing found)
|
||||
#
|
||||
# @using-api no
|
||||
# @results HTML (using search portal)
|
||||
# @stable yes (HTML can change)
|
||||
# @parse url, title, content, seed, leech, magnetlink
|
||||
|
||||
from lxml import html
|
||||
from json import loads
|
||||
from operator import itemgetter
|
||||
from searx.url_utils import quote, urljoin
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
|
||||
url = 'https://seedpeer.me/'
|
||||
search_url = url + 'search/{search_term}?page={page_no}'
|
||||
torrent_file_url = url + 'torrent/{torrent_hash}'
|
||||
|
||||
# specific xpath variables
|
||||
script_xpath = '//script[@type="text/javascript"][not(@src)]'
|
||||
torrent_xpath = '(//table)[2]/tbody/tr'
|
||||
link_xpath = '(./td)[1]/a/@href'
|
||||
age_xpath = '(./td)[2]'
|
||||
size_xpath = '(./td)[3]'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(search_term=quote(query),
|
||||
page_no=params['pageno'])
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
result_rows = dom.xpath(torrent_xpath)
|
||||
|
||||
try:
|
||||
script_element = dom.xpath(script_xpath)[0]
|
||||
json_string = script_element.text[script_element.text.find('{'):]
|
||||
torrents_json = loads(json_string)
|
||||
except:
|
||||
return []
|
||||
|
||||
# parse results
|
||||
for torrent_row, torrent_json in zip(result_rows, torrents_json['data']['list']):
|
||||
title = torrent_json['name']
|
||||
seed = int(torrent_json['seeds'])
|
||||
leech = int(torrent_json['peers'])
|
||||
size = int(torrent_json['size'])
|
||||
torrent_hash = torrent_json['hash']
|
||||
|
||||
torrentfile = torrent_file_url.format(torrent_hash=torrent_hash)
|
||||
magnetlink = 'magnet:?xt=urn:btih:{}'.format(torrent_hash)
|
||||
|
||||
age = extract_text(torrent_row.xpath(age_xpath))
|
||||
link = torrent_row.xpath(link_xpath)[0]
|
||||
|
||||
href = urljoin(url, link)
|
||||
|
||||
# append result
|
||||
results.append({'url': href,
|
||||
'title': title,
|
||||
'content': age,
|
||||
'seed': seed,
|
||||
'leech': leech,
|
||||
'filesize': size,
|
||||
'torrentfile': torrentfile,
|
||||
'magnetlink': magnetlink,
|
||||
'template': 'torrent.html'})
|
||||
|
||||
# return results sorted by seeder
|
||||
return sorted(results, key=itemgetter('seed'), reverse=True)
|
||||
|
|
@ -51,7 +51,9 @@ def get_client_id():
|
|||
|
||||
if response.ok:
|
||||
tree = html.fromstring(response.content)
|
||||
script_tags = tree.xpath("//script[contains(@src, '/assets/app')]")
|
||||
# script_tags has been moved from /assets/app/ to /assets/ path. I
|
||||
# found client_id in https://a-v2.sndcdn.com/assets/49-a0c01933-3.js
|
||||
script_tags = tree.xpath("//script[contains(@src, '/assets/')]")
|
||||
app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None]
|
||||
|
||||
# extracts valid app_js urls from soundcloud.com content
|
||||
|
|
@ -66,7 +68,7 @@ def get_client_id():
|
|||
return ""
|
||||
|
||||
|
||||
def init():
|
||||
def init(engine_settings=None):
|
||||
global guest_client_id
|
||||
# api-key
|
||||
guest_client_id = get_client_id()
|
||||
|
|
|
|||
|
|
@ -15,6 +15,8 @@ from dateutil import parser
|
|||
from datetime import datetime, timedelta
|
||||
import re
|
||||
from searx.engines.xpath import extract_text
|
||||
from searx.languages import language_codes
|
||||
from searx.utils import eval_xpath
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
|
|
@ -22,7 +24,7 @@ categories = ['general']
|
|||
# (probably the parameter qid), require
|
||||
# storing of qid's between mulitble search-calls
|
||||
|
||||
# paging = False
|
||||
paging = True
|
||||
language_support = True
|
||||
|
||||
# search-url
|
||||
|
|
@ -32,23 +34,32 @@ search_url = base_url + 'do/search'
|
|||
# specific xpath variables
|
||||
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
|
||||
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
|
||||
results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]'
|
||||
link_xpath = './/h3/a'
|
||||
content_xpath = './p[@class="search-item__body"]'
|
||||
results_xpath = '//div[@class="w-gl__result"]'
|
||||
link_xpath = './/a[@class="w-gl__result-title"]'
|
||||
content_xpath = './/p[@class="w-gl__description"]'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
offset = (params['pageno'] - 1) * 10
|
||||
|
||||
params['url'] = search_url
|
||||
params['method'] = 'POST'
|
||||
params['data'] = {'query': query,
|
||||
'startat': offset}
|
||||
params['data'] = {
|
||||
'query': query,
|
||||
'page': params['pageno'],
|
||||
'cat': 'web',
|
||||
'cmd': 'process_search',
|
||||
'engine0': 'v1all',
|
||||
}
|
||||
|
||||
# set language if specified
|
||||
if params['language'] != 'all':
|
||||
params['data']['with_language'] = ('lang_' + params['language'].split('-')[0])
|
||||
language = 'english'
|
||||
for lc, _, _, lang in language_codes:
|
||||
if lc == params['language']:
|
||||
language = lang
|
||||
params['data']['language'] = language
|
||||
params['data']['lui'] = language
|
||||
|
||||
return params
|
||||
|
||||
|
|
@ -60,8 +71,8 @@ def response(resp):
|
|||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath(results_xpath):
|
||||
links = result.xpath(link_xpath)
|
||||
for result in eval_xpath(dom, results_xpath):
|
||||
links = eval_xpath(result, link_xpath)
|
||||
if not links:
|
||||
continue
|
||||
link = links[0]
|
||||
|
|
@ -77,8 +88,8 @@ def response(resp):
|
|||
|
||||
title = extract_text(link)
|
||||
|
||||
if result.xpath(content_xpath):
|
||||
content = extract_text(result.xpath(content_xpath))
|
||||
if eval_xpath(result, content_xpath):
|
||||
content = extract_text(eval_xpath(result, content_xpath))
|
||||
else:
|
||||
content = ''
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ from searx.poolrequests import get
|
|||
from searx.engines.xpath import extract_text
|
||||
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
|
||||
from searx.url_utils import urlencode
|
||||
from searx.utils import match_language
|
||||
from searx.utils import match_language, eval_xpath
|
||||
|
||||
from json import loads
|
||||
from lxml.html import fromstring
|
||||
|
|
@ -57,22 +57,6 @@ language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator
|
|||
calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
|
||||
media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a'
|
||||
|
||||
# xpath_cache
|
||||
xpath_cache = {}
|
||||
|
||||
|
||||
def get_xpath(xpath_str):
|
||||
result = xpath_cache.get(xpath_str, None)
|
||||
if not result:
|
||||
result = etree.XPath(xpath_str)
|
||||
xpath_cache[xpath_str] = result
|
||||
return result
|
||||
|
||||
|
||||
def eval_xpath(element, xpath_str):
|
||||
xpath = get_xpath(xpath_str)
|
||||
return xpath(element)
|
||||
|
||||
|
||||
def get_id_cache(result):
|
||||
id_cache = {}
|
||||
|
|
|
|||
|
|
@ -21,7 +21,8 @@ search_url = base_url + u'w/api.php?'\
|
|||
'action=query'\
|
||||
'&format=json'\
|
||||
'&{query}'\
|
||||
'&prop=extracts|pageimages'\
|
||||
'&prop=extracts|pageimages|pageprops'\
|
||||
'&ppprop=disambiguation'\
|
||||
'&exintro'\
|
||||
'&explaintext'\
|
||||
'&pithumbsize=300'\
|
||||
|
|
@ -79,12 +80,15 @@ def response(resp):
|
|||
|
||||
# wikipedia article's unique id
|
||||
# first valid id is assumed to be the requested article
|
||||
if 'pages' not in search_result['query']:
|
||||
return results
|
||||
|
||||
for article_id in search_result['query']['pages']:
|
||||
page = search_result['query']['pages'][article_id]
|
||||
if int(article_id) > 0:
|
||||
break
|
||||
|
||||
if int(article_id) < 0:
|
||||
if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}):
|
||||
return []
|
||||
|
||||
title = page.get('title')
|
||||
|
|
@ -96,6 +100,7 @@ def response(resp):
|
|||
extract = page.get('extract')
|
||||
|
||||
summary = extract_first_paragraph(extract, title, image)
|
||||
summary = summary.replace('() ', '')
|
||||
|
||||
# link to wikipedia article
|
||||
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ def obtain_token():
|
|||
return token
|
||||
|
||||
|
||||
def init():
|
||||
def init(engine_settings=None):
|
||||
obtain_token()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -11,8 +11,8 @@
|
|||
"""
|
||||
|
||||
from lxml import html
|
||||
import re
|
||||
from searx.url_utils import urlencode, urljoin
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images']
|
||||
|
|
@ -34,41 +34,18 @@ def request(query, params):
|
|||
def response(resp):
|
||||
results = []
|
||||
|
||||
# get links from result-text
|
||||
regex = re.compile('(</a>|<a)')
|
||||
results_parts = re.split(regex, resp.text)
|
||||
|
||||
cur_element = ''
|
||||
|
||||
# iterate over link parts
|
||||
for result_part in results_parts:
|
||||
dom = html.fromstring(resp.text)
|
||||
for res in dom.xpath('//div[@class="List-item MainListing"]'):
|
||||
# processed start and end of link
|
||||
if result_part == '<a':
|
||||
cur_element = result_part
|
||||
continue
|
||||
elif result_part != '</a>':
|
||||
cur_element += result_part
|
||||
continue
|
||||
|
||||
cur_element += result_part
|
||||
|
||||
# fix xml-error
|
||||
cur_element = cur_element.replace('"></a>', '"/></a>')
|
||||
|
||||
dom = html.fromstring(cur_element)
|
||||
link = dom.xpath('//a')[0]
|
||||
link = res.xpath('//a')[0]
|
||||
|
||||
url = urljoin(base_url, link.attrib.get('href'))
|
||||
title = link.attrib.get('title', '')
|
||||
title = extract_text(link)
|
||||
|
||||
thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
|
||||
thumbnail_src = urljoin(base_url, res.xpath('.//img')[0].attrib['src'])
|
||||
# TODO: get image with higher resolution
|
||||
img_src = thumbnail_src
|
||||
|
||||
# check if url is showing to a photo
|
||||
if '/photo/' not in url:
|
||||
continue
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from lxml import html
|
||||
from lxml.etree import _ElementStringResult, _ElementUnicodeResult
|
||||
from searx.utils import html_to_text
|
||||
from searx.utils import html_to_text, eval_xpath
|
||||
from searx.url_utils import unquote, urlencode, urljoin, urlparse
|
||||
|
||||
search_url = None
|
||||
|
|
@ -104,15 +104,15 @@ def response(resp):
|
|||
results = []
|
||||
dom = html.fromstring(resp.text)
|
||||
if results_xpath:
|
||||
for result in dom.xpath(results_xpath):
|
||||
url = extract_url(result.xpath(url_xpath), search_url)
|
||||
title = extract_text(result.xpath(title_xpath))
|
||||
content = extract_text(result.xpath(content_xpath))
|
||||
for result in eval_xpath(dom, results_xpath):
|
||||
url = extract_url(eval_xpath(result, url_xpath), search_url)
|
||||
title = extract_text(eval_xpath(result, title_xpath))
|
||||
content = extract_text(eval_xpath(result, content_xpath))
|
||||
tmp_result = {'url': url, 'title': title, 'content': content}
|
||||
|
||||
# add thumbnail if available
|
||||
if thumbnail_xpath:
|
||||
thumbnail_xpath_result = result.xpath(thumbnail_xpath)
|
||||
thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
|
||||
if len(thumbnail_xpath_result) > 0:
|
||||
tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
|
||||
|
||||
|
|
@ -120,14 +120,14 @@ def response(resp):
|
|||
else:
|
||||
for url, title, content in zip(
|
||||
(extract_url(x, search_url) for
|
||||
x in dom.xpath(url_xpath)),
|
||||
map(extract_text, dom.xpath(title_xpath)),
|
||||
map(extract_text, dom.xpath(content_xpath))
|
||||
x in eval_xpath(dom, url_xpath)),
|
||||
map(extract_text, eval_xpath(dom, title_xpath)),
|
||||
map(extract_text, eval_xpath(dom, content_xpath))
|
||||
):
|
||||
results.append({'url': url, 'title': title, 'content': content})
|
||||
|
||||
if not suggestion_xpath:
|
||||
return results
|
||||
for suggestion in dom.xpath(suggestion_xpath):
|
||||
for suggestion in eval_xpath(dom, suggestion_xpath):
|
||||
results.append({'suggestion': extract_text(suggestion)})
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
from lxml import html
|
||||
from searx.engines.xpath import extract_text, extract_url
|
||||
from searx.url_utils import unquote, urlencode
|
||||
from searx.utils import match_language
|
||||
from searx.utils import match_language, eval_xpath
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
|
|
@ -109,21 +109,21 @@ def response(resp):
|
|||
dom = html.fromstring(resp.text)
|
||||
|
||||
try:
|
||||
results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0]
|
||||
results_num = int(eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()')[0]
|
||||
.split()[0].replace(',', ''))
|
||||
results.append({'number_of_results': results_num})
|
||||
except:
|
||||
pass
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath(results_xpath):
|
||||
for result in eval_xpath(dom, results_xpath):
|
||||
try:
|
||||
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
|
||||
title = extract_text(result.xpath(title_xpath)[0])
|
||||
url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url))
|
||||
title = extract_text(eval_xpath(result, title_xpath)[0])
|
||||
except:
|
||||
continue
|
||||
|
||||
content = extract_text(result.xpath(content_xpath)[0])
|
||||
content = extract_text(eval_xpath(result, content_xpath)[0])
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
|
|
@ -131,7 +131,7 @@ def response(resp):
|
|||
'content': content})
|
||||
|
||||
# if no suggestion found, return results
|
||||
suggestions = dom.xpath(suggestion_xpath)
|
||||
suggestions = eval_xpath(dom, suggestion_xpath)
|
||||
if not suggestions:
|
||||
return results
|
||||
|
||||
|
|
@ -148,9 +148,9 @@ def response(resp):
|
|||
def _fetch_supported_languages(resp):
|
||||
supported_languages = []
|
||||
dom = html.fromstring(resp.text)
|
||||
options = dom.xpath('//div[@id="yschlang"]/span/label/input')
|
||||
options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input')
|
||||
for option in options:
|
||||
code_parts = option.xpath('./@value')[0][5:].split('_')
|
||||
code_parts = eval_xpath(option, './@value')[0][5:].split('_')
|
||||
if len(code_parts) == 2:
|
||||
code = code_parts[0] + '-' + code_parts[1].upper()
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -67,12 +67,8 @@ def response(resp):
|
|||
if videoid is not None:
|
||||
url = base_youtube_url + videoid
|
||||
thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'
|
||||
title = video.get('title', {}).get('simpleText', videoid)
|
||||
description_snippet = video.get('descriptionSnippet', {})
|
||||
if 'runs' in description_snippet:
|
||||
content = reduce(lambda a, b: a + b.get('text', ''), description_snippet.get('runs'), '')
|
||||
else:
|
||||
content = description_snippet.get('simpleText', '')
|
||||
title = get_text_from_json(video.get('title', {}))
|
||||
content = get_text_from_json(video.get('descriptionSnippet', {}))
|
||||
embedded = embedded_url.format(videoid=videoid)
|
||||
|
||||
# append result
|
||||
|
|
@ -85,3 +81,10 @@ def response(resp):
|
|||
|
||||
# return results
|
||||
return results
|
||||
|
||||
|
||||
def get_text_from_json(element):
|
||||
if 'runs' in element:
|
||||
return reduce(lambda a, b: a + b.get('text', ''), element.get('runs'), '')
|
||||
else:
|
||||
return element.get('simpleText', '')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue