forked from zaclys/searxng
commit
db703a0283
|
@ -15,6 +15,7 @@ setup.cfg
|
||||||
*.pyc
|
*.pyc
|
||||||
*/*.pyc
|
*/*.pyc
|
||||||
*~
|
*~
|
||||||
|
*.swp
|
||||||
|
|
||||||
/node_modules
|
/node_modules
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -142,6 +142,17 @@ def load_engine(engine_data):
|
||||||
engine.stats['page_load_time'] = 0
|
engine.stats['page_load_time'] = 0
|
||||||
engine.stats['page_load_count'] = 0
|
engine.stats['page_load_count'] = 0
|
||||||
|
|
||||||
|
# tor related settings
|
||||||
|
if settings['outgoing'].get('using_tor_proxy'):
|
||||||
|
# use onion url if using tor.
|
||||||
|
if hasattr(engine, 'onion_url'):
|
||||||
|
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '')
|
||||||
|
elif 'onions' in engine.categories:
|
||||||
|
# exclude onion engines if not using tor.
|
||||||
|
return None
|
||||||
|
|
||||||
|
engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)
|
||||||
|
|
||||||
for category_name in engine.categories:
|
for category_name in engine.categories:
|
||||||
categories.setdefault(category_name, []).append(engine)
|
categories.setdefault(category_name, []).append(engine)
|
||||||
|
|
||||||
|
@ -252,8 +263,9 @@ def get_engines_stats(preferences):
|
||||||
|
|
||||||
|
|
||||||
def load_engines(engine_list):
|
def load_engines(engine_list):
|
||||||
global engines
|
global engines, engine_shortcuts
|
||||||
engines.clear()
|
engines.clear()
|
||||||
|
engine_shortcuts.clear()
|
||||||
for engine_data in engine_list:
|
for engine_data in engine_list:
|
||||||
engine = load_engine(engine_data)
|
engine = load_engine(engine_data)
|
||||||
if engine is not None:
|
if engine is not None:
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
"""
|
||||||
|
Ahmia (Onions)
|
||||||
|
|
||||||
|
@website http://msydqstlz2kzerdg.onion
|
||||||
|
@provides-api no
|
||||||
|
|
||||||
|
@using-api no
|
||||||
|
@results HTML
|
||||||
|
@stable no
|
||||||
|
@parse url, title, content
|
||||||
|
"""
|
||||||
|
|
||||||
|
from urllib.parse import urlencode, urlparse, parse_qs
|
||||||
|
from lxml.html import fromstring
|
||||||
|
from searx.engines.xpath import extract_url, extract_text
|
||||||
|
|
||||||
|
# engine config
|
||||||
|
categories = ['onions']
|
||||||
|
paging = True
|
||||||
|
page_size = 10
|
||||||
|
|
||||||
|
# search url
|
||||||
|
search_url = 'http://msydqstlz2kzerdg.onion/search/?{query}'
|
||||||
|
time_range_support = True
|
||||||
|
time_range_dict = {'day': 1,
|
||||||
|
'week': 7,
|
||||||
|
'month': 30}
|
||||||
|
|
||||||
|
# xpaths
|
||||||
|
results_xpath = '//li[@class="result"]'
|
||||||
|
url_xpath = './h4/a/@href'
|
||||||
|
title_xpath = './h4/a[1]'
|
||||||
|
content_xpath = './/p[1]'
|
||||||
|
correction_xpath = '//*[@id="didYouMean"]//a'
|
||||||
|
number_of_results_xpath = '//*[@id="totalResults"]'
|
||||||
|
|
||||||
|
|
||||||
|
def request(query, params):
|
||||||
|
params['url'] = search_url.format(query=urlencode({'q': query}))
|
||||||
|
|
||||||
|
if params['time_range'] in time_range_dict:
|
||||||
|
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
dom = fromstring(resp.text)
|
||||||
|
|
||||||
|
# trim results so there's not way too many at once
|
||||||
|
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
|
||||||
|
all_results = dom.xpath(results_xpath)
|
||||||
|
trimmed_results = all_results[first_result_index:first_result_index + page_size]
|
||||||
|
|
||||||
|
# get results
|
||||||
|
for result in trimmed_results:
|
||||||
|
# remove ahmia url and extract the actual url for the result
|
||||||
|
raw_url = extract_url(result.xpath(url_xpath), search_url)
|
||||||
|
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
|
||||||
|
|
||||||
|
title = extract_text(result.xpath(title_xpath))
|
||||||
|
content = extract_text(result.xpath(content_xpath))
|
||||||
|
|
||||||
|
results.append({'url': cleaned_url,
|
||||||
|
'title': title,
|
||||||
|
'content': content,
|
||||||
|
'is_onion': True})
|
||||||
|
|
||||||
|
# get spelling corrections
|
||||||
|
for correction in dom.xpath(correction_xpath):
|
||||||
|
results.append({'correction': extract_text(correction)})
|
||||||
|
|
||||||
|
# get number of results
|
||||||
|
number_of_results = dom.xpath(number_of_results_xpath)
|
||||||
|
if number_of_results:
|
||||||
|
try:
|
||||||
|
results.append({'number_of_results': int(extract_text(number_of_results))})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return results
|
|
@ -0,0 +1,64 @@
|
||||||
|
"""
|
||||||
|
not Evil (Onions)
|
||||||
|
|
||||||
|
@website http://hss3uro2hsxfogfq.onion
|
||||||
|
@provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm)
|
||||||
|
|
||||||
|
@using-api no
|
||||||
|
@results HTML
|
||||||
|
@stable no
|
||||||
|
@parse url, title, content
|
||||||
|
"""
|
||||||
|
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
from lxml import html
|
||||||
|
from searx.engines.xpath import extract_text
|
||||||
|
|
||||||
|
# engine dependent config
|
||||||
|
categories = ['onions']
|
||||||
|
paging = True
|
||||||
|
page_size = 20
|
||||||
|
|
||||||
|
# search-url
|
||||||
|
base_url = 'http://hss3uro2hsxfogfq.onion/'
|
||||||
|
search_url = 'index.php?{query}&hostLimit=20&start={pageno}&numRows={page_size}'
|
||||||
|
|
||||||
|
# specific xpath variables
|
||||||
|
results_xpath = '//*[@id="content"]/div/p'
|
||||||
|
url_xpath = './span[1]'
|
||||||
|
title_xpath = './a[1]'
|
||||||
|
content_xpath = './text()'
|
||||||
|
|
||||||
|
|
||||||
|
# do search-request
|
||||||
|
def request(query, params):
|
||||||
|
offset = (params['pageno'] - 1) * page_size
|
||||||
|
|
||||||
|
params['url'] = base_url + search_url.format(pageno=offset,
|
||||||
|
query=urlencode({'q': query}),
|
||||||
|
page_size=page_size)
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
# get response from search-request
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# needed because otherwise requests guesses wrong encoding
|
||||||
|
resp.encoding = 'utf8'
|
||||||
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
# parse results
|
||||||
|
for result in dom.xpath(results_xpath):
|
||||||
|
url = extract_text(result.xpath(url_xpath)[0])
|
||||||
|
title = extract_text(result.xpath(title_xpath)[0])
|
||||||
|
content = extract_text(result.xpath(content_xpath))
|
||||||
|
|
||||||
|
# append result
|
||||||
|
results.append({'url': url,
|
||||||
|
'title': title,
|
||||||
|
'content': content,
|
||||||
|
'is_onion': True})
|
||||||
|
|
||||||
|
return results
|
|
@ -10,6 +10,8 @@ thumbnail_xpath = False
|
||||||
paging = False
|
paging = False
|
||||||
suggestion_xpath = ''
|
suggestion_xpath = ''
|
||||||
results_xpath = ''
|
results_xpath = ''
|
||||||
|
cached_xpath = ''
|
||||||
|
cached_url = ''
|
||||||
|
|
||||||
# parameters for engines with paging support
|
# parameters for engines with paging support
|
||||||
#
|
#
|
||||||
|
@ -36,6 +38,8 @@ def request(query, params):
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
results = []
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
|
is_onion = True if 'onions' in categories else False
|
||||||
|
|
||||||
if results_xpath:
|
if results_xpath:
|
||||||
for result in eval_xpath(dom, results_xpath):
|
for result in eval_xpath(dom, results_xpath):
|
||||||
url = extract_url(eval_xpath(result, url_xpath), search_url)
|
url = extract_url(eval_xpath(result, url_xpath), search_url)
|
||||||
|
@ -49,15 +53,33 @@ def response(resp):
|
||||||
if len(thumbnail_xpath_result) > 0:
|
if len(thumbnail_xpath_result) > 0:
|
||||||
tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
|
tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
|
||||||
|
|
||||||
|
# add alternative cached url if available
|
||||||
|
if cached_xpath:
|
||||||
|
tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath))
|
||||||
|
|
||||||
|
if is_onion:
|
||||||
|
tmp_result['is_onion'] = True
|
||||||
|
|
||||||
results.append(tmp_result)
|
results.append(tmp_result)
|
||||||
|
else:
|
||||||
|
if cached_xpath:
|
||||||
|
for url, title, content, cached in zip(
|
||||||
|
(extract_url(x, search_url) for
|
||||||
|
x in dom.xpath(url_xpath)),
|
||||||
|
map(extract_text, dom.xpath(title_xpath)),
|
||||||
|
map(extract_text, dom.xpath(content_xpath)),
|
||||||
|
map(extract_text, dom.xpath(cached_xpath))
|
||||||
|
):
|
||||||
|
results.append({'url': url, 'title': title, 'content': content,
|
||||||
|
'cached_url': cached_url + cached, 'is_onion': is_onion})
|
||||||
else:
|
else:
|
||||||
for url, title, content in zip(
|
for url, title, content in zip(
|
||||||
(extract_url(x, search_url) for
|
(extract_url(x, search_url) for
|
||||||
x in eval_xpath(dom, url_xpath)),
|
x in dom.xpath(url_xpath)),
|
||||||
map(extract_text, eval_xpath(dom, title_xpath)),
|
map(extract_text, dom.xpath(title_xpath)),
|
||||||
map(extract_text, eval_xpath(dom, content_xpath))
|
map(extract_text, dom.xpath(content_xpath))
|
||||||
):
|
):
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})
|
||||||
|
|
||||||
if not suggestion_xpath:
|
if not suggestion_xpath:
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -28,6 +28,7 @@ from searx import logger, settings, static_path
|
||||||
logger = logger.getChild('plugins')
|
logger = logger.getChild('plugins')
|
||||||
|
|
||||||
from searx.plugins import (oa_doi_rewrite,
|
from searx.plugins import (oa_doi_rewrite,
|
||||||
|
ahmia_filter,
|
||||||
hash_plugin,
|
hash_plugin,
|
||||||
https_rewrite,
|
https_rewrite,
|
||||||
infinite_scroll,
|
infinite_scroll,
|
||||||
|
@ -181,3 +182,7 @@ if 'enabled_plugins' in settings:
|
||||||
plugin.default_on = True
|
plugin.default_on = True
|
||||||
else:
|
else:
|
||||||
plugin.default_on = False
|
plugin.default_on = False
|
||||||
|
|
||||||
|
# load tor specific plugins
|
||||||
|
if settings['outgoing'].get('using_tor_proxy'):
|
||||||
|
plugins.register(ahmia_filter)
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
'''
|
||||||
|
SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
'''
|
||||||
|
|
||||||
|
from hashlib import md5
|
||||||
|
from os.path import join
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from searx import searx_dir
|
||||||
|
|
||||||
|
name = "Ahmia blacklist"
|
||||||
|
description = "Filter out onion results that appear in Ahmia's blacklist. (See https://ahmia.fi/blacklist)"
|
||||||
|
default_on = True
|
||||||
|
preference_section = 'onions'
|
||||||
|
|
||||||
|
ahmia_blacklist = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_ahmia_blacklist():
|
||||||
|
global ahmia_blacklist
|
||||||
|
if not ahmia_blacklist:
|
||||||
|
with open(join(join(searx_dir, "data"), "ahmia_blacklist.txt"), 'r') as f:
|
||||||
|
ahmia_blacklist = f.read().split()
|
||||||
|
return ahmia_blacklist
|
||||||
|
|
||||||
|
|
||||||
|
def not_blacklisted(result):
|
||||||
|
if not result.get('is_onion'):
|
||||||
|
return True
|
||||||
|
result_hash = md5(urlparse(result.get('url')).hostname.encode()).hexdigest()
|
||||||
|
return result_hash not in get_ahmia_blacklist()
|
||||||
|
|
||||||
|
|
||||||
|
def post_search(request, search):
|
||||||
|
filtered_results = list(filter(not_blacklisted, search.result_container._merged_results))
|
||||||
|
search.result_container._merged_results = filtered_results
|
||||||
|
return True
|
|
@ -60,8 +60,10 @@ outgoing: # communication with search engines
|
||||||
# see http://docs.python-requests.org/en/latest/user/advanced/#proxies
|
# see http://docs.python-requests.org/en/latest/user/advanced/#proxies
|
||||||
# SOCKS proxies are also supported: see http://requests.readthedocs.io/en/master/user/advanced/#socks
|
# SOCKS proxies are also supported: see http://requests.readthedocs.io/en/master/user/advanced/#socks
|
||||||
# proxies :
|
# proxies :
|
||||||
# http : http://127.0.0.1:8080
|
# http : socks5h://127.0.0.1:9050
|
||||||
# https: http://127.0.0.1:8080
|
# https: socks5h://127.0.0.1:9050
|
||||||
|
# using_tor_proxy : True
|
||||||
|
# extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy
|
||||||
# uncomment below section only if you have more than one network interface
|
# uncomment below section only if you have more than one network interface
|
||||||
# which can be the source of outgoing search requests
|
# which can be the source of outgoing search requests
|
||||||
# source_ips:
|
# source_ips:
|
||||||
|
@ -89,6 +91,12 @@ engines:
|
||||||
shortcut: apkm
|
shortcut: apkm
|
||||||
disabled: True
|
disabled: True
|
||||||
|
|
||||||
|
# Requires Tor
|
||||||
|
- name : ahmia
|
||||||
|
engine : ahmia
|
||||||
|
categories : onions
|
||||||
|
shortcut : ah
|
||||||
|
|
||||||
- name : arch linux wiki
|
- name : arch linux wiki
|
||||||
engine : archlinux
|
engine : archlinux
|
||||||
shortcut : al
|
shortcut : al
|
||||||
|
@ -514,6 +522,11 @@ engines:
|
||||||
timeout: 5.0
|
timeout: 5.0
|
||||||
shortcut : npm
|
shortcut : npm
|
||||||
|
|
||||||
|
# Requires Tor
|
||||||
|
- name : not evil
|
||||||
|
engine : not_evil
|
||||||
|
shortcut : ne
|
||||||
|
|
||||||
- name : nyaa
|
- name : nyaa
|
||||||
engine : nyaa
|
engine : nyaa
|
||||||
shortcut : nt
|
shortcut : nt
|
||||||
|
@ -698,6 +711,18 @@ engines:
|
||||||
url: https://torrentz2.eu/
|
url: https://torrentz2.eu/
|
||||||
timeout : 3.0
|
timeout : 3.0
|
||||||
|
|
||||||
|
# Requires Tor
|
||||||
|
- name : torch
|
||||||
|
engine : xpath
|
||||||
|
paging : True
|
||||||
|
search_url : http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
|
||||||
|
results_xpath : //table//tr
|
||||||
|
url_xpath : ./td[2]/a
|
||||||
|
title_xpath : ./td[2]/b
|
||||||
|
content_xpath : ./td[2]/small
|
||||||
|
categories : onions
|
||||||
|
shortcut : tch
|
||||||
|
|
||||||
- name : twitter
|
- name : twitter
|
||||||
engine : twitter
|
engine : twitter
|
||||||
shortcut : tw
|
shortcut : tw
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
<div class="result {{ result.class }}{% for e in result.engines %} {{ e }}{% endfor %}">
|
<div class="result {{ result.class }}{% for e in result.engines %} {{ e }}{% endfor %}">
|
||||||
<h3 class="result_title">{% if "icon_"~result.engine~".ico" in favicons %}<img width="14" height="14" class="favicon" src="{{ url_for('static', filename='img/icons/icon_'+result.engine+'.ico') }}" alt="{{result.engine}}" />{% endif %}<a href="{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ result.title|safe }}</a></h3>
|
<h3 class="result_title">{% if "icon_"~result.engine~".ico" in favicons %}<img width="14" height="14" class="favicon" src="{{ url_for('static', filename='img/icons/icon_'+result.engine+'.ico') }}" alt="{{result.engine}}" />{% endif %}<a href="{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ result.title|safe }}</a></h3>
|
||||||
<p class="url">{{ result.pretty_url }}‎ <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
|
<p class="url">{{ result.pretty_url }}‎
|
||||||
|
{% if result.cached_url %}
|
||||||
|
<a class="cache_link" href="{{ result.cached_url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
|
||||||
|
{% elif not result.is_onion %}
|
||||||
|
<a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
|
||||||
|
{% endif %}
|
||||||
{% if result.publishedDate %}<span class="published_date">{{ result.publishedDate }}</span>{% endif %}</p>
|
{% if result.publishedDate %}<span class="published_date">{{ result.publishedDate }}</span>{% endif %}</p>
|
||||||
<p class="content">{% if result.img_src %}<img src="{{ image_proxify(result.img_src) }}" class="image" />{% endif %}{% if result.content %}{{ result.content|safe }}<br class="last"/>{% endif %}</p>
|
<p class="content">{% if result.img_src %}<img src="{{ image_proxify(result.img_src) }}" class="image" />{% endif %}{% if result.content %}{{ result.content|safe }}<br class="last"/>{% endif %}</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -32,7 +32,11 @@
|
||||||
<span class="label label-default">{{ engine }}</span>
|
<span class="label label-default">{{ engine }}</span>
|
||||||
{%- endfor -%}
|
{%- endfor -%}
|
||||||
{%- if result.url -%}
|
{%- if result.url -%}
|
||||||
|
{% if result.cached_url %}
|
||||||
|
<small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
|
||||||
|
{% elif not result.is_onion %}
|
||||||
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
|
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
|
||||||
|
{% endif %}
|
||||||
{%- endif -%}
|
{%- endif -%}
|
||||||
{%- if proxify -%}
|
{%- if proxify -%}
|
||||||
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
|
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
|
||||||
|
@ -50,7 +54,11 @@
|
||||||
<span class="label label-default">{{ engine }}</span>
|
<span class="label label-default">{{ engine }}</span>
|
||||||
{%- endfor %}
|
{%- endfor %}
|
||||||
{%- if result.url -%}
|
{%- if result.url -%}
|
||||||
|
{% if result.cached_url %}
|
||||||
|
<small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
|
||||||
|
{% elif not result.is_onion %}
|
||||||
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
|
<small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
|
||||||
|
{% endif %}
|
||||||
{%- endif -%}
|
{%- endif -%}
|
||||||
{% if proxify -%}
|
{% if proxify -%}
|
||||||
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
|
<small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
|
||||||
|
|
|
@ -258,6 +258,7 @@
|
||||||
<fieldset>
|
<fieldset>
|
||||||
<div class="container-fluid">
|
<div class="container-fluid">
|
||||||
{% for plugin in plugins %}
|
{% for plugin in plugins %}
|
||||||
|
{% if plugin.preference_section != 'onions' %}
|
||||||
<div class="panel panel-default">
|
<div class="panel panel-default">
|
||||||
<div class="panel-heading">
|
<div class="panel-heading">
|
||||||
<h3 class="panel-title">{{ _(plugin.name) }}</h3>
|
<h3 class="panel-title">{{ _(plugin.name) }}</h3>
|
||||||
|
@ -271,6 +272,7 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
|
@ -146,6 +146,7 @@ _category_names = (gettext('files'),
|
||||||
gettext('it'),
|
gettext('it'),
|
||||||
gettext('news'),
|
gettext('news'),
|
||||||
gettext('map'),
|
gettext('map'),
|
||||||
|
gettext('onions'),
|
||||||
gettext('science'))
|
gettext('science'))
|
||||||
|
|
||||||
outgoing_proxies = settings['outgoing'].get('proxies') or None
|
outgoing_proxies = settings['outgoing'].get('proxies') or None
|
||||||
|
|
|
@ -0,0 +1,121 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from collections import defaultdict
|
||||||
|
import mock
|
||||||
|
from searx.engines import xpath
|
||||||
|
from searx.testing import SearxTestCase
|
||||||
|
|
||||||
|
|
||||||
|
class TestXpathEngine(SearxTestCase):
|
||||||
|
|
||||||
|
def test_request(self):
|
||||||
|
xpath.search_url = 'https://url.com/{query}'
|
||||||
|
xpath.categories = []
|
||||||
|
xpath.paging = False
|
||||||
|
query = 'test_query'
|
||||||
|
dicto = defaultdict(dict)
|
||||||
|
params = xpath.request(query, dicto)
|
||||||
|
self.assertIn('url', params)
|
||||||
|
self.assertEquals('https://url.com/test_query', params['url'])
|
||||||
|
|
||||||
|
xpath.search_url = 'https://url.com/q={query}&p={pageno}'
|
||||||
|
xpath.paging = True
|
||||||
|
query = 'test_query'
|
||||||
|
dicto = defaultdict(dict)
|
||||||
|
dicto['pageno'] = 1
|
||||||
|
params = xpath.request(query, dicto)
|
||||||
|
self.assertIn('url', params)
|
||||||
|
self.assertEquals('https://url.com/q=test_query&p=1', params['url'])
|
||||||
|
|
||||||
|
def test_response(self):
|
||||||
|
# without results_xpath
|
||||||
|
xpath.url_xpath = '//div[@class="search_result"]//a[@class="result"]/@href'
|
||||||
|
xpath.title_xpath = '//div[@class="search_result"]//a[@class="result"]'
|
||||||
|
xpath.content_xpath = '//div[@class="search_result"]//p[@class="content"]'
|
||||||
|
|
||||||
|
self.assertRaises(AttributeError, xpath.response, None)
|
||||||
|
self.assertRaises(AttributeError, xpath.response, [])
|
||||||
|
self.assertRaises(AttributeError, xpath.response, '')
|
||||||
|
self.assertRaises(AttributeError, xpath.response, '[]')
|
||||||
|
|
||||||
|
response = mock.Mock(text='<html></html>')
|
||||||
|
self.assertEqual(xpath.response(response), [])
|
||||||
|
|
||||||
|
html = u"""
|
||||||
|
<div>
|
||||||
|
<div class="search_result">
|
||||||
|
<a class="result" href="https://result1.com">Result 1</a>
|
||||||
|
<p class="content">Content 1</p>
|
||||||
|
<a class="cached" href="https://cachedresult1.com">Cache</a>
|
||||||
|
</div>
|
||||||
|
<div class="search_result">
|
||||||
|
<a class="result" href="https://result2.com">Result 2</a>
|
||||||
|
<p class="content">Content 2</p>
|
||||||
|
<a class="cached" href="https://cachedresult2.com">Cache</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
response = mock.Mock(text=html)
|
||||||
|
results = xpath.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[0]['title'], 'Result 1')
|
||||||
|
self.assertEqual(results[0]['url'], 'https://result1.com/')
|
||||||
|
self.assertEqual(results[0]['content'], 'Content 1')
|
||||||
|
self.assertEqual(results[1]['title'], 'Result 2')
|
||||||
|
self.assertEqual(results[1]['url'], 'https://result2.com/')
|
||||||
|
self.assertEqual(results[1]['content'], 'Content 2')
|
||||||
|
|
||||||
|
# with cached urls, without results_xpath
|
||||||
|
xpath.cached_xpath = '//div[@class="search_result"]//a[@class="cached"]/@href'
|
||||||
|
results = xpath.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
|
||||||
|
self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
|
||||||
|
self.assertFalse(results[0].get('is_onion', False))
|
||||||
|
|
||||||
|
# results are onion urls (no results_xpath)
|
||||||
|
xpath.categories = ['onions']
|
||||||
|
results = xpath.response(response)
|
||||||
|
self.assertTrue(results[0]['is_onion'])
|
||||||
|
|
||||||
|
# with results_xpath
|
||||||
|
xpath.results_xpath = '//div[@class="search_result"]'
|
||||||
|
xpath.url_xpath = './/a[@class="result"]/@href'
|
||||||
|
xpath.title_xpath = './/a[@class="result"]'
|
||||||
|
xpath.content_xpath = './/p[@class="content"]'
|
||||||
|
xpath.cached_xpath = None
|
||||||
|
xpath.categories = []
|
||||||
|
|
||||||
|
self.assertRaises(AttributeError, xpath.response, None)
|
||||||
|
self.assertRaises(AttributeError, xpath.response, [])
|
||||||
|
self.assertRaises(AttributeError, xpath.response, '')
|
||||||
|
self.assertRaises(AttributeError, xpath.response, '[]')
|
||||||
|
|
||||||
|
response = mock.Mock(text='<html></html>')
|
||||||
|
self.assertEqual(xpath.response(response), [])
|
||||||
|
|
||||||
|
response = mock.Mock(text=html)
|
||||||
|
results = xpath.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[0]['title'], 'Result 1')
|
||||||
|
self.assertEqual(results[0]['url'], 'https://result1.com/')
|
||||||
|
self.assertEqual(results[0]['content'], 'Content 1')
|
||||||
|
self.assertEqual(results[1]['title'], 'Result 2')
|
||||||
|
self.assertEqual(results[1]['url'], 'https://result2.com/')
|
||||||
|
self.assertEqual(results[1]['content'], 'Content 2')
|
||||||
|
|
||||||
|
# with cached urls, with results_xpath
|
||||||
|
xpath.cached_xpath = './/a[@class="cached"]/@href'
|
||||||
|
results = xpath.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
|
||||||
|
self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
|
||||||
|
self.assertFalse(results[0].get('is_onion', False))
|
||||||
|
|
||||||
|
# results are onion urls (with results_xpath)
|
||||||
|
xpath.categories = ['onions']
|
||||||
|
results = xpath.response(response)
|
||||||
|
self.assertTrue(results[0]['is_onion'])
|
|
@ -0,0 +1,44 @@
|
||||||
|
from searx.testing import SearxTestCase
|
||||||
|
from searx import settings, engines
|
||||||
|
|
||||||
|
|
||||||
|
class TestEnginesInit(SearxTestCase):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
settings['outgoing']['using_tor_proxy'] = False
|
||||||
|
settings['outgoing']['extra_proxy_timeout'] = 0
|
||||||
|
|
||||||
|
def test_initialize_engines_default(self):
|
||||||
|
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1'},
|
||||||
|
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2'}]
|
||||||
|
|
||||||
|
engines.initialize_engines(engine_list)
|
||||||
|
self.assertEqual(len(engines.engines), 2)
|
||||||
|
self.assertIn('engine1', engines.engines)
|
||||||
|
self.assertIn('engine2', engines.engines)
|
||||||
|
|
||||||
|
def test_initialize_engines_exclude_onions(self):
|
||||||
|
settings['outgoing']['using_tor_proxy'] = False
|
||||||
|
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general'},
|
||||||
|
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
|
||||||
|
|
||||||
|
engines.initialize_engines(engine_list)
|
||||||
|
self.assertEqual(len(engines.engines), 1)
|
||||||
|
self.assertIn('engine1', engines.engines)
|
||||||
|
self.assertNotIn('onions', engines.categories)
|
||||||
|
|
||||||
|
def test_initialize_engines_include_onions(self):
|
||||||
|
settings['outgoing']['using_tor_proxy'] = True
|
||||||
|
settings['outgoing']['extra_proxy_timeout'] = 100.0
|
||||||
|
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general',
|
||||||
|
'timeout': 20.0, 'onion_url': 'http://engine1.onion'},
|
||||||
|
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
|
||||||
|
|
||||||
|
engines.initialize_engines(engine_list)
|
||||||
|
self.assertEqual(len(engines.engines), 2)
|
||||||
|
self.assertIn('engine1', engines.engines)
|
||||||
|
self.assertIn('engine2', engines.engines)
|
||||||
|
self.assertIn('onions', engines.categories)
|
||||||
|
self.assertIn('http://engine1.onion', engines.engines['engine1'].search_url)
|
||||||
|
self.assertEqual(engines.engines['engine1'].timeout, 120.0)
|
|
@ -0,0 +1,33 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# This script saves Ahmia's blacklist for onion sites.
|
||||||
|
# More info in https://ahmia.fi/blacklist/
|
||||||
|
|
||||||
|
# set path
|
||||||
|
from sys import path
|
||||||
|
from os.path import realpath, dirname, join
|
||||||
|
path.append(realpath(dirname(realpath(__file__)) + '/../'))
|
||||||
|
|
||||||
|
#
|
||||||
|
import requests
|
||||||
|
from searx import searx_dir
|
||||||
|
|
||||||
|
URL = 'https://ahmia.fi/blacklist/banned/'
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_ahmia_blacklist():
|
||||||
|
resp = requests.get(URL, timeout=3.0)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code)
|
||||||
|
else:
|
||||||
|
blacklist = resp.text.split()
|
||||||
|
return blacklist
|
||||||
|
|
||||||
|
|
||||||
|
def get_ahmia_blacklist_filename():
|
||||||
|
return join(join(searx_dir, "data"), "ahmia_blacklist.txt")
|
||||||
|
|
||||||
|
|
||||||
|
blacklist = fetch_ahmia_blacklist()
|
||||||
|
with open(get_ahmia_blacklist_filename(), "w") as f:
|
||||||
|
f.write('\n'.join(blacklist))
|
Loading…
Reference in New Issue