Merge pull request #733 from dalf/searchpy

Search architecture
This commit is contained in:
Adam Tauber 2016-11-04 19:38:21 +01:00 committed by GitHub
commit 51eafdd471
9 changed files with 280 additions and 233 deletions

View File

@ -20,12 +20,12 @@ def extract_doi(url):
return None
def on_result(request, ctx):
doi = extract_doi(ctx['result']['parsed_url'])
def on_result(request, search, result):
doi = extract_doi(result['parsed_url'])
if doi and len(doi) < 50:
for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):
if doi.endswith(suffix):
doi = doi[:-len(suffix)]
ctx['result']['url'] = 'http://doai.io/' + doi
ctx['result']['parsed_url'] = urlparse(ctx['result']['url'])
result['url'] = 'http://doai.io/' + doi
result['parsed_url'] = urlparse(ctx['result']['url'])
return True

View File

@ -220,8 +220,7 @@ def https_url_rewrite(result):
return result
def on_result(request, ctx):
result = ctx['result']
def on_result(request, search, result):
if result['parsed_url'].scheme == 'http':
https_url_rewrite(result)
return True

View File

@ -28,19 +28,19 @@ p = re.compile('.*user[ -]agent.*', re.IGNORECASE)
# attach callback to the post search hook
# request: flask request object
# ctx: the whole local context of the pre search hook
def post_search(request, ctx):
if ctx['search'].pageno > 1:
def post_search(request, search):
if search.search_query.pageno > 1:
return True
if ctx['search'].query == 'ip':
if search.search_query.query == 'ip':
x_forwarded_for = request.headers.getlist("X-Forwarded-For")
if x_forwarded_for:
ip = x_forwarded_for[0]
else:
ip = request.remote_addr
ctx['search'].result_container.answers.clear()
ctx['search'].result_container.answers.add(ip)
elif p.match(ctx['search'].query):
search.result_container.answers.clear()
search.result_container.answers.add(ip)
elif p.match(search.search_query.query):
ua = request.user_agent
ctx['search'].result_container.answers.clear()
ctx['search'].result_container.answers.add(ua)
search.result_container.answers.clear()
search.result_container.answers.add(ua)
return True

View File

@ -28,8 +28,8 @@ description = gettext('Remove trackers arguments from the returned URL')
default_on = True
def on_result(request, ctx):
query = ctx['result']['parsed_url'].query
def on_result(request, search, result):
query = result['parsed_url'].query
if query == "":
return True
@ -37,8 +37,8 @@ def on_result(request, ctx):
for reg in regexes:
query = reg.sub('', query)
if query != ctx['result']['parsed_url'].query:
ctx['result']['parsed_url'] = ctx['result']['parsed_url']._replace(query=query)
ctx['result']['url'] = urlunparse(ctx['result']['parsed_url'])
if query != result['parsed_url'].query:
result['parsed_url'] = result['parsed_url']._replace(query=query)
result['url'] = urlunparse(result['parsed_url'])
return True

View File

@ -25,8 +25,8 @@ import string
import re
class Query(object):
"""parse query"""
class RawTextQuery(object):
"""parse raw text query (the value from the html input)"""
def __init__(self, query, disabled_engines):
self.query = query
@ -130,3 +130,19 @@ class Query(object):
def getFullQuery(self):
# get full querry including whitespaces
return string.join(self.query_parts, '')
class SearchQuery(object):
"""container for all the search parameters (query, language, etc...)"""
def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range):
self.query = query
self.engines = engines
self.categories = categories
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
def __str__(self):
return str(self.query) + ";" + str(self.engines)

View File

@ -128,6 +128,8 @@ class ResultContainer(object):
self.suggestions = set()
self.answers = set()
self._number_of_results = []
self._ordered = False
self.paging = False
def extend(self, engine_name, results):
for result in list(results):
@ -153,6 +155,9 @@ class ResultContainer(object):
self.results[engine_name].extend(results)
if not self.paging and engines[engine_name].paging:
self.paging = True
for i, result in enumerate(results):
try:
result['url'] = result['url'].decode('utf-8')
@ -219,7 +224,7 @@ class ResultContainer(object):
with RLock():
self._merged_results.append(result)
def get_ordered_results(self):
def order_results(self):
for result in self._merged_results:
score = result_score(result)
result['score'] = score
@ -269,8 +274,14 @@ class ResultContainer(object):
# update categoryIndex
categoryPositions[category] = {'index': len(gresults), 'count': 8}
# return gresults
return gresults
# update _merged_results
self._ordered = True
self._merged_results = gresults
def get_ordered_results(self):
if not self._ordered:
self.order_results()
return self._merged_results
def results_length(self):
return len(self._merged_results)

View File

@ -25,9 +25,10 @@ from searx.engines import (
categories, engines
)
from searx.utils import gen_useragent
from searx.query import Query
from searx.query import RawTextQuery, SearchQuery
from searx.results import ResultContainer
from searx import logger
from searx.plugins import plugins
logger = logger.getChild('search')
@ -127,86 +128,72 @@ def make_callback(engine_name, callback, params, result_container):
return process_callback
class Search(object):
"""Search information container"""
def __init__(self, request):
# init vars
super(Search, self).__init__()
self.query = None
self.engines = []
self.categories = []
self.paging = False
self.pageno = 1
self.lang = 'all'
self.time_range = None
self.is_advanced = None
def get_search_query_from_webapp(preferences, form):
query = None
query_engines = []
query_categories = []
query_paging = False
query_pageno = 1
query_lang = 'all'
query_time_range = None
# set blocked engines
self.disabled_engines = request.preferences.engines.get_disabled()
self.result_container = ResultContainer()
self.request_data = {}
disabled_engines = preferences.engines.get_disabled()
# set specific language if set
self.lang = request.preferences.get_value('language')
query_lang = preferences.get_value('language')
# set request method
if request.method == 'POST':
self.request_data = request.form
else:
self.request_data = request.args
# safesearch
query_safesearch = preferences.get_value('safesearch')
# TODO better exceptions
if not self.request_data.get('q'):
if not form.get('q'):
raise Exception('noquery')
# set pagenumber
pageno_param = self.request_data.get('pageno', '1')
pageno_param = form.get('pageno', '1')
if not pageno_param.isdigit() or int(pageno_param) < 1:
pageno_param = 1
self.pageno = int(pageno_param)
query_pageno = int(pageno_param)
# parse query, if tags are set, which change
# the serch engine or search-language
query_obj = Query(self.request_data['q'], self.disabled_engines)
query_obj.parse_query()
raw_text_query = RawTextQuery(form['q'], disabled_engines)
raw_text_query.parse_query()
# set query
self.query = query_obj.getSearchQuery()
query = raw_text_query.getSearchQuery()
# get last selected language in query, if possible
# TODO support search with multible languages
if len(query_obj.languages):
self.lang = query_obj.languages[-1]
if len(raw_text_query.languages):
query_lang = raw_text_query.languages[-1]
self.time_range = self.request_data.get('time_range')
self.is_advanced = self.request_data.get('advanced_search')
query_time_range = form.get('time_range')
self.engines = query_obj.engines
query_engines = raw_text_query.engines
# if engines are calculated from query,
# set categories by using that informations
if self.engines and query_obj.specific:
self.categories = list(set(engine['category']
for engine in self.engines))
if query_engines and raw_text_query.specific:
query_categories = list(set(engine['category']
for engine in query_engines))
# otherwise, using defined categories to
# calculate which engines should be used
else:
# set categories/engines
load_default_categories = True
for pd_name, pd in self.request_data.items():
for pd_name, pd in form.items():
if pd_name == 'categories':
self.categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)
query_categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)
elif pd_name == 'engines':
pd_engines = [{'category': engines[engine].categories[0],
'name': engine}
for engine in map(unicode.strip, pd.split(',')) if engine in engines]
if pd_engines:
self.engines.extend(pd_engines)
query_engines.extend(pd_engines)
load_default_categories = False
elif pd_name.startswith('category_'):
category = pd_name[9:]
@ -217,45 +204,54 @@ class Search(object):
if pd != 'off':
# add category to list
self.categories.append(category)
elif category in self.categories:
query_categories.append(category)
elif category in query_categories:
# remove category from list if property is set to 'off'
self.categories.remove(category)
query_categories.remove(category)
if not load_default_categories:
if not self.categories:
self.categories = list(set(engine['category']
for engine in self.engines))
return
if not query_categories:
query_categories = list(set(engine['category']
for engine in engines))
else:
# if no category is specified for this search,
# using user-defined default-configuration which
# (is stored in cookie)
if not self.categories:
cookie_categories = request.preferences.get_value('categories')
if not query_categories:
cookie_categories = preferences.get_value('categories')
for ccateg in cookie_categories:
if ccateg in categories:
self.categories.append(ccateg)
query_categories.append(ccateg)
# if still no category is specified, using general
# as default-category
if not self.categories:
self.categories = ['general']
if not query_categories:
query_categories = ['general']
# using all engines for that search, which are
# declared under the specific categories
for categ in self.categories:
self.engines.extend({'category': categ,
for categ in query_categories:
query_engines.extend({'category': categ,
'name': engine.name}
for engine in categories[categ]
if (engine.name, categ) not in self.disabled_engines)
if (engine.name, categ) not in disabled_engines)
# remove suspended engines
self.engines = [e for e in self.engines
if engines[e['name']].suspend_end_time <= time()]
return SearchQuery(query, query_engines, query_categories,
query_lang, query_safesearch, query_pageno, query_time_range)
class Search(object):
"""Search information container"""
def __init__(self, search_query):
# init vars
super(Search, self).__init__()
self.search_query = search_query
self.result_container = ResultContainer()
# do search-request
def search(self, request):
def search(self):
global number_of_searches
# init vars
@ -268,23 +264,30 @@ class Search(object):
# user_agent = request.headers.get('User-Agent', '')
user_agent = gen_useragent()
search_query = self.search_query
# start search-reqest for all selected engines
for selected_engine in self.engines:
for selected_engine in search_query.engines:
if selected_engine['name'] not in engines:
continue
engine = engines[selected_engine['name']]
# skip suspended engines
if engine.suspend_end_time and engine.suspend_end_time <= time():
continue
# if paging is not supported, skip
if self.pageno > 1 and not engine.paging:
if search_query.pageno > 1 and not engine.paging:
continue
# if search-language is set and engine does not
# provide language-support, skip
if self.lang != 'all' and not engine.language_support:
if search_query.lang != 'all' and not engine.language_support:
continue
if self.time_range and not engine.time_range_support:
# if time_range is not supported, skip
if search_query.time_range and not engine.time_range_support:
continue
# set default request parameters
@ -292,21 +295,20 @@ class Search(object):
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
request_params['started'] = time()
request_params['pageno'] = self.pageno
request_params['pageno'] = search_query.pageno
if hasattr(engine, 'language') and engine.language:
request_params['language'] = engine.language
else:
request_params['language'] = self.lang
request_params['language'] = search_query.lang
# 0 = None, 1 = Moderate, 2 = Strict
request_params['safesearch'] = request.preferences.get_value('safesearch')
request_params['time_range'] = self.time_range
request_params['advanced_search'] = self.is_advanced
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
# update request parameters dependent on
# search-engine (contained in engines folder)
engine.request(self.query.encode('utf-8'), request_params)
engine.request(search_query.query.encode('utf-8'), request_params)
if request_params['url'] is None:
# TODO add support of offline engines
@ -346,10 +348,32 @@ class Search(object):
selected_engine['name']))
if not requests:
return self
return self.result_container
# send all search-request
threaded_requests(requests)
start_new_thread(gc.collect, tuple())
# return results, suggestions, answers and infoboxes
return self
return self.result_container
class SearchWithPlugins(Search):
"""Similar to the Search class but call the plugins."""
def __init__(self, search_query, request):
super(SearchWithPlugins, self).__init__(search_query)
self.request = request
def search(self):
if plugins.call('pre_search', self.request, self):
super(SearchWithPlugins, self).search()
plugins.call('post_search', self.request, self)
results = self.result_container.get_ordered_results()
for result in results:
plugins.call('on_result', self.request, self, result)
return self.result_container

View File

@ -62,8 +62,8 @@ from searx.utils import (
)
from searx.version import VERSION_STRING
from searx.languages import language_codes
from searx.search import Search
from searx.query import Query
from searx.search import Search, SearchWithPlugins, get_search_query_from_webapp
from searx.query import RawTextQuery, SearchQuery
from searx.autocomplete import searx_bang, backends as autocomplete_backends
from searx.plugins import plugins
from searx.preferences import Preferences, ValidationException
@ -373,11 +373,13 @@ def pre_request():
logger.warning('Invalid config')
request.preferences = preferences
# request.form
request.form = dict(request.form.items())
for k, v in request.args.items():
if k not in request.form:
request.form[k] = v
# request.user_plugins
request.user_plugins = []
allowed_plugins = preferences.plugins.get_enabled()
disabled_plugins = preferences.plugins.get_disabled()
@ -400,30 +402,33 @@ def index():
'index.html',
)
# search
search_query = None
result_container = None
try:
search = Search(request)
search_query = get_search_query_from_webapp(request.preferences, request.form)
# search = Search(search_query) # without plugins
search = SearchWithPlugins(search_query, request)
result_container = search.search()
except:
return render(
'index.html',
)
if plugins.call('pre_search', request, locals()):
search.search(request)
results = result_container.get_ordered_results()
plugins.call('post_search', request, locals())
results = search.result_container.get_ordered_results()
# UI
advanced_search = request.form.get('advanced_search', None)
output_format = request.form.get('format', 'html')
if output_format not in ['html', 'csv', 'json', 'rss']:
output_format = 'html'
# output
for result in results:
plugins.call('on_result', request, locals())
if not search.paging and engines[result['engine']].paging:
search.paging = True
if search.request_data.get('format', 'html') == 'html':
if output_format == 'html':
if 'content' in result and result['content']:
result['content'] = highlight_content(result['content'][:1024], search.query.encode('utf-8'))
result['title'] = highlight_content(result['title'], search.query.encode('utf-8'))
result['content'] = highlight_content(result['content'][:1024], search_query.query.encode('utf-8'))
result['title'] = highlight_content(result['title'], search_query.query.encode('utf-8'))
else:
if result.get('content'):
result['content'] = html_to_text(result['content']).strip()
@ -450,16 +455,16 @@ def index():
else:
result['publishedDate'] = format_date(result['publishedDate'])
number_of_results = search.result_container.results_number()
if number_of_results < search.result_container.results_length():
number_of_results = result_container.results_number()
if number_of_results < result_container.results_length():
number_of_results = 0
if search.request_data.get('format') == 'json':
return Response(json.dumps({'query': search.query,
if output_format == 'json':
return Response(json.dumps({'query': search_query.query,
'number_of_results': number_of_results,
'results': results}),
mimetype='application/json')
elif search.request_data.get('format') == 'csv':
elif output_format == 'csv':
csv = UnicodeWriter(cStringIO.StringIO())
keys = ('title', 'url', 'content', 'host', 'engine', 'score')
csv.writerow(keys)
@ -468,14 +473,14 @@ def index():
csv.writerow([row.get(key, '') for key in keys])
csv.stream.seek(0)
response = Response(csv.stream.read(), mimetype='application/csv')
cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query.encode('utf-8'))
cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query.encode('utf-8'))
response.headers.add('Content-Disposition', cont_disp)
return response
elif search.request_data.get('format') == 'rss':
elif output_format == 'rss':
response_rss = render(
'opensearch_response_rss.xml',
results=results,
q=search.request_data['q'],
q=request.form['q'],
number_of_results=number_of_results,
base_url=get_base_url()
)
@ -484,17 +489,17 @@ def index():
return render(
'results.html',
results=results,
q=search.request_data['q'],
selected_categories=search.categories,
paging=search.paging,
q=request.form['q'],
selected_categories=search_query.categories,
pageno=search_query.pageno,
time_range=search_query.time_range,
number_of_results=format_decimal(number_of_results),
pageno=search.pageno,
advanced_search=search.is_advanced,
time_range=search.time_range,
advanced_search=advanced_search,
suggestions=result_container.suggestions,
answers=result_container.answers,
infoboxes=result_container.infoboxes,
paging=result_container.paging,
base_url=get_base_url(),
suggestions=search.result_container.suggestions,
answers=search.result_container.answers,
infoboxes=search.result_container.infoboxes,
theme=get_current_theme_name(),
favicons=global_favicons[themes.index(get_current_theme_name())]
)
@ -511,30 +516,23 @@ def about():
@app.route('/autocompleter', methods=['GET', 'POST'])
def autocompleter():
"""Return autocompleter results"""
request_data = {}
# select request method
if request.method == 'POST':
request_data = request.form
else:
request_data = request.args
# set blocked engines
disabled_engines = request.preferences.engines.get_disabled()
# parse query
query = Query(request_data.get('q', '').encode('utf-8'), disabled_engines)
query.parse_query()
raw_text_query = RawTextQuery(request.form.get('q', '').encode('utf-8'), disabled_engines)
raw_text_query.parse_query()
# check if search query is set
if not query.getSearchQuery():
if not raw_text_query.getSearchQuery():
return '', 400
# run autocompleter
completer = autocomplete_backends.get(request.preferences.get_value('autocomplete'))
# parse searx specific autocompleter results like !bang
raw_results = searx_bang(query)
raw_results = searx_bang(raw_text_query)
# normal autocompletion results only appear if max 3 inner results returned
if len(raw_results) <= 3 and completer:
@ -545,19 +543,19 @@ def autocompleter():
else:
language = language.split('_')[0]
# run autocompletion
raw_results.extend(completer(query.getSearchQuery(), language))
raw_results.extend(completer(raw_text_query.getSearchQuery(), language))
# parse results (write :language and !engine back to result string)
results = []
for result in raw_results:
query.changeSearchQuery(result)
raw_text_query.changeSearchQuery(result)
# add parsed result
results.append(query.getFullQuery())
results.append(raw_text_query.getFullQuery())
# return autocompleter results
if request_data.get('format') == 'x-suggestions':
return Response(json.dumps([query.query, results]),
if request.form.get('format') == 'x-suggestions':
return Response(json.dumps([raw_text_query.query, results]),
mimetype='application/json')
return Response(json.dumps(results),

View File

@ -6,9 +6,8 @@ from mock import Mock
def get_search_mock(query, **kwargs):
return {'search': Mock(query=query,
result_container=Mock(answers=set()),
**kwargs)}
return Mock(search_query=Mock(query=query, **kwargs),
result_container=Mock(answers=set()))
class PluginStoreTest(SearxTestCase):
@ -52,39 +51,39 @@ class SelfIPTest(SearxTestCase):
request = Mock(user_plugins=store.plugins,
remote_addr='127.0.0.1')
request.headers.getlist.return_value = []
ctx = get_search_mock(query='ip', pageno=1)
store.call('post_search', request, ctx)
self.assertTrue('127.0.0.1' in ctx['search'].result_container.answers)
search = get_search_mock(query='ip', pageno=1)
store.call('post_search', request, search)
self.assertTrue('127.0.0.1' in search.result_container.answers)
ctx = get_search_mock(query='ip', pageno=2)
store.call('post_search', request, ctx)
self.assertFalse('127.0.0.1' in ctx['search'].result_container.answers)
search = get_search_mock(query='ip', pageno=2)
store.call('post_search', request, search)
self.assertFalse('127.0.0.1' in search.result_container.answers)
# User agent test
request = Mock(user_plugins=store.plugins,
user_agent='Mock')
request.headers.getlist.return_value = []
ctx = get_search_mock(query='user-agent', pageno=1)
store.call('post_search', request, ctx)
self.assertTrue('Mock' in ctx['search'].result_container.answers)
search = get_search_mock(query='user-agent', pageno=1)
store.call('post_search', request, search)
self.assertTrue('Mock' in search.result_container.answers)
ctx = get_search_mock(query='user-agent', pageno=2)
store.call('post_search', request, ctx)
self.assertFalse('Mock' in ctx['search'].result_container.answers)
search = get_search_mock(query='user-agent', pageno=2)
store.call('post_search', request, search)
self.assertFalse('Mock' in search.result_container.answers)
ctx = get_search_mock(query='user-agent', pageno=1)
store.call('post_search', request, ctx)
self.assertTrue('Mock' in ctx['search'].result_container.answers)
search = get_search_mock(query='user-agent', pageno=1)
store.call('post_search', request, search)
self.assertTrue('Mock' in search.result_container.answers)
ctx = get_search_mock(query='user-agent', pageno=2)
store.call('post_search', request, ctx)
self.assertFalse('Mock' in ctx['search'].result_container.answers)
search = get_search_mock(query='user-agent', pageno=2)
store.call('post_search', request, search)
self.assertFalse('Mock' in search.result_container.answers)
ctx = get_search_mock(query='What is my User-Agent?', pageno=1)
store.call('post_search', request, ctx)
self.assertTrue('Mock' in ctx['search'].result_container.answers)
search = get_search_mock(query='What is my User-Agent?', pageno=1)
store.call('post_search', request, search)
self.assertTrue('Mock' in search.result_container.answers)
ctx = get_search_mock(query='What is my User-Agent?', pageno=2)
store.call('post_search', request, ctx)
self.assertFalse('Mock' in ctx['search'].result_container.answers)
search = get_search_mock(query='What is my User-Agent?', pageno=2)
store.call('post_search', request, search)
self.assertFalse('Mock' in search.result_container.answers)