searxng/searx/search.py
Alexandre Flament 01e2648e93 Simplify search.py, basically updated PR #518
The timeouts in settings.yml is about the total time (not only the HTTP request but also the prepare the request and parsing the response)
It was more or less the case before since the threaded_requests function ignores the thread after the timeout even the HTTP request is ended.

New / changed stats :
* page_load_time : record the HTTP request time
* page_load_count: the number of HTTP request
* engine_time : the execution total time of an engine
* engine_time_count : the number of "engine_time" measure

The avg response times in the preferences are the engine response time (engine_load_time / engine_load_count)

To sum up :
* Search.search() filters the engines that can't process the request
* Search.search() call search_multiple_requests function
* search_multiple_requests creates one thread per engine, each thread runs the search_one_request function
* search_one_request calls the request function, make the HTTP request, calls the response function, extends the result_container
* search_multiple_requests waits for the the thread to finish (or timeout)
2016-11-05 13:45:20 +01:00

393 lines
13 KiB
Python

'''
searx is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
searx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
import gc
import threading
from thread import start_new_thread
from time import time
from uuid import uuid4
import searx.poolrequests as requests_lib
from searx.engines import (
categories, engines
)
from searx.utils import gen_useragent
from searx.query import RawTextQuery, SearchQuery
from searx.results import ResultContainer
from searx import logger
from searx.plugins import plugins
logger = logger.getChild('search')
number_of_searches = 0
def send_http_request(engine, request_params, timeout_limit):
response = None
try:
# create dictionary which contain all
# informations about the request
request_args = dict(
headers=request_params['headers'],
cookies=request_params['cookies'],
timeout=timeout_limit,
verify=request_params['verify']
)
# specific type of request (GET or POST)
if request_params['method'] == 'GET':
req = requests_lib.get
else:
req = requests_lib.post
request_args['data'] = request_params['data']
# for page_load_time stats
time_before_request = time()
# send the request
response = req(request_params['url'], **request_args)
with threading.RLock():
# no error : reset the suspend variables
engine.continuous_errors = 0
engine.suspend_end_time = 0
# update stats with current page-load-time
# only the HTTP request
engine.stats['page_load_time'] += time() - time_before_request
engine.stats['page_load_count'] += 1
# is there a timeout (no parsing in this case)
timeout_overhead = 0.2 # seconds
search_duration = time() - request_params['started']
if search_duration > timeout_limit + timeout_overhead:
logger.exception('engine timeout on HTTP request:'
'{0} (search duration : {1} ms, time-out: {2} )'
.format(engine.name, search_duration, timeout_limit))
with threading.RLock():
engine.stats['errors'] += 1
return False
# everything is ok : return the response
return response
except:
# increase errors stats
with threading.RLock():
engine.stats['errors'] += 1
engine.continuous_errors += 1
engine.suspend_end_time = time() + min(60, engine.continuous_errors)
# print engine name and specific error message
logger.exception('engine crash: {0}'.format(engine.name))
return False
def search_one_request(engine_name, query, request_params, result_container, timeout_limit):
engine = engines[engine_name]
# update request parameters dependent on
# search-engine (contained in engines folder)
engine.request(query, request_params)
# TODO add support of offline engines
if request_params['url'] is None:
return False
# ignoring empty urls
if not request_params['url']:
return False
# send request
response = send_http_request(engine, request_params, timeout_limit)
# parse response
success = None
if response:
# parse the response
response.search_params = request_params
search_results = engine.response(response)
# add results
for result in search_results:
result['engine'] = engine.name
result_container.extend(engine.name, search_results)
success = True
else:
success = False
with threading.RLock():
# update stats : total time
engine.stats['engine_time'] += time() - request_params['started']
engine.stats['engine_time_count'] += 1
#
return success
def search_multiple_requests(requests, result_container, timeout_limit):
start_time = time()
search_id = uuid4().__str__()
for engine_name, query, request_params in requests:
th = threading.Thread(
target=search_one_request,
args=(engine_name, query, request_params, result_container, timeout_limit),
name=search_id,
)
th._engine_name = engine_name
th.start()
for th in threading.enumerate():
if th.name == search_id:
remaining_time = max(0.0, timeout_limit - (time() - start_time))
th.join(remaining_time)
if th.isAlive():
logger.warning('engine timeout: {0}'.format(th._engine_name))
# get default reqest parameter
def default_request_params():
return {
'method': 'GET',
'headers': {},
'data': {},
'url': '',
'cookies': {},
'verify': True
}
def get_search_query_from_webapp(preferences, form):
query = None
query_engines = []
query_categories = []
query_paging = False
query_pageno = 1
query_lang = 'all'
query_time_range = None
# set blocked engines
disabled_engines = preferences.engines.get_disabled()
# set specific language if set
query_lang = preferences.get_value('language')
# safesearch
query_safesearch = preferences.get_value('safesearch')
# TODO better exceptions
if not form.get('q'):
raise Exception('noquery')
# set pagenumber
pageno_param = form.get('pageno', '1')
if not pageno_param.isdigit() or int(pageno_param) < 1:
pageno_param = 1
query_pageno = int(pageno_param)
# parse query, if tags are set, which change
# the serch engine or search-language
raw_text_query = RawTextQuery(form['q'], disabled_engines)
raw_text_query.parse_query()
# set query
query = raw_text_query.getSearchQuery()
# get last selected language in query, if possible
# TODO support search with multible languages
if len(raw_text_query.languages):
query_lang = raw_text_query.languages[-1]
query_time_range = form.get('time_range')
query_engines = raw_text_query.engines
# if engines are calculated from query,
# set categories by using that informations
if query_engines and raw_text_query.specific:
query_categories = list(set(engine['category']
for engine in query_engines))
# otherwise, using defined categories to
# calculate which engines should be used
else:
# set categories/engines
load_default_categories = True
for pd_name, pd in form.items():
if pd_name == 'categories':
query_categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)
elif pd_name == 'engines':
pd_engines = [{'category': engines[engine].categories[0],
'name': engine}
for engine in map(unicode.strip, pd.split(',')) if engine in engines]
if pd_engines:
query_engines.extend(pd_engines)
load_default_categories = False
elif pd_name.startswith('category_'):
category = pd_name[9:]
# if category is not found in list, skip
if category not in categories:
continue
if pd != 'off':
# add category to list
query_categories.append(category)
elif category in query_categories:
# remove category from list if property is set to 'off'
query_categories.remove(category)
if not load_default_categories:
if not query_categories:
query_categories = list(set(engine['category']
for engine in engines))
else:
# if no category is specified for this search,
# using user-defined default-configuration which
# (is stored in cookie)
if not query_categories:
cookie_categories = preferences.get_value('categories')
for ccateg in cookie_categories:
if ccateg in categories:
query_categories.append(ccateg)
# if still no category is specified, using general
# as default-category
if not query_categories:
query_categories = ['general']
# using all engines for that search, which are
# declared under the specific categories
for categ in query_categories:
query_engines.extend({'category': categ,
'name': engine.name}
for engine in categories[categ]
if (engine.name, categ) not in disabled_engines)
return SearchQuery(query, query_engines, query_categories,
query_lang, query_safesearch, query_pageno, query_time_range)
class Search(object):
"""Search information container"""
def __init__(self, search_query):
# init vars
super(Search, self).__init__()
self.search_query = search_query
self.result_container = ResultContainer()
# do search-request
def search(self):
global number_of_searches
# start time
start_time = time()
# init vars
requests = []
# increase number of searches
number_of_searches += 1
# set default useragent
# user_agent = request.headers.get('User-Agent', '')
user_agent = gen_useragent()
search_query = self.search_query
# max of all selected engine timeout
timeout_limit = 0
# start search-reqest for all selected engines
for selected_engine in search_query.engines:
if selected_engine['name'] not in engines:
continue
engine = engines[selected_engine['name']]
# skip suspended engines
if engine.suspend_end_time and engine.suspend_end_time <= time():
continue
# if paging is not supported, skip
if search_query.pageno > 1 and not engine.paging:
continue
# if search-language is set and engine does not
# provide language-support, skip
if search_query.lang != 'all' and not engine.language_support:
continue
# if time_range is not supported, skip
if search_query.time_range and not engine.time_range_support:
continue
# set default request parameters
request_params = default_request_params()
request_params['headers']['User-Agent'] = user_agent
request_params['category'] = selected_engine['category']
request_params['started'] = start_time
request_params['pageno'] = search_query.pageno
if hasattr(engine, 'language') and engine.language:
request_params['language'] = engine.language
else:
request_params['language'] = search_query.lang
# 0 = None, 1 = Moderate, 2 = Strict
request_params['safesearch'] = search_query.safesearch
request_params['time_range'] = search_query.time_range
# append request to list
requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))
# update timeout_limit
timeout_limit = max(timeout_limit, engine.timeout)
if requests:
# send all search-request
search_multiple_requests(requests, self.result_container, timeout_limit - (time() - start_time))
start_new_thread(gc.collect, tuple())
# return results, suggestions, answers and infoboxes
return self.result_container
class SearchWithPlugins(Search):
"""Similar to the Search class but call the plugins."""
def __init__(self, search_query, request):
super(SearchWithPlugins, self).__init__(search_query)
self.request = request
def search(self):
if plugins.call('pre_search', self.request, self):
super(SearchWithPlugins, self).search()
plugins.call('post_search', self.request, self)
results = self.result_container.get_ordered_results()
for result in results:
plugins.call('on_result', self.request, self, result)
return self.result_container