Merge pull request #798 from dalf/searchpy4

[mod] add a search_one_request_safe function wrapper
This commit is contained in:
Adam Tauber 2016-12-30 13:01:04 +01:00 committed by GitHub
commit eaa0fb8102

View File

@ -20,6 +20,7 @@ import threading
from thread import start_new_thread from thread import start_new_thread
from time import time from time import time
from uuid import uuid4 from uuid import uuid4
import requests.exceptions
import searx.poolrequests as requests_lib import searx.poolrequests as requests_lib
from searx.engines import ( from searx.engines import (
categories, engines categories, engines
@ -37,109 +38,117 @@ number_of_searches = 0
def send_http_request(engine, request_params, timeout_limit): def send_http_request(engine, request_params, timeout_limit):
response = None # for page_load_time stats
try: time_before_request = time()
# create dictionary which contain all
# informations about the request
request_args = dict(
headers=request_params['headers'],
cookies=request_params['cookies'],
timeout=timeout_limit,
verify=request_params['verify']
)
# specific type of request (GET or POST)
if request_params['method'] == 'GET':
req = requests_lib.get
else:
req = requests_lib.post
request_args['data'] = request_params['data']
# for page_load_time stats # create dictionary which contain all
time_before_request = time() # informations about the request
request_args = dict(
headers=request_params['headers'],
cookies=request_params['cookies'],
timeout=timeout_limit,
verify=request_params['verify']
)
# send the request # specific type of request (GET or POST)
response = req(request_params['url'], **request_args) if request_params['method'] == 'GET':
req = requests_lib.get
else:
req = requests_lib.post
request_args['data'] = request_params['data']
with threading.RLock(): # send the request
# no error : reset the suspend variables response = req(request_params['url'], **request_args)
engine.continuous_errors = 0
engine.suspend_end_time = 0
# update stats with current page-load-time
# only the HTTP request
engine.stats['page_load_time'] += time() - time_before_request
engine.stats['page_load_count'] += 1
# is there a timeout (no parsing in this case) # is there a timeout (no parsing in this case)
timeout_overhead = 0.2 # seconds timeout_overhead = 0.2 # seconds
search_duration = time() - request_params['started'] search_duration = time() - request_params['started']
if search_duration > timeout_limit + timeout_overhead: if search_duration > timeout_limit + timeout_overhead:
logger.exception('engine timeout on HTTP request:' raise Timeout(response=response)
'{0} (search duration : {1} ms, time-out: {2} )'
.format(engine.name, search_duration, timeout_limit))
with threading.RLock():
engine.stats['errors'] += 1
return False
# everything is ok : return the response with threading.RLock():
return response # no error : reset the suspend variables
engine.continuous_errors = 0
engine.suspend_end_time = 0
# update stats with current page-load-time
# only the HTTP request
engine.stats['page_load_time'] += time() - time_before_request
engine.stats['page_load_count'] += 1
except: # everything is ok : return the response
# increase errors stats return response
with threading.RLock():
engine.stats['errors'] += 1
engine.continuous_errors += 1
engine.suspend_end_time = time() + min(60, engine.continuous_errors)
# print engine name and specific error message
logger.exception('engine crash: {0}'.format(engine.name))
return False
def search_one_request(engine_name, query, request_params, result_container, timeout_limit): def search_one_request(engine, query, request_params, timeout_limit):
engine = engines[engine_name]
# update request parameters dependent on # update request parameters dependent on
# search-engine (contained in engines folder) # search-engine (contained in engines folder)
engine.request(query, request_params) engine.request(query, request_params)
# TODO add support of offline engines
if request_params['url'] is None:
return False
# ignoring empty urls # ignoring empty urls
if request_params['url'] is None:
return []
if not request_params['url']: if not request_params['url']:
return False return []
# send request # send request
response = send_http_request(engine, request_params, timeout_limit) response = send_http_request(engine, request_params, timeout_limit)
# parse response # parse the response
success = None response.search_params = request_params
if response: return engine.response(response)
# parse the response
response.search_params = request_params
try: def search_one_request_safe(engine_name, query, request_params, result_container, timeout_limit):
search_results = engine.response(response) start_time = time()
except: engine = engines[engine_name]
logger.exception('engine crash: {0}'.format(engine.name))
search_results = [] try:
# send requests and parse the results
search_results = search_one_request(engine, query, request_params, timeout_limit)
# add results # add results
for result in search_results: for result in search_results:
result['engine'] = engine.name result['engine'] = engine_name
result_container.extend(engine_name, search_results)
result_container.extend(engine.name, search_results) # update engine time when there is no exception
with threading.RLock():
engine.stats['engine_time'] += time() - start_time
engine.stats['engine_time_count'] += 1
success = True return True
else:
success = False
with threading.RLock(): except Exception as e:
# update stats : total time engine.stats['errors'] += 1
engine.stats['engine_time'] += time() - request_params['started']
engine.stats['engine_time_count'] += 1
return success search_duration = time() - start_time
requests_exception = False
if (issubclass(e.__class__, requests.exceptions.Timeout)):
# requests timeout (connect or read)
logger.error("engine {0} : HTTP requests timeout"
"(search duration : {1} s, timeout: {2} s) : {3}"
.format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
requests_exception = True
if (issubclass(e.__class__, requests.exceptions.RequestException)):
# other requests exception
logger.exception("engine {0} : requests exception"
"(search duration : {1} s, timeout: {2} s) : {3}"
.format(engine_name, search_duration, timeout_limit, e))
requests_exception = True
else:
# others errors
logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
# update continuous_errors / suspend_end_time
if requests_exception:
with threading.RLock():
engine.continuous_errors += 1
engine.suspend_end_time = time() + min(60, engine.continuous_errors)
#
return False
def search_multiple_requests(requests, result_container, timeout_limit): def search_multiple_requests(requests, result_container, timeout_limit):
@ -148,7 +157,7 @@ def search_multiple_requests(requests, result_container, timeout_limit):
for engine_name, query, request_params in requests: for engine_name, query, request_params in requests:
th = threading.Thread( th = threading.Thread(
target=search_one_request, target=search_one_request_safe,
args=(engine_name, query, request_params, result_container, timeout_limit), args=(engine_name, query, request_params, result_container, timeout_limit),
name=search_id, name=search_id,
) )