mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
[mod] split searx.search into different processors
see searx.search.processors.abstract.EngineProcessor First the method searx call the get_params method. If the return value is not None, then the searx call the method search.
This commit is contained in:
parent
c0cc01e936
commit
7ec8bc3ea7
16 changed files with 476 additions and 316 deletions
41
searx/search/processors/__init__.py
Normal file
41
searx/search/processors/__init__.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
from .online import OnlineProcessor
|
||||
from .offline import OfflineProcessor
|
||||
from .online_dictionary import OnlineDictionaryProcessor
|
||||
from .online_currency import OnlineCurrencyProcessor
|
||||
from .abstract import EngineProcessor
|
||||
from searx import logger
|
||||
import searx.engines as engines
|
||||
|
||||
|
||||
__all__ = ['EngineProcessor', 'OfflineProcessor', 'OnlineProcessor',
|
||||
'OnlineDictionaryProcessor', 'OnlineCurrencyProcessor', 'processors']
|
||||
logger = logger.getChild('search.processors')
|
||||
processors = {}
|
||||
|
||||
|
||||
def get_processor_class(engine_type):
|
||||
for c in [OnlineProcessor, OfflineProcessor, OnlineDictionaryProcessor, OnlineCurrencyProcessor]:
|
||||
if c.engine_type == engine_type:
|
||||
return c
|
||||
return None
|
||||
|
||||
|
||||
def get_processor(engine, engine_name):
|
||||
engine_type = getattr(engine, 'engine_type', 'online')
|
||||
processor_class = get_processor_class(engine_type)
|
||||
if processor_class:
|
||||
return processor_class(engine, engine_name)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def initialize(engine_list):
|
||||
engines.initialize_engines(engine_list)
|
||||
for engine_name, engine in engines.engines.items():
|
||||
processor = get_processor(engine, engine_name)
|
||||
if processor is None:
|
||||
logger.error('Error get processor for engine %s', engine_name)
|
||||
else:
|
||||
processors[engine_name] = processor
|
||||
39
searx/search/processors/abstract.py
Normal file
39
searx/search/processors/abstract.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
from abc import abstractmethod
|
||||
from searx import logger
|
||||
|
||||
|
||||
logger = logger.getChild('searx.search.processor')
|
||||
|
||||
|
||||
class EngineProcessor:
|
||||
|
||||
def __init__(self, engine, engine_name):
|
||||
self.engine = engine
|
||||
self.engine_name = engine_name
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
# if paging is not supported, skip
|
||||
if search_query.pageno > 1 and not self.engine.paging:
|
||||
return None
|
||||
|
||||
# if time_range is not supported, skip
|
||||
if search_query.time_range and not self.engine.time_range_support:
|
||||
return None
|
||||
|
||||
params = {}
|
||||
params['category'] = engine_category
|
||||
params['pageno'] = search_query.pageno
|
||||
params['safesearch'] = search_query.safesearch
|
||||
params['time_range'] = search_query.time_range
|
||||
|
||||
if hasattr(self.engine, 'language') and self.engine.language:
|
||||
params['language'] = self.engine.language
|
||||
else:
|
||||
params['language'] = search_query.lang
|
||||
return params
|
||||
|
||||
@abstractmethod
|
||||
def search(self, query, params, result_container, start_time, timeout_limit):
|
||||
pass
|
||||
51
searx/search/processors/offline.py
Normal file
51
searx/search/processors/offline.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import threading
|
||||
from time import time
|
||||
from searx import logger
|
||||
from searx.metrology.error_recorder import record_exception, record_error
|
||||
from searx.search.processors.abstract import EngineProcessor
|
||||
|
||||
|
||||
logger = logger.getChild('search.processor.offline')
|
||||
|
||||
|
||||
class OfflineProcessor(EngineProcessor):
|
||||
|
||||
engine_type = 'offline'
|
||||
|
||||
def _record_stats_on_error(self, result_container, start_time):
|
||||
engine_time = time() - start_time
|
||||
result_container.add_timing(self.engine_name, engine_time, engine_time)
|
||||
|
||||
with threading.RLock():
|
||||
self.engine.stats['errors'] += 1
|
||||
|
||||
def _search_basic(self, query, params):
|
||||
return self.engine.search(query, params)
|
||||
|
||||
def search(self, query, params, result_container, start_time, timeout_limit):
|
||||
try:
|
||||
search_results = self._search_basic(query, params)
|
||||
|
||||
if search_results:
|
||||
result_container.extend(self.engine_name, search_results)
|
||||
|
||||
engine_time = time() - start_time
|
||||
result_container.add_timing(self.engine_name, engine_time, engine_time)
|
||||
with threading.RLock():
|
||||
self.engine.stats['engine_time'] += engine_time
|
||||
self.engine.stats['engine_time_count'] += 1
|
||||
|
||||
except ValueError as e:
|
||||
record_exception(self.engine_name, e)
|
||||
self._record_stats_on_error(result_container, start_time)
|
||||
logger.exception('engine {0} : invalid input : {1}'.format(self.engine_name, e))
|
||||
except Exception as e:
|
||||
record_exception(self.engine_name, e)
|
||||
self._record_stats_on_error(result_container, start_time)
|
||||
result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash', str(e))
|
||||
logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))
|
||||
else:
|
||||
if getattr(threading.current_thread(), '_timeout', False):
|
||||
record_error(self.engine_name, 'Timeout')
|
||||
211
searx/search/processors/online.py
Normal file
211
searx/search/processors/online.py
Normal file
|
|
@ -0,0 +1,211 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from time import time
|
||||
import threading
|
||||
|
||||
import requests.exceptions
|
||||
|
||||
import searx.poolrequests as poolrequests
|
||||
from searx.engines import settings
|
||||
from searx import logger
|
||||
from searx.utils import gen_useragent
|
||||
from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
|
||||
SearxEngineTooManyRequestsException,)
|
||||
from searx.metrology.error_recorder import record_exception, record_error
|
||||
|
||||
from searx.search.processors.abstract import EngineProcessor
|
||||
|
||||
|
||||
logger = logger.getChild('search.processor.online')
|
||||
|
||||
DEFAULT_PARAMS = {
|
||||
'method': 'GET',
|
||||
'headers': {},
|
||||
'data': {},
|
||||
'url': '',
|
||||
'cookies': {},
|
||||
'verify': True,
|
||||
'auth': None
|
||||
}
|
||||
|
||||
|
||||
class OnlineProcessor(EngineProcessor):
|
||||
|
||||
engine_type = 'online'
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
params = super().get_params(search_query, engine_category)
|
||||
if params is None:
|
||||
return None
|
||||
|
||||
# skip suspended engines
|
||||
if self.engine.suspend_end_time >= time():
|
||||
logger.debug('Engine currently suspended: %s', self.engine_name)
|
||||
return None
|
||||
|
||||
# add default params
|
||||
params.update(DEFAULT_PARAMS)
|
||||
|
||||
# add an user agent
|
||||
params['headers']['User-Agent'] = gen_useragent()
|
||||
|
||||
return params
|
||||
|
||||
def _send_http_request(self, params):
|
||||
# create dictionary which contain all
|
||||
# informations about the request
|
||||
request_args = dict(
|
||||
headers=params['headers'],
|
||||
cookies=params['cookies'],
|
||||
verify=params['verify'],
|
||||
auth=params['auth']
|
||||
)
|
||||
|
||||
# setting engine based proxies
|
||||
if hasattr(self.engine, 'proxies'):
|
||||
request_args['proxies'] = poolrequests.get_proxies(self.engine.proxies)
|
||||
|
||||
# max_redirects
|
||||
max_redirects = params.get('max_redirects')
|
||||
if max_redirects:
|
||||
request_args['max_redirects'] = max_redirects
|
||||
|
||||
# soft_max_redirects
|
||||
soft_max_redirects = params.get('soft_max_redirects', max_redirects or 0)
|
||||
|
||||
# raise_for_status
|
||||
request_args['raise_for_httperror'] = params.get('raise_for_httperror', False)
|
||||
|
||||
# specific type of request (GET or POST)
|
||||
if params['method'] == 'GET':
|
||||
req = poolrequests.get
|
||||
else:
|
||||
req = poolrequests.post
|
||||
|
||||
request_args['data'] = params['data']
|
||||
|
||||
# send the request
|
||||
response = req(params['url'], **request_args)
|
||||
|
||||
# check soft limit of the redirect count
|
||||
if len(response.history) > soft_max_redirects:
|
||||
# unexpected redirect : record an error
|
||||
# but the engine might still return valid results.
|
||||
status_code = str(response.status_code or '')
|
||||
reason = response.reason or ''
|
||||
hostname = str(urlparse(response.url or '').netloc)
|
||||
record_error(self.engine_name,
|
||||
'{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects),
|
||||
(status_code, reason, hostname))
|
||||
|
||||
return response
|
||||
|
||||
def _search_basic(self, query, params):
|
||||
# update request parameters dependent on
|
||||
# search-engine (contained in engines folder)
|
||||
self.engine.request(query, params)
|
||||
|
||||
# ignoring empty urls
|
||||
if params['url'] is None:
|
||||
return None
|
||||
|
||||
if not params['url']:
|
||||
return None
|
||||
|
||||
# send request
|
||||
response = self._send_http_request(params)
|
||||
|
||||
# parse the response
|
||||
response.search_params = params
|
||||
return self.engine.response(response)
|
||||
|
||||
def search(self, query, params, result_container, start_time, timeout_limit):
|
||||
# set timeout for all HTTP requests
|
||||
poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time)
|
||||
# reset the HTTP total time
|
||||
poolrequests.reset_time_for_thread()
|
||||
|
||||
# suppose everything will be alright
|
||||
requests_exception = False
|
||||
suspended_time = None
|
||||
|
||||
try:
|
||||
# send requests and parse the results
|
||||
search_results = self._search_basic(query, params)
|
||||
|
||||
# check if the engine accepted the request
|
||||
if search_results is not None:
|
||||
# yes, so add results
|
||||
result_container.extend(self.engine_name, search_results)
|
||||
|
||||
# update engine time when there is no exception
|
||||
engine_time = time() - start_time
|
||||
page_load_time = poolrequests.get_time_for_thread()
|
||||
result_container.add_timing(self.engine_name, engine_time, page_load_time)
|
||||
with threading.RLock():
|
||||
self.engine.stats['engine_time'] += engine_time
|
||||
self.engine.stats['engine_time_count'] += 1
|
||||
# update stats with the total HTTP time
|
||||
self.engine.stats['page_load_time'] += page_load_time
|
||||
self.engine.stats['page_load_count'] += 1
|
||||
except Exception as e:
|
||||
record_exception(self.engine_name, e)
|
||||
|
||||
# Timing
|
||||
engine_time = time() - start_time
|
||||
page_load_time = poolrequests.get_time_for_thread()
|
||||
result_container.add_timing(self.engine_name, engine_time, page_load_time)
|
||||
|
||||
# Record the errors
|
||||
with threading.RLock():
|
||||
self.engine.stats['errors'] += 1
|
||||
|
||||
if (issubclass(e.__class__, requests.exceptions.Timeout)):
|
||||
result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout')
|
||||
# requests timeout (connect or read)
|
||||
logger.error("engine {0} : HTTP requests timeout"
|
||||
"(search duration : {1} s, timeout: {2} s) : {3}"
|
||||
.format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__))
|
||||
requests_exception = True
|
||||
elif (issubclass(e.__class__, requests.exceptions.RequestException)):
|
||||
result_container.add_unresponsive_engine(self.engine_name, 'HTTP error')
|
||||
# other requests exception
|
||||
logger.exception("engine {0} : requests exception"
|
||||
"(search duration : {1} s, timeout: {2} s) : {3}"
|
||||
.format(self.engine_name, engine_time, timeout_limit, e))
|
||||
requests_exception = True
|
||||
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
|
||||
result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required')
|
||||
logger.exception('engine {0} : CAPTCHA')
|
||||
suspended_time = e.suspended_time # pylint: disable=no-member
|
||||
elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
|
||||
result_container.add_unresponsive_engine(self.engine_name, 'too many requests')
|
||||
logger.exception('engine {0} : Too many requests')
|
||||
suspended_time = e.suspended_time # pylint: disable=no-member
|
||||
elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
|
||||
result_container.add_unresponsive_engine(self.engine_name, 'blocked')
|
||||
logger.exception('engine {0} : Searx is blocked')
|
||||
suspended_time = e.suspended_time # pylint: disable=no-member
|
||||
else:
|
||||
result_container.add_unresponsive_engine(self.engine_name, 'unexpected crash')
|
||||
# others errors
|
||||
logger.exception('engine {0} : exception : {1}'.format(self.engine_name, e))
|
||||
else:
|
||||
if getattr(threading.current_thread(), '_timeout', False):
|
||||
record_error(self.engine_name, 'Timeout')
|
||||
|
||||
# suspend the engine if there is an HTTP error
|
||||
# or suspended_time is defined
|
||||
with threading.RLock():
|
||||
if requests_exception or suspended_time:
|
||||
# update continuous_errors / suspend_end_time
|
||||
self.engine.continuous_errors += 1
|
||||
if suspended_time is None:
|
||||
suspended_time = min(settings['search']['max_ban_time_on_fail'],
|
||||
self.engine.continuous_errors * settings['search']['ban_time_on_fail'])
|
||||
self.engine.suspend_end_time = time() + suspended_time
|
||||
else:
|
||||
# reset the suspend variables
|
||||
self.engine.continuous_errors = 0
|
||||
self.engine.suspend_end_time = 0
|
||||
57
searx/search/processors/online_currency.py
Normal file
57
searx/search/processors/online_currency.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
from searx.data import CURRENCIES
|
||||
from .online import OnlineProcessor
|
||||
|
||||
|
||||
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
|
||||
|
||||
|
||||
def normalize_name(name):
|
||||
name = name.lower().replace('-', ' ').rstrip('s')
|
||||
name = re.sub(' +', ' ', name)
|
||||
return unicodedata.normalize('NFKD', name).lower()
|
||||
|
||||
|
||||
def name_to_iso4217(name):
|
||||
global CURRENCIES
|
||||
name = normalize_name(name)
|
||||
currency = CURRENCIES['names'].get(name, [name])
|
||||
return currency[0]
|
||||
|
||||
|
||||
def iso4217_to_name(iso4217, language):
|
||||
global CURRENCIES
|
||||
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
|
||||
|
||||
|
||||
class OnlineCurrencyProcessor(OnlineProcessor):
|
||||
|
||||
engine_type = 'online_currency'
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
params = super().get_params(search_query, engine_category)
|
||||
if params is None:
|
||||
return None
|
||||
|
||||
m = parser_re.match(search_query.query)
|
||||
if not m:
|
||||
return None
|
||||
|
||||
amount_str, from_currency, to_currency = m.groups()
|
||||
try:
|
||||
amount = float(amount_str)
|
||||
except ValueError:
|
||||
return None
|
||||
from_currency = name_to_iso4217(from_currency.strip())
|
||||
to_currency = name_to_iso4217(to_currency.strip())
|
||||
|
||||
params['amount'] = amount
|
||||
params['from'] = from_currency
|
||||
params['to'] = to_currency
|
||||
params['from_name'] = iso4217_to_name(from_currency, 'en')
|
||||
params['to_name'] = iso4217_to_name(to_currency, 'en')
|
||||
return params
|
||||
37
searx/search/processors/online_dictionary.py
Normal file
37
searx/search/processors/online_dictionary.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import re
|
||||
|
||||
from searx.utils import is_valid_lang
|
||||
from .online import OnlineProcessor
|
||||
|
||||
|
||||
parser_re = re.compile('.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I)
|
||||
|
||||
|
||||
class OnlineDictionaryProcessor(OnlineProcessor):
|
||||
|
||||
engine_type = 'online_dictionnary'
|
||||
|
||||
def get_params(self, search_query, engine_category):
|
||||
params = super().get_params(search_query, engine_category)
|
||||
if params is None:
|
||||
return None
|
||||
|
||||
m = parser_re.match(search_query.query)
|
||||
if not m:
|
||||
return None
|
||||
|
||||
from_lang, to_lang, query = m.groups()
|
||||
|
||||
from_lang = is_valid_lang(from_lang)
|
||||
to_lang = is_valid_lang(to_lang)
|
||||
|
||||
if not from_lang or not to_lang:
|
||||
return None
|
||||
|
||||
params['from_lang'] = from_lang
|
||||
params['to_lang'] = to_lang
|
||||
params['query'] = query
|
||||
|
||||
return params
|
||||
Loading…
Add table
Add a link
Reference in a new issue