From 3a9f513521d006a7939538cce368d7b799e32c30 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 5 Jan 2021 11:24:39 +0100 Subject: [PATCH] [enh] checker: background check See settings.yml for the options SIGUSR1 signal starts the checker. The result is available at /stats/checker --- searx/search/__init__.py | 72 ++------------------ searx/search/checker/__init__.py | 3 + searx/search/checker/__main__.py | 30 ++++++-- searx/search/checker/background.py | 106 +++++++++++++++++++++++++++++ searx/search/checker/impl.py | 12 +++- searx/search/models.py | 69 +++++++++++++++++++ searx/settings.yml | 45 +++++++----- searx/webapp.py | 12 +++- setup.py | 3 +- 9 files changed, 255 insertions(+), 97 deletions(-) create mode 100644 searx/search/checker/background.py create mode 100644 searx/search/models.py diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 7768d21e9..f777e8595 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url from searx.results import ResultContainer from searx import logger from searx.plugins import plugins +from searx.search.models import EngineRef, SearchQuery from searx.search.processors import processors, initialize as initialize_processors +from searx.search.checker import initialize as initialize_checker logger = logger.getChild('search') @@ -45,75 +47,11 @@ else: sys.exit(1) -def initialize(settings_engines=None): +def initialize(settings_engines=None, enable_checker=False): settings_engines = settings_engines or settings['engines'] initialize_processors(settings_engines) - - -class EngineRef: - - __slots__ = 'name', 'category' - - def __init__(self, name: str, category: str): - self.name = name - self.category = category - - def __repr__(self): - return "EngineRef({!r}, {!r})".format(self.name, self.category) - - def __eq__(self, other): - return self.name == other.name and self.category == other.category - - def __hash__(self): - return hash((self.name, self.category)) - - -class SearchQuery: - """container for all the search parameters (query, language, etc...)""" - - __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ - 'timeout_limit', 'external_bang' - - def __init__(self, - query: str, - engineref_list: typing.List[EngineRef], - lang: str='all', - safesearch: int=0, - pageno: int=1, - time_range: typing.Optional[str]=None, - timeout_limit: typing.Optional[float]=None, - external_bang: typing.Optional[str]=None): - self.query = query - self.engineref_list = engineref_list - self.lang = lang - self.safesearch = safesearch - self.pageno = pageno - self.time_range = time_range - self.timeout_limit = timeout_limit - self.external_bang = external_bang - - @property - def categories(self): - return list(set(map(lambda engineref: engineref.category, self.engineref_list))) - - def __repr__(self): - return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ - format(self.query, self.engineref_list, self.lang, self.safesearch, - self.pageno, self.time_range, self.timeout_limit, self.external_bang) - - def __eq__(self, other): - return self.query == other.query\ - and self.engineref_list == other.engineref_list\ - and self.lang == other.lang\ - and self.safesearch == other.safesearch\ - and self.pageno == other.pageno\ - and self.time_range == other.time_range\ - and self.timeout_limit == other.timeout_limit\ - and self.external_bang == other.external_bang - - def __hash__(self): - return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, - self.timeout_limit, self.external_bang)) + if enable_checker: + initialize_checker() class Search: diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py index 442d5a09d..85b9178df 100644 --- a/searx/search/checker/__init__.py +++ b/searx/search/checker/__init__.py @@ -1 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 2f808237a..37b7e6cda 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,9 +1,13 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import sys import os +import argparse import searx.search -import searx.search.processors import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: @@ -18,20 +22,24 @@ else: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" -def iter_processor(): - if len(sys.argv) > 1: - for name, processor in searx.search.processors.items(): - if name in sys.argv: +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: yield name, processor + else: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) else: for name, processor in searx.search.processors.items(): yield name, processor -def main(): +def run(engine_name_list): searx.search.initialize() broken_urls = [] - for name, processor in iter_processor(): + for name, processor in iter_processor(engine_name_list): if sys.stdout.isatty(): print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) checker = searx.search.checker.Checker(processor) @@ -48,5 +56,13 @@ def main(): print('Error fetching', url) +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + args = parser.parse_args() + run(args.engine_name_list) + + if __name__ == '__main__': main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 000000000..45188ab38 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str('CHECKER_RESULT') + if serialized_result is not None: + return json.loads(serialized_result) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = {} + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result[name] = {'status': True} + else: + result[name] = {'status': False, 'errors': checker.test_results.errors} + + storage.set_str('CHECKER_RESULT', json.dumps(result)) + logger.info('Check done') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + schedule(every[0], _run_with_delay) + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index f55b6d0f5..abef5f8e9 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import typing import types import functools @@ -11,7 +13,7 @@ import requests.exceptions from searx import poolrequests, logger from searx.results import ResultContainer -from searx.search import SearchQuery, EngineRef +from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -240,18 +242,24 @@ class ResultContainerTests: self._check_infoboxes(self.result_container.infoboxes) def has_infobox(self): + """Check the ResultContainer has at least one infobox""" if len(self.result_container.infoboxes) == 0: self._record_error('No infobox') def has_answer(self): + """Check the ResultContainer has at least one answer""" if len(self.result_container.answers) == 0: self._record_error('No answer') def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" if lang not in self.languages: self._record_error(lang + ' not found') def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" result_types = set() results = self.result_container.get_ordered_results() if len(results) > 0: @@ -267,6 +275,7 @@ class ResultContainerTests: self._record_error('No result') def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" title = title.lower() for result in self.result_container.get_ordered_results(): if title in result['title'].lower(): @@ -287,6 +296,7 @@ class CheckerTests: self.result_container_tests_list = result_container_tests_list def unique_results(self): + """Check the results of each ResultContain is unique""" urls_list = [rct.result_urls for rct in self.result_container_tests_list] if len(urls_list[0]) > 0: # results on the first page diff --git a/searx/search/models.py b/searx/search/models.py new file mode 100644 index 000000000..80ceaa223 --- /dev/null +++ b/searx/search/models.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing + + +class EngineRef: + + __slots__ = 'name', 'category' + + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self): + return "EngineRef({!r}, {!r})".format(self.name, self.category) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category + + def __hash__(self): + return hash((self.name, self.category)) + + +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ + 'timeout_limit', 'external_bang' + + def __init__(self, + query: str, + engineref_list: typing.List[EngineRef], + lang: str='all', + safesearch: int=0, + pageno: int=1, + time_range: typing.Optional[str]=None, + timeout_limit: typing.Optional[float]=None, + external_bang: typing.Optional[str]=None): + self.query = query + self.engineref_list = engineref_list + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + self.timeout_limit = timeout_limit + self.external_bang = external_bang + + @property + def categories(self): + return list(set(map(lambda engineref: engineref.category, self.engineref_list))) + + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ + format(self.query, self.engineref_list, self.lang, self.safesearch, + self.pageno, self.time_range, self.timeout_limit, self.external_bang) + + def __eq__(self, other): + return self.query == other.query\ + and self.engineref_list == other.engineref_list\ + and self.lang == other.lang\ + and self.safesearch == other.safesearch\ + and self.pageno == other.pageno\ + and self.time_range == other.time_range\ + and self.timeout_limit == other.timeout_limit\ + and self.external_bang == other.external_bang + + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) diff --git a/searx/settings.yml b/searx/settings.yml index 3094fc7a7..55c9849c1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -102,24 +102,33 @@ outgoing: # communication with search engines # - "HTTPS rewrite" # - ... -additional_tests: - rosebud: &test_rosebud - matrix: - query: rosebud - lang: en - result_container: - - not_empty - - [one_title_contains', 'citizen kane'] - test: - - unique_results - -tests: - infobox: &tests_infobox - infobox: - matrix: - query: ["linux", "new york", "bbc"] - result_container: - - has_infobox +checker: + # disable checker when in debug mode + off_when_debug: True + # scheduling: interval or int + # use "scheduling: False" to disable scheduling + scheduling: + start_after: [300, 1800] # delay to start the first run of the checker + every: [86400, 90000] # how often the checker runs + # additional tests: only for the YAML anchors (see the engines section) + additional_tests: + rosebud: &test_rosebud + matrix: + query: rosebud + lang: en + result_container: + - not_empty + - ['one_title_contains', 'citizen kane'] + test: + - unique_results + # tests: only for the YAML anchors (see the engines section) + tests: + infobox: &tests_infobox + infobox: + matrix: + query: ["linux", "new york", "bbc"] + result_container: + - has_infobox engines: - name: apk mirror diff --git a/searx/webapp.py b/searx/webapp.py index 10f4ce78c..985eced18 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori from searx.utils import html_to_text, gen_useragent, dict_subset, match_language from searx.version import VERSION_STRING from searx.languages import language_codes as languages -from searx.search import SearchWithPlugins, initialize +from searx.search import SearchWithPlugins, initialize as search_initialize +from searx.search.checker import get_result as checker_get_result from searx.query import RawTextQuery from searx.autocomplete import searx_bang, backends as autocomplete_backends from searx.plugins import plugins @@ -81,7 +82,6 @@ from searx.answerers import answerers from searx.poolrequests import get_global_proxies from searx.metrology.error_recorder import errors_per_engines - # serve pages with HTTP/1.1 from werkzeug.serving import WSGIRequestHandler WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0')) @@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai # initialize the engines except on the first run of the werkzeug server. if not werkzeug_reloader\ or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"): - initialize() + search_initialize(enable_checker=True) babel = Babel(app) @@ -977,6 +977,12 @@ def stats_errors(): return jsonify(result) +@app.route('/stats/checker', methods=['GET']) +def stats_checker(): + result = checker_get_result() + return jsonify(result) + + @app.route('/robots.txt', methods=['GET']) def robots(): return Response("""User-agent: * diff --git a/setup.py b/setup.py index f89f16820..09a3021ee 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,8 @@ setup( }, entry_points={ 'console_scripts': [ - 'searx-run = searx.webapp:run' + 'searx-run = searx.webapp:run', + 'searx-checker = searx.search.checker.__main__:main' ] }, package_data={