From 8cbc9f2d5823eb984e99e15c963e306610007fa1 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Thu, 24 Dec 2020 09:28:16 +0100 Subject: [PATCH 01/12] [enh] add checker --- Dockerfile | 3 + requirements.txt | 1 + searx/search/__init__.py | 7 + searx/search/checker/__init__.py | 1 + searx/search/checker/__main__.py | 51 +++ searx/search/checker/impl.py | 388 +++++++++++++++++++ searx/search/processors/abstract.py | 12 + searx/search/processors/online.py | 44 +++ searx/search/processors/online_currency.py | 10 + searx/search/processors/online_dictionary.py | 18 + utils/searx.sh | 6 +- 11 files changed, 539 insertions(+), 2 deletions(-) create mode 100644 searx/search/checker/__init__.py create mode 100644 searx/search/checker/__main__.py create mode 100644 searx/search/checker/impl.py diff --git a/Dockerfile b/Dockerfile index 3894aa968..f251d06ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,6 +41,8 @@ RUN apk upgrade --no-cache \ openssl-dev \ tar \ git \ + protoc \ + protobuf-dev \ && apk add --no-cache \ ca-certificates \ su-exec \ @@ -53,6 +55,7 @@ RUN apk upgrade --no-cache \ uwsgi \ uwsgi-python3 \ brotli \ + protobuf \ && pip3 install --upgrade pip \ && pip3 install --no-cache -r requirements.txt \ && apk del build-dependencies \ diff --git a/requirements.txt b/requirements.txt index ecf8e0c62..776bbc20b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pygments==2.1.3 python-dateutil==2.8.1 pyyaml==5.3.1 requests[socks]==2.25.1 +pycld3==0.20 diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 0d45f0b7c..7768d21e9 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -64,6 +64,9 @@ class EngineRef: def __eq__(self, other): return self.name == other.name and self.category == other.category + def __hash__(self): + return hash((self.name, self.category)) + class SearchQuery: """container for all the search parameters (query, language, etc...)""" @@ -108,6 +111,10 @@ class SearchQuery: and self.timeout_limit == other.timeout_limit\ and self.external_bang == other.external_bang + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) + class Search: """Search information container""" diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py new file mode 100644 index 000000000..442d5a09d --- /dev/null +++ b/searx/search/checker/__init__.py @@ -0,0 +1 @@ +from .impl import Checker diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py new file mode 100644 index 000000000..c071e6437 --- /dev/null +++ b/searx/search/checker/__main__.py @@ -0,0 +1,51 @@ +import sys + +import searx.search +import searx.search.processors +import searx.search.checker + + +if sys.stdout.isatty(): + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + + +def iter_processor(): + if len(sys.argv) > 1: + for name, processor in searx.search.processors.items(): + if name in sys.argv: + yield name, processor + else: + for name, processor in searx.search.processors.items(): + yield name, processor + + +def main(): + searx.search.initialize() + broken_urls = [] + for name, processor in iter_processor(): + if sys.stdout.isatty(): + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.succesfull: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ) + else: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ) + + broken_urls += checker.test_results.broken_urls + + for url in broken_urls: + print('Error fetching', url) + + +if __name__ == '__main__': + main() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py new file mode 100644 index 000000000..f55b6d0f5 --- /dev/null +++ b/searx/search/checker/impl.py @@ -0,0 +1,388 @@ +import typing +import types +import functools +import itertools +from time import time +from urllib.parse import urlparse + +import re +import cld3 +import requests.exceptions + +from searx import poolrequests, logger +from searx.results import ResultContainer +from searx.search import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor + + +HTML_TAGS = [ + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _is_url_image(image_url): + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + retry = 2 + + while retry > 0: + a = time() + try: + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0' + }) + if r.headers["content-type"].startswith('image/'): + return True + return False + except requests.exceptions.Timeout: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except requests.exceptions.RequestException: + logger.exception('Exception for %s', image_url) + return False + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ + -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'broken_urls' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.broken_urls = [] + + def add_error(self, test, message): + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + + def add_broken_url(self, url): + if url not in self.broken_urls: + self.broken_urls.append(url) + + @property + def succesfull(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__(self, + test_results: TestResults, + test_name: str, + search_query: SearchQuery, + result_container: ResultContainer): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results] + + def _record_error(self, message: str) -> None: + self.test_results.add_error(self.test_name, message) + + def _add_language(self, text: str) -> typing.Optional[str]: + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.9 and r.is_reliable: + self.languages.add(r.language) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title') + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content') + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self.test_results.add_broken_url(thumbnail_src) + self._record_error('thumbnail_src URL is invalid') + elif not _is_url_image(result.get('img_src')): + self.test_results.add_broken_url(result.get('img_src')) + self._record_error('img_src URL is invalid') + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid') + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer') + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content') + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value') + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__(self, + test_results: TestResults, + test_name: str, + result_container_tests_list: typing.List[ResultContainerTests]): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff(self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error(self.test_name, + 'results are identitical for {} and {} ({})' + .format(diff1_str, diff2_str, common_str)) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error(obj.test_name, + 'method {!r} ({}) not found for {}' + .format(method, method.__class__.__name__, obj.__class__.__name__)) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + self.processor.search(search_query.query, params, result_container, time(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index cf3fd7236..eb8d296ec 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -37,3 +37,15 @@ class EngineProcessor: @abstractmethod def search(self, query, params, result_container, start_time, timeout_limit): pass + + def get_tests(self): + tests = getattr(self.engine, 'tests', None) + if tests is None: + tests = getattr(self.engine, 'additional_tests', {}) + tests.update(self.get_default_tests()) + return tests + else: + return tests + + def get_default_tests(self): + return {} diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index b62f8059e..54d63b4c9 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -211,3 +211,47 @@ class OnlineProcessor(EngineProcessor): # reset the suspend variables self.engine.continuous_errors = 0 self.engine.suspend_end_time = 0 + + def get_default_tests(self): + tests = {} + + tests['simple'] = { + 'matrix': {'query': ('time', 'time')}, + 'result_container': ['not_empty'], + } + + if getattr(self.engine, 'paging', False): + # [1, 2, 3] --> isinstance(l, (list, tuple)) ?? + tests['paging'] = { + 'matrix': {'query': 'time', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'time_range', False): + tests['time_range'] = { + 'matrix': {'query': 'time', + 'time_range': (None, 'day')}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'lang', False): + tests['lang_fr'] = { + 'matrix': {'query': 'paris', 'lang': 'fr'}, + 'result_container': ['not_empty', ('has_lang', 'fr')], + } + tests['lang_en'] = { + 'matrix': {'query': 'paris', 'lang': 'en'}, + 'result_container': ['not_empty', ('has_lang', 'en')], + } + + if getattr(self.engine, 'safesearch', False): + tests['safesearch'] = { + 'matrix': {'query': 'porn', + 'safesearch': (0, 2)}, + 'test': ['unique_results'] + } + + return tests diff --git a/searx/search/processors/online_currency.py b/searx/search/processors/online_currency.py index f0e919c03..132c10594 100644 --- a/searx/search/processors/online_currency.py +++ b/searx/search/processors/online_currency.py @@ -55,3 +55,13 @@ class OnlineCurrencyProcessor(OnlineProcessor): params['from_name'] = iso4217_to_name(from_currency, 'en') params['to_name'] = iso4217_to_name(to_currency, 'en') return params + + def get_default_tests(self): + tests = {} + + tests['currency'] = { + 'matrix': {'query': '1337 usd in rmb'}, + 'result_container': ['has_answer'], + } + + return tests diff --git a/searx/search/processors/online_dictionary.py b/searx/search/processors/online_dictionary.py index 8e9ef1620..987c710a1 100644 --- a/searx/search/processors/online_dictionary.py +++ b/searx/search/processors/online_dictionary.py @@ -35,3 +35,21 @@ class OnlineDictionaryProcessor(OnlineProcessor): params['query'] = query return params + + def get_default_tests(self): + tests = {} + + if getattr(self.engine, 'paging', False): + tests['translation_paging'] = { + 'matrix': {'query': 'en-es house', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + 'test': ['unique_results'] + } + else: + tests['translation'] = { + 'matrix': {'query': 'en-es house'}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + } + + return tests diff --git a/utils/searx.sh b/utils/searx.sh index b7d3b8e1c..134f0dbb1 100755 --- a/utils/searx.sh +++ b/utils/searx.sh @@ -46,6 +46,7 @@ SEARX_PACKAGES_debian="\ python3-dev python3-babel python3-venv uwsgi uwsgi-plugin-python3 git build-essential libxslt-dev zlib1g-dev libffi-dev libssl-dev +libprotobuf-dev protobuf-compiler shellcheck" BUILD_PACKAGES_debian="\ @@ -58,6 +59,7 @@ SEARX_PACKAGES_arch="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python git base-devel libxml2 +protobuf shellcheck" BUILD_PACKAGES_arch="\ @@ -69,7 +71,7 @@ SEARX_PACKAGES_fedora="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_fedora="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools @@ -82,7 +84,7 @@ SEARX_PACKAGES_centos="\ python36 python36-pip python36-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_centos="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools From 16a889dd8f4886cedf92a008ca89724bba0992dd Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 29 Dec 2020 11:01:35 +0100 Subject: [PATCH 02/12] [enh] checker: add rosebud test --- searx/settings.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/searx/settings.yml b/searx/settings.yml index 30afbf957..f0ec05542 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -102,6 +102,17 @@ outgoing: # communication with search engines # - "HTTPS rewrite" # - ... +additional_tests: + rosebud: &test_rosebud + matrix: + query: rosebud + lang: en + result_container: + - not_empty + - [one_title_contains', 'citizen kane'] + test: + - unique_results + engines: - name: apk mirror engine: apkmirror @@ -278,6 +289,8 @@ engines: engine : etools shortcut : eto disabled : True + additional_tests: + rosebud: *test_rosebud - name : etymonline engine : xpath @@ -343,6 +356,8 @@ engines: shortcut : gb timeout : 3.0 disabled: True + additional_tests: + rosebud: *test_rosebud - name : gentoo engine : gentoo @@ -646,6 +661,8 @@ engines: shortcut : qw categories : general disabled : True + additional_tests: + rosebud: *test_rosebud - name : qwant images engine : qwant @@ -745,6 +762,8 @@ engines: shortcut : sp timeout : 6.0 disabled : True + additional_tests: + rosebud: *test_rosebud - name : tokyotoshokan engine : tokyotoshokan @@ -856,6 +875,8 @@ engines: number_of_results : 5 search_type : text disabled : True + additional_tests: + rosebud: *test_rosebud - name : wikisource engine : mediawiki From ca0889d488d232d357baacc8f16320c74333092c Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 29 Dec 2020 11:02:18 +0100 Subject: [PATCH 03/12] [enh] checker: wikidata & ddd: add specific tests --- searx/settings.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/searx/settings.yml b/searx/settings.yml index f0ec05542..3094fc7a7 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -113,6 +113,14 @@ additional_tests: test: - unique_results +tests: + infobox: &tests_infobox + infobox: + matrix: + query: ["linux", "new york", "bbc"] + result_container: + - has_infobox + engines: - name: apk mirror engine: apkmirror @@ -229,6 +237,7 @@ engines: shortcut : ddd weight : 2 disabled : True + tests: *tests_infobox # cloudflare protected # - name : digbt @@ -273,6 +282,7 @@ engines: shortcut : wd timeout : 3.0 weight : 2 + tests: *tests_infobox - name : duckduckgo engine : duckduckgo From 9c581466e136f7cb82d5ffe6c052fbd9e93ab39f Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 30 Dec 2020 15:24:29 +0100 Subject: [PATCH 04/12] [fix] do not colorize output on dumb terminals Signed-off-by: Markus Heiser --- searx/search/checker/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index c071e6437..2f808237a 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,11 +1,12 @@ import sys +import os import searx.search import searx.search.processors import searx.search.checker -if sys.stdout.isatty(): +if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: RESET_SEQ = "\033[0m" COLOR_SEQ = "\033[1;%dm" BOLD_SEQ = "\033[1m" From 6e2872f43625aba71eba019e16f7fbd74743f590 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 5 Jan 2021 11:22:48 +0100 Subject: [PATCH 05/12] [enh] add searx.shared shared dictionary between the workers (UWSGI or werkzeug) scheduler: run a task once every x seconds (UWSGI or werkzeug) --- dockerfiles/uwsgi.ini | 3 + searx/shared/__init__.py | 31 ++++++++++ searx/shared/shared_abstract.py | 15 +++++ searx/shared/shared_simple.py | 38 ++++++++++++ searx/shared/shared_uwsgi.py | 62 +++++++++++++++++++ .../etc/uwsgi/apps-archlinux/searx.ini | 5 +- .../etc/uwsgi/apps-available/searx.ini | 3 + 7 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 searx/shared/__init__.py create mode 100644 searx/shared/shared_abstract.py create mode 100644 searx/shared/shared_simple.py create mode 100644 searx/shared/shared_uwsgi.py diff --git a/dockerfiles/uwsgi.ini b/dockerfiles/uwsgi.ini index 398a440d9..818a99cc0 100644 --- a/dockerfiles/uwsgi.ini +++ b/dockerfiles/uwsgi.ini @@ -42,3 +42,6 @@ static-map = /static=/usr/local/searx/searx/static static-expires = /* 864000 static-gzip-all = True offload-threads = %k + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 diff --git a/searx/shared/__init__.py b/searx/shared/__init__.py new file mode 100644 index 000000000..83d3a2742 --- /dev/null +++ b/searx/shared/__init__.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import logging + +logger = logging.getLogger('searx.shared') + +try: + import uwsgi +except: + # no uwsgi + from .shared_simple import SimpleSharedDict as SharedDict, schedule + logger.info('Use shared_simple implementation') +else: + try: + uwsgi.cache_update('dummy', b'dummy') + if uwsgi.cache_get('dummy') != b'dummy': + raise Exception() + except: + # uwsgi.ini configuration problem: disable all scheduling + logger.error('uwsgi.ini configuration error, add this line to your uwsgi.ini\n' + 'cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1') + from .shared_simple import SimpleSharedDict as SharedDict + + def schedule(delay, func, *args): + pass + else: + # uwsgi + from .shared_uwsgi import UwsgiCacheSharedDict as SharedDict, schedule + logger.info('Use shared_uwsgi implementation') + +storage = SharedDict() diff --git a/searx/shared/shared_abstract.py b/searx/shared/shared_abstract.py new file mode 100644 index 000000000..3fede417e --- /dev/null +++ b/searx/shared/shared_abstract.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +class SharedDict: + + def get_int(self, key): + pass + + def set_int(self, key, value): + pass + + def get_str(self, key): + pass + + def set_str(self, key, value): + pass diff --git a/searx/shared/shared_simple.py b/searx/shared/shared_simple.py new file mode 100644 index 000000000..5b970aad9 --- /dev/null +++ b/searx/shared/shared_simple.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import threading + +from . import shared_abstract + + +class SimpleSharedDict(shared_abstract.SharedDict): + + __slots__ = 'd', + + def __init__(self): + self.d = {} + + def get_int(self, key): + return self.d.get(key, None) + + def set_int(self, key, value): + self.d[key] = value + + def get_str(self, key): + return self.d.get(key, None) + + def set_str(self, key, value): + self.d[key] = value + + +def schedule(delay, func, *args): + def call_later(): + t = threading.Timer(delay, wrapper) + t.daemon = True + t.start() + + def wrapper(): + call_later() + func(*args) + + call_later() diff --git a/searx/shared/shared_uwsgi.py b/searx/shared/shared_uwsgi.py new file mode 100644 index 000000000..136bf687e --- /dev/null +++ b/searx/shared/shared_uwsgi.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import time +import uwsgi # pylint: disable=E0401 +from . import shared_abstract + + +_last_signal = 10 + + +class UwsgiCacheSharedDict(shared_abstract.SharedDict): + + def get_int(self, key): + value = uwsgi.cache_get(key) + if value is None: + return value + else: + return int.from_bytes(value, 'big') + + def set_int(self, key, value): + b = value.to_bytes(4, 'big') + uwsgi.cache_update(key, b) + + def get_str(self, key): + value = uwsgi.cache_get(key) + if value is None: + return value + else: + return value.decode('utf-8') + + def set_str(self, key, value): + b = value.encode('utf-8') + uwsgi.cache_update(key, b) + + +def schedule(delay, func, *args): + """ + Can be implemented using a spooler. + https://uwsgi-docs.readthedocs.io/en/latest/PythonDecorators.html + + To make the uwsgi configuration simple, use the alternative implementation. + """ + global _last_signal + + def sighandler(signum): + now = int(time.time()) + uwsgi.lock() + try: + updating = uwsgi.cache_get('updating') + if updating is not None: + updating = int.from_bytes(updating, 'big') + if now - updating < delay: + return + uwsgi.cache_update('updating', now.to_bytes(4, 'big')) + finally: + uwsgi.unlock() + func(*args) + + signal_num = _last_signal + _last_signal += 1 + uwsgi.register_signal(signal_num, 'worker', sighandler) + uwsgi.add_timer(signal_num, delay) diff --git a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini index 9dd2e6f2f..71cece3c4 100644 --- a/utils/templates/etc/uwsgi/apps-archlinux/searx.ini +++ b/utils/templates/etc/uwsgi/apps-archlinux/searx.ini @@ -82,4 +82,7 @@ http = ${SEARX_INTERNAL_HTTP} # mkdir -p /run/uwsgi/app/searx # chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx # -# socket = /run/uwsgi/app/searx/socket \ No newline at end of file +# socket = /run/uwsgi/app/searx/socket + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 diff --git a/utils/templates/etc/uwsgi/apps-available/searx.ini b/utils/templates/etc/uwsgi/apps-available/searx.ini index 4d69da0cf..45214ef13 100644 --- a/utils/templates/etc/uwsgi/apps-available/searx.ini +++ b/utils/templates/etc/uwsgi/apps-available/searx.ini @@ -82,3 +82,6 @@ http = ${SEARX_INTERNAL_HTTP} # chown -R ${SERVICE_USER}:${SERVICE_GROUP} /run/uwsgi/app/searx # # socket = /run/uwsgi/app/searx/socket + +# Cache +cache2 = name=searxcache,items=2000,blocks=2000,blocksize=4096,bitmap=1 From 3a9f513521d006a7939538cce368d7b799e32c30 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 5 Jan 2021 11:24:39 +0100 Subject: [PATCH 06/12] [enh] checker: background check See settings.yml for the options SIGUSR1 signal starts the checker. The result is available at /stats/checker --- searx/search/__init__.py | 72 ++------------------ searx/search/checker/__init__.py | 3 + searx/search/checker/__main__.py | 30 ++++++-- searx/search/checker/background.py | 106 +++++++++++++++++++++++++++++ searx/search/checker/impl.py | 12 +++- searx/search/models.py | 69 +++++++++++++++++++ searx/settings.yml | 45 +++++++----- searx/webapp.py | 12 +++- setup.py | 3 +- 9 files changed, 255 insertions(+), 97 deletions(-) create mode 100644 searx/search/checker/background.py create mode 100644 searx/search/models.py diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 7768d21e9..f777e8595 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url from searx.results import ResultContainer from searx import logger from searx.plugins import plugins +from searx.search.models import EngineRef, SearchQuery from searx.search.processors import processors, initialize as initialize_processors +from searx.search.checker import initialize as initialize_checker logger = logger.getChild('search') @@ -45,75 +47,11 @@ else: sys.exit(1) -def initialize(settings_engines=None): +def initialize(settings_engines=None, enable_checker=False): settings_engines = settings_engines or settings['engines'] initialize_processors(settings_engines) - - -class EngineRef: - - __slots__ = 'name', 'category' - - def __init__(self, name: str, category: str): - self.name = name - self.category = category - - def __repr__(self): - return "EngineRef({!r}, {!r})".format(self.name, self.category) - - def __eq__(self, other): - return self.name == other.name and self.category == other.category - - def __hash__(self): - return hash((self.name, self.category)) - - -class SearchQuery: - """container for all the search parameters (query, language, etc...)""" - - __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ - 'timeout_limit', 'external_bang' - - def __init__(self, - query: str, - engineref_list: typing.List[EngineRef], - lang: str='all', - safesearch: int=0, - pageno: int=1, - time_range: typing.Optional[str]=None, - timeout_limit: typing.Optional[float]=None, - external_bang: typing.Optional[str]=None): - self.query = query - self.engineref_list = engineref_list - self.lang = lang - self.safesearch = safesearch - self.pageno = pageno - self.time_range = time_range - self.timeout_limit = timeout_limit - self.external_bang = external_bang - - @property - def categories(self): - return list(set(map(lambda engineref: engineref.category, self.engineref_list))) - - def __repr__(self): - return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ - format(self.query, self.engineref_list, self.lang, self.safesearch, - self.pageno, self.time_range, self.timeout_limit, self.external_bang) - - def __eq__(self, other): - return self.query == other.query\ - and self.engineref_list == other.engineref_list\ - and self.lang == other.lang\ - and self.safesearch == other.safesearch\ - and self.pageno == other.pageno\ - and self.time_range == other.time_range\ - and self.timeout_limit == other.timeout_limit\ - and self.external_bang == other.external_bang - - def __hash__(self): - return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, - self.timeout_limit, self.external_bang)) + if enable_checker: + initialize_checker() class Search: diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py index 442d5a09d..85b9178df 100644 --- a/searx/search/checker/__init__.py +++ b/searx/search/checker/__init__.py @@ -1 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + from .impl import Checker +from .background import initialize, get_result diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 2f808237a..37b7e6cda 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,9 +1,13 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import sys import os +import argparse import searx.search -import searx.search.processors import searx.search.checker +from searx.search import processors +from searx.engines import engine_shortcuts if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: @@ -18,20 +22,24 @@ else: BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" -def iter_processor(): - if len(sys.argv) > 1: - for name, processor in searx.search.processors.items(): - if name in sys.argv: +def iter_processor(engine_name_list): + if len(engine_name_list) > 0: + for name in engine_name_list: + name = engine_shortcuts.get(name, name) + processor = processors.get(name) + if processor is not None: yield name, processor + else: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) else: for name, processor in searx.search.processors.items(): yield name, processor -def main(): +def run(engine_name_list): searx.search.initialize() broken_urls = [] - for name, processor in iter_processor(): + for name, processor in iter_processor(engine_name_list): if sys.stdout.isatty(): print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) checker = searx.search.checker.Checker(processor) @@ -48,5 +56,13 @@ def main(): print('Error fetching', url) +def main(): + parser = argparse.ArgumentParser(description='Check searx engines.') + parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', + help='engines name or shortcut list. Empty for all engines.') + args = parser.parse_args() + run(args.engine_name_list) + + if __name__ == '__main__': main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py new file mode 100644 index 000000000..45188ab38 --- /dev/null +++ b/searx/search/checker/background.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import random +import time +import threading +import os +import signal + +from searx import logger, settings, searx_debug +from searx.exceptions import SearxSettingsException +from searx.search.processors import processors +from searx.search.checker import Checker +from searx.shared import schedule, storage + + +CHECKER_RESULT = 'CHECKER_RESULT' +running = threading.Lock() + + +def _get_interval(every, error_msg): + if isinstance(every, int): + every = (every, every) + if not isinstance(every, (tuple, list))\ + or len(every) != 2\ + or not isinstance(every[0], int)\ + or not isinstance(every[1], int): + raise SearxSettingsException(error_msg, None) + return every + + +def _get_every(): + every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800)) + return _get_interval(every, 'checker.scheduling.every is not a int or list') + + +def get_result(): + serialized_result = storage.get_str('CHECKER_RESULT') + if serialized_result is not None: + return json.loads(serialized_result) + + +def run(): + if not running.acquire(blocking=False): + return + try: + logger.info('Starting checker') + result = {} + for name, processor in processors.items(): + logger.debug('Checking %s engine', name) + checker = Checker(processor) + checker.run() + if checker.test_results.succesfull: + result[name] = {'status': True} + else: + result[name] = {'status': False, 'errors': checker.test_results.errors} + + storage.set_str('CHECKER_RESULT', json.dumps(result)) + logger.info('Check done') + finally: + running.release() + + +def _run_with_delay(): + every = _get_every() + delay = random.randint(0, every[1] - every[0]) + logger.debug('Start checker in %i seconds', delay) + time.sleep(delay) + run() + + +def _start_scheduling(): + every = _get_every() + schedule(every[0], _run_with_delay) + run() + + +def _signal_handler(signum, frame): + t = threading.Thread(target=run) + t.daemon = True + t.start() + + +def initialize(): + logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) + signal.signal(signal.SIGUSR1, _signal_handler) + + # special case when debug is activate + if searx_debug and settings.get('checker', {}).get('off_when_debug', True): + logger.info('debug mode: checker is disabled') + return + + # check value of checker.scheduling.every now + scheduling = settings.get('checker', {}).get('scheduling', None) + if scheduling is None or not scheduling: + logger.info('Checker scheduler is disabled') + return + + # + start_after = scheduling.get('start_after', (300, 1800)) + start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list') + delay = random.randint(start_after[0], start_after[1]) + logger.info('Start checker in %i seconds', delay) + t = threading.Timer(delay, _start_scheduling) + t.daemon = True + t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index f55b6d0f5..abef5f8e9 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + import typing import types import functools @@ -11,7 +13,7 @@ import requests.exceptions from searx import poolrequests, logger from searx.results import ResultContainer -from searx.search import SearchQuery, EngineRef +from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -240,18 +242,24 @@ class ResultContainerTests: self._check_infoboxes(self.result_container.infoboxes) def has_infobox(self): + """Check the ResultContainer has at least one infobox""" if len(self.result_container.infoboxes) == 0: self._record_error('No infobox') def has_answer(self): + """Check the ResultContainer has at least one answer""" if len(self.result_container.answers) == 0: self._record_error('No answer') def has_language(self, lang): + """Check at least one title or content of the results is written in the `lang`. + + Detected using pycld3, may be not accurate""" if lang not in self.languages: self._record_error(lang + ' not found') def not_empty(self): + """Check the ResultContainer has at least one answer or infobox or result""" result_types = set() results = self.result_container.get_ordered_results() if len(results) > 0: @@ -267,6 +275,7 @@ class ResultContainerTests: self._record_error('No result') def one_title_contains(self, title: str): + """Check one of the title contains `title` (case insensitive comparaison)""" title = title.lower() for result in self.result_container.get_ordered_results(): if title in result['title'].lower(): @@ -287,6 +296,7 @@ class CheckerTests: self.result_container_tests_list = result_container_tests_list def unique_results(self): + """Check the results of each ResultContain is unique""" urls_list = [rct.result_urls for rct in self.result_container_tests_list] if len(urls_list[0]) > 0: # results on the first page diff --git a/searx/search/models.py b/searx/search/models.py new file mode 100644 index 000000000..80ceaa223 --- /dev/null +++ b/searx/search/models.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + +import typing + + +class EngineRef: + + __slots__ = 'name', 'category' + + def __init__(self, name: str, category: str): + self.name = name + self.category = category + + def __repr__(self): + return "EngineRef({!r}, {!r})".format(self.name, self.category) + + def __eq__(self, other): + return self.name == other.name and self.category == other.category + + def __hash__(self): + return hash((self.name, self.category)) + + +class SearchQuery: + """container for all the search parameters (query, language, etc...)""" + + __slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\ + 'timeout_limit', 'external_bang' + + def __init__(self, + query: str, + engineref_list: typing.List[EngineRef], + lang: str='all', + safesearch: int=0, + pageno: int=1, + time_range: typing.Optional[str]=None, + timeout_limit: typing.Optional[float]=None, + external_bang: typing.Optional[str]=None): + self.query = query + self.engineref_list = engineref_list + self.lang = lang + self.safesearch = safesearch + self.pageno = pageno + self.time_range = time_range + self.timeout_limit = timeout_limit + self.external_bang = external_bang + + @property + def categories(self): + return list(set(map(lambda engineref: engineref.category, self.engineref_list))) + + def __repr__(self): + return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\ + format(self.query, self.engineref_list, self.lang, self.safesearch, + self.pageno, self.time_range, self.timeout_limit, self.external_bang) + + def __eq__(self, other): + return self.query == other.query\ + and self.engineref_list == other.engineref_list\ + and self.lang == other.lang\ + and self.safesearch == other.safesearch\ + and self.pageno == other.pageno\ + and self.time_range == other.time_range\ + and self.timeout_limit == other.timeout_limit\ + and self.external_bang == other.external_bang + + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) diff --git a/searx/settings.yml b/searx/settings.yml index 3094fc7a7..55c9849c1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -102,24 +102,33 @@ outgoing: # communication with search engines # - "HTTPS rewrite" # - ... -additional_tests: - rosebud: &test_rosebud - matrix: - query: rosebud - lang: en - result_container: - - not_empty - - [one_title_contains', 'citizen kane'] - test: - - unique_results - -tests: - infobox: &tests_infobox - infobox: - matrix: - query: ["linux", "new york", "bbc"] - result_container: - - has_infobox +checker: + # disable checker when in debug mode + off_when_debug: True + # scheduling: interval or int + # use "scheduling: False" to disable scheduling + scheduling: + start_after: [300, 1800] # delay to start the first run of the checker + every: [86400, 90000] # how often the checker runs + # additional tests: only for the YAML anchors (see the engines section) + additional_tests: + rosebud: &test_rosebud + matrix: + query: rosebud + lang: en + result_container: + - not_empty + - ['one_title_contains', 'citizen kane'] + test: + - unique_results + # tests: only for the YAML anchors (see the engines section) + tests: + infobox: &tests_infobox + infobox: + matrix: + query: ["linux", "new york", "bbc"] + result_container: + - has_infobox engines: - name: apk mirror diff --git a/searx/webapp.py b/searx/webapp.py index 10f4ce78c..985eced18 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori from searx.utils import html_to_text, gen_useragent, dict_subset, match_language from searx.version import VERSION_STRING from searx.languages import language_codes as languages -from searx.search import SearchWithPlugins, initialize +from searx.search import SearchWithPlugins, initialize as search_initialize +from searx.search.checker import get_result as checker_get_result from searx.query import RawTextQuery from searx.autocomplete import searx_bang, backends as autocomplete_backends from searx.plugins import plugins @@ -81,7 +82,6 @@ from searx.answerers import answerers from searx.poolrequests import get_global_proxies from searx.metrology.error_recorder import errors_per_engines - # serve pages with HTTP/1.1 from werkzeug.serving import WSGIRequestHandler WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0')) @@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai # initialize the engines except on the first run of the werkzeug server. if not werkzeug_reloader\ or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"): - initialize() + search_initialize(enable_checker=True) babel = Babel(app) @@ -977,6 +977,12 @@ def stats_errors(): return jsonify(result) +@app.route('/stats/checker', methods=['GET']) +def stats_checker(): + result = checker_get_result() + return jsonify(result) + + @app.route('/robots.txt', methods=['GET']) def robots(): return Response("""User-agent: * diff --git a/setup.py b/setup.py index f89f16820..09a3021ee 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,8 @@ setup( }, entry_points={ 'console_scripts': [ - 'searx-run = searx.webapp:run' + 'searx-run = searx.webapp:run', + 'searx-checker = searx.search.checker.__main__:main' ] }, package_data={ From 45bfab77d0154c60f58be0453307cb03b48dca35 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 8 Jan 2021 19:04:04 +0100 Subject: [PATCH 07/12] |mod] checker: improve searx-checker command line * output is unbuffered * verbose mode describe more precisly the errrors --- searx/search/checker/__main__.py | 54 +++++++++++++++++++++++--------- searx/search/checker/impl.py | 46 ++++++++++++++++----------- 2 files changed, 67 insertions(+), 33 deletions(-) diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py index 37b7e6cda..75b37e6c5 100644 --- a/searx/search/checker/__main__.py +++ b/searx/search/checker/__main__.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: AGPL-3.0-or-later import sys +import io import os import argparse +import logging import searx.search import searx.search.checker @@ -10,6 +12,14 @@ from searx.search import processors from searx.engines import engine_shortcuts +# configure logging +root = logging.getLogger() +handler = logging.StreamHandler(sys.stdout) +for h in root.handlers: + root.removeHandler(h) +root.addHandler(handler) + +# color only for a valid terminal if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: RESET_SEQ = "\033[0m" COLOR_SEQ = "\033[1;%dm" @@ -21,7 +31,12 @@ else: BOLD_SEQ = "" BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" +# equivalent of 'python -u' (unbuffered stdout, stderr) +stdout = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) +stderr = io.TextIOWrapper(open(sys.stderr.fileno(), 'wb', 0), write_through=True) + +# iterator of processors def iter_processor(engine_name_list): if len(engine_name_list) > 0: for name in engine_name_list: @@ -30,38 +45,49 @@ def iter_processor(engine_name_list): if processor is not None: yield name, processor else: - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}') else: for name, processor in searx.search.processors.items(): yield name, processor -def run(engine_name_list): +# actual check & display +def run(engine_name_list, verbose): searx.search.initialize() - broken_urls = [] for name, processor in iter_processor(engine_name_list): - if sys.stdout.isatty(): - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') + if not sys.stdout.isatty(): + stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') checker = searx.search.checker.Checker(processor) checker.run() if checker.test_results.succesfull: - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') + if verbose: + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') else: - errors = [test_name + ': ' + error for test_name, error in checker.test_results] - print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ) - - broken_urls += checker.test_results.broken_urls - - for url in broken_urls: - print('Error fetching', url) + stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') + if not verbose: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') + else: + stdout.write('\n') + stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') + for test_name, logs in checker.test_results.logs.items(): + for log in logs: + stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') +# call by setup.py def main(): parser = argparse.ArgumentParser(description='Check searx engines.') parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*', help='engines name or shortcut list. Empty for all engines.') + parser.add_argument('--verbose', '-v', + action='store_true', dest='verbose', + help='Display details about the test results', + default=False) args = parser.parse_args() - run(args.engine_name_list) + run(args.engine_name_list, args.verbose) if __name__ == '__main__': diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index abef5f8e9..71a941f73 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -17,6 +17,8 @@ from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor +logger = logger.getChild('searx.search.checker') + HTML_TAGS = [ 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', @@ -121,20 +123,25 @@ def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ class TestResults: - __slots__ = 'errors', 'broken_urls' + __slots__ = 'errors', 'logs', 'languages' def __init__(self): self.errors: typing.Dict[str, typing.List[str]] = {} - self.broken_urls = [] + self.logs: typing.Dict[str, typing.List[typing.Any]] = {} + self.languages: typing.Set[str] = set() - def add_error(self, test, message): + def add_error(self, test, message, *args): + # message to self.errors errors_for_test = self.errors.setdefault(test, []) if message not in errors_for_test: errors_for_test.append(message) + # (message, *args) to self.logs + logs_for_test = self.logs.setdefault(test, []) + if (message, *args) not in logs_for_test: + logs_for_test.append((message, *args)) - def add_broken_url(self, url): - if url not in self.broken_urls: - self.broken_urls.append(url) + def add_language(self, language): + self.languages.add(language) @property def succesfull(self): @@ -167,20 +174,23 @@ class ResultContainerTests: results = self.result_container.get_ordered_results() return [result['url'] for result in results] - def _record_error(self, message: str) -> None: - self.test_results.add_error(self.test_name, message) + def _record_error(self, message: str, *args) -> None: + sq = _search_query_to_dict(self.search_query) + sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) + self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') def _add_language(self, text: str) -> typing.Optional[str]: r = cld3.get_language(str(text)) # pylint: disable=E1101 - if r is not None and r.probability >= 0.9 and r.is_reliable: + if r is not None and r.probability >= 0.98 and r.is_reliable: self.languages.add(r.language) + self.test_results.add_language(r.language) return None def _check_result(self, result): if not _check_no_html(result.get('title', '')): - self._record_error('HTML in title') + self._record_error('HTML in title', repr(result.get('title', ''))) if not _check_no_html(result.get('content', '')): - self._record_error('HTML in content') + self._record_error('HTML in content', repr(result.get('content', ''))) self._add_language(result.get('title', '')) self._add_language(result.get('content', '')) @@ -198,13 +208,11 @@ class ResultContainerTests: thumbnail_src = result.get('thumbnail_src') if thumbnail_src is not None: if not _is_url_image(thumbnail_src): - self.test_results.add_broken_url(thumbnail_src) - self._record_error('thumbnail_src URL is invalid') + self._record_error('thumbnail_src URL is invalid', thumbnail_src) elif not _is_url_image(result.get('img_src')): - self.test_results.add_broken_url(result.get('img_src')) - self._record_error('img_src URL is invalid') + self._record_error('img_src URL is invalid', result.get('img_src')) if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): - self._record_error('thumbnail URL is invalid') + self._record_error('thumbnail URL is invalid', result.get('img_src')) def _check_results(self, results: list): for result in results: @@ -213,16 +221,16 @@ class ResultContainerTests: def _check_answers(self, answers): for answer in answers: if not _check_no_html(answer): - self._record_error('HTML in answer') + self._record_error('HTML in answer', answer) def _check_infoboxes(self, infoboxes): for infobox in infoboxes: if not _check_no_html(infobox.get('content', '')): - self._record_error('HTML in infobox content') + self._record_error('HTML in infobox content', infobox.get('content', '')) self._add_language(infobox.get('content', '')) for attribute in infobox.get('attributes', {}): if not _check_no_html(attribute.get('value', '')): - self._record_error('HTML in infobox attribute value') + self._record_error('HTML in infobox attribute value', attribute.get('value', '')) def check_basic(self): if len(self.result_container.unresponsive_engines) > 0: From f3e1bd308f8abb62b3ce0070973e0a494d15b122 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 8 Jan 2021 19:05:56 +0100 Subject: [PATCH 08/12] [mod] checker: minor adjustements on the default tests the query "time" is convinient because most of the search engine will return some results, but some engines in the general category will return documentation about the HTML tags