From 8cbc9f2d5823eb984e99e15c963e306610007fa1 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Thu, 24 Dec 2020 09:28:16 +0100 Subject: [PATCH] [enh] add checker --- Dockerfile | 3 + requirements.txt | 1 + searx/search/__init__.py | 7 + searx/search/checker/__init__.py | 1 + searx/search/checker/__main__.py | 51 +++ searx/search/checker/impl.py | 388 +++++++++++++++++++ searx/search/processors/abstract.py | 12 + searx/search/processors/online.py | 44 +++ searx/search/processors/online_currency.py | 10 + searx/search/processors/online_dictionary.py | 18 + utils/searx.sh | 6 +- 11 files changed, 539 insertions(+), 2 deletions(-) create mode 100644 searx/search/checker/__init__.py create mode 100644 searx/search/checker/__main__.py create mode 100644 searx/search/checker/impl.py diff --git a/Dockerfile b/Dockerfile index 3894aa968..f251d06ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,6 +41,8 @@ RUN apk upgrade --no-cache \ openssl-dev \ tar \ git \ + protoc \ + protobuf-dev \ && apk add --no-cache \ ca-certificates \ su-exec \ @@ -53,6 +55,7 @@ RUN apk upgrade --no-cache \ uwsgi \ uwsgi-python3 \ brotli \ + protobuf \ && pip3 install --upgrade pip \ && pip3 install --no-cache -r requirements.txt \ && apk del build-dependencies \ diff --git a/requirements.txt b/requirements.txt index ecf8e0c62..776bbc20b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ pygments==2.1.3 python-dateutil==2.8.1 pyyaml==5.3.1 requests[socks]==2.25.1 +pycld3==0.20 diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 0d45f0b7c..7768d21e9 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -64,6 +64,9 @@ class EngineRef: def __eq__(self, other): return self.name == other.name and self.category == other.category + def __hash__(self): + return hash((self.name, self.category)) + class SearchQuery: """container for all the search parameters (query, language, etc...)""" @@ -108,6 +111,10 @@ class SearchQuery: and self.timeout_limit == other.timeout_limit\ and self.external_bang == other.external_bang + def __hash__(self): + return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range, + self.timeout_limit, self.external_bang)) + class Search: """Search information container""" diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py new file mode 100644 index 000000000..442d5a09d --- /dev/null +++ b/searx/search/checker/__init__.py @@ -0,0 +1 @@ +from .impl import Checker diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py new file mode 100644 index 000000000..c071e6437 --- /dev/null +++ b/searx/search/checker/__main__.py @@ -0,0 +1,51 @@ +import sys + +import searx.search +import searx.search.processors +import searx.search.checker + + +if sys.stdout.isatty(): + RESET_SEQ = "\033[0m" + COLOR_SEQ = "\033[1;%dm" + BOLD_SEQ = "\033[1m" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) +else: + RESET_SEQ = "" + COLOR_SEQ = "" + BOLD_SEQ = "" + BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" + + +def iter_processor(): + if len(sys.argv) > 1: + for name, processor in searx.search.processors.items(): + if name in sys.argv: + yield name, processor + else: + for name, processor in searx.search.processors.items(): + yield name, processor + + +def main(): + searx.search.initialize() + broken_urls = [] + for name, processor in iter_processor(): + if sys.stdout.isatty(): + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) + checker = searx.search.checker.Checker(processor) + checker.run() + if checker.test_results.succesfull: + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, GREEN, ' OK', RESET_SEQ) + else: + errors = [test_name + ': ' + error for test_name, error in checker.test_results] + print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Error ', str(errors), RESET_SEQ) + + broken_urls += checker.test_results.broken_urls + + for url in broken_urls: + print('Error fetching', url) + + +if __name__ == '__main__': + main() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py new file mode 100644 index 000000000..f55b6d0f5 --- /dev/null +++ b/searx/search/checker/impl.py @@ -0,0 +1,388 @@ +import typing +import types +import functools +import itertools +from time import time +from urllib.parse import urlparse + +import re +import cld3 +import requests.exceptions + +from searx import poolrequests, logger +from searx.results import ResultContainer +from searx.search import SearchQuery, EngineRef +from searx.search.processors import EngineProcessor + + +HTML_TAGS = [ + 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', + 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', + 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', + 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', + 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', + 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', + 'frame', 'frameset' +] + + +def get_check_no_html(): + rep = ['<' + tag + '[^\>]*>' for tag in HTML_TAGS] + rep += ['' for tag in HTML_TAGS] + pattern = re.compile('|'.join(rep)) + + def f(text): + return pattern.search(text.lower()) is None + + return f + + +_check_no_html = get_check_no_html() + + +def _is_url(url): + try: + result = urlparse(url) + except ValueError: + return False + if result.scheme not in ('http', 'https'): + return False + return True + + +@functools.lru_cache(maxsize=8192) +def _is_url_image(image_url): + if not isinstance(image_url, str): + return False + + if image_url.startswith('//'): + image_url = 'https:' + image_url + + if image_url.startswith('data:'): + return image_url.startswith('data:image/') + + if not _is_url(image_url): + return False + + retry = 2 + + while retry > 0: + a = time() + try: + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US;q=0.5,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0' + }) + if r.headers["content-type"].startswith('image/'): + return True + return False + except requests.exceptions.Timeout: + logger.error('Timeout for %s: %i', image_url, int(time() - a)) + retry -= 1 + except requests.exceptions.RequestException: + logger.exception('Exception for %s', image_url) + return False + + +def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: + return { + 'query': search_query.query, + 'lang': search_query.lang, + 'pageno': search_query.pageno, + 'safesearch': search_query.safesearch, + 'time_range': search_query.time_range, + } + + +def _search_query_diff(sq1: SearchQuery, sq2: SearchQuery)\ + -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: + param1 = _search_query_to_dict(sq1) + param2 = _search_query_to_dict(sq2) + common = {} + diff = {} + for k, value1 in param1.items(): + value2 = param2[k] + if value1 == value2: + common[k] = value1 + else: + diff[k] = (value1, value2) + return (common, diff) + + +class TestResults: + + __slots__ = 'errors', 'broken_urls' + + def __init__(self): + self.errors: typing.Dict[str, typing.List[str]] = {} + self.broken_urls = [] + + def add_error(self, test, message): + errors_for_test = self.errors.setdefault(test, []) + if message not in errors_for_test: + errors_for_test.append(message) + + def add_broken_url(self, url): + if url not in self.broken_urls: + self.broken_urls.append(url) + + @property + def succesfull(self): + return len(self.errors) == 0 + + def __iter__(self): + for test_name, errors in self.errors.items(): + for error in sorted(errors): + yield (test_name, error) + + +class ResultContainerTests: + + __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' + + def __init__(self, + test_results: TestResults, + test_name: str, + search_query: SearchQuery, + result_container: ResultContainer): + self.test_name = test_name + self.search_query = search_query + self.result_container = result_container + self.languages: typing.Set[str] = set() + self.test_results = test_results + self.stop_test = False + + @property + def result_urls(self): + results = self.result_container.get_ordered_results() + return [result['url'] for result in results] + + def _record_error(self, message: str) -> None: + self.test_results.add_error(self.test_name, message) + + def _add_language(self, text: str) -> typing.Optional[str]: + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.9 and r.is_reliable: + self.languages.add(r.language) + return None + + def _check_result(self, result): + if not _check_no_html(result.get('title', '')): + self._record_error('HTML in title') + if not _check_no_html(result.get('content', '')): + self._record_error('HTML in content') + + self._add_language(result.get('title', '')) + self._add_language(result.get('content', '')) + + template = result.get('template', 'default.html') + if template == 'default.html': + return + if template == 'code.html': + return + if template == 'torrent.html': + return + if template == 'map.html': + return + if template == 'images.html': + thumbnail_src = result.get('thumbnail_src') + if thumbnail_src is not None: + if not _is_url_image(thumbnail_src): + self.test_results.add_broken_url(thumbnail_src) + self._record_error('thumbnail_src URL is invalid') + elif not _is_url_image(result.get('img_src')): + self.test_results.add_broken_url(result.get('img_src')) + self._record_error('img_src URL is invalid') + if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): + self._record_error('thumbnail URL is invalid') + + def _check_results(self, results: list): + for result in results: + self._check_result(result) + + def _check_answers(self, answers): + for answer in answers: + if not _check_no_html(answer): + self._record_error('HTML in answer') + + def _check_infoboxes(self, infoboxes): + for infobox in infoboxes: + if not _check_no_html(infobox.get('content', '')): + self._record_error('HTML in infobox content') + self._add_language(infobox.get('content', '')) + for attribute in infobox.get('attributes', {}): + if not _check_no_html(attribute.get('value', '')): + self._record_error('HTML in infobox attribute value') + + def check_basic(self): + if len(self.result_container.unresponsive_engines) > 0: + for message in self.result_container.unresponsive_engines: + self._record_error(message[1] + ' ' + (message[2] or '')) + self.stop_test = True + return + + results = self.result_container.get_ordered_results() + if len(results) > 0: + self._check_results(results) + + if len(self.result_container.answers) > 0: + self._check_answers(self.result_container.answers) + + if len(self.result_container.infoboxes) > 0: + self._check_infoboxes(self.result_container.infoboxes) + + def has_infobox(self): + if len(self.result_container.infoboxes) == 0: + self._record_error('No infobox') + + def has_answer(self): + if len(self.result_container.answers) == 0: + self._record_error('No answer') + + def has_language(self, lang): + if lang not in self.languages: + self._record_error(lang + ' not found') + + def not_empty(self): + result_types = set() + results = self.result_container.get_ordered_results() + if len(results) > 0: + result_types.add('results') + + if len(self.result_container.answers) > 0: + result_types.add('answers') + + if len(self.result_container.infoboxes) > 0: + result_types.add('infoboxes') + + if len(result_types) == 0: + self._record_error('No result') + + def one_title_contains(self, title: str): + title = title.lower() + for result in self.result_container.get_ordered_results(): + if title in result['title'].lower(): + return + self._record_error(('{!r} not found in the title'.format(title))) + + +class CheckerTests: + + __slots__ = 'test_results', 'test_name', 'result_container_tests_list' + + def __init__(self, + test_results: TestResults, + test_name: str, + result_container_tests_list: typing.List[ResultContainerTests]): + self.test_results = test_results + self.test_name = test_name + self.result_container_tests_list = result_container_tests_list + + def unique_results(self): + urls_list = [rct.result_urls for rct in self.result_container_tests_list] + if len(urls_list[0]) > 0: + # results on the first page + for i, urls_i in enumerate(urls_list): + for j, urls_j in enumerate(urls_list): + if i < j and urls_i == urls_j: + common, diff = _search_query_diff(self.result_container_tests_list[i].search_query, + self.result_container_tests_list[j].search_query) + common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) + diff1_str = ', ' .join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) + diff2_str = ', ' .join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) + self.test_results.add_error(self.test_name, + 'results are identitical for {} and {} ({})' + .format(diff1_str, diff2_str, common_str)) + + +class Checker: + + __slots__ = 'processor', 'tests', 'test_results' + + def __init__(self, processor: EngineProcessor): + self.processor = processor + self.tests = self.processor.get_tests() + self.test_results = TestResults() + + @property + def engineref_list(self): + engine_name = self.processor.engine_name + engine_category = self.processor.engine.categories[0] + return [EngineRef(engine_name, engine_category)] + + @staticmethod + def search_query_matrix_iterator(engineref_list, matrix): + p = [] + for name, values in matrix.items(): + if isinstance(values, (tuple, list)): + l = [(name, value) for value in values] + else: + l = [(name, values)] + p.append(l) + + for kwargs in itertools.product(*p): + kwargs = {k: v for k, v in kwargs} + query = kwargs['query'] + params = dict(kwargs) + del params['query'] + yield SearchQuery(query, engineref_list, **params) + + def call_test(self, obj, test_description): + if isinstance(test_description, (tuple, list)): + method, args = test_description[0], test_description[1:] + else: + method = test_description + args = () + if isinstance(method, str) and hasattr(obj, method): + getattr(obj, method)(*args) + elif isinstance(method, types.FunctionType): + method(*args) + else: + self.test_results.add_error(obj.test_name, + 'method {!r} ({}) not found for {}' + .format(method, method.__class__.__name__, obj.__class__.__name__)) + + def call_tests(self, obj, test_descriptions): + for test_description in test_descriptions: + self.call_test(obj, test_description) + + def search(self, search_query: SearchQuery) -> ResultContainer: + result_container = ResultContainer() + engineref_category = search_query.engineref_list[0].category + params = self.processor.get_params(search_query, engineref_category) + if params is not None: + self.processor.search(search_query.query, params, result_container, time(), 5) + return result_container + + def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: + result_container = self.search(search_query) + result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) + result_container_check.check_basic() + return result_container_check + + def run_test(self, test_name): + test_parameters = self.tests[test_name] + search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) + rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] + stop_test = False + if 'result_container' in test_parameters: + for rct in rct_list: + stop_test = stop_test or rct.stop_test + if not rct.stop_test: + self.call_tests(rct, test_parameters['result_container']) + if not stop_test: + if 'test' in test_parameters: + checker_tests = CheckerTests(self.test_results, test_name, rct_list) + self.call_tests(checker_tests, test_parameters['test']) + + def run(self): + for test_name in self.tests: + self.run_test(test_name) diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index cf3fd7236..eb8d296ec 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -37,3 +37,15 @@ class EngineProcessor: @abstractmethod def search(self, query, params, result_container, start_time, timeout_limit): pass + + def get_tests(self): + tests = getattr(self.engine, 'tests', None) + if tests is None: + tests = getattr(self.engine, 'additional_tests', {}) + tests.update(self.get_default_tests()) + return tests + else: + return tests + + def get_default_tests(self): + return {} diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index b62f8059e..54d63b4c9 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -211,3 +211,47 @@ class OnlineProcessor(EngineProcessor): # reset the suspend variables self.engine.continuous_errors = 0 self.engine.suspend_end_time = 0 + + def get_default_tests(self): + tests = {} + + tests['simple'] = { + 'matrix': {'query': ('time', 'time')}, + 'result_container': ['not_empty'], + } + + if getattr(self.engine, 'paging', False): + # [1, 2, 3] --> isinstance(l, (list, tuple)) ?? + tests['paging'] = { + 'matrix': {'query': 'time', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'time_range', False): + tests['time_range'] = { + 'matrix': {'query': 'time', + 'time_range': (None, 'day')}, + 'result_container': ['not_empty'], + 'test': ['unique_results'] + } + + if getattr(self.engine, 'lang', False): + tests['lang_fr'] = { + 'matrix': {'query': 'paris', 'lang': 'fr'}, + 'result_container': ['not_empty', ('has_lang', 'fr')], + } + tests['lang_en'] = { + 'matrix': {'query': 'paris', 'lang': 'en'}, + 'result_container': ['not_empty', ('has_lang', 'en')], + } + + if getattr(self.engine, 'safesearch', False): + tests['safesearch'] = { + 'matrix': {'query': 'porn', + 'safesearch': (0, 2)}, + 'test': ['unique_results'] + } + + return tests diff --git a/searx/search/processors/online_currency.py b/searx/search/processors/online_currency.py index f0e919c03..132c10594 100644 --- a/searx/search/processors/online_currency.py +++ b/searx/search/processors/online_currency.py @@ -55,3 +55,13 @@ class OnlineCurrencyProcessor(OnlineProcessor): params['from_name'] = iso4217_to_name(from_currency, 'en') params['to_name'] = iso4217_to_name(to_currency, 'en') return params + + def get_default_tests(self): + tests = {} + + tests['currency'] = { + 'matrix': {'query': '1337 usd in rmb'}, + 'result_container': ['has_answer'], + } + + return tests diff --git a/searx/search/processors/online_dictionary.py b/searx/search/processors/online_dictionary.py index 8e9ef1620..987c710a1 100644 --- a/searx/search/processors/online_dictionary.py +++ b/searx/search/processors/online_dictionary.py @@ -35,3 +35,21 @@ class OnlineDictionaryProcessor(OnlineProcessor): params['query'] = query return params + + def get_default_tests(self): + tests = {} + + if getattr(self.engine, 'paging', False): + tests['translation_paging'] = { + 'matrix': {'query': 'en-es house', + 'pageno': (1, 2, 3)}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + 'test': ['unique_results'] + } + else: + tests['translation'] = { + 'matrix': {'query': 'en-es house'}, + 'result_container': ['not_empty', ('one_title_contains', 'house')], + } + + return tests diff --git a/utils/searx.sh b/utils/searx.sh index b7d3b8e1c..134f0dbb1 100755 --- a/utils/searx.sh +++ b/utils/searx.sh @@ -46,6 +46,7 @@ SEARX_PACKAGES_debian="\ python3-dev python3-babel python3-venv uwsgi uwsgi-plugin-python3 git build-essential libxslt-dev zlib1g-dev libffi-dev libssl-dev +libprotobuf-dev protobuf-compiler shellcheck" BUILD_PACKAGES_debian="\ @@ -58,6 +59,7 @@ SEARX_PACKAGES_arch="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python git base-devel libxml2 +protobuf shellcheck" BUILD_PACKAGES_arch="\ @@ -69,7 +71,7 @@ SEARX_PACKAGES_fedora="\ python python-pip python-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_fedora="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools @@ -82,7 +84,7 @@ SEARX_PACKAGES_centos="\ python36 python36-pip python36-lxml python-babel uwsgi uwsgi-plugin-python3 git @development-tools libxml2 -ShellCheck" +ShellCheck protobuf-compiler protobuf-devel" BUILD_PACKAGES_centos="\ firefox graphviz graphviz-gd ImageMagick librsvg2-tools