From e242005916c560bc92dbc7239c24f0ce435d654d Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Tue, 26 Jul 2022 21:43:20 +0200 Subject: [PATCH] Add searx.results module Based on #1412 --- docs/src/searx.results.models.rst | 47 +++++ searx/engines/__init__.py | 2 + searx/{results.py => results/__init__.py} | 53 +++--- searx/results/models.py | 213 ++++++++++++++++++++++ searx/webapp.py | 5 +- 5 files changed, 295 insertions(+), 25 deletions(-) create mode 100644 docs/src/searx.results.models.rst rename searx/{results.py => results/__init__.py} (90%) create mode 100644 searx/results/models.py diff --git a/docs/src/searx.results.models.rst b/docs/src/searx.results.models.rst new file mode 100644 index 000000000..b6d152d22 --- /dev/null +++ b/docs/src/searx.results.models.rst @@ -0,0 +1,47 @@ +.. _results models: + +==================== +searx.results.models +==================== + +Main Results +------------ + +.. autoclass:: searx.results.models.MainResult + :members: + +.. autoclass:: searx.results.models.UrlResult + :members: + +.. autoclass:: searx.results.models.KeyValueResult + :members: + +Infoboxes +--------- + +.. autoclass:: searx.results.models.Infobox + :members: + +.. autoclass:: searx.results.models.InfoboxAttribute + :members: + +.. autoclass:: searx.results.models.InfoboxImage + :members: + +.. autoclass:: searx.results.models.InfoboxUrl + :members: + +.. autoclass:: searx.results.models.InfoboxRelatedTopic + :members: + +Others +------ + +.. autoclass:: searx.results.models.Answer + :members: + +.. autoclass:: searx.results.models.Correction + :members: + +.. autoclass:: searx.results.models.Suggestion + :members: diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 52bb5f20d..2a58eb67e 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -68,6 +68,8 @@ class Engine: # pylint: disable=too-few-public-methods safesearch: bool time_range_support: bool timeout: float + weight: float + display_error_messages: bool # Defaults for the namespace of an engine module, see :py:func:`load_engine` diff --git a/searx/results.py b/searx/results/__init__.py similarity index 90% rename from searx/results.py rename to searx/results/__init__.py index 5dd1bff21..864989494 100644 --- a/searx/results.py +++ b/searx/results/__init__.py @@ -1,13 +1,17 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pyright: basic + import re from collections import defaultdict from operator import itemgetter from threading import RLock -from typing import List, NamedTuple, Set +from typing import NamedTuple, Optional, List, Set, Dict, cast from urllib.parse import urlparse, unquote from searx import logger from searx.engines import engines from searx.metrics import histogram_observe, counter_add, count_error +from . import models CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) @@ -55,7 +59,7 @@ def compare_urls(url_a, url_b): return unquote(path_a) == unquote(path_b) -def merge_two_infoboxes(infobox1, infobox2): +def merge_two_infoboxes(infobox1: models.Infobox, infobox2: models.Infobox): # get engines weights if hasattr(engines[infobox1['engine']], 'weight'): weight1 = engines[infobox1['engine']].weight @@ -91,8 +95,8 @@ def merge_two_infoboxes(infobox1, infobox2): infobox1['urls'] = urls1 - if 'img_src' in infobox2: - img1 = infobox1.get('img_src', None) + if infobox2.get('img_src') is not None: + img1 = infobox1.get('img_src') img2 = infobox2.get('img_src') if img1 is None: infobox1['img_src'] = img2 @@ -127,7 +131,7 @@ def merge_two_infoboxes(infobox1, infobox2): infobox1['content'] = content2 -def result_score(result): +def result_score(result: models.Result): weight = 1.0 for result_engine in result['engines']: @@ -173,18 +177,18 @@ class ResultContainer: def __init__(self): super().__init__() - self._merged_results = [] - self.infoboxes = [] - self.suggestions = set() - self.answers = {} - self.corrections = set() + self._merged_results: List[models.MainResult] = [] + self.infoboxes: List[models.Infobox] = [] + self.suggestions: Set[models.Suggestion] = set() + self.answers: Dict[str, models.Answer] = {} + self.corrections: Set[models.Correction] = set() self._number_of_results = [] self.engine_data = defaultdict(dict) - self._closed = False - self.paging = False + self._closed: bool = False + self.paging: bool = False self.unresponsive_engines: Set[UnresponsiveEngine] = set() self.timings: List[Timing] = [] - self.redirect_url = None + self.redirect_url: Optional[str] = None self.on_result = lambda _: True self._lock = RLock() @@ -193,7 +197,7 @@ class ResultContainer: return standard_result_count = 0 - error_msgs = set() + error_msgs: Set[str] = set() for result in list(results): result['engine'] = engine_name if 'suggestion' in result and self.on_result(result): @@ -234,7 +238,7 @@ class ResultContainer: if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging: self.paging = True - def _merge_infobox(self, infobox): + def _merge_infobox(self, infobox: models.Infobox): add_infobox = True infobox_id = infobox.get('id', None) infobox['engines'] = set([infobox['engine']]) @@ -249,7 +253,7 @@ class ResultContainer: if add_infobox: self.infoboxes.append(infobox) - def _is_valid_url_result(self, result, error_msgs): + def _is_valid_url_result(self, result: models.UrlResult, error_msgs: Set[str]) -> bool: if 'url' in result: if not isinstance(result['url'], str): logger.debug('result: invalid URL: %s', str(result)) @@ -269,7 +273,7 @@ class ResultContainer: return True - def _normalize_url_result(self, result): + def _normalize_url_result(self, result: models.UrlResult): """Return True if the result is valid""" result['parsed_url'] = urlparse(result['url']) @@ -288,9 +292,9 @@ class ResultContainer: # strip multiple spaces and carriage returns from content if result.get('content'): - result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) + result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) # type: ignore - def __merge_url_result(self, result, position): + def __merge_url_result(self, result: models.UrlResult, position: int): result['engines'] = set([result['engine']]) with self._lock: duplicated = self.__find_duplicated_http_result(result) @@ -302,11 +306,12 @@ class ResultContainer: result['positions'] = [position] self._merged_results.append(result) - def __find_duplicated_http_result(self, result): + def __find_duplicated_http_result(self, result: models.UrlResult) -> Optional[models.UrlResult]: result_template = result.get('template') for merged_result in self._merged_results: if 'parsed_url' not in merged_result: continue + merged_result = cast(models.UrlResult, merged_result) if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get( 'template' ): @@ -320,10 +325,10 @@ class ResultContainer: return merged_result return None - def __merge_duplicated_http_result(self, duplicated, result, position): + def __merge_duplicated_http_result(self, duplicated: models.UrlResult, result: models.UrlResult, position: int): # using content with more text if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')): - duplicated['content'] = result['content'] + duplicated['content'] = result['content'] # type: ignore # merge all result's parameters not found in duplicate for key in result.keys(): @@ -341,11 +346,11 @@ class ResultContainer: duplicated['url'] = result['parsed_url'].geturl() duplicated['parsed_url'] = result['parsed_url'] - def __merge_result_no_url(self, result, position): + def __merge_result_no_url(self, result: models.KeyValueResult, position: int): result['engines'] = set([result['engine']]) result['positions'] = [position] with self._lock: - self._merged_results.append(result) + self._merged_results.append(result) # type: ignore def close(self): self._closed = True diff --git a/searx/results/models.py b/searx/results/models.py new file mode 100644 index 000000000..8daff09eb --- /dev/null +++ b/searx/results/models.py @@ -0,0 +1,213 @@ +from typing import List, Dict, Set +from typing_extensions import TypedDict, NotRequired, Required, Literal +from urllib.parse import ParseResult +from datetime import datetime + + +__all__ = [ + 'Result', + 'UrlResult', + 'Answer', + 'Correction', + 'Suggestion', + 'Infobox', + 'InfoboxUrl', + 'InfoboxImage', + 'InfoboxAttribute', + 'InfoboxRelatedTopic', +] + + +class Result(TypedDict): + """A result from any type""" + + engine: str + """Internal field. DO NOT USE""" + + weight: float + """Internal field. DO NOT USE""" + + engines: Set[str] + """Internal field. DO NOT USE""" + + category: str + """Internal field. DO NOT USE""" + + positions: List[int] + """Internal field. DO NOT USE""" + + score: float + """Internal field. DO NOT USE""" + + +class MainResult(Result): + """Result that is going to be displayed as a "main" result""" + + template: NotRequired[str] + """Template to display the result. The default value is "default.html". + see searx/templates/simple/result_templates""" + + +class UrlResult(MainResult): + """Typical main result: an url, a title and a short description""" + + title: str + """Title of the result""" + + url: str + """URL of the result""" + + parsed_url: ParseResult + """Engines don't have to set this value: it is automatically initialized from the url field. + However, plugins have to manually update this field when they change the url field""" + + content: NotRequired[str] + """Content of the result""" + + iframe_src: NotRequired[str] + """URL of an iframe to add to the result.""" + + audio_src: NotRequired[str] + """URL of