From 5dd28ff04b39a239ff7d0d65df7065e9c3983f90 Mon Sep 17 00:00:00 2001 From: Martin Fischer Date: Fri, 28 Jan 2022 10:38:37 +0100 Subject: [PATCH] [enh] introduce Engine class Previously engines were defined in modules, which where then stapled with logger, supported_languages & language_aliases via monkey patching in searx/engines/__init__.py. Monkey patching is obviously a bad practice since it confuses humans as well as static type checkers. But there were more problems: * there were no type hints for the method input and output types * the request method did not even make a clear distinction between input and output (abusing a single "params" dictionary for both) This commit introduces a new class-based architecture for engines, in a backwards-compatiable manner: allowing old-style module engines to be used along witn new-style class-based engines. --- docs/admin/engines/configured_engines.rst | 8 +- searx/engine.py | 100 ++++++++++++++++++++++ searx/engines/__init__.py | 17 +++- searx/results.py | 12 +++ searx/search/processors/online.py | 17 +++- 5 files changed, 147 insertions(+), 7 deletions(-) create mode 100644 searx/engine.py diff --git a/docs/admin/engines/configured_engines.rst b/docs/admin/engines/configured_engines.rst index c7b6a1f52..c81ffb203 100644 --- a/docs/admin/engines/configured_engines.rst +++ b/docs/admin/engines/configured_engines.rst @@ -48,12 +48,14 @@ Explanation of the :ref:`general engine configuration` shown in the table {% for mod in engines %} + {% set mod_name = mod['__name__'] or mod.__module__ %} + * - `{{mod.name}} <{{mod.about and mod.about.website}}>`_ - ``!{{mod.shortcut}}`` - - {%- if 'searx.engines.' + mod.__name__ in documented_modules %} - :py:mod:`~searx.engines.{{mod.__name__}}` + - {%- if 'searx.engines.' + mod_name in documented_modules %} + :py:mod:`~searx.engines.{{mod_name}}` {%- else %} - :origin:`{{mod.__name__}} ` + :origin:`{{mod_name}} ` {%- endif %} - {{(mod.disabled and "y") or ""}} {%- if mod.about and mod.about.language %} diff --git a/searx/engine.py b/searx/engine.py new file mode 100644 index 000000000..5fa37080a --- /dev/null +++ b/searx/engine.py @@ -0,0 +1,100 @@ +# pyright: strict +from logging import Logger +from typing import Iterable, List, NamedTuple, Optional, Dict, Union +from typing_extensions import Literal, TypedDict, NotRequired +from dataclasses import dataclass + +from httpx import Response + + +class Engine: + categories: Optional[List[str]] + paging = False + time_range_support = False + supported_languages: List[str] + language_aliases: Dict[str, str] + about: 'About' + + def __init__(self, logger: Logger) -> None: + self.logger = logger + + +class About(TypedDict, total=False): + website: str + wikidata_id: Optional[str] + official_api_documentation: Optional[str] + use_official_api: bool + require_api_key: bool + results: Literal["HTML", "JSON"] + language: NotRequired[str] + + +class OnlineEngine(Engine): + def request(self, query: str, ctx: 'QueryContext') -> 'OnlineRequest': + raise NotImplementedError() + + def response(self, response: Response) -> List['Result']: + raise NotImplementedError() + + +class QueryContext(NamedTuple): + category: str + """current category""" + safesearch: Literal[0, 1, 2] + """desired content safety (normal, moderate, strict)""" + time_range: Optional[Literal['day', 'week', 'month', 'year']] + """current time range (if any)""" + pageno: int + """current page number""" + language: str + """specific language code like ``en_US``, or ``all`` if unspecified""" + + +@dataclass +class OnlineRequest: + url: str + """requested URL""" + method: Literal['GET', 'POST'] = 'GET' + """HTTP request method""" + headers: Optional[Dict[str, str]] = None + """HTTP headers""" + data: Optional[Dict[str, str]] = None + """data to be sent as the HTTP body""" + cookies: Optional[Dict[str, str]] = None + """HTTP cookies""" + verify: bool = True + """Assert that the TLS certificate is valid""" + allow_redirects: bool = True + """follow redirects""" + max_redirects: Optional[int] = None + """maximum redirects, hard limit""" + soft_max_redirects: Optional[int] = None + """maximum redirects, soft limit. Record an error but don't stop the engine""" + raise_for_httperror: bool = True + """raise an exception if the HTTP code of response is >= 300""" + + def set_header(self, name: str, value: str): + if self.headers is None: + self.headers = {} + self.headers[name] = value + + +Result = Union['StandardResult', 'InfoBox'] + + +@dataclass +class StandardResult: + url: str + title: str + content: str = '' + + +@dataclass +class InfoBox(StandardResult): + img_src: Optional[str] = None + links: Iterable['Link'] = () + + +class Link(TypedDict): + title: str + url: str diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 98a118bb4..e9e53791c 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -20,6 +20,11 @@ from searx import logger, settings from searx.data import ENGINES_LANGUAGES from searx.network import get from searx.utils import load_module, gen_useragent, find_language_aliases +from searx.engine import Engine + +_NEXTGEN_ENGINES = { +} +"""maps module names to class names for engines that are implemented using the new class-based approach""" logger = logger.getChild('engines') @@ -121,6 +126,11 @@ def load_engine(engine_data: dict) -> Optional[ConfiguredEngine]: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None + if engine_data['engine'] in _NEXTGEN_ENGINES: + engine = getattr(engine, _NEXTGEN_ENGINES[engine_data['engine']])( + logger=logger.getChild(engine_name), + ) + update_engine_attributes(engine, engine_data) set_language_attributes(engine) update_attributes_for_tor(engine) @@ -204,10 +214,11 @@ def _get_supported_languages(engine: ConfiguredEngine) -> Collection[str]: def set_language_attributes(engine: ConfiguredEngine): - engine.supported_languages = _get_supported_languages(engine) + if not isinstance(engine, Engine): + engine.supported_languages = _get_supported_languages(engine) - # find custom aliases for non standard language codes - engine.language_aliases.update(find_language_aliases(engine.supported_languages)) + # find custom aliases for non standard language codes + engine.language_aliases.update(find_language_aliases(engine.supported_languages)) # language_support engine.language_support = len(engine.supported_languages) > 0 diff --git a/searx/results.py b/searx/results.py index 1ac6a5ab5..404f00996 100644 --- a/searx/results.py +++ b/searx/results.py @@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set from urllib.parse import urlparse, unquote from searx import logger +from searx.engine import InfoBox, StandardResult from searx.engines import engines from searx.metrics import histogram_observe, counter_add, count_error @@ -195,6 +196,17 @@ class ResultContainer: standard_result_count = 0 error_msgs = set() for result in list(results): + if isinstance(result, InfoBox): + result = { + 'infobox': result.title, + 'id': result.url, + 'content': result.content, + 'img_src': result.img_src, + 'urls': result.links, + } + elif isinstance(result, StandardResult): + result = result.__dict__ + result['engine'] = engine_name if 'suggestion' in result and self.on_result(result): self.suggestions.add(result['suggestion']) diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 8d8275df1..c0aa27e06 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -17,6 +17,7 @@ from searx.exceptions import ( SearxEngineTooManyRequestsException, ) from searx.metrics.error_recorder import count_error +from searx.engine import OnlineEngine, QueryContext from .abstract import EngineProcessor @@ -114,7 +115,21 @@ class OnlineProcessor(EngineProcessor): def _search_basic(self, query, params): # update request parameters dependent on # search-engine (contained in engines folder) - self.engine.request(query, params) + if isinstance(self.engine, OnlineEngine): + params.update( + self.engine.request( + query, + QueryContext( + category=params['category'], + safesearch=params['safesearch'], + time_range=params['time_range'], + pageno=params['pageno'], + language=params['language'], + ), + ).__dict__ + ) + else: + self.engine.request(query, params) # ignoring empty urls if params['url'] is None: