[enh] introduce Engine class

Previously engines were defined in modules, which where then stapled with logger, supported_languages & language_aliases via monkey patching in searx/engines/__init__.py. Monkey patching is obviously a bad practice since it confuses humans as well as static type checkers. But there were more problems: * there were no type hints for the method input and output types * the request method did not even make a clear distinction between input and output (abusing a single "params" dictionary for both) This commit introduces a new class-based architecture for engines, in a backwards-compatiable manner: allowing old-style module engines to be used along witn new-style class-based engines.
2024-01-01 19:24:07 +01:00 · 2022-01-28 10:38:37 +01:00 · 2022-01-28 10:38:37 +01:00 · 5dd28ff04b
commit 5dd28ff04b
parent 73e71508e4
5 changed files with 147 additions and 7 deletions
--- a/docs/admin/engines/configured_engines.rst
+++ b/docs/admin/engines/configured_engines.rst
@ -48,12 +48,14 @@ Explanation of the :ref:`general engine configuration` shown in the table
      {% for mod in engines %}
      {% set mod_name = mod['__name__'] or mod.__module__ %}
      * - `{{mod.name}} <{{mod.about and mod.about.website}}>`_
        - ``!{{mod.shortcut}}``
-        - {%- if 'searx.engines.' + mod.__name__ in documented_modules %}
+        - {%- if 'searx.engines.' + mod_name in documented_modules %}
-          :py:mod:`~searx.engines.{{mod.__name__}}`
+          :py:mod:`~searx.engines.{{mod_name}}`
          {%- else %}
-          :origin:`{{mod.__name__}} <searx/engines/{{mod.__name__}}.py>`
+          :origin:`{{mod_name}} <searx/engines/{{mod_name}}.py>`
          {%- endif %}
        - {{(mod.disabled and "y") or ""}}
          {%- if mod.about and  mod.about.language %}
--- a/searx/engine.py
+++ b/searx/engine.py
@ -0,0 +1,100 @@
 # pyright: strict
 from logging import Logger
 from typing import Iterable, List, NamedTuple, Optional, Dict, Union
 from typing_extensions import Literal, TypedDict, NotRequired
 from dataclasses import dataclass
 from httpx import Response
 class Engine:
    categories: Optional[List[str]]
    paging = False
    time_range_support = False
    supported_languages: List[str]
    language_aliases: Dict[str, str]
    about: 'About'
    def __init__(self, logger: Logger) -> None:
        self.logger = logger
 class About(TypedDict, total=False):
    website: str
    wikidata_id: Optional[str]
    official_api_documentation: Optional[str]
    use_official_api: bool
    require_api_key: bool
    results: Literal["HTML", "JSON"]
    language: NotRequired[str]
 class OnlineEngine(Engine):
    def request(self, query: str, ctx: 'QueryContext') -> 'OnlineRequest':
        raise NotImplementedError()
    def response(self, response: Response) -> List['Result']:
        raise NotImplementedError()
 class QueryContext(NamedTuple):
    category: str
    """current category"""
    safesearch: Literal[0, 1, 2]
    """desired content safety (normal, moderate, strict)"""
    time_range: Optional[Literal['day', 'week', 'month', 'year']]
    """current time range (if any)"""
    pageno: int
    """current page number"""
    language: str
    """specific language code like ``en_US``, or ``all`` if unspecified"""
@dataclass
 class OnlineRequest:
    url: str
    """requested URL"""
    method: Literal['GET', 'POST'] = 'GET'
    """HTTP request method"""
    headers: Optional[Dict[str, str]] = None
    """HTTP headers"""
    data: Optional[Dict[str, str]] = None
    """data to be sent as the HTTP body"""
    cookies: Optional[Dict[str, str]] = None
    """HTTP cookies"""
    verify: bool = True
    """Assert that the TLS certificate is valid"""
    allow_redirects: bool = True
    """follow redirects"""
    max_redirects: Optional[int] = None
    """maximum redirects, hard limit"""
    soft_max_redirects: Optional[int] = None
    """maximum redirects, soft limit. Record an error but don't stop the engine"""
    raise_for_httperror: bool = True
    """raise an exception if the HTTP code of response is >= 300"""
    def set_header(self, name: str, value: str):
        if self.headers is None:
            self.headers = {}
        self.headers[name] = value
 Result = Union['StandardResult', 'InfoBox']
@dataclass
 class StandardResult:
    url: str
    title: str
    content: str = ''
@dataclass
 class InfoBox(StandardResult):
    img_src: Optional[str] = None
    links: Iterable['Link'] = ()
 class Link(TypedDict):
    title: str
    url: str
--- a/searx/engines/init.py
+++ b/searx/engines/init.py
@ -20,6 +20,11 @@ from searx import logger, settings
 from searx.data import ENGINES_LANGUAGES
 from searx.network import get
 from searx.utils import load_module, gen_useragent, find_language_aliases
 from searx.engine import Engine
 _NEXTGEN_ENGINES = {
 }
 """maps module names to class names for engines that are implemented using the new class-based approach"""
 logger = logger.getChild('engines')
@ -121,6 +126,11 @@ def load_engine(engine_data: dict) -> Optional[ConfiguredEngine]:
        logger.exception('Cannot load engine "{}"'.format(engine_module))
        return None
    if engine_data['engine'] in _NEXTGEN_ENGINES:
        engine = getattr(engine, _NEXTGEN_ENGINES[engine_data['engine']])(
            logger=logger.getChild(engine_name),
        )
    update_engine_attributes(engine, engine_data)
    set_language_attributes(engine)
    update_attributes_for_tor(engine)
@ -204,10 +214,11 @@ def _get_supported_languages(engine: ConfiguredEngine) -> Collection[str]:
 def set_language_attributes(engine: ConfiguredEngine):
-    engine.supported_languages = _get_supported_languages(engine)
+    if not isinstance(engine, Engine):
        engine.supported_languages = _get_supported_languages(engine)
-    # find custom aliases for non standard language codes
+        # find custom aliases for non standard language codes
-    engine.language_aliases.update(find_language_aliases(engine.supported_languages))
+        engine.language_aliases.update(find_language_aliases(engine.supported_languages))
    # language_support
    engine.language_support = len(engine.supported_languages) > 0
--- a/searx/results.py
+++ b/searx/results.py
@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set
 from urllib.parse import urlparse, unquote
 from searx import logger
 from searx.engine import InfoBox, StandardResult
 from searx.engines import engines
 from searx.metrics import histogram_observe, counter_add, count_error
@ -195,6 +196,17 @@ class ResultContainer:
        standard_result_count = 0
        error_msgs = set()
        for result in list(results):
            if isinstance(result, InfoBox):
                result = {
                    'infobox': result.title,
                    'id': result.url,
                    'content': result.content,
                    'img_src': result.img_src,
                    'urls': result.links,
                }
            elif isinstance(result, StandardResult):
                result = result.__dict__
            result['engine'] = engine_name
            if 'suggestion' in result and self.on_result(result):
                self.suggestions.add(result['suggestion'])
--- a/searx/search/processors/online.py
+++ b/searx/search/processors/online.py
@ -17,6 +17,7 @@ from searx.exceptions import (
    SearxEngineTooManyRequestsException,
 )
 from searx.metrics.error_recorder import count_error
 from searx.engine import OnlineEngine, QueryContext
 from .abstract import EngineProcessor
@ -114,7 +115,21 @@ class OnlineProcessor(EngineProcessor):
    def _search_basic(self, query, params):
        # update request parameters dependent on
        # search-engine (contained in engines folder)
-        self.engine.request(query, params)
+        if isinstance(self.engine, OnlineEngine):
            params.update(
                self.engine.request(
                    query,
                    QueryContext(
                        category=params['category'],
                        safesearch=params['safesearch'],
                        time_range=params['time_range'],
                        pageno=params['pageno'],
                        language=params['language'],
                    ),
                ).__dict__
            )
        else:
            self.engine.request(query, params)
        # ignoring empty urls
        if params['url'] is None: