[enh] introduce Engine class

Previously engines were defined in modules, which where then
stapled with logger, supported_languages & language_aliases
via monkey patching in searx/engines/__init__.py.

Monkey patching is obviously a bad practice since it confuses
humans as well as static type checkers. But there were more problems:

* there were no type hints for the method input and output types

* the request method did not even make a clear distinction between input
  and output (abusing a single "params" dictionary for both)

This commit introduces a new class-based architecture for engines,
in a backwards-compatiable manner: allowing old-style module engines
to be used along witn new-style class-based engines.
This commit is contained in:
Martin Fischer 2022-01-28 10:38:37 +01:00
parent 73e71508e4
commit 5dd28ff04b
5 changed files with 147 additions and 7 deletions

View file

@ -48,12 +48,14 @@ Explanation of the :ref:`general engine configuration` shown in the table
{% for mod in engines %}
{% set mod_name = mod['__name__'] or mod.__module__ %}
* - `{{mod.name}} <{{mod.about and mod.about.website}}>`_
- ``!{{mod.shortcut}}``
- {%- if 'searx.engines.' + mod.__name__ in documented_modules %}
:py:mod:`~searx.engines.{{mod.__name__}}`
- {%- if 'searx.engines.' + mod_name in documented_modules %}
:py:mod:`~searx.engines.{{mod_name}}`
{%- else %}
:origin:`{{mod.__name__}} <searx/engines/{{mod.__name__}}.py>`
:origin:`{{mod_name}} <searx/engines/{{mod_name}}.py>`
{%- endif %}
- {{(mod.disabled and "y") or ""}}
{%- if mod.about and mod.about.language %}

100
searx/engine.py Normal file
View file

@ -0,0 +1,100 @@
# pyright: strict
from logging import Logger
from typing import Iterable, List, NamedTuple, Optional, Dict, Union
from typing_extensions import Literal, TypedDict, NotRequired
from dataclasses import dataclass
from httpx import Response
class Engine:
categories: Optional[List[str]]
paging = False
time_range_support = False
supported_languages: List[str]
language_aliases: Dict[str, str]
about: 'About'
def __init__(self, logger: Logger) -> None:
self.logger = logger
class About(TypedDict, total=False):
website: str
wikidata_id: Optional[str]
official_api_documentation: Optional[str]
use_official_api: bool
require_api_key: bool
results: Literal["HTML", "JSON"]
language: NotRequired[str]
class OnlineEngine(Engine):
def request(self, query: str, ctx: 'QueryContext') -> 'OnlineRequest':
raise NotImplementedError()
def response(self, response: Response) -> List['Result']:
raise NotImplementedError()
class QueryContext(NamedTuple):
category: str
"""current category"""
safesearch: Literal[0, 1, 2]
"""desired content safety (normal, moderate, strict)"""
time_range: Optional[Literal['day', 'week', 'month', 'year']]
"""current time range (if any)"""
pageno: int
"""current page number"""
language: str
"""specific language code like ``en_US``, or ``all`` if unspecified"""
@dataclass
class OnlineRequest:
url: str
"""requested URL"""
method: Literal['GET', 'POST'] = 'GET'
"""HTTP request method"""
headers: Optional[Dict[str, str]] = None
"""HTTP headers"""
data: Optional[Dict[str, str]] = None
"""data to be sent as the HTTP body"""
cookies: Optional[Dict[str, str]] = None
"""HTTP cookies"""
verify: bool = True
"""Assert that the TLS certificate is valid"""
allow_redirects: bool = True
"""follow redirects"""
max_redirects: Optional[int] = None
"""maximum redirects, hard limit"""
soft_max_redirects: Optional[int] = None
"""maximum redirects, soft limit. Record an error but don't stop the engine"""
raise_for_httperror: bool = True
"""raise an exception if the HTTP code of response is >= 300"""
def set_header(self, name: str, value: str):
if self.headers is None:
self.headers = {}
self.headers[name] = value
Result = Union['StandardResult', 'InfoBox']
@dataclass
class StandardResult:
url: str
title: str
content: str = ''
@dataclass
class InfoBox(StandardResult):
img_src: Optional[str] = None
links: Iterable['Link'] = ()
class Link(TypedDict):
title: str
url: str

View file

@ -20,6 +20,11 @@ from searx import logger, settings
from searx.data import ENGINES_LANGUAGES
from searx.network import get
from searx.utils import load_module, gen_useragent, find_language_aliases
from searx.engine import Engine
_NEXTGEN_ENGINES = {
}
"""maps module names to class names for engines that are implemented using the new class-based approach"""
logger = logger.getChild('engines')
@ -121,6 +126,11 @@ def load_engine(engine_data: dict) -> Optional[ConfiguredEngine]:
logger.exception('Cannot load engine "{}"'.format(engine_module))
return None
if engine_data['engine'] in _NEXTGEN_ENGINES:
engine = getattr(engine, _NEXTGEN_ENGINES[engine_data['engine']])(
logger=logger.getChild(engine_name),
)
update_engine_attributes(engine, engine_data)
set_language_attributes(engine)
update_attributes_for_tor(engine)
@ -204,10 +214,11 @@ def _get_supported_languages(engine: ConfiguredEngine) -> Collection[str]:
def set_language_attributes(engine: ConfiguredEngine):
engine.supported_languages = _get_supported_languages(engine)
if not isinstance(engine, Engine):
engine.supported_languages = _get_supported_languages(engine)
# find custom aliases for non standard language codes
engine.language_aliases.update(find_language_aliases(engine.supported_languages))
# find custom aliases for non standard language codes
engine.language_aliases.update(find_language_aliases(engine.supported_languages))
# language_support
engine.language_support = len(engine.supported_languages) > 0

View file

@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set
from urllib.parse import urlparse, unquote
from searx import logger
from searx.engine import InfoBox, StandardResult
from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error
@ -195,6 +196,17 @@ class ResultContainer:
standard_result_count = 0
error_msgs = set()
for result in list(results):
if isinstance(result, InfoBox):
result = {
'infobox': result.title,
'id': result.url,
'content': result.content,
'img_src': result.img_src,
'urls': result.links,
}
elif isinstance(result, StandardResult):
result = result.__dict__
result['engine'] = engine_name
if 'suggestion' in result and self.on_result(result):
self.suggestions.add(result['suggestion'])

View file

@ -17,6 +17,7 @@ from searx.exceptions import (
SearxEngineTooManyRequestsException,
)
from searx.metrics.error_recorder import count_error
from searx.engine import OnlineEngine, QueryContext
from .abstract import EngineProcessor
@ -114,7 +115,21 @@ class OnlineProcessor(EngineProcessor):
def _search_basic(self, query, params):
# update request parameters dependent on
# search-engine (contained in engines folder)
self.engine.request(query, params)
if isinstance(self.engine, OnlineEngine):
params.update(
self.engine.request(
query,
QueryContext(
category=params['category'],
safesearch=params['safesearch'],
time_range=params['time_range'],
pageno=params['pageno'],
language=params['language'],
),
).__dict__
)
else:
self.engine.request(query, params)
# ignoring empty urls
if params['url'] is None: