[enh] introduce Engine class

Previously engines were defined in modules, which where then
stapled with logger, supported_languages & language_aliases
via monkey patching in searx/engines/__init__.py.

Monkey patching is obviously a bad practice since it confuses
humans as well as static type checkers. But there were more problems:

* there were no type hints for the method input and output types

* the request method did not even make a clear distinction between input
  and output (abusing a single "params" dictionary for both)

This commit introduces a new class-based architecture for engines,
in a backwards-compatiable manner: allowing old-style module engines
to be used along witn new-style class-based engines.
This commit is contained in:
Martin Fischer 2022-01-28 10:38:37 +01:00
parent 73e71508e4
commit 5dd28ff04b
5 changed files with 147 additions and 7 deletions

View file

@ -48,12 +48,14 @@ Explanation of the :ref:`general engine configuration` shown in the table
{% for mod in engines %} {% for mod in engines %}
{% set mod_name = mod['__name__'] or mod.__module__ %}
* - `{{mod.name}} <{{mod.about and mod.about.website}}>`_ * - `{{mod.name}} <{{mod.about and mod.about.website}}>`_
- ``!{{mod.shortcut}}`` - ``!{{mod.shortcut}}``
- {%- if 'searx.engines.' + mod.__name__ in documented_modules %} - {%- if 'searx.engines.' + mod_name in documented_modules %}
:py:mod:`~searx.engines.{{mod.__name__}}` :py:mod:`~searx.engines.{{mod_name}}`
{%- else %} {%- else %}
:origin:`{{mod.__name__}} <searx/engines/{{mod.__name__}}.py>` :origin:`{{mod_name}} <searx/engines/{{mod_name}}.py>`
{%- endif %} {%- endif %}
- {{(mod.disabled and "y") or ""}} - {{(mod.disabled and "y") or ""}}
{%- if mod.about and mod.about.language %} {%- if mod.about and mod.about.language %}

100
searx/engine.py Normal file
View file

@ -0,0 +1,100 @@
# pyright: strict
from logging import Logger
from typing import Iterable, List, NamedTuple, Optional, Dict, Union
from typing_extensions import Literal, TypedDict, NotRequired
from dataclasses import dataclass
from httpx import Response
class Engine:
categories: Optional[List[str]]
paging = False
time_range_support = False
supported_languages: List[str]
language_aliases: Dict[str, str]
about: 'About'
def __init__(self, logger: Logger) -> None:
self.logger = logger
class About(TypedDict, total=False):
website: str
wikidata_id: Optional[str]
official_api_documentation: Optional[str]
use_official_api: bool
require_api_key: bool
results: Literal["HTML", "JSON"]
language: NotRequired[str]
class OnlineEngine(Engine):
def request(self, query: str, ctx: 'QueryContext') -> 'OnlineRequest':
raise NotImplementedError()
def response(self, response: Response) -> List['Result']:
raise NotImplementedError()
class QueryContext(NamedTuple):
category: str
"""current category"""
safesearch: Literal[0, 1, 2]
"""desired content safety (normal, moderate, strict)"""
time_range: Optional[Literal['day', 'week', 'month', 'year']]
"""current time range (if any)"""
pageno: int
"""current page number"""
language: str
"""specific language code like ``en_US``, or ``all`` if unspecified"""
@dataclass
class OnlineRequest:
url: str
"""requested URL"""
method: Literal['GET', 'POST'] = 'GET'
"""HTTP request method"""
headers: Optional[Dict[str, str]] = None
"""HTTP headers"""
data: Optional[Dict[str, str]] = None
"""data to be sent as the HTTP body"""
cookies: Optional[Dict[str, str]] = None
"""HTTP cookies"""
verify: bool = True
"""Assert that the TLS certificate is valid"""
allow_redirects: bool = True
"""follow redirects"""
max_redirects: Optional[int] = None
"""maximum redirects, hard limit"""
soft_max_redirects: Optional[int] = None
"""maximum redirects, soft limit. Record an error but don't stop the engine"""
raise_for_httperror: bool = True
"""raise an exception if the HTTP code of response is >= 300"""
def set_header(self, name: str, value: str):
if self.headers is None:
self.headers = {}
self.headers[name] = value
Result = Union['StandardResult', 'InfoBox']
@dataclass
class StandardResult:
url: str
title: str
content: str = ''
@dataclass
class InfoBox(StandardResult):
img_src: Optional[str] = None
links: Iterable['Link'] = ()
class Link(TypedDict):
title: str
url: str

View file

@ -20,6 +20,11 @@ from searx import logger, settings
from searx.data import ENGINES_LANGUAGES from searx.data import ENGINES_LANGUAGES
from searx.network import get from searx.network import get
from searx.utils import load_module, gen_useragent, find_language_aliases from searx.utils import load_module, gen_useragent, find_language_aliases
from searx.engine import Engine
_NEXTGEN_ENGINES = {
}
"""maps module names to class names for engines that are implemented using the new class-based approach"""
logger = logger.getChild('engines') logger = logger.getChild('engines')
@ -121,6 +126,11 @@ def load_engine(engine_data: dict) -> Optional[ConfiguredEngine]:
logger.exception('Cannot load engine "{}"'.format(engine_module)) logger.exception('Cannot load engine "{}"'.format(engine_module))
return None return None
if engine_data['engine'] in _NEXTGEN_ENGINES:
engine = getattr(engine, _NEXTGEN_ENGINES[engine_data['engine']])(
logger=logger.getChild(engine_name),
)
update_engine_attributes(engine, engine_data) update_engine_attributes(engine, engine_data)
set_language_attributes(engine) set_language_attributes(engine)
update_attributes_for_tor(engine) update_attributes_for_tor(engine)
@ -204,10 +214,11 @@ def _get_supported_languages(engine: ConfiguredEngine) -> Collection[str]:
def set_language_attributes(engine: ConfiguredEngine): def set_language_attributes(engine: ConfiguredEngine):
engine.supported_languages = _get_supported_languages(engine) if not isinstance(engine, Engine):
engine.supported_languages = _get_supported_languages(engine)
# find custom aliases for non standard language codes # find custom aliases for non standard language codes
engine.language_aliases.update(find_language_aliases(engine.supported_languages)) engine.language_aliases.update(find_language_aliases(engine.supported_languages))
# language_support # language_support
engine.language_support = len(engine.supported_languages) > 0 engine.language_support = len(engine.supported_languages) > 0

View file

@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
from searx import logger from searx import logger
from searx.engine import InfoBox, StandardResult
from searx.engines import engines from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error from searx.metrics import histogram_observe, counter_add, count_error
@ -195,6 +196,17 @@ class ResultContainer:
standard_result_count = 0 standard_result_count = 0
error_msgs = set() error_msgs = set()
for result in list(results): for result in list(results):
if isinstance(result, InfoBox):
result = {
'infobox': result.title,
'id': result.url,
'content': result.content,
'img_src': result.img_src,
'urls': result.links,
}
elif isinstance(result, StandardResult):
result = result.__dict__
result['engine'] = engine_name result['engine'] = engine_name
if 'suggestion' in result and self.on_result(result): if 'suggestion' in result and self.on_result(result):
self.suggestions.add(result['suggestion']) self.suggestions.add(result['suggestion'])

View file

@ -17,6 +17,7 @@ from searx.exceptions import (
SearxEngineTooManyRequestsException, SearxEngineTooManyRequestsException,
) )
from searx.metrics.error_recorder import count_error from searx.metrics.error_recorder import count_error
from searx.engine import OnlineEngine, QueryContext
from .abstract import EngineProcessor from .abstract import EngineProcessor
@ -114,7 +115,21 @@ class OnlineProcessor(EngineProcessor):
def _search_basic(self, query, params): def _search_basic(self, query, params):
# update request parameters dependent on # update request parameters dependent on
# search-engine (contained in engines folder) # search-engine (contained in engines folder)
self.engine.request(query, params) if isinstance(self.engine, OnlineEngine):
params.update(
self.engine.request(
query,
QueryContext(
category=params['category'],
safesearch=params['safesearch'],
time_range=params['time_range'],
pageno=params['pageno'],
language=params['language'],
),
).__dict__
)
else:
self.engine.request(query, params)
# ignoring empty urls # ignoring empty urls
if params['url'] is None: if params['url'] is None: