[mod] limiter: add config file /etc/searxng/limiter.toml

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-05-26 17:24:43 +02:00
parent 1ec325adcc
commit 66fdec0eb9
12 changed files with 459 additions and 12 deletions

View File

@ -16,3 +16,4 @@ redis==4.5.5
markdown-it-py==2.2.0 markdown-it-py==2.2.0
typing_extensions==4.6.2 typing_extensions==4.6.2
fasttext-predict==0.9.2.1 fasttext-predict==0.9.2.1
pytomlpp==1.0.13

View File

@ -13,12 +13,15 @@ Accept_ header ..
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
""" """
# pylint: disable=unused-argument
from typing import Optional, Tuple from typing import Optional, Tuple
import flask import flask
from searx.tools import config
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
if 'text/html' not in request.accept_mimetypes: if 'text/html' not in request.accept_mimetypes:
return 429, "bot detected, HTTP header Accept did not contain text/html" return 429, "bot detected, HTTP header Accept did not contain text/html"
return None return None

View File

@ -14,12 +14,15 @@ bot if the Accept-Encoding_ header ..
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
""" """
# pylint: disable=unused-argument
from typing import Optional, Tuple from typing import Optional, Tuple
import flask import flask
from searx.tools import config
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
if not ('gzip' in accept_list or 'deflate' in accept_list): if not ('gzip' in accept_list or 'deflate' in accept_list):
return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate" return 429, "bot detected, HTTP header Accept-Encoding did not contain gzip nor deflate"

View File

@ -11,13 +11,15 @@ if the Accept-Language_ header is unset.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
""" """
# pylint: disable=unused-argument
from typing import Optional, Tuple from typing import Optional, Tuple
import flask import flask
from searx.tools import config
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
if request.headers.get('Accept-Language', '').strip() == '': if request.headers.get('Accept-Language', '').strip() == '':
return 429, "bot detected, missing HTTP header Accept-Language" return 429, "bot detected, missing HTTP header Accept-Language"
return None return None

View File

@ -11,13 +11,15 @@ the Connection_ header is set to ``close``.
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
""" """
# pylint: disable=unused-argument
from typing import Optional, Tuple from typing import Optional, Tuple
import flask import flask
from searx.tools import config
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
if request.headers.get('Connection', '').strip() == 'close': if request.headers.get('Connection', '').strip() == 'close':
return 429, "bot detected, HTTP header 'Connection=close'" return 429, "bot detected, HTTP header 'Connection=close'"
return None return None

View File

@ -12,11 +12,15 @@ the User-Agent_ header is unset or matches the regular expression
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
""" """
# pylint: disable=unused-argument
from typing import Optional, Tuple from typing import Optional, Tuple
import re import re
import flask import flask
from searx.tools import config
USER_AGENT = ( USER_AGENT = (
r'(' r'('
+ r'unknown' + r'unknown'
@ -44,7 +48,7 @@ def regexp_user_agent():
return _regexp return _regexp
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
user_agent = request.headers.get('User-Agent', 'unknown') user_agent = request.headers.get('User-Agent', 'unknown')
if regexp_user_agent().match(user_agent): if regexp_user_agent().match(user_agent):
return ( return (

View File

@ -1,4 +1,5 @@
""" """.. _botdetection.ip_limit:
Method ``ip_limit`` Method ``ip_limit``
------------------- -------------------
@ -22,6 +23,8 @@ The :py:obj:`link_token` method is used to investigate whether a request is
from typing import Optional, Tuple from typing import Optional, Tuple
import flask import flask
from searx.tools import config
from searx import redisdb from searx import redisdb
from searx import logger from searx import logger
@ -56,7 +59,7 @@ API_MAX = 4
"""Maximum requests from one IP in the :py:obj:`API_WONDOW`""" """Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: def filter_request(request: flask.Request, cfg: config.Config) -> Optional[Tuple[int, str]]:
redis_client = redisdb.client() redis_client = redisdb.client()
x_forwarded_for = request.headers.get('X-Forwarded-For', '') x_forwarded_for = request.headers.get('X-Forwarded-For', '')
@ -68,7 +71,9 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
if c > API_MAX: if c > API_MAX:
return 429, "BLOCK %s: API limit exceeded" return 429, "BLOCK %s: API limit exceeded"
suspicious = link_token.is_suspicious(request) suspicious = False
if cfg['botdetection.ip_limit.link_token']:
suspicious = link_token.is_suspicious(request)
if suspicious: if suspicious:
c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW) c = incr_sliding_window(redis_client, 'IP limit - BURST_WINDOW:' + x_forwarded_for, BURST_WINDOW)

View File

@ -38,8 +38,11 @@ and set the redis-url connection. Check the value, it depends on your redis DB
""" """
from typing import Optional, Tuple from typing import Optional, Tuple
from pathlib import Path
import flask import flask
import pytomlpp as toml
from searx.tools import config
from searx.botdetection import ( from searx.botdetection import (
http_accept, http_accept,
http_accept_encoding, http_accept_encoding,
@ -49,6 +52,42 @@ from searx.botdetection import (
ip_limit, ip_limit,
) )
LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml"
"""Base configuration (schema) of the botdetection."""
LIMITER_CFG = Path('/etc/searxng/limiter.toml')
"""Lokal Limiter configuration."""
CFG_DEPRECATED = {
# "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
}
CFG = config.Config({}, {})
def init_cfg(log):
global CFG # pylint: disable=global-statement
CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED)
if not LIMITER_CFG.exists():
log.warning("missing config file: %s", LIMITER_CFG)
return
log.warning("load config file: %s", LIMITER_CFG)
try:
upd_cfg = toml.load(LIMITER_CFG)
except toml.DecodeError as exc:
msg = str(exc).replace('\t', '').replace('\n', ' ')
log.error("%s: %s", LIMITER_CFG, msg)
raise
is_valid, issue_list = CFG.validate(upd_cfg)
for msg in issue_list:
log.error(str(msg))
if not is_valid:
raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!")
CFG.update(upd_cfg)
def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
@ -58,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
for func in [ for func in [
http_user_agent, http_user_agent,
]: ]:
val = func.filter_request(request) val = func.filter_request(request, CFG)
if val is not None: if val is not None:
return val return val
@ -72,7 +111,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]:
http_user_agent, http_user_agent,
ip_limit, ip_limit,
]: ]:
val = func.filter_request(request) val = func.filter_request(request, CFG)
if val is not None: if val is not None:
return val return val

View File

@ -0,0 +1,3 @@
[botdetection.ip_limit]
link_token = true

View File

@ -38,5 +38,6 @@ def init(app: flask.Flask, settings) -> bool:
if not redisdb.client(): if not redisdb.client():
logger.error("The limiter requires Redis") logger.error("The limiter requires Redis")
return False return False
limiter.init_cfg(logger)
app.before_request(pre_request) app.before_request(pre_request)
return True return True

8
searx/tools/__init__.py Normal file
View File

@ -0,0 +1,8 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _tools src:
A collection of *utilities* used by SearXNG, but without SearXNG specific
peculiarities.
"""

376
searx/tools/config.py Normal file
View File

@ -0,0 +1,376 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Configuration class :py:class:`Config` with deep-update, schema validation
and deprecated names.
The :py:class:`Config` class implements a configuration that is based on
structured dictionaries. The configuration schema is defined in a dictionary
structure and the configuration data is given in a dictionary structure.
"""
from __future__ import annotations
import copy
import typing
import logging
import pathlib
import pytomlpp as toml
__all__ = ['Config', 'UNSET', 'SchemaIssue']
log = logging.getLogger(__name__)
class FALSE:
"""Class of ``False`` singelton"""
# pylint: disable=multiple-statements
def __init__(self, msg):
self.msg = msg
def __bool__(self):
return False
def __str__(self):
return self.msg
__repr__ = __str__
UNSET = FALSE('<UNSET>')
class SchemaIssue(ValueError):
"""Exception to store and/or raise a message from a schema issue."""
def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str):
self.level = level
super().__init__(msg)
def __str__(self):
return f"[cfg schema {self.level}] {self.args[0]}"
class Config:
"""Base class used for configuration"""
UNSET = UNSET
@classmethod
def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path, deprecated: dict) -> Config:
# init schema
log.debug("load schema file: %s", schema_file)
cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated)
if not cfg_file.exists():
log.warning("missing config file: %s", cfg_file)
return cfg
# load configuration
log.debug("load config file: %s", cfg_file)
try:
upd_cfg = toml.load(cfg_file)
except toml.DecodeError as exc:
msg = str(exc).replace('\t', '').replace('\n', ' ')
log.error("%s: %s", cfg_file, msg)
raise
is_valid, issue_list = cfg.validate(upd_cfg)
for msg in issue_list:
log.error(str(msg))
if not is_valid:
raise TypeError(f"schema of {cfg_file} is invalid!")
cfg.update(upd_cfg)
return cfg
def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]):
"""Construtor of class Config.
:param cfg_schema: Schema of the configuration
:param deprecated: dictionary that maps deprecated configuration names to a messages
These values are needed for validation, see :py:obj:`validate`.
"""
self.cfg_schema = cfg_schema
self.deprecated = deprecated
self.cfg = copy.deepcopy(cfg_schema)
def __getitem__(self, key: str):
return self.get(key)
def validate(self, cfg: dict):
"""Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`.
Validation is done by :py:obj:`validate`."""
return validate(self.cfg_schema, cfg, self.deprecated)
def update(self, upd_cfg: dict):
"""Update this configuration by ``upd_cfg``."""
dict_deepupdate(self.cfg, upd_cfg)
def default(self, name: str):
"""Returns default value of field ``name`` in ``self.cfg_schema``."""
return value(name, self.cfg_schema)
def get(self, name: str, default=UNSET, replace=True):
"""Returns the value to which ``name`` points in the configuration.
If there is no such ``name`` in the config and the ``default`` is
:py:obj:`UNSET`, a :py:obj:`KeyError` is raised.
"""
parent = self._get_parent_dict(name)
val = parent.get(name.split('.')[-1], UNSET)
if val is UNSET:
if default is UNSET:
raise KeyError(name)
val = default
if replace and isinstance(val, str):
val = val % self
return val
def set(self, name: str, val):
"""Set the value to which ``name`` points in the configuration.
If there is no such ``name`` in the config, a :py:obj:`KeyError` is
raised.
"""
parent = self._get_parent_dict(name)
parent[name.split('.')[-1]] = val
def _get_parent_dict(self, name):
parent_name = '.'.join(name.split('.')[:-1])
if parent_name:
parent = value(parent_name, self.cfg)
else:
parent = self.cfg
if (parent is UNSET) or (not isinstance(parent, dict)):
raise KeyError(parent_name)
return parent
def path(self, name: str, default=UNSET):
"""Get a :py:class:`pathlib.Path` object from a config string."""
val = self.get(name, default)
if val is UNSET:
if default is UNSET:
raise KeyError(name)
return default
return pathlib.Path(str(val))
def pyobj(self, name, default=UNSET):
"""Get python object refered by full qualiffied name (FQN) in the config
string."""
fqn = self.get(name, default)
if fqn is UNSET:
if default is UNSET:
raise KeyError(name)
return default
(modulename, name) = str(fqn).rsplit('.', 1)
m = __import__(modulename, {}, {}, [name], 0)
return getattr(m, name)
# working with dictionaries
def value(name: str, data_dict: dict):
"""Returns the value to which ``name`` points in the ``dat_dict``.
.. code: python
>>> data_dict = {
"foo": {"bar": 1 },
"bar": {"foo": 2 },
"foobar": [1, 2, 3],
}
>>> value('foobar', data_dict)
[1, 2, 3]
>>> value('foo.bar', data_dict)
1
>>> value('foo.bar.xxx', data_dict)
<UNSET>
"""
ret_val = data_dict
for part in name.split('.'):
if isinstance(ret_val, dict):
ret_val = ret_val.get(part, UNSET)
if ret_val is UNSET:
break
return ret_val
def validate(
schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str]
) -> typing.Tuple[bool, list]:
"""Deep validation of dictionary in ``data_dict`` against dictionary in
``schema_dict``. Argument deprecated is a dictionary that maps deprecated
configuration names to a messages::
deprecated = {
"foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'",
"..." : "..."
}
The function returns a python tuple ``(is_valid, issue_list)``:
``is_valid``:
A bool value indicating ``data_dict`` is valid or not.
``issue_list``:
A list of messages (:py:obj:`SchemaIssue`) from the validation::
[schema warn] data_dict: deprecated 'fontlib.foo': <DEPRECATED['foo.bar']>
[schema invalid] data_dict: key unknown 'fontlib.foo'
[schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ...
If ``schema_dict`` or ``data_dict`` is not a dictionary type a
:py:obj:`SchemaIssue` is raised.
"""
names = []
is_valid = True
issue_list = []
if not isinstance(schema_dict, dict):
raise SchemaIssue('invalid', "schema_dict is not a dict type")
if not isinstance(data_dict, dict):
raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type")
is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated)
return is_valid, issue_list
def _validate(
names: typing.List,
issue_list: typing.List,
schema_dict: typing.Dict,
data_dict: typing.Dict,
deprecated: typing.Dict[str, str],
) -> typing.Tuple[bool, typing.List]:
is_valid = True
for key, data_value in data_dict.items():
names.append(key)
name = '.'.join(names)
deprecated_msg = deprecated.get(name)
# print("XXX %s: key %s // data_value: %s" % (name, key, data_value))
if deprecated_msg:
issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}"))
schema_value = value(name, schema_dict)
# print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value))
if schema_value is UNSET:
if not deprecated_msg:
issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict"))
is_valid = False
elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck
issue_list.append(
SchemaIssue(
'invalid',
(f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"),
)
)
is_valid = False
elif isinstance(data_value, dict):
_valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated)
is_valid = is_valid and _valid
names.pop()
return is_valid, issue_list
def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None):
"""Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``.
For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``:
0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a
:py:obj:`TypeError`.
1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``.
2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a
(deep-) copy of ``upd_val``.
3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the
list in ``upd_val``.
4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in
``upd_val``.
"""
# pylint: disable=too-many-branches
if not isinstance(base_dict, dict):
raise TypeError("argument 'base_dict' is not a ditionary type")
if not isinstance(upd_dict, dict):
raise TypeError("argument 'upd_dict' is not a ditionary type")
if names is None:
names = []
for upd_key, upd_val in upd_dict.items():
# For each upd_key & upd_val pair in upd_dict:
if isinstance(upd_val, dict):
if upd_key in base_dict:
# if base_dict[upd_key] exists, recursively deep-update it
if not isinstance(base_dict[upd_key], dict):
raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict")
dict_deepupdate(
base_dict[upd_key],
upd_val,
names
+ [
upd_key,
],
)
else:
# if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val
base_dict[upd_key] = copy.deepcopy(upd_val)
elif isinstance(upd_val, list):
if upd_key in base_dict:
# if base_dict[upd_key] exists, base_dict[up_key] is extended by
# the list from upd_val
if not isinstance(base_dict[upd_key], list):
raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict")
base_dict[upd_key].extend(upd_val)
else:
# if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the
# list in upd_val.
base_dict[upd_key] = copy.deepcopy(upd_val)
elif isinstance(upd_val, set):
if upd_key in base_dict:
# if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val
if not isinstance(base_dict[upd_key], set):
raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict")
base_dict[upd_key].update(upd_val.copy())
else:
# if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the
# set in upd_val
base_dict[upd_key] = upd_val.copy()
else:
# for any other type of upd_val replace or add base_dict[upd_key] by a copy
# of upd_val
base_dict[upd_key] = copy.copy(upd_val)