mirror of https://github.com/searxng/searxng.git
[mod] botdetection - outsourcing to an external project
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
2310b507b3
commit
fa68aba8c5
|
@ -16,4 +16,4 @@ redis==4.6.0
|
|||
markdown-it-py==3.0.0
|
||||
typing_extensions==4.8.0
|
||||
fasttext-predict==0.9.2.1
|
||||
pytomlpp==1.0.13
|
||||
botdetection @ git+https://github.com/return42/botdetection.git@master
|
||||
|
|
|
@ -1,48 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _botdetection src:
|
||||
|
||||
Implementations used for bot detection.
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import pathlib
|
||||
|
||||
import redis
|
||||
from .config import Config
|
||||
|
||||
from ._helpers import logger
|
||||
from ._helpers import dump_request
|
||||
from ._helpers import get_real_ip
|
||||
from ._helpers import get_network
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
logger = logger.getChild('init')
|
||||
|
||||
__all__ = ['dump_request', 'get_network', 'get_real_ip', 'too_many_requests']
|
||||
|
||||
CFG_SCHEMA = pathlib.Path(__file__).parent / "schema.toml"
|
||||
"""Base configuration (schema) of the botdetection."""
|
||||
|
||||
CFG_DEPRECATED = {
|
||||
# "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Context:
|
||||
"""A global context of the botdetection"""
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
|
||||
redis_client: redis.Redis | None = None
|
||||
cfg: Config = Config.from_toml(schema_file=CFG_SCHEMA, cfg_file=None, deprecated=CFG_DEPRECATED)
|
||||
|
||||
def init(self, toml_cfg: pathlib.Path, redis_client: redis.Redis | None):
|
||||
self.redis_client = redis_client
|
||||
self.cfg.load_toml(toml_cfg)
|
||||
|
||||
|
||||
ctx = Context()
|
|
@ -1,129 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
# pylint: disable=missing-module-docstring, invalid-name
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
IPv4Address,
|
||||
IPv6Address,
|
||||
ip_network,
|
||||
)
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from . import config
|
||||
|
||||
logger = logging.getLogger('botdetection')
|
||||
|
||||
|
||||
def dump_request(request: flask.Request):
|
||||
return (
|
||||
request.path
|
||||
+ " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For')
|
||||
+ " || X-Real-IP: %s" % request.headers.get('X-Real-IP')
|
||||
+ " || form: %s" % request.form
|
||||
+ " || Accept: %s" % request.headers.get('Accept')
|
||||
+ " || Accept-Language: %s" % request.headers.get('Accept-Language')
|
||||
+ " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding')
|
||||
+ " || Content-Type: %s" % request.headers.get('Content-Type')
|
||||
+ " || Content-Length: %s" % request.headers.get('Content-Length')
|
||||
+ " || Connection: %s" % request.headers.get('Connection')
|
||||
+ " || User-Agent: %s" % request.headers.get('User-Agent')
|
||||
)
|
||||
|
||||
|
||||
def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None:
|
||||
"""Returns a HTTP 429 response object and writes a ERROR message to the
|
||||
'botdetection' logger. This function is used in part by the filter methods
|
||||
to return the default ``Too Many Requests`` response.
|
||||
|
||||
"""
|
||||
|
||||
logger.debug("BLOCK %s: %s", network.compressed, log_msg)
|
||||
return flask.make_response(('Too Many Requests', 429))
|
||||
|
||||
|
||||
def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network:
|
||||
"""Returns the (client) network of whether the real_ip is part of."""
|
||||
|
||||
if real_ip.version == 6:
|
||||
prefix = cfg['real_ip.ipv6_prefix']
|
||||
else:
|
||||
prefix = cfg['real_ip.ipv4_prefix']
|
||||
network = ip_network(f"{real_ip}/{prefix}", strict=False)
|
||||
# logger.debug("get_network(): %s", network.compressed)
|
||||
return network
|
||||
|
||||
|
||||
_logged_errors = []
|
||||
|
||||
|
||||
def _log_error_only_once(err_msg):
|
||||
if err_msg not in _logged_errors:
|
||||
logger.error(err_msg)
|
||||
_logged_errors.append(err_msg)
|
||||
|
||||
|
||||
def get_real_ip(request: flask.Request) -> str:
|
||||
"""Returns real IP of the request. Since not all proxies set all the HTTP
|
||||
headers and incoming headers can be faked it may happen that the IP cannot
|
||||
be determined correctly.
|
||||
|
||||
.. sidebar:: :py:obj:`flask.Request.remote_addr`
|
||||
|
||||
SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``).
|
||||
|
||||
This function tries to get the remote IP in the order listed below,
|
||||
additional some tests are done and if inconsistencies or errors are
|
||||
detected, they are logged.
|
||||
|
||||
The remote IP of the request is taken from (first match):
|
||||
|
||||
- X-Forwarded-For_ header
|
||||
- `X-real-IP header <https://github.com/searxng/searxng/issues/1237#issuecomment-1147564516>`__
|
||||
- :py:obj:`flask.Request.remote_addr`
|
||||
|
||||
.. _ProxyFix:
|
||||
https://werkzeug.palletsprojects.com/middleware/proxy_fix/
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
"""
|
||||
|
||||
forwarded_for = request.headers.get("X-Forwarded-For")
|
||||
real_ip = request.headers.get('X-Real-IP')
|
||||
remote_addr = request.remote_addr
|
||||
# logger.debug(
|
||||
# "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr
|
||||
# )
|
||||
|
||||
if not forwarded_for:
|
||||
_log_error_only_once("X-Forwarded-For header is not set!")
|
||||
else:
|
||||
from . import ctx # pylint: disable=import-outside-toplevel, cyclic-import
|
||||
|
||||
forwarded_for = [x.strip() for x in forwarded_for.split(',')]
|
||||
x_for: int = ctx.cfg['real_ip.x_for'] # type: ignore
|
||||
forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)]
|
||||
|
||||
if not real_ip:
|
||||
_log_error_only_once("X-Real-IP header is not set!")
|
||||
|
||||
if forwarded_for and real_ip and forwarded_for != real_ip:
|
||||
logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for)
|
||||
|
||||
if forwarded_for and remote_addr and forwarded_for != remote_addr:
|
||||
logger.warning(
|
||||
"IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for
|
||||
)
|
||||
|
||||
if real_ip and remote_addr and real_ip != remote_addr:
|
||||
logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip)
|
||||
|
||||
request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0'
|
||||
# logger.debug("get_real_ip() -> %s", request_ip)
|
||||
return request_ip
|
|
@ -1,378 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Configuration class :py:class:`Config` with deep-update, schema validation
|
||||
and deprecated names.
|
||||
|
||||
The :py:class:`Config` class implements a configuration that is based on
|
||||
structured dictionaries. The configuration schema is defined in a dictionary
|
||||
structure and the configuration data is given in a dictionary structure.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any
|
||||
|
||||
import copy
|
||||
import typing
|
||||
import logging
|
||||
import pathlib
|
||||
import pytomlpp as toml
|
||||
|
||||
__all__ = ['Config', 'UNSET', 'SchemaIssue']
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FALSE:
|
||||
"""Class of ``False`` singelton"""
|
||||
|
||||
# pylint: disable=multiple-statements
|
||||
def __init__(self, msg):
|
||||
self.msg = msg
|
||||
|
||||
def __bool__(self):
|
||||
return False
|
||||
|
||||
def __str__(self):
|
||||
return self.msg
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
||||
UNSET = FALSE('<UNSET>')
|
||||
|
||||
|
||||
class SchemaIssue(ValueError):
|
||||
"""Exception to store and/or raise a message from a schema issue."""
|
||||
|
||||
def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str):
|
||||
self.level = level
|
||||
super().__init__(msg)
|
||||
|
||||
def __str__(self):
|
||||
return f"[cfg schema {self.level}] {self.args[0]}"
|
||||
|
||||
|
||||
class Config:
|
||||
"""Base class used for configuration"""
|
||||
|
||||
UNSET = UNSET
|
||||
|
||||
@classmethod
|
||||
def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path | None, deprecated: dict) -> Config:
|
||||
# init schema
|
||||
log.debug("load schema file: %s", schema_file)
|
||||
cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated)
|
||||
if cfg_file is None:
|
||||
return cfg
|
||||
if not cfg_file.exists():
|
||||
log.warning("missing config file: %s", cfg_file)
|
||||
return cfg
|
||||
# load configuration from toml file
|
||||
cfg.load_toml(cfg_file)
|
||||
return cfg
|
||||
|
||||
def load_toml(self, cfg_file: pathlib.Path):
|
||||
log.debug("load config file: %s", cfg_file)
|
||||
try:
|
||||
upd_cfg = toml.load(cfg_file)
|
||||
except toml.DecodeError as exc:
|
||||
msg = str(exc).replace('\t', '').replace('\n', ' ')
|
||||
log.error("%s: %s", cfg_file, msg)
|
||||
raise
|
||||
|
||||
is_valid, issue_list = self.validate(upd_cfg)
|
||||
for msg in issue_list:
|
||||
log.error(str(msg))
|
||||
if not is_valid:
|
||||
raise TypeError(f"schema of {cfg_file} is invalid!")
|
||||
self.update(upd_cfg)
|
||||
|
||||
def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]):
|
||||
"""Construtor of class Config.
|
||||
|
||||
:param cfg_schema: Schema of the configuration
|
||||
:param deprecated: dictionary that maps deprecated configuration names to a messages
|
||||
|
||||
These values are needed for validation, see :py:obj:`validate`.
|
||||
|
||||
"""
|
||||
self.cfg_schema = cfg_schema
|
||||
self.deprecated = deprecated
|
||||
self.cfg = copy.deepcopy(cfg_schema)
|
||||
|
||||
def __getitem__(self, key: str) -> Any:
|
||||
return self.get(key)
|
||||
|
||||
def validate(self, cfg: dict):
|
||||
"""Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`.
|
||||
Validation is done by :py:obj:`validate`."""
|
||||
|
||||
return validate(self.cfg_schema, cfg, self.deprecated)
|
||||
|
||||
def update(self, upd_cfg: dict):
|
||||
"""Update this configuration by ``upd_cfg``."""
|
||||
|
||||
dict_deepupdate(self.cfg, upd_cfg)
|
||||
|
||||
def default(self, name: str):
|
||||
"""Returns default value of field ``name`` in ``self.cfg_schema``."""
|
||||
return value(name, self.cfg_schema)
|
||||
|
||||
def get(self, name: str, default: Any = UNSET, replace: bool = True) -> Any:
|
||||
"""Returns the value to which ``name`` points in the configuration.
|
||||
|
||||
If there is no such ``name`` in the config and the ``default`` is
|
||||
:py:obj:`UNSET`, a :py:obj:`KeyError` is raised.
|
||||
"""
|
||||
|
||||
parent = self._get_parent_dict(name)
|
||||
val = parent.get(name.split('.')[-1], UNSET)
|
||||
if val is UNSET:
|
||||
if default is UNSET:
|
||||
raise KeyError(name)
|
||||
val = default
|
||||
|
||||
if replace and isinstance(val, str):
|
||||
val = val % self
|
||||
return val
|
||||
|
||||
def set(self, name: str, val):
|
||||
"""Set the value to which ``name`` points in the configuration.
|
||||
|
||||
If there is no such ``name`` in the config, a :py:obj:`KeyError` is
|
||||
raised.
|
||||
"""
|
||||
parent = self._get_parent_dict(name)
|
||||
parent[name.split('.')[-1]] = val
|
||||
|
||||
def _get_parent_dict(self, name):
|
||||
parent_name = '.'.join(name.split('.')[:-1])
|
||||
if parent_name:
|
||||
parent = value(parent_name, self.cfg)
|
||||
else:
|
||||
parent = self.cfg
|
||||
if (parent is UNSET) or (not isinstance(parent, dict)):
|
||||
raise KeyError(parent_name)
|
||||
return parent
|
||||
|
||||
def path(self, name: str, default: Any = UNSET):
|
||||
"""Get a :py:class:`pathlib.Path` object from a config string."""
|
||||
|
||||
val = self.get(name, default)
|
||||
if val is UNSET:
|
||||
if default is UNSET:
|
||||
raise KeyError(name)
|
||||
return default
|
||||
return pathlib.Path(str(val))
|
||||
|
||||
def pyobj(self, name, default: Any = UNSET):
|
||||
"""Get python object refered by full qualiffied name (FQN) in the config
|
||||
string."""
|
||||
|
||||
fqn = self.get(name, default)
|
||||
if fqn is UNSET:
|
||||
if default is UNSET:
|
||||
raise KeyError(name)
|
||||
return default
|
||||
(modulename, name) = str(fqn).rsplit('.', 1)
|
||||
m = __import__(modulename, {}, {}, [name], 0)
|
||||
return getattr(m, name)
|
||||
|
||||
|
||||
# working with dictionaries
|
||||
|
||||
|
||||
def value(name: str, data_dict: dict):
|
||||
"""Returns the value to which ``name`` points in the ``dat_dict``.
|
||||
|
||||
.. code: python
|
||||
|
||||
>>> data_dict = {
|
||||
"foo": {"bar": 1 },
|
||||
"bar": {"foo": 2 },
|
||||
"foobar": [1, 2, 3],
|
||||
}
|
||||
>>> value('foobar', data_dict)
|
||||
[1, 2, 3]
|
||||
>>> value('foo.bar', data_dict)
|
||||
1
|
||||
>>> value('foo.bar.xxx', data_dict)
|
||||
<UNSET>
|
||||
|
||||
"""
|
||||
|
||||
ret_val = data_dict
|
||||
for part in name.split('.'):
|
||||
if isinstance(ret_val, dict):
|
||||
ret_val = ret_val.get(part, UNSET)
|
||||
if ret_val is UNSET:
|
||||
break
|
||||
return ret_val
|
||||
|
||||
|
||||
def validate(
|
||||
schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str]
|
||||
) -> typing.Tuple[bool, list]:
|
||||
|
||||
"""Deep validation of dictionary in ``data_dict`` against dictionary in
|
||||
``schema_dict``. Argument deprecated is a dictionary that maps deprecated
|
||||
configuration names to a messages::
|
||||
|
||||
deprecated = {
|
||||
"foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'",
|
||||
"..." : "..."
|
||||
}
|
||||
|
||||
The function returns a python tuple ``(is_valid, issue_list)``:
|
||||
|
||||
``is_valid``:
|
||||
A bool value indicating ``data_dict`` is valid or not.
|
||||
|
||||
``issue_list``:
|
||||
A list of messages (:py:obj:`SchemaIssue`) from the validation::
|
||||
|
||||
[schema warn] data_dict: deprecated 'fontlib.foo': <DEPRECATED['foo.bar']>
|
||||
[schema invalid] data_dict: key unknown 'fontlib.foo'
|
||||
[schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ...
|
||||
|
||||
If ``schema_dict`` or ``data_dict`` is not a dictionary type a
|
||||
:py:obj:`SchemaIssue` is raised.
|
||||
|
||||
"""
|
||||
names = []
|
||||
is_valid = True
|
||||
issue_list = []
|
||||
|
||||
if not isinstance(schema_dict, dict):
|
||||
raise SchemaIssue('invalid', "schema_dict is not a dict type")
|
||||
if not isinstance(data_dict, dict):
|
||||
raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type")
|
||||
|
||||
is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated)
|
||||
return is_valid, issue_list
|
||||
|
||||
|
||||
def _validate(
|
||||
names: typing.List,
|
||||
issue_list: typing.List,
|
||||
schema_dict: typing.Dict,
|
||||
data_dict: typing.Dict,
|
||||
deprecated: typing.Dict[str, str],
|
||||
) -> typing.Tuple[bool, typing.List]:
|
||||
|
||||
is_valid = True
|
||||
|
||||
for key, data_value in data_dict.items():
|
||||
|
||||
names.append(key)
|
||||
name = '.'.join(names)
|
||||
|
||||
deprecated_msg = deprecated.get(name)
|
||||
# print("XXX %s: key %s // data_value: %s" % (name, key, data_value))
|
||||
if deprecated_msg:
|
||||
issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}"))
|
||||
|
||||
schema_value = value(name, schema_dict)
|
||||
# print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value))
|
||||
if schema_value is UNSET:
|
||||
if not deprecated_msg:
|
||||
issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict"))
|
||||
is_valid = False
|
||||
|
||||
elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck
|
||||
issue_list.append(
|
||||
SchemaIssue(
|
||||
'invalid',
|
||||
(f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"),
|
||||
)
|
||||
)
|
||||
is_valid = False
|
||||
|
||||
elif isinstance(data_value, dict):
|
||||
_valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated)
|
||||
is_valid = is_valid and _valid
|
||||
names.pop()
|
||||
|
||||
return is_valid, issue_list
|
||||
|
||||
|
||||
def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None):
|
||||
"""Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``.
|
||||
|
||||
For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``:
|
||||
|
||||
0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a
|
||||
:py:obj:`TypeError`.
|
||||
|
||||
1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``.
|
||||
|
||||
2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a
|
||||
(deep-) copy of ``upd_val``.
|
||||
|
||||
3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the
|
||||
list in ``upd_val``.
|
||||
|
||||
4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in
|
||||
``upd_val``.
|
||||
"""
|
||||
# pylint: disable=too-many-branches
|
||||
if not isinstance(base_dict, dict):
|
||||
raise TypeError("argument 'base_dict' is not a ditionary type")
|
||||
if not isinstance(upd_dict, dict):
|
||||
raise TypeError("argument 'upd_dict' is not a ditionary type")
|
||||
|
||||
if names is None:
|
||||
names = []
|
||||
|
||||
for upd_key, upd_val in upd_dict.items():
|
||||
# For each upd_key & upd_val pair in upd_dict:
|
||||
|
||||
if isinstance(upd_val, dict):
|
||||
|
||||
if upd_key in base_dict:
|
||||
# if base_dict[upd_key] exists, recursively deep-update it
|
||||
if not isinstance(base_dict[upd_key], dict):
|
||||
raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict")
|
||||
dict_deepupdate(
|
||||
base_dict[upd_key],
|
||||
upd_val,
|
||||
names
|
||||
+ [
|
||||
upd_key,
|
||||
],
|
||||
)
|
||||
|
||||
else:
|
||||
# if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val
|
||||
base_dict[upd_key] = copy.deepcopy(upd_val)
|
||||
|
||||
elif isinstance(upd_val, list):
|
||||
|
||||
if upd_key in base_dict:
|
||||
# if base_dict[upd_key] exists, base_dict[up_key] is extended by
|
||||
# the list from upd_val
|
||||
if not isinstance(base_dict[upd_key], list):
|
||||
raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict")
|
||||
base_dict[upd_key].extend(upd_val)
|
||||
|
||||
else:
|
||||
# if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the
|
||||
# list in upd_val.
|
||||
base_dict[upd_key] = copy.deepcopy(upd_val)
|
||||
|
||||
elif isinstance(upd_val, set):
|
||||
|
||||
if upd_key in base_dict:
|
||||
# if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val
|
||||
if not isinstance(base_dict[upd_key], set):
|
||||
raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict")
|
||||
base_dict[upd_key].update(upd_val.copy())
|
||||
|
||||
else:
|
||||
# if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the
|
||||
# set in upd_val
|
||||
base_dict[upd_key] = upd_val.copy()
|
||||
|
||||
else:
|
||||
# for any other type of upd_val replace or add base_dict[upd_key] by a copy
|
||||
# of upd_val
|
||||
base_dict[upd_key] = copy.copy(upd_val)
|
|
@ -1,39 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept``
|
||||
----------------------
|
||||
|
||||
The ``http_accept`` method evaluates a request as the request of a bot if the
|
||||
Accept_ header ..
|
||||
|
||||
- did not contain ``text/html``
|
||||
|
||||
.. _Accept:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from . import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
if 'text/html' not in request.accept_mimetypes:
|
||||
return too_many_requests(network, "HTTP header Accept did not contain text/html")
|
||||
return None
|
|
@ -1,41 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept_encoding``
|
||||
-------------------------------
|
||||
|
||||
The ``http_accept_encoding`` method evaluates a request as the request of a
|
||||
bot if the Accept-Encoding_ header ..
|
||||
|
||||
- did not contain ``gzip`` AND ``deflate`` (if both values are missed)
|
||||
- did not contain ``text/html``
|
||||
|
||||
.. _Accept-Encoding:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from . import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')]
|
||||
if not ('gzip' in accept_list or 'deflate' in accept_list):
|
||||
return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate")
|
||||
return None
|
|
@ -1,35 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_accept_language``
|
||||
-------------------------------
|
||||
|
||||
The ``http_accept_language`` method evaluates a request as the request of a bot
|
||||
if the Accept-Language_ header is unset.
|
||||
|
||||
.. _Accept-Language:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from . import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
if request.headers.get('Accept-Language', '').strip() == '':
|
||||
return too_many_requests(network, "missing HTTP header Accept-Language")
|
||||
return None
|
|
@ -1,37 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_connection``
|
||||
--------------------------
|
||||
|
||||
The ``http_connection`` method evaluates a request as the request of a bot if
|
||||
the Connection_ header is set to ``close``.
|
||||
|
||||
.. _Connection:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from . import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
if request.headers.get('Connection', '').strip() == 'close':
|
||||
return too_many_requests(network, "HTTP header 'Connection=close")
|
||||
return None
|
|
@ -1,67 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``http_user_agent``
|
||||
--------------------------
|
||||
|
||||
The ``http_user_agent`` method evaluates a request as the request of a bot if
|
||||
the User-Agent_ header is unset or matches the regular expression
|
||||
:py:obj:`USER_AGENT`.
|
||||
|
||||
.. _User-Agent:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from . import config
|
||||
from ._helpers import too_many_requests
|
||||
|
||||
|
||||
USER_AGENT = (
|
||||
r'('
|
||||
+ r'unknown'
|
||||
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
|
||||
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
|
||||
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
|
||||
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
|
||||
+ r'|ZmEu|BLEXBot|bitlybot'
|
||||
# unmaintained Farside instances
|
||||
+ r'|'
|
||||
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
|
||||
# other bots and client to block
|
||||
+ '|.*PetalBot.*'
|
||||
+ r')'
|
||||
)
|
||||
"""Regular expression that matches to User-Agent_ from known *bots*"""
|
||||
|
||||
_regexp = None
|
||||
|
||||
|
||||
def regexp_user_agent():
|
||||
global _regexp # pylint: disable=global-statement
|
||||
if not _regexp:
|
||||
_regexp = re.compile(USER_AGENT)
|
||||
return _regexp
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
user_agent = request.headers.get('User-Agent', 'unknown')
|
||||
if regexp_user_agent().match(user_agent):
|
||||
return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
|
||||
return None
|
|
@ -1,163 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _botdetection.ip_limit:
|
||||
|
||||
Method ``ip_limit``
|
||||
-------------------
|
||||
|
||||
The ``ip_limit`` method counts request from an IP in *sliding windows*. If
|
||||
there are to many requests in a sliding window, the request is evaluated as a
|
||||
bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_
|
||||
header. To take privacy only the hash value of an IP is stored in the redis DB
|
||||
and at least for a maximum of 10 minutes.
|
||||
|
||||
The :py:obj:`.link_token` method can be used to investigate whether a request is
|
||||
*suspicious*. To activate the :py:obj:`.link_token` method in the
|
||||
:py:obj:`.ip_limit` method add the following configuration:
|
||||
|
||||
.. code:: toml
|
||||
|
||||
[botdetection.ip_limit]
|
||||
link_token = true
|
||||
|
||||
If the :py:obj:`.link_token` method is activated and a request is *suspicious*
|
||||
the request rates are reduced:
|
||||
|
||||
- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS`
|
||||
- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS`
|
||||
|
||||
To intercept bots that get their IPs from a range of IPs, there is a
|
||||
:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored
|
||||
for a longer time. IPs stored in this sliding window have a maximum of
|
||||
:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP
|
||||
makes a request that is not suspicious, the sliding window for this IP is
|
||||
dropped.
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
|
||||
Config
|
||||
~~~~~~
|
||||
|
||||
.. code:: toml
|
||||
|
||||
[botdetection.ip_limit]
|
||||
|
||||
# To get unlimited access in a local network, by default link-lokal addresses
|
||||
# (networks) are not monitored by the ip_limit
|
||||
filter_link_local = false
|
||||
|
||||
# activate link_token method in the ip_limit method
|
||||
link_token = false
|
||||
|
||||
Implementations
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
)
|
||||
|
||||
import flask
|
||||
import werkzeug
|
||||
|
||||
from . import ctx
|
||||
from .redislib import incr_sliding_window, drop_counter
|
||||
from . import link_token
|
||||
from . import config
|
||||
from ._helpers import (
|
||||
too_many_requests,
|
||||
logger,
|
||||
)
|
||||
|
||||
|
||||
logger = logger.getChild('ip_limit')
|
||||
|
||||
BURST_WINDOW = 20
|
||||
"""Time (sec) before sliding window for *burst* requests expires."""
|
||||
|
||||
BURST_MAX = 15
|
||||
"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`"""
|
||||
|
||||
BURST_MAX_SUSPICIOUS = 2
|
||||
"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`"""
|
||||
|
||||
LONG_WINDOW = 600
|
||||
"""Time (sec) before the longer sliding window expires."""
|
||||
|
||||
LONG_MAX = 150
|
||||
"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`"""
|
||||
|
||||
LONG_MAX_SUSPICIOUS = 10
|
||||
"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
|
||||
|
||||
API_WINDOW = 3600
|
||||
"""Time (sec) before sliding window for API requests (format != html) expires."""
|
||||
|
||||
API_MAX = 4
|
||||
"""Maximum requests from one IP in the :py:obj:`API_WINDOW`"""
|
||||
|
||||
SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30
|
||||
"""Time (sec) before sliding window for one suspicious IP expires."""
|
||||
|
||||
SUSPICIOUS_IP_MAX = 3
|
||||
"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`."""
|
||||
|
||||
|
||||
def filter_request(
|
||||
network: IPv4Network | IPv6Network,
|
||||
request: flask.Request,
|
||||
cfg: config.Config,
|
||||
) -> werkzeug.Response | None:
|
||||
|
||||
# pylint: disable=too-many-return-statements
|
||||
|
||||
if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']:
|
||||
logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed)
|
||||
return None
|
||||
|
||||
if request.args.get('format', 'html') != 'html':
|
||||
c = incr_sliding_window(ctx.redis_client, 'ip_limit.API_WINDOW:' + network.compressed, API_WINDOW)
|
||||
if c > API_MAX:
|
||||
return too_many_requests(network, "too many request in API_WINDOW")
|
||||
|
||||
if cfg['botdetection.ip_limit.link_token']:
|
||||
|
||||
suspicious = link_token.is_suspicious(network, request, True)
|
||||
|
||||
if not suspicious:
|
||||
# this IP is no longer suspicious: release ip again / delete the counter of this IP
|
||||
drop_counter(ctx.redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed)
|
||||
return None
|
||||
|
||||
# this IP is suspicious: count requests from this IP
|
||||
c = incr_sliding_window(
|
||||
ctx.redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW
|
||||
)
|
||||
if c > SUSPICIOUS_IP_MAX:
|
||||
logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network)
|
||||
return flask.redirect(flask.url_for('index'), code=302)
|
||||
|
||||
c = incr_sliding_window(ctx.redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
|
||||
if c > BURST_MAX_SUSPICIOUS:
|
||||
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)")
|
||||
|
||||
c = incr_sliding_window(ctx.redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
|
||||
if c > LONG_MAX_SUSPICIOUS:
|
||||
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)")
|
||||
|
||||
return None
|
||||
|
||||
# vanilla limiter without extensions counts BURST_MAX and LONG_MAX
|
||||
c = incr_sliding_window(ctx.redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW)
|
||||
if c > BURST_MAX:
|
||||
return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)")
|
||||
|
||||
c = incr_sliding_window(ctx.redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW)
|
||||
if c > LONG_MAX:
|
||||
return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)")
|
||||
|
||||
return None
|
|
@ -1,80 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
""".. _botdetection.ip_lists:
|
||||
|
||||
Method ``ip_lists``
|
||||
-------------------
|
||||
|
||||
The ``ip_lists`` method implements IP :py:obj:`block- <block_ip>` and
|
||||
:py:obj:`pass-lists <pass_ip>`.
|
||||
|
||||
|
||||
Config
|
||||
~~~~~~
|
||||
|
||||
.. code:: toml
|
||||
|
||||
[botdetection.ip_lists]
|
||||
|
||||
pass_ip = [
|
||||
'140.238.172.132', # IPv4 of check.searx.space
|
||||
'192.168.0.0/16', # IPv4 private network
|
||||
'fe80::/10' # IPv6 linklocal
|
||||
]
|
||||
block_ip = [
|
||||
'93.184.216.34', # IPv4 of example.org
|
||||
'257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class
|
||||
]
|
||||
|
||||
|
||||
Implementations
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
"""
|
||||
# pylint: disable=unused-argument
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Tuple
|
||||
from ipaddress import (
|
||||
ip_network,
|
||||
IPv4Address,
|
||||
IPv6Address,
|
||||
)
|
||||
|
||||
from . import config
|
||||
from ._helpers import logger
|
||||
|
||||
logger = logger.getChild('ip_limit')
|
||||
|
||||
|
||||
def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
|
||||
"""Checks if the IP on the subnet is in one of the members of the
|
||||
``botdetection.ip_lists.pass_ip`` list.
|
||||
"""
|
||||
return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg)
|
||||
|
||||
|
||||
def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]:
|
||||
"""Checks if the IP on the subnet is in one of the members of the
|
||||
``botdetection.ip_lists.block_ip`` list.
|
||||
"""
|
||||
|
||||
block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg)
|
||||
if block:
|
||||
msg += " To remove IP from list, please contact the maintainer of the service."
|
||||
return block, msg
|
||||
|
||||
|
||||
def ip_is_subnet_of_member_in_list(
|
||||
real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config
|
||||
) -> Tuple[bool, str]:
|
||||
|
||||
for net in cfg.get(list_name, default=[]):
|
||||
try:
|
||||
net = ip_network(net, strict=False)
|
||||
except ValueError:
|
||||
logger.error("invalid IP %s in %s", net, list_name)
|
||||
continue
|
||||
if real_ip.version == net.version and real_ip in net:
|
||||
return True, f"IP matches {net.compressed} in {list_name}."
|
||||
return False, f"IP is not a member of an item in the f{list_name} list"
|
|
@ -1,179 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
Method ``link_token``
|
||||
---------------------
|
||||
|
||||
The ``link_token`` method evaluates a request as :py:obj:`suspicious
|
||||
<is_suspicious>` if the URL ``/client<token>.css`` is not requested by the
|
||||
client. By adding a random component (the token) in the URL, a bot can not send
|
||||
a ping by request a static URL.
|
||||
|
||||
.. note::
|
||||
|
||||
This method requires a redis DB and needs a HTTP X-Forwarded-For_ header.
|
||||
|
||||
.. _X-Forwarded-For:
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
|
||||
|
||||
To get in use of this method a flask URL route needs to be added:
|
||||
|
||||
.. code:: python
|
||||
|
||||
@app.route('/client<token>.css', methods=['GET', 'POST'])
|
||||
def client_token(token=None):
|
||||
link_token.ping(request, token)
|
||||
return Response('', mimetype='text/css')
|
||||
|
||||
And in the HTML template from flask a stylesheet link is needed (the value of
|
||||
``link_token`` comes from :py:obj:`get_token`):
|
||||
|
||||
.. code:: html
|
||||
|
||||
<link rel="stylesheet"
|
||||
href="{{ url_for('client_token', token=link_token) }}"
|
||||
type="text/css" />
|
||||
|
||||
|
||||
Config
|
||||
~~~~~~
|
||||
|
||||
.. code:: toml
|
||||
|
||||
[botdetection.link_token]
|
||||
# Livetime (sec) of limiter's CSS token.
|
||||
TOKEN_LIVE_TIME = 600
|
||||
|
||||
# Livetime (sec) of the ping-key from a client (request)
|
||||
PING_LIVE_TIME = 3600
|
||||
|
||||
# Prefix of all ping-keys generated by link_token.get_ping_key
|
||||
PING_KEY = 'botdetection.link_token.PING_KEY'
|
||||
|
||||
# Key for which the current token is stored in the DB
|
||||
TOKEN_KEY = 'botdetection.link_token.TOKEN_KEY'
|
||||
|
||||
|
||||
Implementations
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from ipaddress import (
|
||||
IPv4Network,
|
||||
IPv6Network,
|
||||
ip_address,
|
||||
)
|
||||
|
||||
import string
|
||||
import random
|
||||
import flask
|
||||
|
||||
from . import ctx
|
||||
from .redislib import secret_hash
|
||||
|
||||
from ._helpers import (
|
||||
logger,
|
||||
get_network,
|
||||
get_real_ip,
|
||||
)
|
||||
|
||||
|
||||
logger = logger.getChild('link_token')
|
||||
|
||||
|
||||
PING_KEY = 'botdetection.link_token.PING_KEY'
|
||||
"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""
|
||||
|
||||
TOKEN_KEY = 'botdetection.link_token.TOKEN_KEY'
|
||||
"""Key for which the current token is stored in the DB"""
|
||||
|
||||
|
||||
def _cfg(name):
|
||||
return ctx.cfg.get(f'botdetection.link_token.{name}')
|
||||
|
||||
|
||||
def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False):
|
||||
"""Checks whether a valid ping is exists for this (client) network, if not
|
||||
this request is rated as *suspicious*. If a valid ping exists and argument
|
||||
``renew`` is ``True`` the expire time of this ping is reset to
|
||||
``PING_LIVE_TIME``.
|
||||
|
||||
"""
|
||||
if not ctx.redis_client:
|
||||
return False
|
||||
|
||||
ping_key = get_ping_key(network, request)
|
||||
if not ctx.redis_client.get(ping_key):
|
||||
logger.info("missing ping (IP: %s) / request: %s", network.compressed, ping_key)
|
||||
return True
|
||||
|
||||
if renew:
|
||||
ctx.redis_client.set(ping_key, 1, ex=_cfg('PING_LIVE_TIME'))
|
||||
|
||||
logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key)
|
||||
return False
|
||||
|
||||
|
||||
def ping(request: flask.Request, token: str):
|
||||
"""This function is called by a request to URL ``/client<token>.css``. If
|
||||
``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB.
|
||||
The expire time of this ping-key is ``PING_LIVE_TIME``.
|
||||
|
||||
"""
|
||||
if not ctx.redis_client:
|
||||
return
|
||||
if not token_is_valid(token):
|
||||
return
|
||||
|
||||
real_ip = ip_address(get_real_ip(request))
|
||||
network = get_network(real_ip, ctx.cfg)
|
||||
|
||||
ping_key = get_ping_key(network, request)
|
||||
logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key)
|
||||
|
||||
ctx.redis_client.set(ping_key, 1, ex=_cfg('PING_LIVE_TIME'))
|
||||
|
||||
|
||||
def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str:
|
||||
"""Generates a hashed key that fits (more or less) to a *WEB-browser
|
||||
session* in a network."""
|
||||
return (
|
||||
PING_KEY
|
||||
+ "["
|
||||
+ secret_hash(
|
||||
network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '')
|
||||
)
|
||||
+ "]"
|
||||
)
|
||||
|
||||
|
||||
def token_is_valid(token) -> bool:
|
||||
valid = token == get_token()
|
||||
logger.debug("token is valid --> %s", valid)
|
||||
return valid
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
"""Returns current token. If there is no currently active token a new token
|
||||
is generated randomly and stored in the redis DB.
|
||||
|
||||
Config:
|
||||
|
||||
- ``TOKEN_LIVE_TIME``
|
||||
- ``TOKEN_KEY``
|
||||
|
||||
"""
|
||||
if not ctx.redis_client:
|
||||
# This function is also called when limiter is inactive / no redis DB
|
||||
# (see render function in webapp.py)
|
||||
return '12345678'
|
||||
token_key = _cfg('TOKEN_KEY')
|
||||
token = ctx.redis_client.get(token_key)
|
||||
if token:
|
||||
token = token.decode('UTF-8')
|
||||
else:
|
||||
token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16))
|
||||
ctx.redis_client.set(token_key, token, ex=_cfg('TOKEN_LIVE_TIME'))
|
||||
return token
|
|
@ -1,263 +0,0 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""A collection of convenient functions and redis/lua scripts.
|
||||
|
||||
This code was partial inspired by the `Bullet-Proofing Lua Scripts in RedisPy`_
|
||||
article.
|
||||
|
||||
.. _Bullet-Proofing Lua Scripts in RedisPy:
|
||||
https://redis.com/blog/bullet-proofing-lua-scripts-in-redispy/
|
||||
|
||||
Config
|
||||
~~~~~~
|
||||
|
||||
.. code:: toml
|
||||
|
||||
[botdetection.redis]
|
||||
|
||||
# FQDN of a function definition. A function with which the DB keys of the Redis
|
||||
# DB are to be annonymized.
|
||||
secret_hash = ''
|
||||
|
||||
# A prefix to all keys store by the botdetection in the redis DB
|
||||
REDIS_KEY_PREFIX = 'botdetection_'
|
||||
|
||||
|
||||
Implementations
|
||||
~~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from . import ctx
|
||||
|
||||
REDIS_KEY_PREFIX = 'botdetection'
|
||||
"""A prefix applied to all keys store by the botdetection in the redis DB."""
|
||||
|
||||
LUA_SCRIPT_STORAGE = {}
|
||||
"""A global dictionary to cache client's ``Script`` objects, used by
|
||||
:py:obj:`lua_script_storage`"""
|
||||
|
||||
|
||||
def secret_hash(name: str) -> str:
|
||||
"""Returns a annonymized name if ``secret_hash`` is configured, otherwise
|
||||
the ``name`` is returned unchanged."""
|
||||
func = ctx.cfg.pyobj('botdetection.redis.secret_hash', default=None) # type: ignore
|
||||
if not func:
|
||||
return name
|
||||
return func(name)
|
||||
|
||||
|
||||
def _prefix(val: str | None = None) -> str:
|
||||
if val is None:
|
||||
val = ctx.cfg.get('botdetection.redis.REDIS_KEY_PREFIX', default=REDIS_KEY_PREFIX) # type: ignore
|
||||
return str(val)
|
||||
|
||||
|
||||
def lua_script_storage(client, script):
|
||||
"""Returns a redis :py:obj:`Script
|
||||
<redis.commands.core.CoreCommands.register_script>` instance.
|
||||
|
||||
Due to performance reason the ``Script`` object is instantiated only once
|
||||
for a client (``client.register_script(..)``) and is cached in
|
||||
:py:obj:`LUA_SCRIPT_STORAGE`.
|
||||
|
||||
"""
|
||||
|
||||
# redis connection can be closed, lets use the id() of the redis connector
|
||||
# as key in the script-storage:
|
||||
client_id = id(client)
|
||||
|
||||
if LUA_SCRIPT_STORAGE.get(client_id) is None:
|
||||
LUA_SCRIPT_STORAGE[client_id] = {}
|
||||
|
||||
if LUA_SCRIPT_STORAGE[client_id].get(script) is None:
|
||||
LUA_SCRIPT_STORAGE[client_id][script] = client.register_script(script)
|
||||
|
||||
return LUA_SCRIPT_STORAGE[client_id][script]
|
||||
|
||||
|
||||
PURGE_BY_PREFIX = """
|
||||
local prefix = tostring(ARGV[1])
|
||||
for i, name in ipairs(redis.call('KEYS', prefix .. '*')) do
|
||||
redis.call('EXPIRE', name, 0)
|
||||
end
|
||||
"""
|
||||
|
||||
|
||||
def purge_by_prefix(client, prefix: str | None):
|
||||
"""Purge all keys with ``prefix`` from database.
|
||||
|
||||
Queries all keys in the database by the given prefix and set expire time to
|
||||
zero. The default prefix will drop all keys which has been set by
|
||||
:py:obj:`REDIS_KEY_PREFIX`.
|
||||
|
||||
The implementation is the lua script from string :py:obj:`PURGE_BY_PREFIX`.
|
||||
The lua script uses EXPIRE_ instead of DEL_: if there are a lot keys to
|
||||
delete and/or their values are big, `DEL` could take more time and blocks
|
||||
the command loop while `EXPIRE` turns back immediate.
|
||||
|
||||
:param prefix: prefix of the key to delete (default: :py:obj:`REDIS_KEY_PREFIX`)
|
||||
:type name: str
|
||||
|
||||
.. _EXPIRE: https://redis.io/commands/expire/
|
||||
.. _DEL: https://redis.io/commands/del/
|
||||
|
||||
"""
|
||||
script = lua_script_storage(client, PURGE_BY_PREFIX)
|
||||
script(args=[_prefix(prefix)])
|
||||
|
||||
|
||||
INCR_COUNTER = """
|
||||
local limit = tonumber(ARGV[1])
|
||||
local expire = tonumber(ARGV[2])
|
||||
local c_name = KEYS[1]
|
||||
|
||||
local c = redis.call('GET', c_name)
|
||||
|
||||
if not c then
|
||||
c = redis.call('INCR', c_name)
|
||||
if expire > 0 then
|
||||
redis.call('EXPIRE', c_name, expire)
|
||||
end
|
||||
else
|
||||
c = tonumber(c)
|
||||
if limit == 0 or c < limit then
|
||||
c = redis.call('INCR', c_name)
|
||||
end
|
||||
end
|
||||
return c
|
||||
"""
|
||||
|
||||
|
||||
def incr_counter(client, name: str, limit: int = 0, expire: int = 0):
|
||||
"""Increment a counter and return the new value.
|
||||
|
||||
If counter with redis key :py:obj:`REDIS_KEY_PREFIX` + ``counter_<name>``
|
||||
does not exists it is created with initial value 1 returned. The
|
||||
replacement ``<name>`` is a *secret hash* of the value from argument
|
||||
``name`` (see :py:func:`secret_hash`).
|
||||
|
||||
The implementation of the redis counter is the lua script from string
|
||||
:py:obj:`INCR_COUNTER`.
|
||||
|
||||
:param name: name of the counter
|
||||
:type name: str
|
||||
|
||||
:param expire: live-time of the counter in seconds (default ``None`` means
|
||||
infinite).
|
||||
:type expire: int / see EXPIRE_
|
||||
|
||||
:param limit: limit where the counter stops to increment (default ``None``)
|
||||
:type limit: int / limit is 2^64 see INCR_
|
||||
|
||||
:return: value of the incremented counter
|
||||
:type return: int
|
||||
|
||||
.. _EXPIRE: https://redis.io/commands/expire/
|
||||
.. _INCR: https://redis.io/commands/incr/
|
||||
|
||||
A simple demo of a counter with expire time and limit::
|
||||
|
||||
>>> for i in range(6):
|
||||
... i, incr_counter(client, "foo", 3, 5) # max 3, duration 5 sec
|
||||
... time.sleep(1) # from the third call on max has been reached
|
||||
...
|
||||
(0, 1)
|
||||
(1, 2)
|
||||
(2, 3)
|
||||
(3, 3)
|
||||
(4, 3)
|
||||
(5, 1)
|
||||
|
||||
"""
|
||||
script = lua_script_storage(client, INCR_COUNTER)
|
||||
name = _prefix() + "counter_" + secret_hash(name)
|
||||
c = script(args=[limit, expire], keys=[name])
|
||||
return c
|
||||
|
||||
|
||||
def drop_counter(client, name):
|
||||
"""Drop counter with redis key :py:obj:`REDIS_KEY_PREFIX` +
|
||||
``counter_<name>``
|
||||
|
||||
The replacement ``<name>`` is a *secret hash* of the value from argument
|
||||
``name`` (see :py:func:`incr_counter` and :py:func:`incr_sliding_window`).
|
||||
|
||||
"""
|
||||
name = _prefix() + "counter_" + secret_hash(name)
|
||||
client.delete(name)
|
||||
|
||||
|
||||
INCR_SLIDING_WINDOW = """
|
||||
local expire = tonumber(ARGV[1])
|
||||
local name = KEYS[1]
|
||||
local current_time = redis.call('TIME')
|
||||
|
||||
redis.call('ZREMRANGEBYSCORE', name, 0, current_time[1] - expire)
|
||||
redis.call('ZADD', name, current_time[1], current_time[1] .. current_time[2])
|
||||
local result = redis.call('ZCOUNT', name, 0, current_time[1] + 1)
|
||||
redis.call('EXPIRE', name, expire)
|
||||
return result
|
||||
"""
|
||||
|
||||
|
||||
def incr_sliding_window(client, name: str, duration: int):
|
||||
"""Increment a sliding-window counter and return the new value.
|
||||
|
||||
If counter with redis key :py:obj:`REDIS_KEY_PREFIX` + ``counter_<name>``
|
||||
does not exists it is created with initial value 1 returned. The
|
||||
replacement ``<name>`` is a *secret hash* of the value from argument
|
||||
``name`` (see :py:func:`secret_hash`).
|
||||
|
||||
:param name: name of the counter
|
||||
:type name: str
|
||||
|
||||
:param duration: live-time of the sliding window in seconds
|
||||
:typeduration: int
|
||||
|
||||
:return: value of the incremented counter
|
||||
:type return: int
|
||||
|
||||
The implementation of the redis counter is the lua script from string
|
||||
:py:obj:`INCR_SLIDING_WINDOW`. The lua script uses `sorted sets in Redis`_
|
||||
to implement a sliding window for the redis key :py:obj:`REDIS_KEY_PREFIX` +
|
||||
``counter_<name>`` (ZADD_). The current TIME_ is used to score the items in
|
||||
the sorted set and the time window is moved by removing items with a score
|
||||
lower current time minus *duration* time (ZREMRANGEBYSCORE_).
|
||||
|
||||
The EXPIRE_ time (the duration of the sliding window) is refreshed on each
|
||||
call (increment) and if there is no call in this duration, the sorted
|
||||
set expires from the redis DB.
|
||||
|
||||
The return value is the amount of items in the sorted set (ZCOUNT_), what
|
||||
means the number of calls in the sliding window.
|
||||
|
||||
.. _Sorted sets in Redis:
|
||||
https://redis.com/ebook/part-1-getting-started/chapter-1-getting-to-know-redis/1-2-what-redis-data-structures-look-like/1-2-5-sorted-sets-in-redis/
|
||||
.. _TIME: https://redis.io/commands/time/
|
||||
.. _ZADD: https://redis.io/commands/zadd/
|
||||
.. _EXPIRE: https://redis.io/commands/expire/
|
||||
.. _ZREMRANGEBYSCORE: https://redis.io/commands/zremrangebyscore/
|
||||
.. _ZCOUNT: https://redis.io/commands/zcount/
|
||||
|
||||
A simple demo of the sliding window::
|
||||
|
||||
>>> for i in range(5):
|
||||
... incr_sliding_window(client, "foo", 3) # duration 3 sec
|
||||
... time.sleep(1) # from the third call (second) on the window is moved
|
||||
...
|
||||
1
|
||||
2
|
||||
3
|
||||
3
|
||||
3
|
||||
>>> time.sleep(3) # wait until expire
|
||||
>>> incr_sliding_window(client, "foo", 3)
|
||||
1
|
||||
|
||||
"""
|
||||
script = lua_script_storage(client, INCR_SLIDING_WINDOW)
|
||||
name = _prefix() + "counter_" + secret_hash(name)
|
||||
c = script(args=[duration], keys=[name])
|
||||
return c
|
|
@ -1,58 +0,0 @@
|
|||
[real_ip]
|
||||
|
||||
# Number of values to trust for X-Forwarded-For.
|
||||
|
||||
x_for = 1
|
||||
|
||||
# The prefix defines the number of leading bits in an address that are compared
|
||||
# to determine whether or not an address is part of a (client) network.
|
||||
|
||||
ipv4_prefix = 32
|
||||
ipv6_prefix = 48
|
||||
|
||||
[botdetection.redis]
|
||||
|
||||
# FQDN of a function definition. A function with which the DB keys of the Redis
|
||||
# DB are to be annonymized.
|
||||
secret_hash = ''
|
||||
|
||||
# A prefix to all keys store by the botdetection in the redis DB
|
||||
REDIS_KEY_PREFIX = 'botdetection_'
|
||||
|
||||
[botdetection.ip_limit]
|
||||
|
||||
# To get unlimited access in a local network, by default link-lokal addresses
|
||||
# (networks) are not monitored by the ip_limit
|
||||
filter_link_local = false
|
||||
|
||||
# activate link_token method in the ip_limit method
|
||||
link_token = false
|
||||
|
||||
[botdetection.link_token]
|
||||
# Livetime (sec) of limiter's CSS token.
|
||||
TOKEN_LIVE_TIME = 600
|
||||
|
||||
# Livetime (sec) of the ping-key from a client (request)
|
||||
PING_LIVE_TIME = 3600
|
||||
|
||||
# Prefix of all ping-keys generated by link_token.get_ping_key
|
||||
PING_KEY = 'botdetection.link_token.PING_KEY'
|
||||
|
||||
# Key for which the current token is stored in the DB
|
||||
TOKEN_KEY = 'botdetection.link_token.TOKEN_KEY'
|
||||
|
||||
[botdetection.ip_lists]
|
||||
|
||||
# In the limiter, the ip_lists method has priority over all other methods -> if
|
||||
# an IP is in the pass_ip list, it has unrestricted access and it is also not
|
||||
# checked if e.g. the "user agent" suggests a bot (e.g. curl).
|
||||
|
||||
block_ip = [
|
||||
# '93.184.216.34', # IPv4 of example.org
|
||||
# '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class
|
||||
]
|
||||
|
||||
pass_ip = [
|
||||
# '192.168.0.0/16', # IPv4 private network
|
||||
# 'fe80::/10' # IPv6 linklocal / wins over botdetection.ip_limit.filter_link_local
|
||||
]
|
|
@ -101,12 +101,8 @@ from ipaddress import ip_address
|
|||
import flask
|
||||
import werkzeug
|
||||
|
||||
from searx import (
|
||||
logger,
|
||||
redisdb,
|
||||
)
|
||||
from searx import botdetection
|
||||
from searx.botdetection import (
|
||||
import botdetection
|
||||
from botdetection import (
|
||||
http_accept,
|
||||
http_accept_encoding,
|
||||
http_accept_language,
|
||||
|
@ -118,6 +114,11 @@ from searx.botdetection import (
|
|||
dump_request,
|
||||
)
|
||||
|
||||
from searx import (
|
||||
logger,
|
||||
redisdb,
|
||||
)
|
||||
|
||||
# the configuration are limiter.toml and "limiter" in settings.yml so, for
|
||||
# coherency, the logger is "limiter"
|
||||
logger = logger.getChild('limiter')
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
import re
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx.botdetection._helpers import get_real_ip
|
||||
from botdetection._helpers import get_real_ip
|
||||
|
||||
name = gettext('Self Information')
|
||||
description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".')
|
||||
|
|
|
@ -17,8 +17,9 @@ import time
|
|||
import importlib
|
||||
from typing import Callable
|
||||
|
||||
from botdetection.redislib import lua_script_storage
|
||||
|
||||
from searx.redisdb import client as get_redis_client
|
||||
from searx.botdetection.redislib import lua_script_storage
|
||||
|
||||
|
||||
logger = logging.getLogger('searx.search.checker')
|
||||
|
|
|
@ -49,6 +49,8 @@ from flask_babel import (
|
|||
format_decimal,
|
||||
)
|
||||
|
||||
import botdetection
|
||||
|
||||
from searx import (
|
||||
logger,
|
||||
get_setting,
|
||||
|
@ -58,7 +60,6 @@ from searx import (
|
|||
|
||||
from searx import infopage
|
||||
from searx import limiter
|
||||
from searx import botdetection
|
||||
|
||||
from searx.data import ENGINE_DESCRIPTIONS
|
||||
from searx.results import Timing
|
||||
|
|
Loading…
Reference in New Issue