From a2ee107dd79f180473e427f8c10585e4a8ed29ac Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 2 Nov 2023 15:35:38 +0100 Subject: [PATCH] [mod] botdetection - outsourcing to an external project Signed-off-by: Markus Heiser --- docs/admin/settings/settings_server.rst | 2 +- docs/conf.py | 1 + docs/src/searx.botdetection.rst | 62 ---- requirements.txt | 2 +- searx/botdetection/__init__.py | 48 --- searx/botdetection/_helpers.py | 129 ------- searx/botdetection/config.py | 378 --------------------- searx/botdetection/http_accept.py | 39 --- searx/botdetection/http_accept_encoding.py | 41 --- searx/botdetection/http_accept_language.py | 35 -- searx/botdetection/http_connection.py | 37 -- searx/botdetection/http_user_agent.py | 67 ---- searx/botdetection/ip_limit.py | 163 --------- searx/botdetection/ip_lists.py | 80 ----- searx/botdetection/link_token.py | 179 ---------- searx/botdetection/redislib.py | 263 -------------- searx/botdetection/schema.toml | 58 ---- searx/limiter.py | 13 +- searx/plugins/self_info.py | 2 +- searx/search/checker/scheduler.py | 3 +- searx/webapp.py | 3 +- 21 files changed, 15 insertions(+), 1590 deletions(-) delete mode 100644 docs/src/searx.botdetection.rst delete mode 100644 searx/botdetection/__init__.py delete mode 100644 searx/botdetection/_helpers.py delete mode 100644 searx/botdetection/config.py delete mode 100644 searx/botdetection/http_accept.py delete mode 100644 searx/botdetection/http_accept_encoding.py delete mode 100644 searx/botdetection/http_accept_language.py delete mode 100644 searx/botdetection/http_connection.py delete mode 100644 searx/botdetection/http_user_agent.py delete mode 100644 searx/botdetection/ip_limit.py delete mode 100644 searx/botdetection/ip_lists.py delete mode 100644 searx/botdetection/link_token.py delete mode 100644 searx/botdetection/redislib.py delete mode 100644 searx/botdetection/schema.toml diff --git a/docs/admin/settings/settings_server.rst b/docs/admin/settings/settings_server.rst index 453c4a23f..03c6d9dd3 100644 --- a/docs/admin/settings/settings_server.rst +++ b/docs/admin/settings/settings_server.rst @@ -42,7 +42,7 @@ - ``server: limiter`` option :ref:`see below ` - ``server: pass_searxng_org`` option :ref:`see below ` - - :py:obj:`searx.botdetection.link_token` in the :ref:`limiter` + - :py:obj:`botdetection.link_token` in the :ref:`limiter` .. _activate limiter: diff --git a/docs/conf.py b/docs/conf.py index 2ed85a800..8a7e69ae5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -150,6 +150,7 @@ intersphinx_mapping = { "linuxdoc" : ("https://return42.github.io/linuxdoc/", None), "sphinx" : ("https://www.sphinx-doc.org/en/master/", None), "redis": ('https://redis.readthedocs.io/en/stable/', None), + "botdetection" : ("https://searxng.org/botdetection/", None), } issues_github_path = "searxng/searxng" diff --git a/docs/src/searx.botdetection.rst b/docs/src/searx.botdetection.rst deleted file mode 100644 index 04cb81dfd..000000000 --- a/docs/src/searx.botdetection.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _botdetection: - -============= -Bot Detection -============= - -.. contents:: - :depth: 2 - :local: - :backlinks: entry - -.. automodule:: searx.botdetection - :members: - -.. _botdetection ip_lists: - -IP lists -======== - -.. automodule:: searx.botdetection.ip_lists - :members: - - -.. _botdetection rate limit: - -Rate limit -========== - -.. automodule:: searx.botdetection.ip_limit - :members: - -.. automodule:: searx.botdetection.link_token - :members: - - -.. _botdetection probe headers: - -Probe HTTP headers -================== - -.. automodule:: searx.botdetection.http_accept - :members: - -.. automodule:: searx.botdetection.http_accept_encoding - :members: - -.. automodule:: searx.botdetection.http_accept_language - :members: - -.. automodule:: searx.botdetection.http_connection - :members: - -.. automodule:: searx.botdetection.http_user_agent - :members: - -.. _botdetection config: - -Config -====== - -.. automodule:: searx.botdetection.config - :members: diff --git a/requirements.txt b/requirements.txt index 10df212b4..01f9292a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,4 @@ redis==5.0.1 markdown-it-py==3.0.0 typing_extensions==4.9.0 fasttext-predict==0.9.2.2 -pytomlpp==1.0.13 +botdetection==20240226.0 diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py deleted file mode 100644 index 3047ca997..000000000 --- a/searx/botdetection/__init__.py +++ /dev/null @@ -1,48 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. _botdetection src: - -Implementations used for bot detection. - -""" -from __future__ import annotations - -from dataclasses import dataclass -import pathlib - -import redis -from .config import Config - -from ._helpers import logger -from ._helpers import dump_request -from ._helpers import get_real_ip -from ._helpers import get_network -from ._helpers import too_many_requests - -logger = logger.getChild('init') - -__all__ = ['dump_request', 'get_network', 'get_real_ip', 'too_many_requests'] - -CFG_SCHEMA = pathlib.Path(__file__).parent / "schema.toml" -"""Base configuration (schema) of the botdetection.""" - -CFG_DEPRECATED = { - # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." -} - - -@dataclass -class Context: - """A global context of the botdetection""" - - # pylint: disable=too-few-public-methods - - redis_client: redis.Redis | None = None - cfg: Config = Config.from_toml(schema_file=CFG_SCHEMA, cfg_file=None, deprecated=CFG_DEPRECATED) - - def init(self, toml_cfg: pathlib.Path, redis_client: redis.Redis | None): - self.redis_client = redis_client - self.cfg.load_toml(toml_cfg) - - -ctx = Context() diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py deleted file mode 100644 index d1b3b0cfe..000000000 --- a/searx/botdetection/_helpers.py +++ /dev/null @@ -1,129 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=missing-module-docstring, invalid-name -from __future__ import annotations - -import logging -from ipaddress import ( - IPv4Network, - IPv6Network, - IPv4Address, - IPv6Address, - ip_network, -) -import flask -import werkzeug - -from . import config - -logger = logging.getLogger('botdetection') - - -def dump_request(request: flask.Request): - return ( - request.path - + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') - + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') - + " || form: %s" % request.form - + " || Accept: %s" % request.headers.get('Accept') - + " || Accept-Language: %s" % request.headers.get('Accept-Language') - + " || Accept-Encoding: %s" % request.headers.get('Accept-Encoding') - + " || Content-Type: %s" % request.headers.get('Content-Type') - + " || Content-Length: %s" % request.headers.get('Content-Length') - + " || Connection: %s" % request.headers.get('Connection') - + " || User-Agent: %s" % request.headers.get('User-Agent') - ) - - -def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None: - """Returns a HTTP 429 response object and writes a ERROR message to the - 'botdetection' logger. This function is used in part by the filter methods - to return the default ``Too Many Requests`` response. - - """ - - logger.debug("BLOCK %s: %s", network.compressed, log_msg) - return flask.make_response(('Too Many Requests', 429)) - - -def get_network(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> IPv4Network | IPv6Network: - """Returns the (client) network of whether the real_ip is part of.""" - - if real_ip.version == 6: - prefix = cfg['real_ip.ipv6_prefix'] - else: - prefix = cfg['real_ip.ipv4_prefix'] - network = ip_network(f"{real_ip}/{prefix}", strict=False) - # logger.debug("get_network(): %s", network.compressed) - return network - - -_logged_errors = [] - - -def _log_error_only_once(err_msg): - if err_msg not in _logged_errors: - logger.error(err_msg) - _logged_errors.append(err_msg) - - -def get_real_ip(request: flask.Request) -> str: - """Returns real IP of the request. Since not all proxies set all the HTTP - headers and incoming headers can be faked it may happen that the IP cannot - be determined correctly. - - .. sidebar:: :py:obj:`flask.Request.remote_addr` - - SearXNG uses Werkzeug's ProxyFix_ (with it default ``x_for=1``). - - This function tries to get the remote IP in the order listed below, - additional some tests are done and if inconsistencies or errors are - detected, they are logged. - - The remote IP of the request is taken from (first match): - - - X-Forwarded-For_ header - - `X-real-IP header `__ - - :py:obj:`flask.Request.remote_addr` - - .. _ProxyFix: - https://werkzeug.palletsprojects.com/middleware/proxy_fix/ - - .. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - - """ - - forwarded_for = request.headers.get("X-Forwarded-For") - real_ip = request.headers.get('X-Real-IP') - remote_addr = request.remote_addr - # logger.debug( - # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr - # ) - - if not forwarded_for: - _log_error_only_once("X-Forwarded-For header is not set!") - else: - from . import ctx # pylint: disable=import-outside-toplevel, cyclic-import - - forwarded_for = [x.strip() for x in forwarded_for.split(',')] - x_for: int = ctx.cfg['real_ip.x_for'] # type: ignore - forwarded_for = forwarded_for[-min(len(forwarded_for), x_for)] - - if not real_ip: - _log_error_only_once("X-Real-IP header is not set!") - - if forwarded_for and real_ip and forwarded_for != real_ip: - logger.warning("IP from X-Real-IP (%s) is not equal to IP from X-Forwarded-For (%s)", real_ip, forwarded_for) - - if forwarded_for and remote_addr and forwarded_for != remote_addr: - logger.warning( - "IP from WSGI environment (%s) is not equal to IP from X-Forwarded-For (%s)", remote_addr, forwarded_for - ) - - if real_ip and remote_addr and real_ip != remote_addr: - logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) - - request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' - # logger.debug("get_real_ip() -> %s", request_ip) - return request_ip diff --git a/searx/botdetection/config.py b/searx/botdetection/config.py deleted file mode 100644 index c85696092..000000000 --- a/searx/botdetection/config.py +++ /dev/null @@ -1,378 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Configuration class :py:class:`Config` with deep-update, schema validation -and deprecated names. - -The :py:class:`Config` class implements a configuration that is based on -structured dictionaries. The configuration schema is defined in a dictionary -structure and the configuration data is given in a dictionary structure. -""" -from __future__ import annotations -from typing import Any - -import copy -import typing -import logging -import pathlib -import pytomlpp as toml - -__all__ = ['Config', 'UNSET', 'SchemaIssue'] - -log = logging.getLogger(__name__) - - -class FALSE: - """Class of ``False`` singelton""" - - # pylint: disable=multiple-statements - def __init__(self, msg): - self.msg = msg - - def __bool__(self): - return False - - def __str__(self): - return self.msg - - __repr__ = __str__ - - -UNSET = FALSE('') - - -class SchemaIssue(ValueError): - """Exception to store and/or raise a message from a schema issue.""" - - def __init__(self, level: typing.Literal['warn', 'invalid'], msg: str): - self.level = level - super().__init__(msg) - - def __str__(self): - return f"[cfg schema {self.level}] {self.args[0]}" - - -class Config: - """Base class used for configuration""" - - UNSET = UNSET - - @classmethod - def from_toml(cls, schema_file: pathlib.Path, cfg_file: pathlib.Path | None, deprecated: dict) -> Config: - # init schema - log.debug("load schema file: %s", schema_file) - cfg = cls(cfg_schema=toml.load(schema_file), deprecated=deprecated) - if cfg_file is None: - return cfg - if not cfg_file.exists(): - log.warning("missing config file: %s", cfg_file) - return cfg - # load configuration from toml file - cfg.load_toml(cfg_file) - return cfg - - def load_toml(self, cfg_file: pathlib.Path): - log.debug("load config file: %s", cfg_file) - try: - upd_cfg = toml.load(cfg_file) - except toml.DecodeError as exc: - msg = str(exc).replace('\t', '').replace('\n', ' ') - log.error("%s: %s", cfg_file, msg) - raise - - is_valid, issue_list = self.validate(upd_cfg) - for msg in issue_list: - log.error(str(msg)) - if not is_valid: - raise TypeError(f"schema of {cfg_file} is invalid!") - self.update(upd_cfg) - - def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]): - """Construtor of class Config. - - :param cfg_schema: Schema of the configuration - :param deprecated: dictionary that maps deprecated configuration names to a messages - - These values are needed for validation, see :py:obj:`validate`. - - """ - self.cfg_schema = cfg_schema - self.deprecated = deprecated - self.cfg = copy.deepcopy(cfg_schema) - - def __getitem__(self, key: str) -> Any: - return self.get(key) - - def validate(self, cfg: dict): - """Validation of dictionary ``cfg`` on :py:obj:`Config.SCHEMA`. - Validation is done by :py:obj:`validate`.""" - - return validate(self.cfg_schema, cfg, self.deprecated) - - def update(self, upd_cfg: dict): - """Update this configuration by ``upd_cfg``.""" - - dict_deepupdate(self.cfg, upd_cfg) - - def default(self, name: str): - """Returns default value of field ``name`` in ``self.cfg_schema``.""" - return value(name, self.cfg_schema) - - def get(self, name: str, default: Any = UNSET, replace: bool = True) -> Any: - """Returns the value to which ``name`` points in the configuration. - - If there is no such ``name`` in the config and the ``default`` is - :py:obj:`UNSET`, a :py:obj:`KeyError` is raised. - """ - - parent = self._get_parent_dict(name) - val = parent.get(name.split('.')[-1], UNSET) - if val is UNSET: - if default is UNSET: - raise KeyError(name) - val = default - - if replace and isinstance(val, str): - val = val % self - return val - - def set(self, name: str, val): - """Set the value to which ``name`` points in the configuration. - - If there is no such ``name`` in the config, a :py:obj:`KeyError` is - raised. - """ - parent = self._get_parent_dict(name) - parent[name.split('.')[-1]] = val - - def _get_parent_dict(self, name): - parent_name = '.'.join(name.split('.')[:-1]) - if parent_name: - parent = value(parent_name, self.cfg) - else: - parent = self.cfg - if (parent is UNSET) or (not isinstance(parent, dict)): - raise KeyError(parent_name) - return parent - - def path(self, name: str, default: Any = UNSET): - """Get a :py:class:`pathlib.Path` object from a config string.""" - - val = self.get(name, default) - if val is UNSET: - if default is UNSET: - raise KeyError(name) - return default - return pathlib.Path(str(val)) - - def pyobj(self, name, default: Any = UNSET): - """Get python object refered by full qualiffied name (FQN) in the config - string.""" - - fqn = self.get(name, default) - if fqn is UNSET: - if default is UNSET: - raise KeyError(name) - return default - (modulename, name) = str(fqn).rsplit('.', 1) - m = __import__(modulename, {}, {}, [name], 0) - return getattr(m, name) - - -# working with dictionaries - - -def value(name: str, data_dict: dict): - """Returns the value to which ``name`` points in the ``dat_dict``. - - .. code: python - - >>> data_dict = { - "foo": {"bar": 1 }, - "bar": {"foo": 2 }, - "foobar": [1, 2, 3], - } - >>> value('foobar', data_dict) - [1, 2, 3] - >>> value('foo.bar', data_dict) - 1 - >>> value('foo.bar.xxx', data_dict) - - - """ - - ret_val = data_dict - for part in name.split('.'): - if isinstance(ret_val, dict): - ret_val = ret_val.get(part, UNSET) - if ret_val is UNSET: - break - return ret_val - - -def validate( - schema_dict: typing.Dict, data_dict: typing.Dict, deprecated: typing.Dict[str, str] -) -> typing.Tuple[bool, list]: - - """Deep validation of dictionary in ``data_dict`` against dictionary in - ``schema_dict``. Argument deprecated is a dictionary that maps deprecated - configuration names to a messages:: - - deprecated = { - "foo.bar" : "config 'foo.bar' is deprecated, use 'bar.foo'", - "..." : "..." - } - - The function returns a python tuple ``(is_valid, issue_list)``: - - ``is_valid``: - A bool value indicating ``data_dict`` is valid or not. - - ``issue_list``: - A list of messages (:py:obj:`SchemaIssue`) from the validation:: - - [schema warn] data_dict: deprecated 'fontlib.foo': - [schema invalid] data_dict: key unknown 'fontlib.foo' - [schema invalid] data_dict: type mismatch 'fontlib.foo': expected ..., is ... - - If ``schema_dict`` or ``data_dict`` is not a dictionary type a - :py:obj:`SchemaIssue` is raised. - - """ - names = [] - is_valid = True - issue_list = [] - - if not isinstance(schema_dict, dict): - raise SchemaIssue('invalid', "schema_dict is not a dict type") - if not isinstance(data_dict, dict): - raise SchemaIssue('invalid', f"data_dict issue{'.'.join(names)} is not a dict type") - - is_valid, issue_list = _validate(names, issue_list, schema_dict, data_dict, deprecated) - return is_valid, issue_list - - -def _validate( - names: typing.List, - issue_list: typing.List, - schema_dict: typing.Dict, - data_dict: typing.Dict, - deprecated: typing.Dict[str, str], -) -> typing.Tuple[bool, typing.List]: - - is_valid = True - - for key, data_value in data_dict.items(): - - names.append(key) - name = '.'.join(names) - - deprecated_msg = deprecated.get(name) - # print("XXX %s: key %s // data_value: %s" % (name, key, data_value)) - if deprecated_msg: - issue_list.append(SchemaIssue('warn', f"data_dict '{name}': deprecated - {deprecated_msg}")) - - schema_value = value(name, schema_dict) - # print("YYY %s: key %s // schema_value: %s" % (name, key, schema_value)) - if schema_value is UNSET: - if not deprecated_msg: - issue_list.append(SchemaIssue('invalid', f"data_dict '{name}': key unknown in schema_dict")) - is_valid = False - - elif type(schema_value) != type(data_value): # pylint: disable=unidiomatic-typecheck - issue_list.append( - SchemaIssue( - 'invalid', - (f"data_dict: type mismatch '{name}':" f" expected {type(schema_value)}, is: {type(data_value)}"), - ) - ) - is_valid = False - - elif isinstance(data_value, dict): - _valid, _ = _validate(names, issue_list, schema_dict, data_value, deprecated) - is_valid = is_valid and _valid - names.pop() - - return is_valid, issue_list - - -def dict_deepupdate(base_dict: dict, upd_dict: dict, names=None): - """Deep-update of dictionary in ``base_dict`` by dictionary in ``upd_dict``. - - For each ``upd_key`` & ``upd_val`` pair in ``upd_dict``: - - 0. If types of ``base_dict[upd_key]`` and ``upd_val`` do not match raise a - :py:obj:`TypeError`. - - 1. If ``base_dict[upd_key]`` is a dict: recursively deep-update it by ``upd_val``. - - 2. If ``base_dict[upd_key]`` not exist: set ``base_dict[upd_key]`` from a - (deep-) copy of ``upd_val``. - - 3. If ``upd_val`` is a list, extend list in ``base_dict[upd_key]`` by the - list in ``upd_val``. - - 4. If ``upd_val`` is a set, update set in ``base_dict[upd_key]`` by set in - ``upd_val``. - """ - # pylint: disable=too-many-branches - if not isinstance(base_dict, dict): - raise TypeError("argument 'base_dict' is not a ditionary type") - if not isinstance(upd_dict, dict): - raise TypeError("argument 'upd_dict' is not a ditionary type") - - if names is None: - names = [] - - for upd_key, upd_val in upd_dict.items(): - # For each upd_key & upd_val pair in upd_dict: - - if isinstance(upd_val, dict): - - if upd_key in base_dict: - # if base_dict[upd_key] exists, recursively deep-update it - if not isinstance(base_dict[upd_key], dict): - raise TypeError(f"type mismatch {'.'.join(names)}: is not a dict type in base_dict") - dict_deepupdate( - base_dict[upd_key], - upd_val, - names - + [ - upd_key, - ], - ) - - else: - # if base_dict[upd_key] not exist, set base_dict[upd_key] from deepcopy of upd_val - base_dict[upd_key] = copy.deepcopy(upd_val) - - elif isinstance(upd_val, list): - - if upd_key in base_dict: - # if base_dict[upd_key] exists, base_dict[up_key] is extended by - # the list from upd_val - if not isinstance(base_dict[upd_key], list): - raise TypeError(f"type mismatch {'.'.join(names)}: is not a list type in base_dict") - base_dict[upd_key].extend(upd_val) - - else: - # if base_dict[upd_key] doesn't exists, set base_dict[key] from a deepcopy of the - # list in upd_val. - base_dict[upd_key] = copy.deepcopy(upd_val) - - elif isinstance(upd_val, set): - - if upd_key in base_dict: - # if base_dict[upd_key] exists, base_dict[up_key] is updated by the set in upd_val - if not isinstance(base_dict[upd_key], set): - raise TypeError(f"type mismatch {'.'.join(names)}: is not a set type in base_dict") - base_dict[upd_key].update(upd_val.copy()) - - else: - # if base_dict[upd_key] doesn't exists, set base_dict[upd_key] from a copy of the - # set in upd_val - base_dict[upd_key] = upd_val.copy() - - else: - # for any other type of upd_val replace or add base_dict[upd_key] by a copy - # of upd_val - base_dict[upd_key] = copy.copy(upd_val) diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py deleted file mode 100644 index b1f524593..000000000 --- a/searx/botdetection/http_accept.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_accept`` ----------------------- - -The ``http_accept`` method evaluates a request as the request of a bot if the -Accept_ header .. - -- did not contain ``text/html`` - -.. _Accept: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from . import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - if 'text/html' not in request.accept_mimetypes: - return too_many_requests(network, "HTTP header Accept did not contain text/html") - return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py deleted file mode 100644 index e0c03cc73..000000000 --- a/searx/botdetection/http_accept_encoding.py +++ /dev/null @@ -1,41 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_accept_encoding`` -------------------------------- - -The ``http_accept_encoding`` method evaluates a request as the request of a -bot if the Accept-Encoding_ header .. - -- did not contain ``gzip`` AND ``deflate`` (if both values are missed) -- did not contain ``text/html`` - -.. _Accept-Encoding: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Encoding - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from . import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] - if not ('gzip' in accept_list or 'deflate' in accept_list): - return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate") - return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py deleted file mode 100644 index aaef81cc4..000000000 --- a/searx/botdetection/http_accept_language.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_accept_language`` -------------------------------- - -The ``http_accept_language`` method evaluates a request as the request of a bot -if the Accept-Language_ header is unset. - -.. _Accept-Language: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent - -""" -# pylint: disable=unused-argument -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from . import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - if request.headers.get('Accept-Language', '').strip() == '': - return too_many_requests(network, "missing HTTP header Accept-Language") - return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py deleted file mode 100644 index a32877158..000000000 --- a/searx/botdetection/http_connection.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_connection`` --------------------------- - -The ``http_connection`` method evaluates a request as the request of a bot if -the Connection_ header is set to ``close``. - -.. _Connection: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Connection - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from . import config -from ._helpers import too_many_requests - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - if request.headers.get('Connection', '').strip() == 'close': - return too_many_requests(network, "HTTP header 'Connection=close") - return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py deleted file mode 100644 index 9e45c7f9d..000000000 --- a/searx/botdetection/http_user_agent.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``http_user_agent`` --------------------------- - -The ``http_user_agent`` method evaluates a request as the request of a bot if -the User-Agent_ header is unset or matches the regular expression -:py:obj:`USER_AGENT`. - -.. _User-Agent: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -import re -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from . import config -from ._helpers import too_many_requests - - -USER_AGENT = ( - r'(' - + r'unknown' - + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' - + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' - + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' - + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' - + r'|ZmEu|BLEXBot|bitlybot|HeadlessChrome' - # unmaintained Farside instances - + r'|' - + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)') - # other bots and client to block - + '|.*PetalBot.*' - + r')' -) -"""Regular expression that matches to User-Agent_ from known *bots*""" - -_regexp = None - - -def regexp_user_agent(): - global _regexp # pylint: disable=global-statement - if not _regexp: - _regexp = re.compile(USER_AGENT) - return _regexp - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - user_agent = request.headers.get('User-Agent', 'unknown') - if regexp_user_agent().match(user_agent): - return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") - return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py deleted file mode 100644 index d87acb5fc..000000000 --- a/searx/botdetection/ip_limit.py +++ /dev/null @@ -1,163 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. _botdetection.ip_limit: - -Method ``ip_limit`` -------------------- - -The ``ip_limit`` method counts request from an IP in *sliding windows*. If -there are to many requests in a sliding window, the request is evaluated as a -bot request. This method requires a redis DB and needs a HTTP X-Forwarded-For_ -header. To take privacy only the hash value of an IP is stored in the redis DB -and at least for a maximum of 10 minutes. - -The :py:obj:`.link_token` method can be used to investigate whether a request is -*suspicious*. To activate the :py:obj:`.link_token` method in the -:py:obj:`.ip_limit` method add the following configuration: - -.. code:: toml - - [botdetection.ip_limit] - link_token = true - -If the :py:obj:`.link_token` method is activated and a request is *suspicious* -the request rates are reduced: - -- :py:obj:`BURST_MAX` -> :py:obj:`BURST_MAX_SUSPICIOUS` -- :py:obj:`LONG_MAX` -> :py:obj:`LONG_MAX_SUSPICIOUS` - -To intercept bots that get their IPs from a range of IPs, there is a -:py:obj:`SUSPICIOUS_IP_WINDOW`. In this window the suspicious IPs are stored -for a longer time. IPs stored in this sliding window have a maximum of -:py:obj:`SUSPICIOUS_IP_MAX` accesses before they are blocked. As soon as the IP -makes a request that is not suspicious, the sliding window for this IP is -dropped. - -.. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - - -Config -~~~~~~ - -.. code:: toml - - [botdetection.ip_limit] - - # To get unlimited access in a local network, by default link-lokal addresses - # (networks) are not monitored by the ip_limit - filter_link_local = false - - # activate link_token method in the ip_limit method - link_token = false - -Implementations -~~~~~~~~~~~~~~~ - -""" -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, -) - -import flask -import werkzeug - -from . import ctx -from .redislib import incr_sliding_window, drop_counter -from . import link_token -from . import config -from ._helpers import ( - too_many_requests, - logger, -) - - -logger = logger.getChild('ip_limit') - -BURST_WINDOW = 20 -"""Time (sec) before sliding window for *burst* requests expires.""" - -BURST_MAX = 15 -"""Maximum requests from one IP in the :py:obj:`BURST_WINDOW`""" - -BURST_MAX_SUSPICIOUS = 2 -"""Maximum of suspicious requests from one IP in the :py:obj:`BURST_WINDOW`""" - -LONG_WINDOW = 600 -"""Time (sec) before the longer sliding window expires.""" - -LONG_MAX = 150 -"""Maximum requests from one IP in the :py:obj:`LONG_WINDOW`""" - -LONG_MAX_SUSPICIOUS = 10 -"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`""" - -API_WINDOW = 3600 -"""Time (sec) before sliding window for API requests (format != html) expires.""" - -API_MAX = 4 -"""Maximum requests from one IP in the :py:obj:`API_WINDOW`""" - -SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30 -"""Time (sec) before sliding window for one suspicious IP expires.""" - -SUSPICIOUS_IP_MAX = 3 -"""Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" - - -def filter_request( - network: IPv4Network | IPv6Network, - request: flask.Request, - cfg: config.Config, -) -> werkzeug.Response | None: - - # pylint: disable=too-many-return-statements - - if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: - logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) - return None - - if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(ctx.redis_client, 'ip_limit.API_WINDOW:' + network.compressed, API_WINDOW) - if c > API_MAX: - return too_many_requests(network, "too many request in API_WINDOW") - - if cfg['botdetection.ip_limit.link_token']: - - suspicious = link_token.is_suspicious(network, request, True) - - if not suspicious: - # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(ctx.redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed) - return None - - # this IP is suspicious: count requests from this IP - c = incr_sliding_window( - ctx.redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW - ) - if c > SUSPICIOUS_IP_MAX: - logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) - return flask.redirect(flask.url_for('index'), code=302) - - c = incr_sliding_window(ctx.redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) - if c > BURST_MAX_SUSPICIOUS: - return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - - c = incr_sliding_window(ctx.redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) - if c > LONG_MAX_SUSPICIOUS: - return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") - - return None - - # vanilla limiter without extensions counts BURST_MAX and LONG_MAX - c = incr_sliding_window(ctx.redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) - if c > BURST_MAX: - return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)") - - c = incr_sliding_window(ctx.redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) - if c > LONG_MAX: - return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)") - - return None diff --git a/searx/botdetection/ip_lists.py b/searx/botdetection/ip_lists.py deleted file mode 100644 index 34fde6467..000000000 --- a/searx/botdetection/ip_lists.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""".. _botdetection.ip_lists: - -Method ``ip_lists`` -------------------- - -The ``ip_lists`` method implements IP :py:obj:`block- ` and -:py:obj:`pass-lists `. - - -Config -~~~~~~ - -.. code:: toml - - [botdetection.ip_lists] - - pass_ip = [ - '167.235.158.251', # IPv4 of check.searx.space - '192.168.0.0/16', # IPv4 private network - 'fe80::/10' # IPv6 linklocal - ] - block_ip = [ - '93.184.216.34', # IPv4 of example.org - '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class - ] - - -Implementations -~~~~~~~~~~~~~~~ - -""" -# pylint: disable=unused-argument - -from __future__ import annotations -from typing import Tuple -from ipaddress import ( - ip_network, - IPv4Address, - IPv6Address, -) - -from . import config -from ._helpers import logger - -logger = logger.getChild('ip_limit') - - -def pass_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: - """Checks if the IP on the subnet is in one of the members of the - ``botdetection.ip_lists.pass_ip`` list. - """ - return ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.pass_ip', cfg) - - -def block_ip(real_ip: IPv4Address | IPv6Address, cfg: config.Config) -> Tuple[bool, str]: - """Checks if the IP on the subnet is in one of the members of the - ``botdetection.ip_lists.block_ip`` list. - """ - - block, msg = ip_is_subnet_of_member_in_list(real_ip, 'botdetection.ip_lists.block_ip', cfg) - if block: - msg += " To remove IP from list, please contact the maintainer of the service." - return block, msg - - -def ip_is_subnet_of_member_in_list( - real_ip: IPv4Address | IPv6Address, list_name: str, cfg: config.Config -) -> Tuple[bool, str]: - - for net in cfg.get(list_name, default=[]): - try: - net = ip_network(net, strict=False) - except ValueError: - logger.error("invalid IP %s in %s", net, list_name) - continue - if real_ip.version == net.version and real_ip in net: - return True, f"IP matches {net.compressed} in {list_name}." - return False, f"IP is not a member of an item in the f{list_name} list" diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py deleted file mode 100644 index 9f2adf766..000000000 --- a/searx/botdetection/link_token.py +++ /dev/null @@ -1,179 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -""" -Method ``link_token`` ---------------------- - -The ``link_token`` method evaluates a request as :py:obj:`suspicious -` if the URL ``/client.css`` is not requested by the -client. By adding a random component (the token) in the URL, a bot can not send -a ping by request a static URL. - -.. note:: - - This method requires a redis DB and needs a HTTP X-Forwarded-For_ header. - -.. _X-Forwarded-For: - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For - -To get in use of this method a flask URL route needs to be added: - -.. code:: python - - @app.route('/client.css', methods=['GET', 'POST']) - def client_token(token=None): - link_token.ping(request, token) - return Response('', mimetype='text/css') - -And in the HTML template from flask a stylesheet link is needed (the value of -``link_token`` comes from :py:obj:`get_token`): - -.. code:: html - - - - -Config -~~~~~~ - -.. code:: toml - - [botdetection.link_token] - # Livetime (sec) of limiter's CSS token. - TOKEN_LIVE_TIME = 600 - - # Livetime (sec) of the ping-key from a client (request) - PING_LIVE_TIME = 3600 - - # Prefix of all ping-keys generated by link_token.get_ping_key - PING_KEY = 'botdetection.link_token.PING_KEY' - - # Key for which the current token is stored in the DB - TOKEN_KEY = 'botdetection.link_token.TOKEN_KEY' - - -Implementations -~~~~~~~~~~~~~~~ - -""" - -from __future__ import annotations -from ipaddress import ( - IPv4Network, - IPv6Network, - ip_address, -) - -import string -import random -import flask - -from . import ctx -from .redislib import secret_hash - -from ._helpers import ( - logger, - get_network, - get_real_ip, -) - - -logger = logger.getChild('link_token') - - -PING_KEY = 'botdetection.link_token.PING_KEY' -"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`""" - -TOKEN_KEY = 'botdetection.link_token.TOKEN_KEY' -"""Key for which the current token is stored in the DB""" - - -def _cfg(name): - return ctx.cfg.get(f'botdetection.link_token.{name}') - - -def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): - """Checks whether a valid ping is exists for this (client) network, if not - this request is rated as *suspicious*. If a valid ping exists and argument - ``renew`` is ``True`` the expire time of this ping is reset to - ``PING_LIVE_TIME``. - - """ - if not ctx.redis_client: - return False - - ping_key = get_ping_key(network, request) - if not ctx.redis_client.get(ping_key): - logger.info("missing ping (IP: %s) / request: %s", network.compressed, ping_key) - return True - - if renew: - ctx.redis_client.set(ping_key, 1, ex=_cfg('PING_LIVE_TIME')) - - logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key) - return False - - -def ping(request: flask.Request, token: str): - """This function is called by a request to URL ``/client.css``. If - ``token`` is valid a :py:obj:`PING_KEY` for the client is stored in the DB. - The expire time of this ping-key is ``PING_LIVE_TIME``. - - """ - if not ctx.redis_client: - return - if not token_is_valid(token): - return - - real_ip = ip_address(get_real_ip(request)) - network = get_network(real_ip, ctx.cfg) - - ping_key = get_ping_key(network, request) - logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) - - ctx.redis_client.set(ping_key, 1, ex=_cfg('PING_LIVE_TIME')) - - -def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: - """Generates a hashed key that fits (more or less) to a *WEB-browser - session* in a network.""" - return ( - PING_KEY - + "[" - + secret_hash( - network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') - ) - + "]" - ) - - -def token_is_valid(token) -> bool: - valid = token == get_token() - logger.debug("token is valid --> %s", valid) - return valid - - -def get_token() -> str: - """Returns current token. If there is no currently active token a new token - is generated randomly and stored in the redis DB. - - Config: - - - ``TOKEN_LIVE_TIME`` - - ``TOKEN_KEY`` - - """ - if not ctx.redis_client: - # This function is also called when limiter is inactive / no redis DB - # (see render function in webapp.py) - return '12345678' - token_key = _cfg('TOKEN_KEY') - token = ctx.redis_client.get(token_key) - if token: - token = token.decode('UTF-8') - else: - token = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(16)) - ctx.redis_client.set(token_key, token, ex=_cfg('TOKEN_LIVE_TIME')) - return token diff --git a/searx/botdetection/redislib.py b/searx/botdetection/redislib.py deleted file mode 100644 index 74342d4a6..000000000 --- a/searx/botdetection/redislib.py +++ /dev/null @@ -1,263 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""A collection of convenient functions and redis/lua scripts. - -This code was partial inspired by the `Bullet-Proofing Lua Scripts in RedisPy`_ -article. - -.. _Bullet-Proofing Lua Scripts in RedisPy: - https://redis.com/blog/bullet-proofing-lua-scripts-in-redispy/ - -Config -~~~~~~ - -.. code:: toml - - [botdetection.redis] - - # FQDN of a function definition. A function with which the DB keys of the Redis - # DB are to be annonymized. - secret_hash = '' - - # A prefix to all keys store by the botdetection in the redis DB - REDIS_KEY_PREFIX = 'botdetection_' - - -Implementations -~~~~~~~~~~~~~~~ -""" - -from __future__ import annotations - -from . import ctx - -REDIS_KEY_PREFIX = 'botdetection' -"""A prefix applied to all keys store by the botdetection in the redis DB.""" - -LUA_SCRIPT_STORAGE = {} -"""A global dictionary to cache client's ``Script`` objects, used by -:py:obj:`lua_script_storage`""" - - -def secret_hash(name: str) -> str: - """Returns a annonymized name if ``secret_hash`` is configured, otherwise - the ``name`` is returned unchanged.""" - func = ctx.cfg.pyobj('botdetection.redis.secret_hash', default=None) # type: ignore - if not func: - return name - return func(name) - - -def _prefix(val: str | None = None) -> str: - if val is None: - val = ctx.cfg.get('botdetection.redis.REDIS_KEY_PREFIX', default=REDIS_KEY_PREFIX) # type: ignore - return str(val) - - -def lua_script_storage(client, script): - """Returns a redis :py:obj:`Script - ` instance. - - Due to performance reason the ``Script`` object is instantiated only once - for a client (``client.register_script(..)``) and is cached in - :py:obj:`LUA_SCRIPT_STORAGE`. - - """ - - # redis connection can be closed, lets use the id() of the redis connector - # as key in the script-storage: - client_id = id(client) - - if LUA_SCRIPT_STORAGE.get(client_id) is None: - LUA_SCRIPT_STORAGE[client_id] = {} - - if LUA_SCRIPT_STORAGE[client_id].get(script) is None: - LUA_SCRIPT_STORAGE[client_id][script] = client.register_script(script) - - return LUA_SCRIPT_STORAGE[client_id][script] - - -PURGE_BY_PREFIX = """ -local prefix = tostring(ARGV[1]) -for i, name in ipairs(redis.call('KEYS', prefix .. '*')) do - redis.call('EXPIRE', name, 0) -end -""" - - -def purge_by_prefix(client, prefix: str | None): - """Purge all keys with ``prefix`` from database. - - Queries all keys in the database by the given prefix and set expire time to - zero. The default prefix will drop all keys which has been set by - :py:obj:`REDIS_KEY_PREFIX`. - - The implementation is the lua script from string :py:obj:`PURGE_BY_PREFIX`. - The lua script uses EXPIRE_ instead of DEL_: if there are a lot keys to - delete and/or their values are big, `DEL` could take more time and blocks - the command loop while `EXPIRE` turns back immediate. - - :param prefix: prefix of the key to delete (default: :py:obj:`REDIS_KEY_PREFIX`) - :type name: str - - .. _EXPIRE: https://redis.io/commands/expire/ - .. _DEL: https://redis.io/commands/del/ - - """ - script = lua_script_storage(client, PURGE_BY_PREFIX) - script(args=[_prefix(prefix)]) - - -INCR_COUNTER = """ -local limit = tonumber(ARGV[1]) -local expire = tonumber(ARGV[2]) -local c_name = KEYS[1] - -local c = redis.call('GET', c_name) - -if not c then - c = redis.call('INCR', c_name) - if expire > 0 then - redis.call('EXPIRE', c_name, expire) - end -else - c = tonumber(c) - if limit == 0 or c < limit then - c = redis.call('INCR', c_name) - end -end -return c -""" - - -def incr_counter(client, name: str, limit: int = 0, expire: int = 0): - """Increment a counter and return the new value. - - If counter with redis key :py:obj:`REDIS_KEY_PREFIX` + ``counter_`` - does not exists it is created with initial value 1 returned. The - replacement ```` is a *secret hash* of the value from argument - ``name`` (see :py:func:`secret_hash`). - - The implementation of the redis counter is the lua script from string - :py:obj:`INCR_COUNTER`. - - :param name: name of the counter - :type name: str - - :param expire: live-time of the counter in seconds (default ``None`` means - infinite). - :type expire: int / see EXPIRE_ - - :param limit: limit where the counter stops to increment (default ``None``) - :type limit: int / limit is 2^64 see INCR_ - - :return: value of the incremented counter - :type return: int - - .. _EXPIRE: https://redis.io/commands/expire/ - .. _INCR: https://redis.io/commands/incr/ - - A simple demo of a counter with expire time and limit:: - - >>> for i in range(6): - ... i, incr_counter(client, "foo", 3, 5) # max 3, duration 5 sec - ... time.sleep(1) # from the third call on max has been reached - ... - (0, 1) - (1, 2) - (2, 3) - (3, 3) - (4, 3) - (5, 1) - - """ - script = lua_script_storage(client, INCR_COUNTER) - name = _prefix() + "counter_" + secret_hash(name) - c = script(args=[limit, expire], keys=[name]) - return c - - -def drop_counter(client, name): - """Drop counter with redis key :py:obj:`REDIS_KEY_PREFIX` + - ``counter_`` - - The replacement ```` is a *secret hash* of the value from argument - ``name`` (see :py:func:`incr_counter` and :py:func:`incr_sliding_window`). - - """ - name = _prefix() + "counter_" + secret_hash(name) - client.delete(name) - - -INCR_SLIDING_WINDOW = """ -local expire = tonumber(ARGV[1]) -local name = KEYS[1] -local current_time = redis.call('TIME') - -redis.call('ZREMRANGEBYSCORE', name, 0, current_time[1] - expire) -redis.call('ZADD', name, current_time[1], current_time[1] .. current_time[2]) -local result = redis.call('ZCOUNT', name, 0, current_time[1] + 1) -redis.call('EXPIRE', name, expire) -return result -""" - - -def incr_sliding_window(client, name: str, duration: int): - """Increment a sliding-window counter and return the new value. - - If counter with redis key :py:obj:`REDIS_KEY_PREFIX` + ``counter_`` - does not exists it is created with initial value 1 returned. The - replacement ```` is a *secret hash* of the value from argument - ``name`` (see :py:func:`secret_hash`). - - :param name: name of the counter - :type name: str - - :param duration: live-time of the sliding window in seconds - :typeduration: int - - :return: value of the incremented counter - :type return: int - - The implementation of the redis counter is the lua script from string - :py:obj:`INCR_SLIDING_WINDOW`. The lua script uses `sorted sets in Redis`_ - to implement a sliding window for the redis key :py:obj:`REDIS_KEY_PREFIX` + - ``counter_`` (ZADD_). The current TIME_ is used to score the items in - the sorted set and the time window is moved by removing items with a score - lower current time minus *duration* time (ZREMRANGEBYSCORE_). - - The EXPIRE_ time (the duration of the sliding window) is refreshed on each - call (increment) and if there is no call in this duration, the sorted - set expires from the redis DB. - - The return value is the amount of items in the sorted set (ZCOUNT_), what - means the number of calls in the sliding window. - - .. _Sorted sets in Redis: - https://redis.com/ebook/part-1-getting-started/chapter-1-getting-to-know-redis/1-2-what-redis-data-structures-look-like/1-2-5-sorted-sets-in-redis/ - .. _TIME: https://redis.io/commands/time/ - .. _ZADD: https://redis.io/commands/zadd/ - .. _EXPIRE: https://redis.io/commands/expire/ - .. _ZREMRANGEBYSCORE: https://redis.io/commands/zremrangebyscore/ - .. _ZCOUNT: https://redis.io/commands/zcount/ - - A simple demo of the sliding window:: - - >>> for i in range(5): - ... incr_sliding_window(client, "foo", 3) # duration 3 sec - ... time.sleep(1) # from the third call (second) on the window is moved - ... - 1 - 2 - 3 - 3 - 3 - >>> time.sleep(3) # wait until expire - >>> incr_sliding_window(client, "foo", 3) - 1 - - """ - script = lua_script_storage(client, INCR_SLIDING_WINDOW) - name = _prefix() + "counter_" + secret_hash(name) - c = script(args=[duration], keys=[name]) - return c diff --git a/searx/botdetection/schema.toml b/searx/botdetection/schema.toml deleted file mode 100644 index 97a13b1aa..000000000 --- a/searx/botdetection/schema.toml +++ /dev/null @@ -1,58 +0,0 @@ -[real_ip] - -# Number of values to trust for X-Forwarded-For. - -x_for = 1 - -# The prefix defines the number of leading bits in an address that are compared -# to determine whether or not an address is part of a (client) network. - -ipv4_prefix = 32 -ipv6_prefix = 48 - -[botdetection.redis] - -# FQDN of a function definition. A function with which the DB keys of the Redis -# DB are to be annonymized. -secret_hash = '' - -# A prefix to all keys store by the botdetection in the redis DB -REDIS_KEY_PREFIX = 'botdetection_' - -[botdetection.ip_limit] - -# To get unlimited access in a local network, by default link-lokal addresses -# (networks) are not monitored by the ip_limit -filter_link_local = false - -# activate link_token method in the ip_limit method -link_token = false - -[botdetection.link_token] -# Livetime (sec) of limiter's CSS token. -TOKEN_LIVE_TIME = 600 - -# Livetime (sec) of the ping-key from a client (request) -PING_LIVE_TIME = 3600 - -# Prefix of all ping-keys generated by link_token.get_ping_key -PING_KEY = 'botdetection.link_token.PING_KEY' - -# Key for which the current token is stored in the DB -TOKEN_KEY = 'botdetection.link_token.TOKEN_KEY' - -[botdetection.ip_lists] - -# In the limiter, the ip_lists method has priority over all other methods -> if -# an IP is in the pass_ip list, it has unrestricted access and it is also not -# checked if e.g. the "user agent" suggests a bot (e.g. curl). - -block_ip = [ - # '93.184.216.34', # IPv4 of example.org - # '257.1.1.1', # invalid IP --> will be ignored, logged in ERROR class -] - -pass_ip = [ - # '192.168.0.0/16', # IPv4 private network - # 'fe80::/10' # IPv6 linklocal / wins over botdetection.ip_limit.filter_link_local -] diff --git a/searx/limiter.py b/searx/limiter.py index 033614de9..c1b67096a 100644 --- a/searx/limiter.py +++ b/searx/limiter.py @@ -101,12 +101,8 @@ from ipaddress import ip_address import flask import werkzeug -from searx import ( - logger, - redisdb, -) -from searx import botdetection -from searx.botdetection import ( +import botdetection +from botdetection import ( http_accept, http_accept_encoding, http_accept_language, @@ -118,6 +114,11 @@ from searx.botdetection import ( dump_request, ) +from searx import ( + logger, + redisdb, +) + # the configuration are limiter.toml and "limiter" in settings.yml so, for # coherency, the logger is "limiter" logger = logger.getChild('limiter') diff --git a/searx/plugins/self_info.py b/searx/plugins/self_info.py index 8079ee0d4..64e63136c 100644 --- a/searx/plugins/self_info.py +++ b/searx/plugins/self_info.py @@ -5,7 +5,7 @@ import re from flask_babel import gettext -from searx.botdetection._helpers import get_real_ip +from botdetection._helpers import get_real_ip name = gettext('Self Information') description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".') diff --git a/searx/search/checker/scheduler.py b/searx/search/checker/scheduler.py index 89d0b04f5..cedcca73a 100644 --- a/searx/search/checker/scheduler.py +++ b/searx/search/checker/scheduler.py @@ -17,8 +17,9 @@ import time import importlib from typing import Callable +from botdetection.redislib import lua_script_storage + from searx.redisdb import client as get_redis_client -from searx.botdetection.redislib import lua_script_storage logger = logging.getLogger('searx.search.checker') diff --git a/searx/webapp.py b/searx/webapp.py index 9c2ef8146..ea49e2d3e 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -49,6 +49,8 @@ from flask_babel import ( format_decimal, ) +import botdetection + from searx import ( logger, get_setting, @@ -58,7 +60,6 @@ from searx import ( from searx import infopage from searx import limiter -from searx import botdetection from searx.data import ENGINE_DESCRIPTIONS from searx.results import Timing