From 281e36f4b7848374535d5e953050ae73423191ca Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 1 Jun 2023 15:41:48 +0200 Subject: [PATCH] [fix] limiter: replace real_ip by IPv4/v6 network Closes: https://github.com/searxng/searxng/issues/2477 Signed-off-by: Markus Heiser --- searx/botdetection/__init__.py | 1 + searx/botdetection/_helpers.py | 42 ++++++++++++--- searx/botdetection/http_accept.py | 16 ++++-- searx/botdetection/http_accept_encoding.py | 16 ++++-- searx/botdetection/http_accept_language.py | 14 +++-- searx/botdetection/http_connection.py | 16 ++++-- searx/botdetection/http_user_agent.py | 16 ++++-- searx/botdetection/ip_limit.py | 49 ++++++++++------- searx/botdetection/limiter.py | 61 ++++++++++------------ searx/botdetection/limiter.toml | 22 ++++++-- searx/botdetection/link_token.py | 54 +++++++++++-------- searx/plugins/limiter.py | 7 +-- 12 files changed, 208 insertions(+), 106 deletions(-) diff --git a/searx/botdetection/__init__.py b/searx/botdetection/__init__.py index c903b0bb4..fcd8e5630 100644 --- a/searx/botdetection/__init__.py +++ b/searx/botdetection/__init__.py @@ -24,3 +24,4 @@ X-Forwarded-For from ._helpers import dump_request from ._helpers import get_real_ip +from ._helpers import too_many_requests diff --git a/searx/botdetection/_helpers.py b/searx/botdetection/_helpers.py index b034b980b..8e0156d6e 100644 --- a/searx/botdetection/_helpers.py +++ b/searx/botdetection/_helpers.py @@ -1,11 +1,19 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint # pylint: disable=missing-module-docstring, invalid-name +from __future__ import annotations -from typing import Optional +from ipaddress import ( + IPv4Network, + IPv6Network, + IPv6Address, + ip_address, + ip_network, +) import flask import werkzeug +from searx.tools import config from searx import logger logger = logger.getChild('botdetection') @@ -13,7 +21,7 @@ logger = logger.getChild('botdetection') def dump_request(request: flask.Request): return ( - "%s: %s" % (get_real_ip(request), request.path) + request.path + " || X-Forwarded-For: %s" % request.headers.get('X-Forwarded-For') + " || X-Real-IP: %s" % request.headers.get('X-Real-IP') + " || form: %s" % request.form @@ -27,12 +35,30 @@ def dump_request(request: flask.Request): ) -def too_many_requests(request: flask.Request, log_msg: str) -> Optional[werkzeug.Response]: - log_prefix = 'BLOCK %s: ' % get_real_ip(request) - logger.debug(log_prefix + log_msg) +def too_many_requests(network: IPv4Network | IPv6Network, log_msg: str) -> werkzeug.Response | None: + """Returns a HTTP 429 response object and writes a ERROR message to the + 'botdetection' logger. This function is used in part by the filter methods + to return the default ``Too Many Requests`` response. + + """ + + logger.debug("BLOCK %s: %s", network.compressed, log_msg) return flask.make_response(('Too Many Requests', 429)) +def get_network(real_ip: str, cfg: config.Config) -> IPv4Network | IPv6Network: + """Returns the (client) network of whether the real_ip is part of.""" + + ip = ip_address(real_ip) + if isinstance(ip, IPv6Address): + prefix = cfg['real_ip.ipv6_prefix'] + else: + prefix = cfg['real_ip.ipv4_prefix'] + network = ip_network(f"{real_ip}/{prefix}", strict=False) + # logger.debug("get_network(): %s", network.compressed) + return network + + def get_real_ip(request: flask.Request) -> str: """Returns real IP of the request. Since not all proxies set all the HTTP headers and incoming headers can be faked it may happen that the IP cannot @@ -63,7 +89,9 @@ def get_real_ip(request: flask.Request) -> str: forwarded_for = request.headers.get("X-Forwarded-For") real_ip = request.headers.get('X-Real-IP') remote_addr = request.remote_addr - logger.debug("X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr) + # logger.debug( + # "X-Forwarded-For: %s || X-Real-IP: %s || request.remote_addr: %s", forwarded_for, real_ip, remote_addr + # ) if not forwarded_for: logger.error("X-Forwarded-For header is not set!") @@ -89,5 +117,5 @@ def get_real_ip(request: flask.Request) -> str: logger.warning("IP from WSGI environment (%s) is not equal to IP from X-Real-IP (%s)", remote_addr, real_ip) request_ip = forwarded_for or real_ip or remote_addr or '0.0.0.0' - logger.debug("get_real_ip() -> %s", request_ip) + # logger.debug("get_real_ip() -> %s", request_ip) return request_ip diff --git a/searx/botdetection/http_accept.py b/searx/botdetection/http_accept.py index 60e2330ae..b78a86278 100644 --- a/searx/botdetection/http_accept.py +++ b/searx/botdetection/http_accept.py @@ -15,7 +15,12 @@ Accept_ header .. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -23,7 +28,12 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if 'text/html' not in request.accept_mimetypes: - return too_many_requests(request, "HTTP header Accept did not contain text/html") + return too_many_requests(network, "HTTP header Accept did not contain text/html") return None diff --git a/searx/botdetection/http_accept_encoding.py b/searx/botdetection/http_accept_encoding.py index 5301c5d9d..60718a4ca 100644 --- a/searx/botdetection/http_accept_encoding.py +++ b/searx/botdetection/http_accept_encoding.py @@ -16,7 +16,12 @@ bot if the Accept-Encoding_ header .. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -24,8 +29,13 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + accept_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] if not ('gzip' in accept_list or 'deflate' in accept_list): - return too_many_requests(request, "HTTP header Accept-Encoding did not contain gzip nor deflate") + return too_many_requests(network, "HTTP header Accept-Encoding did not contain gzip nor deflate") return None diff --git a/searx/botdetection/http_accept_language.py b/searx/botdetection/http_accept_language.py index 060f67ec0..395d28bfd 100644 --- a/searx/botdetection/http_accept_language.py +++ b/searx/botdetection/http_accept_language.py @@ -12,8 +12,12 @@ if the Accept-Language_ header is unset. """ # pylint: disable=unused-argument +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) -from typing import Optional import flask import werkzeug @@ -21,7 +25,11 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: if request.headers.get('Accept-Language', '').strip() == '': - return too_many_requests(request, "missing HTTP header Accept-Language") + return too_many_requests(network, "missing HTTP header Accept-Language") return None diff --git a/searx/botdetection/http_connection.py b/searx/botdetection/http_connection.py index e718dfe3f..ee0d80a23 100644 --- a/searx/botdetection/http_connection.py +++ b/searx/botdetection/http_connection.py @@ -13,7 +13,12 @@ the Connection_ header is set to ``close``. """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -21,7 +26,12 @@ from searx.tools import config from ._helpers import too_many_requests -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + if request.headers.get('Connection', '').strip() == 'close': - return too_many_requests(request, "HTTP header 'Connection=close") + return too_many_requests(network, "HTTP header 'Connection=close") return None diff --git a/searx/botdetection/http_user_agent.py b/searx/botdetection/http_user_agent.py index 70309e975..17025f68b 100644 --- a/searx/botdetection/http_user_agent.py +++ b/searx/botdetection/http_user_agent.py @@ -14,8 +14,13 @@ the User-Agent_ header is unset or matches the regular expression """ # pylint: disable=unused-argument -from typing import Optional +from __future__ import annotations import re +from ipaddress import ( + IPv4Network, + IPv6Network, +) + import flask import werkzeug @@ -50,8 +55,13 @@ def regexp_user_agent(): return _regexp -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + user_agent = request.headers.get('User-Agent', 'unknown') if regexp_user_agent().match(user_agent): - return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}") + return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}") return None diff --git a/searx/botdetection/ip_limit.py b/searx/botdetection/ip_limit.py index 268285dd9..46e026371 100644 --- a/searx/botdetection/ip_limit.py +++ b/searx/botdetection/ip_limit.py @@ -38,8 +38,12 @@ droped. https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For """ +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) -from typing import Optional import flask import werkzeug from searx.tools import config @@ -49,7 +53,7 @@ from searx import logger from searx.redislib import incr_sliding_window, drop_counter from . import link_token -from ._helpers import too_many_requests, get_real_ip +from ._helpers import too_many_requests logger = logger.getChild('botdetection.ip_limit') @@ -85,49 +89,58 @@ SUSPICIOUS_IP_MAX = 3 """Maximum requests from one suspicious IP in the :py:obj:`SUSPICIOUS_IP_WINDOW`.""" -def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]: +def filter_request( + network: IPv4Network | IPv6Network, + request: flask.Request, + cfg: config.Config, +) -> werkzeug.Response | None: + # pylint: disable=too-many-return-statements redis_client = redisdb.client() - client_ip = get_real_ip(request) + if network.is_link_local and not cfg['botdetection.ip_limit.filter_link_local']: + logger.debug("network %s is link-local -> not monitored by ip_limit method", network.compressed) + return None if request.args.get('format', 'html') != 'html': - c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + client_ip, API_WONDOW) + c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW) if c > API_MAX: - return too_many_requests(request, "too many request in API_WINDOW") + return too_many_requests(network, "too many request in API_WINDOW") if cfg['botdetection.ip_limit.link_token']: - suspicious = link_token.is_suspicious(request, True) + suspicious = link_token.is_suspicious(network, request, True) if not suspicious: # this IP is no longer suspicious: release ip again / delete the counter of this IP - drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip) + drop_counter(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed) return None # this IP is suspicious: count requests from this IP - c = incr_sliding_window(redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + client_ip, SUSPICIOUS_IP_WINDOW) + c = incr_sliding_window( + redis_client, 'ip_limit.SUSPICIOUS_IP_WINDOW' + network.compressed, SUSPICIOUS_IP_WINDOW + ) if c > SUSPICIOUS_IP_MAX: - logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", client_ip) + logger.error("BLOCK: too many request from %s in SUSPICIOUS_IP_WINDOW (redirect to /)", network) return flask.redirect(flask.url_for('index'), code=302) - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) if c > BURST_MAX_SUSPICIOUS: - return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX_SUSPICIOUS)") - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) if c > LONG_MAX_SUSPICIOUS: - return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX_SUSPICIOUS)") return None # vanilla limiter without extensions counts BURST_MAX and LONG_MAX - c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + client_ip, BURST_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.BURST_WINDOW' + network.compressed, BURST_WINDOW) if c > BURST_MAX: - return too_many_requests(request, "too many request in BURST_WINDOW (BURST_MAX)") + return too_many_requests(network, "too many request in BURST_WINDOW (BURST_MAX)") - c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + client_ip, LONG_WINDOW) + c = incr_sliding_window(redis_client, 'ip_limit.LONG_WINDOW' + network.compressed, LONG_WINDOW) if c > LONG_MAX: - return too_many_requests(request, "too many request in LONG_WINDOW (LONG_MAX)") + return too_many_requests(network, "too many request in LONG_WINDOW (LONG_MAX)") return None diff --git a/searx/botdetection/limiter.py b/searx/botdetection/limiter.py index 93826684f..18ffc8407 100644 --- a/searx/botdetection/limiter.py +++ b/searx/botdetection/limiter.py @@ -37,14 +37,16 @@ and set the redis-url connection. Check the value, it depends on your redis DB """ -from typing import Optional, Tuple +from __future__ import annotations + from pathlib import Path import flask -import pytomlpp as toml +import werkzeug -from searx import logger from searx.tools import config -from searx.botdetection import ( +from searx import logger + +from . import ( http_accept, http_accept_encoding, http_accept_language, @@ -53,6 +55,16 @@ from searx.botdetection import ( ip_limit, ) +from ._helpers import ( + get_network, + get_real_ip, + dump_request, +) + +logger = logger.getChild('botdetection.limiter') + +CFG: config.Config = None # type: ignore + LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml" """Base configuration (schema) of the botdetection.""" @@ -63,40 +75,21 @@ CFG_DEPRECATED = { # "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config." } -CFG = None - def get_cfg() -> config.Config: + global CFG # pylint: disable=global-statement if CFG is None: - init_cfg(logger) + CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED) return CFG -def init_cfg(log): - global CFG # pylint: disable=global-statement - CFG = config.Config(cfg_schema=toml.load(LIMITER_CFG_SCHEMA), deprecated=CFG_DEPRECATED) +def filter_request(request: flask.Request) -> werkzeug.Response | None: - if not LIMITER_CFG.exists(): - log.warning("missing config file: %s", LIMITER_CFG) - return - - log.info("load config file: %s", LIMITER_CFG) - try: - upd_cfg = toml.load(LIMITER_CFG) - except toml.DecodeError as exc: - msg = str(exc).replace('\t', '').replace('\n', ' ') - log.error("%s: %s", LIMITER_CFG, msg) - raise - - is_valid, issue_list = CFG.validate(upd_cfg) - for msg in issue_list: - log.error(str(msg)) - if not is_valid: - raise TypeError(f"schema of {LIMITER_CFG} is invalid, can't cutomize limiter configuration from!") - CFG.update(upd_cfg) - - -def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: + cfg = get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + if network.is_link_local: + return None if request.path == '/healthz': return None @@ -104,7 +97,7 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: for func in [ http_user_agent, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val @@ -118,8 +111,8 @@ def filter_request(request: flask.Request) -> Optional[Tuple[int, str]]: http_user_agent, ip_limit, ]: - val = func.filter_request(request, CFG) + val = func.filter_request(network, request, cfg) if val is not None: return val - + logger.debug(f"OK {network}: %s", dump_request(flask.request)) return None diff --git a/searx/botdetection/limiter.toml b/searx/botdetection/limiter.toml index af797d32c..71a231e8f 100644 --- a/searx/botdetection/limiter.toml +++ b/searx/botdetection/limiter.toml @@ -1,8 +1,22 @@ -[botdetection.ip_limit] - -link_token = false - [real_ip] # Number of values to trust for X-Forwarded-For. + x_for = 1 + +# The prefix defines the number of leading bits in an address that are compared +# to determine whether or not an address is part of a (client) network. + +ipv4_prefix = 32 +ipv6_prefix = 48 + +[botdetection.ip_limit] + +# To get unlimited access in a local network, by default link-lokal addresses +# (networks) are not monitored by the ip_limit +filter_link_local = false + +# acrivate link_token method in the ip_limit method +link_token = false + + diff --git a/searx/botdetection/link_token.py b/searx/botdetection/link_token.py index a83214a33..11a6a56b5 100644 --- a/searx/botdetection/link_token.py +++ b/searx/botdetection/link_token.py @@ -6,7 +6,7 @@ Method ``link_token`` The ``link_token`` method evaluates a request as :py:obj:`suspicious ` if the URL ``/client.css`` is not requested by the -client. By adding a random component (the token) in the URL a bot can not send +client. By adding a random component (the token) in the URL, a bot can not send a ping by request a static URL. .. note:: @@ -35,6 +35,11 @@ And in the HTML template from flask a stylesheet link is needed (the value of https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For """ +from __future__ import annotations +from ipaddress import ( + IPv4Network, + IPv6Network, +) import string import random @@ -43,7 +48,11 @@ import flask from searx import logger from searx import redisdb from searx.redislib import secret_hash -from ._helpers import get_real_ip + +from ._helpers import ( + get_network, + get_real_ip, +) TOKEN_LIVE_TIME = 600 """Livetime (sec) of limiter's CSS token.""" @@ -60,29 +69,26 @@ TOKEN_KEY = 'SearXNG_limiter.token' logger = logger.getChild('botdetection.link_token') -def is_suspicious(request: flask.Request, renew: bool = False): - """Checks if there is a valid ping for this request, if not this request is - rated as *suspicious*. If a valid ping exists and argument ``renew`` is - ``True`` the expire time of this ping is reset to :py:obj:`PING_LIVE_TIME`. +def is_suspicious(network: IPv4Network | IPv6Network, request: flask.Request, renew: bool = False): + """Checks whether a valid ping is exists for this (client) network, if not + this request is rated as *suspicious*. If a valid ping exists and argument + ``renew`` is ``True`` the expire time of this ping is reset to + :py:obj:`PING_LIVE_TIME`. """ redis_client = redisdb.client() if not redis_client: return False - ping_key = get_ping_key(request) + ping_key = get_ping_key(network, request) if not redis_client.get(ping_key): - logger.warning( - "missing ping (IP: %s) / request: %s", - get_real_ip(request), - ping_key, - ) + logger.warning("missing ping (IP: %s) / request: %s", network.compressed, ping_key) return True if renew: redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) - logger.debug("found ping for client request: %s", ping_key) + logger.debug("found ping for (client) network %s -> %s", network.compressed, ping_key) return False @@ -92,27 +98,31 @@ def ping(request: flask.Request, token: str): The expire time of this ping-key is :py:obj:`PING_LIVE_TIME`. """ + from . import limiter # pylint: disable=import-outside-toplevel, cyclic-import + redis_client = redisdb.client() if not redis_client: return if not token_is_valid(token): return - ping_key = get_ping_key(request) - logger.debug("store ping for: %s", ping_key) + + cfg = limiter.get_cfg() + real_ip = get_real_ip(request) + network = get_network(real_ip, cfg) + + ping_key = get_ping_key(network, request) + logger.debug("store ping_key for (client) network %s (IP %s) -> %s", network.compressed, real_ip, ping_key) redis_client.set(ping_key, 1, ex=PING_LIVE_TIME) -def get_ping_key(request: flask.Request): - """Generates a hashed key that fits (more or less) to a client (request). - At least X-Forwarded-For_ is needed to be able to assign the request to an - IP. - - """ +def get_ping_key(network: IPv4Network | IPv6Network, request: flask.Request) -> str: + """Generates a hashed key that fits (more or less) to a *WEB-browser + session* in a network.""" return ( PING_KEY + "[" + secret_hash( - get_real_ip(request) + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') + network.compressed + request.headers.get('Accept-Language', '') + request.headers.get('User-Agent', '') ) + "]" ) diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py index 7edbb1ce0..a8beb5e88 100644 --- a/searx/plugins/limiter.py +++ b/searx/plugins/limiter.py @@ -8,7 +8,6 @@ import flask from searx import redisdb from searx.plugins import logger from searx.botdetection import limiter -from searx.botdetection import dump_request name = "Request limiter" description = "Limit the number of request" @@ -20,10 +19,7 @@ logger = logger.getChild('limiter') def pre_request(): """See :ref:`flask.Flask.before_request`""" - ret_val = limiter.filter_request(flask.request) - if ret_val is None: - logger.debug("OK: %s" % dump_request(flask.request)) - return ret_val + return limiter.filter_request(flask.request) def init(app: flask.Flask, settings) -> bool: @@ -32,6 +28,5 @@ def init(app: flask.Flask, settings) -> bool: if not redisdb.client(): logger.error("The limiter requires Redis") return False - limiter.init_cfg(logger) app.before_request(pre_request) return True