mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	- counting requests in LONG_WINDOW and BURST_WINDOW is not needed when the request is validated by the link_token method [1] - renew a ping-key on validation [2], this is needed for infinite scrolling, where no new token (CSS) is loaded. / this does not fix the BURST_MAX issue in the vanilla limiter - normalize the counter names of the ip_limit method to 'ip_limit.*' - just integrate the ip_limit method straight forward in the limiter plugin / non intermediate code --> ip_limit now returns None or a werkzeug.Response object that can be passed by the plugin to the flask application / non intermediate code that returns a tuple [1] https://github.com/searxng/searxng/pull/2357#issuecomment-1566113277 [2] https://github.com/searxng/searxng/pull/2357#discussion_r1208542206 [3] https://github.com/searxng/searxng/pull/2357#issuecomment-1566125979 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
		
			
				
	
	
		
			57 lines
		
	
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			57 lines
		
	
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
						|
# lint: pylint
 | 
						|
"""
 | 
						|
Method ``http_user_agent``
 | 
						|
--------------------------
 | 
						|
 | 
						|
The ``http_user_agent`` method evaluates a request as the request of a bot if
 | 
						|
the User-Agent_ header is unset or matches the regular expression
 | 
						|
:py:obj:`USER_AGENT`.
 | 
						|
 | 
						|
.. _User-Agent:
 | 
						|
   https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
 | 
						|
 | 
						|
"""
 | 
						|
# pylint: disable=unused-argument
 | 
						|
 | 
						|
from typing import Optional
 | 
						|
import re
 | 
						|
import flask
 | 
						|
import werkzeug
 | 
						|
 | 
						|
from searx.tools import config
 | 
						|
from ._helpers import too_many_requests
 | 
						|
 | 
						|
 | 
						|
USER_AGENT = (
 | 
						|
    r'('
 | 
						|
    + r'unknown'
 | 
						|
    + r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
 | 
						|
    + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
 | 
						|
    + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
 | 
						|
    + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
 | 
						|
    + r'|ZmEu|BLEXBot|bitlybot'
 | 
						|
    # unmaintained Farside instances
 | 
						|
    + r'|'
 | 
						|
    + re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
 | 
						|
    # other bots and client to block
 | 
						|
    + '|.*PetalBot.*'
 | 
						|
    + r')'
 | 
						|
)
 | 
						|
"""Regular expression that matches to User-Agent_ from known *bots*"""
 | 
						|
 | 
						|
_regexp = None
 | 
						|
 | 
						|
 | 
						|
def regexp_user_agent():
 | 
						|
    global _regexp  # pylint: disable=global-statement
 | 
						|
    if not _regexp:
 | 
						|
        _regexp = re.compile(USER_AGENT)
 | 
						|
    return _regexp
 | 
						|
 | 
						|
 | 
						|
def filter_request(request: flask.Request, cfg: config.Config) -> Optional[werkzeug.Response]:
 | 
						|
    user_agent = request.headers.get('User-Agent', 'unknown')
 | 
						|
    if regexp_user_agent().match(user_agent):
 | 
						|
        return too_many_requests(request, f"bot detected, HTTP header User-Agent: {user_agent}")
 | 
						|
    return None
 |