From 122a9568de6a07ab77fb97025734033499fc5c9f Mon Sep 17 00:00:00 2001 From: czaky Date: Fri, 17 May 2024 02:09:29 +0000 Subject: [PATCH 1/7] [network]: Add redundant parallel proxy requests. Anecdotally, using SearX over unreliable proxies, like tor, seems to be quite error prone. SearX puts quite an effort to measure the performance and reliability of engines, most likely owning to those aspects being of significant concern. The patch here proposes to mitigate related problems, by issuing concurrent redundant requests through the specified proxies at once, returning the first response that is not an error. The functionality is enabled using the: `proxy_request_redundancy` parameter within the outgoing network settings or the engine settings. Example: ```yaml outgoing: request_timeout: 8.0 proxies: "all://": - socks5h://tor:9050 - socks5h://tor1:9050 - socks5h://tor2:9050 - socks5h://tor3:9050 proxy_request_redundancy: 4 ``` In this example, each network request will be send 4 times, once through every proxy. The first (non-error) response wins. In my testing environment using several tor proxy end-points, this approach almost entirely removes engine errors related to timeouts and denied requests. The latency of the network system is also improved. The implementation, uses a `AsyncParallelTransport(httpx.AsyncBaseTransport)` wrapper to wrap multiple sub-trasports, and `asyncio.wait` to wait on the first completed request. The existing implementation of the network proxy cycling has also been moved into the `AsyncParallelTransport` class, which should improve network client memoization and performance. TESTED: - unit tests for the new functions and classes. - tested on desktop PC with 10+ upstream proxies and comparable request redundancy. --- AUTHORS.rst | 1 + docs/admin/settings/settings_engine.rst | 5 +- docs/admin/settings/settings_outgoing.rst | 15 +- searx/enginelib/__init__.py | 4 + searx/network/client.py | 194 ++++++++++++++++++++-- searx/network/network.py | 55 +++--- searx/settings.yml | 9 +- searx/settings_defaults.py | 1 + tests/unit/network/test_client.py | 128 ++++++++++++++ tests/unit/network/test_network.py | 29 ++-- 10 files changed, 382 insertions(+), 59 deletions(-) create mode 100644 tests/unit/network/test_client.py diff --git a/AUTHORS.rst b/AUTHORS.rst index 011735b55..8177f1c7e 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -173,3 +173,4 @@ features or generally made searx better: - Austin Olacsi `` - @micsthepick - Daniel Kukula `` +- @czaky `` diff --git a/docs/admin/settings/settings_engine.rst b/docs/admin/settings/settings_engine.rst index 78c400ccf..b668b2b45 100644 --- a/docs/admin/settings/settings_engine.rst +++ b/docs/admin/settings/settings_engine.rst @@ -47,6 +47,7 @@ engine is shown. Most of the options have a default value or even are optional. max_keepalive_connections: 10 keepalive_expiry: 5.0 using_tor_proxy: false + proxy_request_redundancy: 1 proxies: http: - http://proxy1:8080 @@ -154,6 +155,9 @@ engine is shown. Most of the options have a default value or even are optional. ``proxies`` : Overwrites proxy settings from :ref:`settings outgoing`. +``proxy_request_redundancy`` : + Overwrites proxy settings from :ref:`settings outgoing`. + ``using_tor_proxy`` : Using tor proxy (``true``) or not (``false``) for this engine. The default is taken from ``using_tor_proxy`` of the :ref:`settings outgoing`. @@ -241,4 +245,3 @@ Example configuration in settings.yml for a German and English speaker: When searching, the default google engine will return German results and "google english" will return English results. - diff --git a/docs/admin/settings/settings_outgoing.rst b/docs/admin/settings/settings_outgoing.rst index 7d49ab789..96b3466cd 100644 --- a/docs/admin/settings/settings_outgoing.rst +++ b/docs/admin/settings/settings_outgoing.rst @@ -22,9 +22,9 @@ Communication with search engines. # and https://www.python-httpx.org/compatibility/#ssl-configuration # verify: ~/.mitmproxy/mitmproxy-ca-cert.cer # - # uncomment below section if you want to use a proxyq see: SOCKS proxies + # Uncomment below section if you want to use a proxy. See: # https://2.python-requests.org/en/latest/user/advanced/#proxies - # are also supported: see + # SOCKS proxies are also supported. See: # https://2.python-requests.org/en/latest/user/advanced/#socks # # proxies: @@ -34,6 +34,11 @@ Communication with search engines. # # using_tor_proxy: true # + # Uncomment below if you want to make multiple request in parallel + # through all the proxies at once: + # + # proxy_request_redundancy: 10 + # # Extra seconds to add in order to account for the time taken by the proxy # # extra_proxy_timeout: 10.0 @@ -70,6 +75,10 @@ Communication with search engines. If there are more than one proxy for one protocol (http, https), requests to the engines are distributed in a round-robin fashion. +``proxy_request_redundancy`` : + Cycle the proxies (``1``) on by one or use them in parallel (``> 1``) for all engines. + The default is ``1`` and can be overwritten in the :ref:`settings engine` + ``source_ips`` : If you use multiple network interfaces, define from which IP the requests must be made. Example: @@ -106,5 +115,3 @@ Communication with search engines. ``using_tor_proxy`` : Using tor proxy (``true``) or not (``false``) for all engines. The default is ``false`` and can be overwritten in the :ref:`settings engine` - - diff --git a/searx/enginelib/__init__.py b/searx/enginelib/__init__.py index 6e6c24cb7..b4ec1ab68 100644 --- a/searx/enginelib/__init__.py +++ b/searx/enginelib/__init__.py @@ -110,6 +110,10 @@ class Engine: # pylint: disable=too-few-public-methods https: socks5://proxy:port """ + proxy_request_redundancy: int + """Cycle proxies one by one (``1``) or + use them in parallel at once (``> 1``) for this engine.""" + disabled: bool """To disable by default the engine, but not deleting it. It will allow the user to manually activate it in the settings.""" diff --git a/searx/network/client.py b/searx/network/client.py index 32bc5af42..25113a189 100644 --- a/searx/network/client.py +++ b/searx/network/client.py @@ -1,14 +1,18 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # pylint: disable=missing-module-docstring, global-statement +from __future__ import annotations + import asyncio +import contextlib import logging import random from ssl import SSLContext import threading -from typing import Any, Dict +from typing import Any, Dict, Iterable import httpx +import httpcore from httpx_socks import AsyncProxyTransport from python_socks import parse_proxy_url, ProxyConnectionError, ProxyTimeoutError, ProxyError @@ -112,7 +116,8 @@ class AsyncProxyTransportFixed(AsyncProxyTransport): raise httpx.ProxyError("ProxyError: " + e.args[0], request=request) from e -def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries): +def get_socks_transport(verify, http2, local_address, proxy_url, limit, retries): + """Return an AsyncProxyTransport.""" # support socks5h (requests compatibility): # https://requests.readthedocs.io/en/master/user/advanced/#socks # socks5:// hostname is resolved on client side @@ -141,7 +146,8 @@ def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit ) -def get_transport(verify, http2, local_address, proxy_url, limit, retries): +def get_http_transport(verify, http2, local_address, proxy_url, limit, retries): + """Return an AsyncHTTPTransport.""" verify = get_sslcontexts(None, None, verify, True, http2) if verify is True else verify return httpx.AsyncHTTPTransport( # pylint: disable=protected-access @@ -154,6 +160,166 @@ def get_transport(verify, http2, local_address, proxy_url, limit, retries): ) +def get_single_transport( + limit: httpx.Limits | None = None, + proxy_url: str | None = None, + local_address: str | None = None, + retries: int = 0, + *, + verify: bool = True, + http2: bool = True, +) -> httpx.AsyncBaseTransport: + """Generate a single, non-parallel transport. + + Parameters + ---------- + limit : httpx.Limits + Limits applied to the to the transport. + proxy_url : str | None, optional + Proxy to use for the transport. + local_address : str | None, optional + local address to specify in the connection. + retries : int, optional + how many times to retry the request, by default 0 + verify : bool, optional + Verify the certificates, by default True + http2 : bool, optional + Enable HTTP2 protocol, by default True + + Returns + ------- + httpx.AsyncBaseTransport + An async transport object. + """ + limit = limit or httpx.Limits() + if proxy_url and proxy_url.startswith(('socks4://', 'socks5://', 'socks5h://')): + return get_socks_transport(verify, http2, local_address, proxy_url, limit, retries) + return get_http_transport(verify, http2, local_address, proxy_url, limit, retries) + + +class AsyncParallelTransport(httpx.AsyncBaseTransport): + """Fan out request to multiple base transports.""" + + def __init__( + self, + transports: Iterable[httpx.AsyncBaseTransport], + proxy_request_redundancy: int, + network_logger: logging.Logger, + ) -> None: + """Init the parallel transport using a list of base `transports`.""" + self._transports = list(transports) + if len(self._transports) == 0: + msg = "Got an empty list of (proxy) transports." + raise ValueError(msg) + if proxy_request_redundancy < 1: + logger.warning("Invalid proxy_request_redundancy specified: %d", proxy_request_redundancy) + proxy_request_redundancy = 1 + self._proxy_request_redundancy = proxy_request_redundancy + self._index = random.randrange(len(self._transports)) # noqa: S311 + self._logger = network_logger or logger + + async def handle_async_request( + self, + request: httpx.Request, + ) -> httpx.Response: + """Issue parallel requests to all sub-transports. + + Return the response of the first completed. + + Parameters + ---------- + request : httpx.Request + Request to pass to the transports. + + Returns + ------- + httpx.Response + Response from the first completed request. + + """ + response = None # non-error response, taking precedence + error_response = None # any error response + request_error = None # any request related exception + tcount = len(self._transports) + redundancy = self._proxy_request_redundancy + pending = [ + asyncio.create_task(self._transports[i % tcount].handle_async_request(request)) + for i in range(self._index, self._index + redundancy) + ] + self._index = (self._index + redundancy) % tcount + while pending: + if len(pending) == 1: + return await pending.pop() + done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) + for task in done: + try: + result = task.result() + if not result.is_error or result.status_code == 404: + response = result + elif not error_response: + self._logger.warning("Error response: %s for %s", result.status_code, request.url) + error_response = result + except ( + httpx.HTTPError, + httpcore.ProtocolError, + httpcore.NetworkError, + httpcore.TimeoutException, + # Low level semaphore errors. + ValueError, + ) as e: + if not request_error: + self._logger.warning("Request error: %s for %s", e, request.url) + request_error = e + if response: + break + if pending: + with contextlib.suppress(asyncio.exceptions.CancelledError): + gather = asyncio.gather(*pending) + gather.cancel() + self._logger.debug("Cancelling %d/%d redundant proxy requests.", len(pending), redundancy) + await gather + if response: + return response + if error_response: + return error_response + msg = "No valid response." + if request_error: + raise httpx.RequestError(msg) from request_error + raise httpx.RequestError(msg) + + async def aclose(self) -> None: + """Close all the transports.""" + for transport in self._transports: + await transport.aclose() + + +def get_transport( + proxy_urls: list, + limit: httpx.Limits | None = None, + local_address: str | None = None, + proxy_request_redundancy: int = 1, + retries: int = 0, + network_logger: logging.Logger = logger, + *, + verify: bool = True, + http2: bool = True, +) -> httpx.AsyncBaseTransport: + """Return a single http/proxy transport or the parallel version of those.""" + limit = limit or httpx.Limits() + # pylint: disable=unnecessary-lambda-assignment + transport = lambda proxy_url: get_single_transport( + verify=verify, + http2=http2, + local_address=local_address, + proxy_url=proxy_url, + limit=limit, + retries=retries, + ) + if len(proxy_urls or []) <= 1: + return transport(proxy_urls[0] if proxy_urls else None) + return AsyncParallelTransport(map(transport, proxy_urls), proxy_request_redundancy, network_logger) + + def new_client( # pylint: disable=too-many-arguments enable_http, @@ -163,10 +329,12 @@ def new_client( max_keepalive_connections, keepalive_expiry, proxies, + proxy_request_redundancy, local_address, retries, max_redirects, hook_log_response, + network_logger, ): limit = httpx.Limits( max_connections=max_connections, @@ -175,20 +343,24 @@ def new_client( ) # See https://www.python-httpx.org/advanced/#routing mounts = {} - for pattern, proxy_url in proxies.items(): + for pattern, proxy_urls in proxies.items(): if not enable_http and pattern.startswith('http://'): continue - if proxy_url.startswith('socks4://') or proxy_url.startswith('socks5://') or proxy_url.startswith('socks5h://'): - mounts[pattern] = get_transport_for_socks_proxy( - verify, enable_http2, local_address, proxy_url, limit, retries - ) - else: - mounts[pattern] = get_transport(verify, enable_http2, local_address, proxy_url, limit, retries) + mounts[pattern] = get_transport( + verify=verify, + http2=enable_http2, + local_address=local_address, + proxy_urls=proxy_urls, + proxy_request_redundancy=proxy_request_redundancy, + limit=limit, + retries=retries, + network_logger=network_logger, + ) if not enable_http: mounts['http://'] = AsyncHTTPTransportNoHttp() - transport = get_transport(verify, enable_http2, local_address, None, limit, retries) + transport = get_http_transport(verify, enable_http2, local_address, None, limit, retries) event_hooks = None if hook_log_response: diff --git a/searx/network/network.py b/searx/network/network.py index 453c8d2fc..4941349b5 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -2,10 +2,11 @@ # pylint: disable=global-statement # pylint: disable=missing-module-docstring, missing-class-docstring +from __future__ import annotations + import atexit import asyncio import ipaddress -from itertools import cycle from typing import Dict import httpx @@ -46,12 +47,14 @@ class Network: 'keepalive_expiry', 'local_addresses', 'proxies', + 'proxy_request_redundancy', 'using_tor_proxy', 'max_redirects', 'retries', 'retry_on_http_error', '_local_addresses_cycle', '_proxies_cycle', + '_proxies_by_pattern', '_clients', '_logger', ) @@ -68,6 +71,7 @@ class Network: max_keepalive_connections=None, keepalive_expiry=None, proxies=None, + proxy_request_redundancy=1, using_tor_proxy=False, local_addresses=None, retries=0, @@ -83,13 +87,15 @@ class Network: self.max_keepalive_connections = max_keepalive_connections self.keepalive_expiry = keepalive_expiry self.proxies = proxies + self.proxy_request_redundancy = proxy_request_redundancy self.using_tor_proxy = using_tor_proxy self.local_addresses = local_addresses self.retries = retries self.retry_on_http_error = retry_on_http_error self.max_redirects = max_redirects self._local_addresses_cycle = self.get_ipaddress_cycle() - self._proxies_cycle = self.get_proxy_cycles() + # Contains a dictionary with a list of proxies by pattern. + self._proxies_by_pattern = dict(self.iter_proxies()) self._clients = {} self._logger = logger.getChild(logger_name) if logger_name else logger self.check_parameters() @@ -132,21 +138,16 @@ class Network: return # https://www.python-httpx.org/compatibility/#proxy-keys if isinstance(self.proxies, str): - yield 'all://', [self.proxies] - else: - for pattern, proxy_url in self.proxies.items(): + yield 'all://', (self.proxies,) + elif isinstance(self.proxies, dict): + for pattern, proxy_urls in self.proxies.items(): pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern) - if isinstance(proxy_url, str): - proxy_url = [proxy_url] - yield pattern, proxy_url - - def get_proxy_cycles(self): - proxy_settings = {} - for pattern, proxy_urls in self.iter_proxies(): - proxy_settings[pattern] = cycle(proxy_urls) - while True: - # pylint: disable=stop-iteration-return - yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items()) + if isinstance(proxy_urls, str): + yield pattern, (proxy_urls,) + else: + yield pattern, tuple(proxy_urls) + else: + raise ValueError("`proxies` need to be either a string or a patthern to url dictionary.") async def log_response(self, response: httpx.Response): request = response.request @@ -181,10 +182,11 @@ class Network: verify = self.verify if verify is None else verify max_redirects = self.max_redirects if max_redirects is None else max_redirects local_address = next(self._local_addresses_cycle) - proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key - key = (verify, max_redirects, local_address, proxies) hook_log_response = self.log_response if searx_debug else None - if key not in self._clients or self._clients[key].is_closed: + proxies = self._proxies_by_pattern + key = (verify, max_redirects, local_address) + client = self._clients.get(key) + if not client or client.is_closed: client = new_client( self.enable_http, verify, @@ -192,17 +194,19 @@ class Network: self.max_connections, self.max_keepalive_connections, self.keepalive_expiry, - dict(proxies), + proxies, + self.proxy_request_redundancy, local_address, 0, max_redirects, hook_log_response, + self._logger, ) if self.using_tor_proxy and not await self.check_tor_proxy(client, proxies): await client.aclose() raise httpx.ProxyError('Network configuration problem: not using Tor') self._clients[key] = client - return self._clients[key] + return client async def aclose(self): async def close_client(client): @@ -340,13 +344,13 @@ def initialize(settings_engines=None, settings_outgoing=None): 'local_addresses': settings_outgoing['source_ips'], 'using_tor_proxy': settings_outgoing['using_tor_proxy'], 'proxies': settings_outgoing['proxies'], + 'proxy_request_redundancy': settings_outgoing['proxy_request_redundancy'], 'max_redirects': settings_outgoing['max_redirects'], 'retries': settings_outgoing['retries'], 'retry_on_http_error': None, } def new_network(params, logger_name=None): - nonlocal default_params result = {} result.update(default_params) result.update(params) @@ -354,8 +358,7 @@ def initialize(settings_engines=None, settings_outgoing=None): result['logger_name'] = logger_name return Network(**result) - def iter_networks(): - nonlocal settings_engines + def iter_engine_networks(): for engine_spec in settings_engines: engine_name = engine_spec['name'] engine = engines.get(engine_name) @@ -376,7 +379,7 @@ def initialize(settings_engines=None, settings_outgoing=None): NETWORKS[network_name] = new_network(network, logger_name=network_name) # define networks from engines.[i].network (except references) - for engine_name, engine, network in iter_networks(): + for engine_name, engine, network in iter_engine_networks(): if network is None: network = {} for attribute_name, attribute_value in default_params.items(): @@ -389,7 +392,7 @@ def initialize(settings_engines=None, settings_outgoing=None): NETWORKS[engine_name] = new_network(network, logger_name=engine_name) # define networks from engines.[i].network (references) - for engine_name, engine, network in iter_networks(): + for engine_name, engine, network in iter_engine_networks(): if isinstance(network, str): NETWORKS[engine_name] = NETWORKS[network] diff --git a/searx/settings.yml b/searx/settings.yml index b5f1a2b36..620cfb759 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -178,9 +178,9 @@ outgoing: # and https://www.python-httpx.org/compatibility/#ssl-configuration # verify: ~/.mitmproxy/mitmproxy-ca-cert.cer # - # uncomment below section if you want to use a proxyq see: SOCKS proxies + # Uncomment below section if you want to use a proxy. See: # https://2.python-requests.org/en/latest/user/advanced/#proxies - # are also supported: see + # SOCKS proxies are also supported. See: # https://2.python-requests.org/en/latest/user/advanced/#socks # # proxies: @@ -190,6 +190,11 @@ outgoing: # # using_tor_proxy: true # + # Uncomment below if you want to make multiple request in parallel + # through all the proxies at once: + # + # proxy_request_redundancy: 10 + # # Extra seconds to add in order to account for the time taken by the proxy # # extra_proxy_timeout: 10 diff --git a/searx/settings_defaults.py b/searx/settings_defaults.py index 93b04257c..63ea0dfe1 100644 --- a/searx/settings_defaults.py +++ b/searx/settings_defaults.py @@ -221,6 +221,7 @@ SCHEMA = { 'max_redirects': SettingsValue(int, 30), 'retries': SettingsValue(int, 0), 'proxies': SettingsValue((None, str, dict), None), + 'proxy_request_redundancy': SettingsValue(int, 1), 'source_ips': SettingsValue((None, str, list), None), # Tor configuration 'using_tor_proxy': SettingsValue(bool, False), diff --git a/tests/unit/network/test_client.py b/tests/unit/network/test_client.py new file mode 100644 index 000000000..a37ce4400 --- /dev/null +++ b/tests/unit/network/test_client.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Test module for the client and proxy handling code.""" + +from unittest.mock import patch, Mock + +import httpx + +from searx.network import client +from tests import SearxTestCase + + +class TestClient(SearxTestCase): + """Tests for the client and proxy handling code.""" + + def test_get_single_transport(self): + t = client.get_single_transport(proxy_url="socks4://local:1080") + assert isinstance(t, client.AsyncProxyTransportFixed) + t = client.get_single_transport(proxy_url="socks5://local:1080") + assert isinstance(t, client.AsyncProxyTransportFixed) + t = client.get_single_transport(proxy_url="socks5h://local:1080") + assert isinstance(t, client.AsyncProxyTransportFixed) + t = client.get_single_transport(proxy_url="https://local:8080") + assert isinstance(t, httpx.AsyncHTTPTransport) + + def test_get_parallel_transport(self): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + ) + assert isinstance(t, client.AsyncParallelTransport) + + @patch( + 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', + side_effect=[httpx.Response(200, html=""), httpx.Response(301, html="")], + ) + async def test_parallel_transport_ok(self, handler_mock: Mock): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + ) + request = httpx.Request(url="http://wiki.com", method="GET") + response = await t.handle_async_request(request) + assert response.status_code == 200 + handler_mock.assert_called_once_with(request) + + response = await t.handle_async_request(request) + assert response.status_code == 301 + assert handler_mock.call_count == 2 + + @patch( + 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', + side_effect=[httpx.Response(403, html=""), httpx.Response(200, html="")], + ) + async def test_parallel_transport_403(self, handler_mock: Mock): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + proxy_request_redundancy=2, + ) + assert isinstance(t, client.AsyncParallelTransport) + request = httpx.Request(url="http://wiki.com", method="GET") + response = await t.handle_async_request(request) + handler_mock.assert_called_with(request) + assert response.status_code == 200 + assert handler_mock.call_count == 2 + + @patch( + 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', + side_effect=[httpx.Response(404, html=""), httpx.Response(200, html="")], + ) + async def test_parallel_transport_404(self, handler_mock: Mock): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + proxy_request_redundancy=2, + ) + assert isinstance(t, client.AsyncParallelTransport) + request = httpx.Request(url="http://wiki.com", method="GET") + response = await t.handle_async_request(request) + handler_mock.assert_called_with(request) + assert response.status_code == 404 + assert handler_mock.call_count == 2 + + @patch( + 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', + side_effect=[httpx.Response(403, html=""), httpx.Response(403, html="")], + ) + async def test_parallel_transport_403_403(self, handler_mock: Mock): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + proxy_request_redundancy=2, + ) + assert isinstance(t, client.AsyncParallelTransport) + request = httpx.Request(url="http://wiki.com", method="GET") + response = await t.handle_async_request(request) + handler_mock.assert_called_with(request) + assert response.status_code == 403 + assert handler_mock.call_count == 2 + + @patch( + 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', + side_effect=[httpx.RequestError("OMG!"), httpx.Response(200, html="")], + ) + async def test_parallel_transport_ex_ok(self, handler_mock: Mock): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + proxy_request_redundancy=2, + ) + assert isinstance(t, client.AsyncParallelTransport) + request = httpx.Request(url="http://wiki.com", method="GET") + response = await t.handle_async_request(request) + handler_mock.assert_called_with(request) + assert response.status_code == 200 + assert handler_mock.call_count == 2 + + @patch( + 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', + side_effect=[httpx.RequestError("OMG!"), httpx.RequestError("OMG!")], + ) + async def test_parallel_transport_ex_ex(self, handler_mock: Mock): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + proxy_request_redundancy=2, + ) + assert isinstance(t, client.AsyncParallelTransport) + request = httpx.Request(url="http://wiki.com", method="GET") + response = None + with self.assertRaises(httpx.RequestError): + response = await t.handle_async_request(request) + handler_mock.assert_called_with(request) + assert not response + assert handler_mock.call_count == 2 diff --git a/tests/unit/network/test_network.py b/tests/unit/network/test_network.py index eabb23082..c5312491c 100644 --- a/tests/unit/network/test_network.py +++ b/tests/unit/network/test_network.py @@ -17,7 +17,7 @@ class TestNetwork(SearxTestCase): # pylint: disable=missing-class-docstring network = Network() self.assertEqual(next(network._local_addresses_cycle), None) - self.assertEqual(next(network._proxies_cycle), ()) + self.assertEqual(network._proxies_by_pattern, {}) def test_ipaddress_cycle(self): network = NETWORKS['ipv6'] @@ -47,27 +47,26 @@ class TestNetwork(SearxTestCase): # pylint: disable=missing-class-docstring with self.assertRaises(ValueError): Network(local_addresses=['not_an_ip_address']) - def test_proxy_cycles(self): + def test_proxies_by_patterns(self): network = Network(proxies='http://localhost:1337') - self.assertEqual(next(network._proxies_cycle), (('all://', 'http://localhost:1337'),)) + assert network._proxies_by_pattern == {'all://': ('http://localhost:1337',)} network = Network(proxies={'https': 'http://localhost:1337', 'http': 'http://localhost:1338'}) - self.assertEqual( - next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')) - ) - self.assertEqual( - next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')) - ) + assert network._proxies_by_pattern == { + 'https://': ('http://localhost:1337',), + 'http://': ('http://localhost:1338',), + } network = Network( proxies={'https': ['http://localhost:1337', 'http://localhost:1339'], 'http': 'http://localhost:1338'} ) - self.assertEqual( - next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')) - ) - self.assertEqual( - next(network._proxies_cycle), (('https://', 'http://localhost:1339'), ('http://', 'http://localhost:1338')) - ) + assert network._proxies_by_pattern == { + 'https://': ( + 'http://localhost:1337', + 'http://localhost:1339', + ), + 'http://': ('http://localhost:1338',), + } with self.assertRaises(ValueError): Network(proxies=1) From 8fe19d54efb22c85e4e1911a8fe37be444ef1a19 Mon Sep 17 00:00:00 2001 From: czaky Date: Fri, 17 May 2024 12:04:00 +0000 Subject: [PATCH 2/7] Using conservative unittest asserts --- tests/unit/network/test_client.py | 46 +++++++++++++++--------------- tests/unit/network/test_network.py | 30 +++++++++++-------- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/tests/unit/network/test_client.py b/tests/unit/network/test_client.py index a37ce4400..5fa08187e 100644 --- a/tests/unit/network/test_client.py +++ b/tests/unit/network/test_client.py @@ -14,19 +14,19 @@ class TestClient(SearxTestCase): def test_get_single_transport(self): t = client.get_single_transport(proxy_url="socks4://local:1080") - assert isinstance(t, client.AsyncProxyTransportFixed) + self.assertTrue(isinstance(t, client.AsyncProxyTransportFixed)) t = client.get_single_transport(proxy_url="socks5://local:1080") - assert isinstance(t, client.AsyncProxyTransportFixed) + self.assertTrue(isinstance(t, client.AsyncProxyTransportFixed)) t = client.get_single_transport(proxy_url="socks5h://local:1080") - assert isinstance(t, client.AsyncProxyTransportFixed) + self.assertTrue(isinstance(t, client.AsyncProxyTransportFixed)) t = client.get_single_transport(proxy_url="https://local:8080") - assert isinstance(t, httpx.AsyncHTTPTransport) + self.assertTrue(isinstance(t, httpx.AsyncHTTPTransport)) def test_get_parallel_transport(self): t = client.get_transport( proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], ) - assert isinstance(t, client.AsyncParallelTransport) + self.assertTrue(isinstance(t, client.AsyncParallelTransport)) @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', @@ -38,12 +38,12 @@ class TestClient(SearxTestCase): ) request = httpx.Request(url="http://wiki.com", method="GET") response = await t.handle_async_request(request) - assert response.status_code == 200 + self.assertEqual(response.status_code, 200) handler_mock.assert_called_once_with(request) response = await t.handle_async_request(request) - assert response.status_code == 301 - assert handler_mock.call_count == 2 + self.assertEqual(handler_mock.call_count, 2) + self.assertEqual(response.status_code, 301) @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', @@ -54,12 +54,12 @@ class TestClient(SearxTestCase): proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], proxy_request_redundancy=2, ) - assert isinstance(t, client.AsyncParallelTransport) + self.assertTrue(isinstance(t, client.AsyncParallelTransport)) request = httpx.Request(url="http://wiki.com", method="GET") response = await t.handle_async_request(request) handler_mock.assert_called_with(request) - assert response.status_code == 200 - assert handler_mock.call_count == 2 + self.assertEqual(handler_mock.call_count, 2) + self.assertEqual(response.status_code, 200) @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', @@ -70,12 +70,12 @@ class TestClient(SearxTestCase): proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], proxy_request_redundancy=2, ) - assert isinstance(t, client.AsyncParallelTransport) + self.assertTrue(isinstance(t, client.AsyncParallelTransport)) request = httpx.Request(url="http://wiki.com", method="GET") response = await t.handle_async_request(request) handler_mock.assert_called_with(request) - assert response.status_code == 404 - assert handler_mock.call_count == 2 + self.assertEqual(handler_mock.call_count, 2) + self.assertEqual(response.status_code, 404) @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', @@ -86,12 +86,12 @@ class TestClient(SearxTestCase): proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], proxy_request_redundancy=2, ) - assert isinstance(t, client.AsyncParallelTransport) + self.assertTrue(isinstance(t, client.AsyncParallelTransport)) request = httpx.Request(url="http://wiki.com", method="GET") response = await t.handle_async_request(request) handler_mock.assert_called_with(request) - assert response.status_code == 403 - assert handler_mock.call_count == 2 + self.assertEqual(handler_mock.call_count, 2) + self.assertEqual(response.status_code, 403) @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', @@ -102,12 +102,12 @@ class TestClient(SearxTestCase): proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], proxy_request_redundancy=2, ) - assert isinstance(t, client.AsyncParallelTransport) + self.assertTrue(isinstance(t, client.AsyncParallelTransport)) request = httpx.Request(url="http://wiki.com", method="GET") response = await t.handle_async_request(request) handler_mock.assert_called_with(request) - assert response.status_code == 200 - assert handler_mock.call_count == 2 + self.assertEqual(response.status_code, 200) + self.assertEqual(handler_mock.call_count, 2) @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', @@ -118,11 +118,11 @@ class TestClient(SearxTestCase): proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], proxy_request_redundancy=2, ) - assert isinstance(t, client.AsyncParallelTransport) + self.assertTrue(isinstance(t, client.AsyncParallelTransport)) request = httpx.Request(url="http://wiki.com", method="GET") response = None with self.assertRaises(httpx.RequestError): response = await t.handle_async_request(request) handler_mock.assert_called_with(request) - assert not response - assert handler_mock.call_count == 2 + self.assertFalse(response) + self.assertEqual(handler_mock.call_count, 2) diff --git a/tests/unit/network/test_network.py b/tests/unit/network/test_network.py index c5312491c..f44408c67 100644 --- a/tests/unit/network/test_network.py +++ b/tests/unit/network/test_network.py @@ -49,24 +49,30 @@ class TestNetwork(SearxTestCase): # pylint: disable=missing-class-docstring def test_proxies_by_patterns(self): network = Network(proxies='http://localhost:1337') - assert network._proxies_by_pattern == {'all://': ('http://localhost:1337',)} + self.assertEqual(network._proxies_by_pattern, {'all://': ('http://localhost:1337',)}) network = Network(proxies={'https': 'http://localhost:1337', 'http': 'http://localhost:1338'}) - assert network._proxies_by_pattern == { - 'https://': ('http://localhost:1337',), - 'http://': ('http://localhost:1338',), - } + self.assertEqual( + network._proxies_by_pattern, + { + 'https://': ('http://localhost:1337',), + 'http://': ('http://localhost:1338',), + }, + ) network = Network( proxies={'https': ['http://localhost:1337', 'http://localhost:1339'], 'http': 'http://localhost:1338'} ) - assert network._proxies_by_pattern == { - 'https://': ( - 'http://localhost:1337', - 'http://localhost:1339', - ), - 'http://': ('http://localhost:1338',), - } + self.assertEqual( + network._proxies_by_pattern, + { + 'https://': ( + 'http://localhost:1337', + 'http://localhost:1339', + ), + 'http://': ('http://localhost:1338',), + }, + ) with self.assertRaises(ValueError): Network(proxies=1) From c302f113b05d0f8c7703841181e509cc04aab010 Mon Sep 17 00:00:00 2001 From: czaky Date: Fri, 17 May 2024 12:45:46 +0000 Subject: [PATCH 3/7] add on workflow_dispatch to integration to verify why this fails remotely --- .github/workflows/integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index c6e74eaef..7d0f1db74 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -5,6 +5,7 @@ on: branches: ["master"] pull_request: branches: ["master"] + workflow_dispatch: permissions: contents: read From 5fe7e42d1a40d005279577d44fafa3382e9e8988 Mon Sep 17 00:00:00 2001 From: czaky Date: Fri, 17 May 2024 12:55:00 +0000 Subject: [PATCH 4/7] Add preference for non 404 responses fix test RC. Fixed race condition with the 404 test. --- searx/network/client.py | 5 ++++- tests/unit/network/test_client.py | 20 ++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/searx/network/client.py b/searx/network/client.py index 25113a189..f68248976 100644 --- a/searx/network/client.py +++ b/searx/network/client.py @@ -222,6 +222,7 @@ class AsyncParallelTransport(httpx.AsyncBaseTransport): self, request: httpx.Request, ) -> httpx.Response: + # pylint: disable=too-many-branches """Issue parallel requests to all sub-transports. Return the response of the first completed. @@ -254,7 +255,9 @@ class AsyncParallelTransport(httpx.AsyncBaseTransport): for task in done: try: result = task.result() - if not result.is_error or result.status_code == 404: + if not result.is_error: + response = result + elif result.status_code == 404 and response is None: response = result elif not error_response: self._logger.warning("Error response: %s for %s", result.status_code, request.url) diff --git a/tests/unit/network/test_client.py b/tests/unit/network/test_client.py index 5fa08187e..77323ce4d 100644 --- a/tests/unit/network/test_client.py +++ b/tests/unit/network/test_client.py @@ -63,9 +63,9 @@ class TestClient(SearxTestCase): @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', - side_effect=[httpx.Response(404, html=""), httpx.Response(200, html="")], + side_effect=[httpx.Response(404, html=""), httpx.Response(404, html="")], ) - async def test_parallel_transport_404(self, handler_mock: Mock): + async def test_parallel_transport_404_404(self, handler_mock: Mock): t = client.get_transport( proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], proxy_request_redundancy=2, @@ -77,6 +77,22 @@ class TestClient(SearxTestCase): self.assertEqual(handler_mock.call_count, 2) self.assertEqual(response.status_code, 404) + @patch( + 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', + side_effect=[httpx.Response(200, html=""), httpx.Response(404, html="")], + ) + async def test_parallel_transport_404_200(self, handler_mock: Mock): + t = client.get_transport( + proxy_urls=["socks5h://local:1080", "socks5h://local:1180"], + proxy_request_redundancy=2, + ) + self.assertTrue(isinstance(t, client.AsyncParallelTransport)) + request = httpx.Request(url="http://wiki.com", method="GET") + response = await t.handle_async_request(request) + handler_mock.assert_called_with(request) + self.assertEqual(handler_mock.call_count, 2) + self.assertEqual(response.status_code, 200) + @patch( 'searx.network.client.AsyncProxyTransportFixed.handle_async_request', side_effect=[httpx.Response(403, html=""), httpx.Response(403, html="")], From 6c375c7d282dd599f52131642203c5d59c2c45ea Mon Sep 17 00:00:00 2001 From: czaky Date: Fri, 17 May 2024 23:34:39 +0000 Subject: [PATCH 5/7] Revert "add on workflow_dispatch to integration" This reverts commit c302f113b05d0f8c7703841181e509cc04aab010. --- .github/workflows/integration.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 7d0f1db74..c6e74eaef 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -5,7 +5,6 @@ on: branches: ["master"] pull_request: branches: ["master"] - workflow_dispatch: permissions: contents: read From e82516fb8d2efbc50933be6d020b6f1052de6c0c Mon Sep 17 00:00:00 2001 From: czaky Date: Fri, 17 May 2024 23:51:24 +0000 Subject: [PATCH 6/7] cleanup --- docs/admin/settings/settings_outgoing.rst | 2 +- searx/network/client.py | 6 +++--- searx/network/network.py | 8 +++++--- searx/settings.yml | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/admin/settings/settings_outgoing.rst b/docs/admin/settings/settings_outgoing.rst index 96b3466cd..c912c64fa 100644 --- a/docs/admin/settings/settings_outgoing.rst +++ b/docs/admin/settings/settings_outgoing.rst @@ -37,7 +37,7 @@ Communication with search engines. # Uncomment below if you want to make multiple request in parallel # through all the proxies at once: # - # proxy_request_redundancy: 10 + # proxy_request_redundancy: 4 # # Extra seconds to add in order to account for the time taken by the proxy # diff --git a/searx/network/client.py b/searx/network/client.py index f68248976..699a2cc4d 100644 --- a/searx/network/client.py +++ b/searx/network/client.py @@ -207,16 +207,16 @@ class AsyncParallelTransport(httpx.AsyncBaseTransport): network_logger: logging.Logger, ) -> None: """Init the parallel transport using a list of base `transports`.""" + self._logger = network_logger or logger self._transports = list(transports) if len(self._transports) == 0: msg = "Got an empty list of (proxy) transports." raise ValueError(msg) if proxy_request_redundancy < 1: - logger.warning("Invalid proxy_request_redundancy specified: %d", proxy_request_redundancy) + self._logger.warning("Invalid proxy_request_redundancy specified: %d", proxy_request_redundancy) proxy_request_redundancy = 1 self._proxy_request_redundancy = proxy_request_redundancy self._index = random.randrange(len(self._transports)) # noqa: S311 - self._logger = network_logger or logger async def handle_async_request( self, @@ -258,7 +258,7 @@ class AsyncParallelTransport(httpx.AsyncBaseTransport): if not result.is_error: response = result elif result.status_code == 404 and response is None: - response = result + error_response = response = result elif not error_response: self._logger.warning("Error response: %s for %s", result.status_code, request.url) error_response = result diff --git a/searx/network/network.py b/searx/network/network.py index 4941349b5..b42eeb617 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -351,6 +351,7 @@ def initialize(settings_engines=None, settings_outgoing=None): } def new_network(params, logger_name=None): + nonlocal default_params result = {} result.update(default_params) result.update(params) @@ -358,7 +359,8 @@ def initialize(settings_engines=None, settings_outgoing=None): result['logger_name'] = logger_name return Network(**result) - def iter_engine_networks(): + def iter_networks(): + nonlocal settings_engines for engine_spec in settings_engines: engine_name = engine_spec['name'] engine = engines.get(engine_name) @@ -379,7 +381,7 @@ def initialize(settings_engines=None, settings_outgoing=None): NETWORKS[network_name] = new_network(network, logger_name=network_name) # define networks from engines.[i].network (except references) - for engine_name, engine, network in iter_engine_networks(): + for engine_name, engine, network in iter_networks(): if network is None: network = {} for attribute_name, attribute_value in default_params.items(): @@ -392,7 +394,7 @@ def initialize(settings_engines=None, settings_outgoing=None): NETWORKS[engine_name] = new_network(network, logger_name=engine_name) # define networks from engines.[i].network (references) - for engine_name, engine, network in iter_engine_networks(): + for engine_name, engine, network in iter_networks(): if isinstance(network, str): NETWORKS[engine_name] = NETWORKS[network] diff --git a/searx/settings.yml b/searx/settings.yml index 620cfb759..1743d2554 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -193,7 +193,7 @@ outgoing: # Uncomment below if you want to make multiple request in parallel # through all the proxies at once: # - # proxy_request_redundancy: 10 + # proxy_request_redundancy: 4 # # Extra seconds to add in order to account for the time taken by the proxy # From 9441c97a2682f2a0defc7d45cd3a966906bf5c98 Mon Sep 17 00:00:00 2001 From: czaky Date: Fri, 17 May 2024 23:51:24 +0000 Subject: [PATCH 7/7] cleanup --- docs/admin/settings/settings_outgoing.rst | 2 +- searx/network/client.py | 6 +++--- searx/network/network.py | 11 +++++++---- searx/settings.yml | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/admin/settings/settings_outgoing.rst b/docs/admin/settings/settings_outgoing.rst index 96b3466cd..c912c64fa 100644 --- a/docs/admin/settings/settings_outgoing.rst +++ b/docs/admin/settings/settings_outgoing.rst @@ -37,7 +37,7 @@ Communication with search engines. # Uncomment below if you want to make multiple request in parallel # through all the proxies at once: # - # proxy_request_redundancy: 10 + # proxy_request_redundancy: 4 # # Extra seconds to add in order to account for the time taken by the proxy # diff --git a/searx/network/client.py b/searx/network/client.py index f68248976..699a2cc4d 100644 --- a/searx/network/client.py +++ b/searx/network/client.py @@ -207,16 +207,16 @@ class AsyncParallelTransport(httpx.AsyncBaseTransport): network_logger: logging.Logger, ) -> None: """Init the parallel transport using a list of base `transports`.""" + self._logger = network_logger or logger self._transports = list(transports) if len(self._transports) == 0: msg = "Got an empty list of (proxy) transports." raise ValueError(msg) if proxy_request_redundancy < 1: - logger.warning("Invalid proxy_request_redundancy specified: %d", proxy_request_redundancy) + self._logger.warning("Invalid proxy_request_redundancy specified: %d", proxy_request_redundancy) proxy_request_redundancy = 1 self._proxy_request_redundancy = proxy_request_redundancy self._index = random.randrange(len(self._transports)) # noqa: S311 - self._logger = network_logger or logger async def handle_async_request( self, @@ -258,7 +258,7 @@ class AsyncParallelTransport(httpx.AsyncBaseTransport): if not result.is_error: response = result elif result.status_code == 404 and response is None: - response = result + error_response = response = result elif not error_response: self._logger.warning("Error response: %s for %s", result.status_code, request.url) error_response = result diff --git a/searx/network/network.py b/searx/network/network.py index 4941349b5..9613384c3 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -147,7 +147,8 @@ class Network: else: yield pattern, tuple(proxy_urls) else: - raise ValueError("`proxies` need to be either a string or a patthern to url dictionary.") + msg = "`proxies` need to be either a string or a patthern to url dictionary." + raise ValueError(msg) async def log_response(self, response: httpx.Response): request = response.request @@ -351,6 +352,7 @@ def initialize(settings_engines=None, settings_outgoing=None): } def new_network(params, logger_name=None): + nonlocal default_params result = {} result.update(default_params) result.update(params) @@ -358,7 +360,8 @@ def initialize(settings_engines=None, settings_outgoing=None): result['logger_name'] = logger_name return Network(**result) - def iter_engine_networks(): + def iter_networks(): + nonlocal settings_engines for engine_spec in settings_engines: engine_name = engine_spec['name'] engine = engines.get(engine_name) @@ -379,7 +382,7 @@ def initialize(settings_engines=None, settings_outgoing=None): NETWORKS[network_name] = new_network(network, logger_name=network_name) # define networks from engines.[i].network (except references) - for engine_name, engine, network in iter_engine_networks(): + for engine_name, engine, network in iter_networks(): if network is None: network = {} for attribute_name, attribute_value in default_params.items(): @@ -392,7 +395,7 @@ def initialize(settings_engines=None, settings_outgoing=None): NETWORKS[engine_name] = new_network(network, logger_name=engine_name) # define networks from engines.[i].network (references) - for engine_name, engine, network in iter_engine_networks(): + for engine_name, engine, network in iter_networks(): if isinstance(network, str): NETWORKS[engine_name] = NETWORKS[network] diff --git a/searx/settings.yml b/searx/settings.yml index 620cfb759..1743d2554 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -193,7 +193,7 @@ outgoing: # Uncomment below if you want to make multiple request in parallel # through all the proxies at once: # - # proxy_request_redundancy: 10 + # proxy_request_redundancy: 4 # # Extra seconds to add in order to account for the time taken by the proxy #