mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
[network]: Add redundant parallel proxy requests.
Anecdotally, using SearX over unreliable proxies,
like tor, seems to be quite error prone.
SearX puts quite an effort to measure the
performance and reliability of engines, most
likely owning to those aspects being of
significant concern.
The patch here proposes to mitigate related
problems, by issuing concurrent redundant requests
through the specified proxies at once, returning
the first response that is not an error.
The functionality is enabled using the:
`proxy_request_redundancy` parameter within the
outgoing network settings or the engine settings.
Example:
```yaml
outgoing:
request_timeout: 8.0
proxies:
"all://":
- socks5h://tor:9050
- socks5h://tor1:9050
- socks5h://tor2:9050
- socks5h://tor3:9050
proxy_request_redundancy: 4
```
In this example, each network request will be
send 4 times, once through every proxy. The
first (non-error) response wins.
In my testing environment using several tor proxy
end-points, this approach almost entirely
removes engine errors related to timeouts
and denied requests. The latency of the
network system is also improved.
The implementation, uses a
`AsyncParallelTransport(httpx.AsyncBaseTransport)`
wrapper to wrap multiple sub-trasports,
and `asyncio.wait` to wait on the first completed
request.
The existing implementation of the network
proxy cycling has also been moved into the
`AsyncParallelTransport` class, which should
improve network client memoization and
performance.
TESTED:
- unit tests for the new functions and classes.
- tested on desktop PC with 10+ upstream proxies
and comparable request redundancy.
This commit is contained in:
parent
2f2d93b292
commit
122a9568de
10 changed files with 382 additions and 59 deletions
|
|
@ -2,10 +2,11 @@
|
|||
# pylint: disable=global-statement
|
||||
# pylint: disable=missing-module-docstring, missing-class-docstring
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import atexit
|
||||
import asyncio
|
||||
import ipaddress
|
||||
from itertools import cycle
|
||||
from typing import Dict
|
||||
|
||||
import httpx
|
||||
|
|
@ -46,12 +47,14 @@ class Network:
|
|||
'keepalive_expiry',
|
||||
'local_addresses',
|
||||
'proxies',
|
||||
'proxy_request_redundancy',
|
||||
'using_tor_proxy',
|
||||
'max_redirects',
|
||||
'retries',
|
||||
'retry_on_http_error',
|
||||
'_local_addresses_cycle',
|
||||
'_proxies_cycle',
|
||||
'_proxies_by_pattern',
|
||||
'_clients',
|
||||
'_logger',
|
||||
)
|
||||
|
|
@ -68,6 +71,7 @@ class Network:
|
|||
max_keepalive_connections=None,
|
||||
keepalive_expiry=None,
|
||||
proxies=None,
|
||||
proxy_request_redundancy=1,
|
||||
using_tor_proxy=False,
|
||||
local_addresses=None,
|
||||
retries=0,
|
||||
|
|
@ -83,13 +87,15 @@ class Network:
|
|||
self.max_keepalive_connections = max_keepalive_connections
|
||||
self.keepalive_expiry = keepalive_expiry
|
||||
self.proxies = proxies
|
||||
self.proxy_request_redundancy = proxy_request_redundancy
|
||||
self.using_tor_proxy = using_tor_proxy
|
||||
self.local_addresses = local_addresses
|
||||
self.retries = retries
|
||||
self.retry_on_http_error = retry_on_http_error
|
||||
self.max_redirects = max_redirects
|
||||
self._local_addresses_cycle = self.get_ipaddress_cycle()
|
||||
self._proxies_cycle = self.get_proxy_cycles()
|
||||
# Contains a dictionary with a list of proxies by pattern.
|
||||
self._proxies_by_pattern = dict(self.iter_proxies())
|
||||
self._clients = {}
|
||||
self._logger = logger.getChild(logger_name) if logger_name else logger
|
||||
self.check_parameters()
|
||||
|
|
@ -132,21 +138,16 @@ class Network:
|
|||
return
|
||||
# https://www.python-httpx.org/compatibility/#proxy-keys
|
||||
if isinstance(self.proxies, str):
|
||||
yield 'all://', [self.proxies]
|
||||
else:
|
||||
for pattern, proxy_url in self.proxies.items():
|
||||
yield 'all://', (self.proxies,)
|
||||
elif isinstance(self.proxies, dict):
|
||||
for pattern, proxy_urls in self.proxies.items():
|
||||
pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern)
|
||||
if isinstance(proxy_url, str):
|
||||
proxy_url = [proxy_url]
|
||||
yield pattern, proxy_url
|
||||
|
||||
def get_proxy_cycles(self):
|
||||
proxy_settings = {}
|
||||
for pattern, proxy_urls in self.iter_proxies():
|
||||
proxy_settings[pattern] = cycle(proxy_urls)
|
||||
while True:
|
||||
# pylint: disable=stop-iteration-return
|
||||
yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items())
|
||||
if isinstance(proxy_urls, str):
|
||||
yield pattern, (proxy_urls,)
|
||||
else:
|
||||
yield pattern, tuple(proxy_urls)
|
||||
else:
|
||||
raise ValueError("`proxies` need to be either a string or a patthern to url dictionary.")
|
||||
|
||||
async def log_response(self, response: httpx.Response):
|
||||
request = response.request
|
||||
|
|
@ -181,10 +182,11 @@ class Network:
|
|||
verify = self.verify if verify is None else verify
|
||||
max_redirects = self.max_redirects if max_redirects is None else max_redirects
|
||||
local_address = next(self._local_addresses_cycle)
|
||||
proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key
|
||||
key = (verify, max_redirects, local_address, proxies)
|
||||
hook_log_response = self.log_response if searx_debug else None
|
||||
if key not in self._clients or self._clients[key].is_closed:
|
||||
proxies = self._proxies_by_pattern
|
||||
key = (verify, max_redirects, local_address)
|
||||
client = self._clients.get(key)
|
||||
if not client or client.is_closed:
|
||||
client = new_client(
|
||||
self.enable_http,
|
||||
verify,
|
||||
|
|
@ -192,17 +194,19 @@ class Network:
|
|||
self.max_connections,
|
||||
self.max_keepalive_connections,
|
||||
self.keepalive_expiry,
|
||||
dict(proxies),
|
||||
proxies,
|
||||
self.proxy_request_redundancy,
|
||||
local_address,
|
||||
0,
|
||||
max_redirects,
|
||||
hook_log_response,
|
||||
self._logger,
|
||||
)
|
||||
if self.using_tor_proxy and not await self.check_tor_proxy(client, proxies):
|
||||
await client.aclose()
|
||||
raise httpx.ProxyError('Network configuration problem: not using Tor')
|
||||
self._clients[key] = client
|
||||
return self._clients[key]
|
||||
return client
|
||||
|
||||
async def aclose(self):
|
||||
async def close_client(client):
|
||||
|
|
@ -340,13 +344,13 @@ def initialize(settings_engines=None, settings_outgoing=None):
|
|||
'local_addresses': settings_outgoing['source_ips'],
|
||||
'using_tor_proxy': settings_outgoing['using_tor_proxy'],
|
||||
'proxies': settings_outgoing['proxies'],
|
||||
'proxy_request_redundancy': settings_outgoing['proxy_request_redundancy'],
|
||||
'max_redirects': settings_outgoing['max_redirects'],
|
||||
'retries': settings_outgoing['retries'],
|
||||
'retry_on_http_error': None,
|
||||
}
|
||||
|
||||
def new_network(params, logger_name=None):
|
||||
nonlocal default_params
|
||||
result = {}
|
||||
result.update(default_params)
|
||||
result.update(params)
|
||||
|
|
@ -354,8 +358,7 @@ def initialize(settings_engines=None, settings_outgoing=None):
|
|||
result['logger_name'] = logger_name
|
||||
return Network(**result)
|
||||
|
||||
def iter_networks():
|
||||
nonlocal settings_engines
|
||||
def iter_engine_networks():
|
||||
for engine_spec in settings_engines:
|
||||
engine_name = engine_spec['name']
|
||||
engine = engines.get(engine_name)
|
||||
|
|
@ -376,7 +379,7 @@ def initialize(settings_engines=None, settings_outgoing=None):
|
|||
NETWORKS[network_name] = new_network(network, logger_name=network_name)
|
||||
|
||||
# define networks from engines.[i].network (except references)
|
||||
for engine_name, engine, network in iter_networks():
|
||||
for engine_name, engine, network in iter_engine_networks():
|
||||
if network is None:
|
||||
network = {}
|
||||
for attribute_name, attribute_value in default_params.items():
|
||||
|
|
@ -389,7 +392,7 @@ def initialize(settings_engines=None, settings_outgoing=None):
|
|||
NETWORKS[engine_name] = new_network(network, logger_name=engine_name)
|
||||
|
||||
# define networks from engines.[i].network (references)
|
||||
for engine_name, engine, network in iter_networks():
|
||||
for engine_name, engine, network in iter_engine_networks():
|
||||
if isinstance(network, str):
|
||||
NETWORKS[engine_name] = NETWORKS[network]
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue