This commit is contained in:
czaky 2024-06-13 10:46:50 +02:00 committed by GitHub
commit c331cf64cf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 401 additions and 50 deletions

View file

@ -173,3 +173,4 @@ features or generally made searx better:
- Austin Olacsi `<https://github.com/Austin-Olacsi>`
- @micsthepick
- Daniel Kukula `<https://github.com/dkuku>`
- @czaky `<https://github.com/czaky>`

View file

@ -47,6 +47,7 @@ engine is shown. Most of the options have a default value or even are optional.
max_keepalive_connections: 10
keepalive_expiry: 5.0
using_tor_proxy: false
proxy_request_redundancy: 1
proxies:
http:
- http://proxy1:8080
@ -154,6 +155,9 @@ engine is shown. Most of the options have a default value or even are optional.
``proxies`` :
Overwrites proxy settings from :ref:`settings outgoing`.
``proxy_request_redundancy`` :
Overwrites proxy settings from :ref:`settings outgoing`.
``using_tor_proxy`` :
Using tor proxy (``true``) or not (``false``) for this engine. The default is
taken from ``using_tor_proxy`` of the :ref:`settings outgoing`.
@ -241,4 +245,3 @@ Example configuration in settings.yml for a German and English speaker:
When searching, the default google engine will return German results and
"google english" will return English results.

View file

@ -22,9 +22,9 @@ Communication with search engines.
# and https://www.python-httpx.org/compatibility/#ssl-configuration
# verify: ~/.mitmproxy/mitmproxy-ca-cert.cer
#
# uncomment below section if you want to use a proxyq see: SOCKS proxies
# Uncomment below section if you want to use a proxy. See:
# https://2.python-requests.org/en/latest/user/advanced/#proxies
# are also supported: see
# SOCKS proxies are also supported. See:
# https://2.python-requests.org/en/latest/user/advanced/#socks
#
# proxies:
@ -34,6 +34,11 @@ Communication with search engines.
#
# using_tor_proxy: true
#
# Uncomment below if you want to make multiple request in parallel
# through all the proxies at once:
#
# proxy_request_redundancy: 4
#
# Extra seconds to add in order to account for the time taken by the proxy
#
# extra_proxy_timeout: 10.0
@ -70,6 +75,10 @@ Communication with search engines.
If there are more than one proxy for one protocol (http, https),
requests to the engines are distributed in a round-robin fashion.
``proxy_request_redundancy`` :
Cycle the proxies (``1``) on by one or use them in parallel (``> 1``) for all engines.
The default is ``1`` and can be overwritten in the :ref:`settings engine`
``source_ips`` :
If you use multiple network interfaces, define from which IP the requests must
be made. Example:
@ -106,5 +115,3 @@ Communication with search engines.
``using_tor_proxy`` :
Using tor proxy (``true``) or not (``false``) for all engines. The default is
``false`` and can be overwritten in the :ref:`settings engine`

View file

@ -110,6 +110,10 @@ class Engine: # pylint: disable=too-few-public-methods
https: socks5://proxy:port
"""
proxy_request_redundancy: int
"""Cycle proxies one by one (``1``) or
use them in parallel at once (``> 1``) for this engine."""
disabled: bool
"""To disable by default the engine, but not deleting it. It will allow the
user to manually activate it in the settings."""

View file

@ -1,14 +1,18 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, global-statement
from __future__ import annotations
import asyncio
import contextlib
import logging
import random
from ssl import SSLContext
import threading
from typing import Any, Dict
from typing import Any, Dict, Iterable
import httpx
import httpcore
from httpx_socks import AsyncProxyTransport
from python_socks import parse_proxy_url, ProxyConnectionError, ProxyTimeoutError, ProxyError
@ -112,7 +116,8 @@ class AsyncProxyTransportFixed(AsyncProxyTransport):
raise httpx.ProxyError("ProxyError: " + e.args[0], request=request) from e
def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries):
def get_socks_transport(verify, http2, local_address, proxy_url, limit, retries):
"""Return an AsyncProxyTransport."""
# support socks5h (requests compatibility):
# https://requests.readthedocs.io/en/master/user/advanced/#socks
# socks5:// hostname is resolved on client side
@ -141,7 +146,8 @@ def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit
)
def get_transport(verify, http2, local_address, proxy_url, limit, retries):
def get_http_transport(verify, http2, local_address, proxy_url, limit, retries):
"""Return an AsyncHTTPTransport."""
verify = get_sslcontexts(None, None, verify, True, http2) if verify is True else verify
return httpx.AsyncHTTPTransport(
# pylint: disable=protected-access
@ -154,6 +160,169 @@ def get_transport(verify, http2, local_address, proxy_url, limit, retries):
)
def get_single_transport(
limit: httpx.Limits | None = None,
proxy_url: str | None = None,
local_address: str | None = None,
retries: int = 0,
*,
verify: bool = True,
http2: bool = True,
) -> httpx.AsyncBaseTransport:
"""Generate a single, non-parallel transport.
Parameters
----------
limit : httpx.Limits
Limits applied to the to the transport.
proxy_url : str | None, optional
Proxy to use for the transport.
local_address : str | None, optional
local address to specify in the connection.
retries : int, optional
how many times to retry the request, by default 0
verify : bool, optional
Verify the certificates, by default True
http2 : bool, optional
Enable HTTP2 protocol, by default True
Returns
-------
httpx.AsyncBaseTransport
An async transport object.
"""
limit = limit or httpx.Limits()
if proxy_url and proxy_url.startswith(('socks4://', 'socks5://', 'socks5h://')):
return get_socks_transport(verify, http2, local_address, proxy_url, limit, retries)
return get_http_transport(verify, http2, local_address, proxy_url, limit, retries)
class AsyncParallelTransport(httpx.AsyncBaseTransport):
"""Fan out request to multiple base transports."""
def __init__(
self,
transports: Iterable[httpx.AsyncBaseTransport],
proxy_request_redundancy: int,
network_logger: logging.Logger,
) -> None:
"""Init the parallel transport using a list of base `transports`."""
self._logger = network_logger or logger
self._transports = list(transports)
if len(self._transports) == 0:
msg = "Got an empty list of (proxy) transports."
raise ValueError(msg)
if proxy_request_redundancy < 1:
self._logger.warning("Invalid proxy_request_redundancy specified: %d", proxy_request_redundancy)
proxy_request_redundancy = 1
self._proxy_request_redundancy = proxy_request_redundancy
self._index = random.randrange(len(self._transports)) # noqa: S311
async def handle_async_request(
self,
request: httpx.Request,
) -> httpx.Response:
# pylint: disable=too-many-branches
"""Issue parallel requests to all sub-transports.
Return the response of the first completed.
Parameters
----------
request : httpx.Request
Request to pass to the transports.
Returns
-------
httpx.Response
Response from the first completed request.
"""
response = None # non-error response, taking precedence
error_response = None # any error response
request_error = None # any request related exception
tcount = len(self._transports)
redundancy = self._proxy_request_redundancy
pending = [
asyncio.create_task(self._transports[i % tcount].handle_async_request(request))
for i in range(self._index, self._index + redundancy)
]
self._index = (self._index + redundancy) % tcount
while pending:
if len(pending) == 1:
return await pending.pop()
done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
for task in done:
try:
result = task.result()
if not result.is_error:
response = result
elif result.status_code == 404 and response is None:
error_response = response = result
elif not error_response:
self._logger.warning("Error response: %s for %s", result.status_code, request.url)
error_response = result
except (
httpx.HTTPError,
httpcore.ProtocolError,
httpcore.NetworkError,
httpcore.TimeoutException,
# Low level semaphore errors.
ValueError,
) as e:
if not request_error:
self._logger.warning("Request error: %s for %s", e, request.url)
request_error = e
if response:
break
if pending:
with contextlib.suppress(asyncio.exceptions.CancelledError):
gather = asyncio.gather(*pending)
gather.cancel()
self._logger.debug("Cancelling %d/%d redundant proxy requests.", len(pending), redundancy)
await gather
if response:
return response
if error_response:
return error_response
msg = "No valid response."
if request_error:
raise httpx.RequestError(msg) from request_error
raise httpx.RequestError(msg)
async def aclose(self) -> None:
"""Close all the transports."""
for transport in self._transports:
await transport.aclose()
def get_transport(
proxy_urls: list,
limit: httpx.Limits | None = None,
local_address: str | None = None,
proxy_request_redundancy: int = 1,
retries: int = 0,
network_logger: logging.Logger = logger,
*,
verify: bool = True,
http2: bool = True,
) -> httpx.AsyncBaseTransport:
"""Return a single http/proxy transport or the parallel version of those."""
limit = limit or httpx.Limits()
# pylint: disable=unnecessary-lambda-assignment
transport = lambda proxy_url: get_single_transport(
verify=verify,
http2=http2,
local_address=local_address,
proxy_url=proxy_url,
limit=limit,
retries=retries,
)
if len(proxy_urls or []) <= 1:
return transport(proxy_urls[0] if proxy_urls else None)
return AsyncParallelTransport(map(transport, proxy_urls), proxy_request_redundancy, network_logger)
def new_client(
# pylint: disable=too-many-arguments
enable_http,
@ -163,10 +332,12 @@ def new_client(
max_keepalive_connections,
keepalive_expiry,
proxies,
proxy_request_redundancy,
local_address,
retries,
max_redirects,
hook_log_response,
network_logger,
):
limit = httpx.Limits(
max_connections=max_connections,
@ -175,20 +346,24 @@ def new_client(
)
# See https://www.python-httpx.org/advanced/#routing
mounts = {}
for pattern, proxy_url in proxies.items():
for pattern, proxy_urls in proxies.items():
if not enable_http and pattern.startswith('http://'):
continue
if proxy_url.startswith('socks4://') or proxy_url.startswith('socks5://') or proxy_url.startswith('socks5h://'):
mounts[pattern] = get_transport_for_socks_proxy(
verify, enable_http2, local_address, proxy_url, limit, retries
)
else:
mounts[pattern] = get_transport(verify, enable_http2, local_address, proxy_url, limit, retries)
mounts[pattern] = get_transport(
verify=verify,
http2=enable_http2,
local_address=local_address,
proxy_urls=proxy_urls,
proxy_request_redundancy=proxy_request_redundancy,
limit=limit,
retries=retries,
network_logger=network_logger,
)
if not enable_http:
mounts['http://'] = AsyncHTTPTransportNoHttp()
transport = get_transport(verify, enable_http2, local_address, None, limit, retries)
transport = get_http_transport(verify, enable_http2, local_address, None, limit, retries)
event_hooks = None
if hook_log_response:

View file

@ -2,10 +2,11 @@
# pylint: disable=global-statement
# pylint: disable=missing-module-docstring, missing-class-docstring
from __future__ import annotations
import atexit
import asyncio
import ipaddress
from itertools import cycle
from typing import Dict
import httpx
@ -46,12 +47,14 @@ class Network:
'keepalive_expiry',
'local_addresses',
'proxies',
'proxy_request_redundancy',
'using_tor_proxy',
'max_redirects',
'retries',
'retry_on_http_error',
'_local_addresses_cycle',
'_proxies_cycle',
'_proxies_by_pattern',
'_clients',
'_logger',
)
@ -68,6 +71,7 @@ class Network:
max_keepalive_connections=None,
keepalive_expiry=None,
proxies=None,
proxy_request_redundancy=1,
using_tor_proxy=False,
local_addresses=None,
retries=0,
@ -83,13 +87,15 @@ class Network:
self.max_keepalive_connections = max_keepalive_connections
self.keepalive_expiry = keepalive_expiry
self.proxies = proxies
self.proxy_request_redundancy = proxy_request_redundancy
self.using_tor_proxy = using_tor_proxy
self.local_addresses = local_addresses
self.retries = retries
self.retry_on_http_error = retry_on_http_error
self.max_redirects = max_redirects
self._local_addresses_cycle = self.get_ipaddress_cycle()
self._proxies_cycle = self.get_proxy_cycles()
# Contains a dictionary with a list of proxies by pattern.
self._proxies_by_pattern = dict(self.iter_proxies())
self._clients = {}
self._logger = logger.getChild(logger_name) if logger_name else logger
self.check_parameters()
@ -132,21 +138,17 @@ class Network:
return
# https://www.python-httpx.org/compatibility/#proxy-keys
if isinstance(self.proxies, str):
yield 'all://', [self.proxies]
else:
for pattern, proxy_url in self.proxies.items():
yield 'all://', (self.proxies,)
elif isinstance(self.proxies, dict):
for pattern, proxy_urls in self.proxies.items():
pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern)
if isinstance(proxy_url, str):
proxy_url = [proxy_url]
yield pattern, proxy_url
def get_proxy_cycles(self):
proxy_settings = {}
for pattern, proxy_urls in self.iter_proxies():
proxy_settings[pattern] = cycle(proxy_urls)
while True:
# pylint: disable=stop-iteration-return
yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items())
if isinstance(proxy_urls, str):
yield pattern, (proxy_urls,)
else:
yield pattern, tuple(proxy_urls)
else:
msg = "`proxies` need to be either a string or a patthern to url dictionary."
raise ValueError(msg)
async def log_response(self, response: httpx.Response):
request = response.request
@ -181,10 +183,11 @@ class Network:
verify = self.verify if verify is None else verify
max_redirects = self.max_redirects if max_redirects is None else max_redirects
local_address = next(self._local_addresses_cycle)
proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key
key = (verify, max_redirects, local_address, proxies)
hook_log_response = self.log_response if searx_debug else None
if key not in self._clients or self._clients[key].is_closed:
proxies = self._proxies_by_pattern
key = (verify, max_redirects, local_address)
client = self._clients.get(key)
if not client or client.is_closed:
client = new_client(
self.enable_http,
verify,
@ -192,17 +195,19 @@ class Network:
self.max_connections,
self.max_keepalive_connections,
self.keepalive_expiry,
dict(proxies),
proxies,
self.proxy_request_redundancy,
local_address,
0,
max_redirects,
hook_log_response,
self._logger,
)
if self.using_tor_proxy and not await self.check_tor_proxy(client, proxies):
await client.aclose()
raise httpx.ProxyError('Network configuration problem: not using Tor')
self._clients[key] = client
return self._clients[key]
return client
async def aclose(self):
async def close_client(client):
@ -340,6 +345,7 @@ def initialize(settings_engines=None, settings_outgoing=None):
'local_addresses': settings_outgoing['source_ips'],
'using_tor_proxy': settings_outgoing['using_tor_proxy'],
'proxies': settings_outgoing['proxies'],
'proxy_request_redundancy': settings_outgoing['proxy_request_redundancy'],
'max_redirects': settings_outgoing['max_redirects'],
'retries': settings_outgoing['retries'],
'retry_on_http_error': None,

View file

@ -178,9 +178,9 @@ outgoing:
# and https://www.python-httpx.org/compatibility/#ssl-configuration
# verify: ~/.mitmproxy/mitmproxy-ca-cert.cer
#
# uncomment below section if you want to use a proxyq see: SOCKS proxies
# Uncomment below section if you want to use a proxy. See:
# https://2.python-requests.org/en/latest/user/advanced/#proxies
# are also supported: see
# SOCKS proxies are also supported. See:
# https://2.python-requests.org/en/latest/user/advanced/#socks
#
# proxies:
@ -190,6 +190,11 @@ outgoing:
#
# using_tor_proxy: true
#
# Uncomment below if you want to make multiple request in parallel
# through all the proxies at once:
#
# proxy_request_redundancy: 4
#
# Extra seconds to add in order to account for the time taken by the proxy
#
# extra_proxy_timeout: 10

View file

@ -221,6 +221,7 @@ SCHEMA = {
'max_redirects': SettingsValue(int, 30),
'retries': SettingsValue(int, 0),
'proxies': SettingsValue((None, str, dict), None),
'proxy_request_redundancy': SettingsValue(int, 1),
'source_ips': SettingsValue((None, str, list), None),
# Tor configuration
'using_tor_proxy': SettingsValue(bool, False),

View file

@ -0,0 +1,144 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Test module for the client and proxy handling code."""
from unittest.mock import patch, Mock
import httpx
from searx.network import client
from tests import SearxTestCase
class TestClient(SearxTestCase):
"""Tests for the client and proxy handling code."""
def test_get_single_transport(self):
t = client.get_single_transport(proxy_url="socks4://local:1080")
self.assertTrue(isinstance(t, client.AsyncProxyTransportFixed))
t = client.get_single_transport(proxy_url="socks5://local:1080")
self.assertTrue(isinstance(t, client.AsyncProxyTransportFixed))
t = client.get_single_transport(proxy_url="socks5h://local:1080")
self.assertTrue(isinstance(t, client.AsyncProxyTransportFixed))
t = client.get_single_transport(proxy_url="https://local:8080")
self.assertTrue(isinstance(t, httpx.AsyncHTTPTransport))
def test_get_parallel_transport(self):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
)
self.assertTrue(isinstance(t, client.AsyncParallelTransport))
@patch(
'searx.network.client.AsyncProxyTransportFixed.handle_async_request',
side_effect=[httpx.Response(200, html="<html/>"), httpx.Response(301, html="<html/>")],
)
async def test_parallel_transport_ok(self, handler_mock: Mock):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
)
request = httpx.Request(url="http://wiki.com", method="GET")
response = await t.handle_async_request(request)
self.assertEqual(response.status_code, 200)
handler_mock.assert_called_once_with(request)
response = await t.handle_async_request(request)
self.assertEqual(handler_mock.call_count, 2)
self.assertEqual(response.status_code, 301)
@patch(
'searx.network.client.AsyncProxyTransportFixed.handle_async_request',
side_effect=[httpx.Response(403, html="<html/>"), httpx.Response(200, html="<html/>")],
)
async def test_parallel_transport_403(self, handler_mock: Mock):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
proxy_request_redundancy=2,
)
self.assertTrue(isinstance(t, client.AsyncParallelTransport))
request = httpx.Request(url="http://wiki.com", method="GET")
response = await t.handle_async_request(request)
handler_mock.assert_called_with(request)
self.assertEqual(handler_mock.call_count, 2)
self.assertEqual(response.status_code, 200)
@patch(
'searx.network.client.AsyncProxyTransportFixed.handle_async_request',
side_effect=[httpx.Response(404, html="<html/>"), httpx.Response(404, html="<html/>")],
)
async def test_parallel_transport_404_404(self, handler_mock: Mock):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
proxy_request_redundancy=2,
)
self.assertTrue(isinstance(t, client.AsyncParallelTransport))
request = httpx.Request(url="http://wiki.com", method="GET")
response = await t.handle_async_request(request)
handler_mock.assert_called_with(request)
self.assertEqual(handler_mock.call_count, 2)
self.assertEqual(response.status_code, 404)
@patch(
'searx.network.client.AsyncProxyTransportFixed.handle_async_request',
side_effect=[httpx.Response(200, html="<html/>"), httpx.Response(404, html="<html/>")],
)
async def test_parallel_transport_404_200(self, handler_mock: Mock):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
proxy_request_redundancy=2,
)
self.assertTrue(isinstance(t, client.AsyncParallelTransport))
request = httpx.Request(url="http://wiki.com", method="GET")
response = await t.handle_async_request(request)
handler_mock.assert_called_with(request)
self.assertEqual(handler_mock.call_count, 2)
self.assertEqual(response.status_code, 200)
@patch(
'searx.network.client.AsyncProxyTransportFixed.handle_async_request',
side_effect=[httpx.Response(403, html="<html/>"), httpx.Response(403, html="<html/>")],
)
async def test_parallel_transport_403_403(self, handler_mock: Mock):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
proxy_request_redundancy=2,
)
self.assertTrue(isinstance(t, client.AsyncParallelTransport))
request = httpx.Request(url="http://wiki.com", method="GET")
response = await t.handle_async_request(request)
handler_mock.assert_called_with(request)
self.assertEqual(handler_mock.call_count, 2)
self.assertEqual(response.status_code, 403)
@patch(
'searx.network.client.AsyncProxyTransportFixed.handle_async_request',
side_effect=[httpx.RequestError("OMG!"), httpx.Response(200, html="<html/>")],
)
async def test_parallel_transport_ex_ok(self, handler_mock: Mock):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
proxy_request_redundancy=2,
)
self.assertTrue(isinstance(t, client.AsyncParallelTransport))
request = httpx.Request(url="http://wiki.com", method="GET")
response = await t.handle_async_request(request)
handler_mock.assert_called_with(request)
self.assertEqual(response.status_code, 200)
self.assertEqual(handler_mock.call_count, 2)
@patch(
'searx.network.client.AsyncProxyTransportFixed.handle_async_request',
side_effect=[httpx.RequestError("OMG!"), httpx.RequestError("OMG!")],
)
async def test_parallel_transport_ex_ex(self, handler_mock: Mock):
t = client.get_transport(
proxy_urls=["socks5h://local:1080", "socks5h://local:1180"],
proxy_request_redundancy=2,
)
self.assertTrue(isinstance(t, client.AsyncParallelTransport))
request = httpx.Request(url="http://wiki.com", method="GET")
response = None
with self.assertRaises(httpx.RequestError):
response = await t.handle_async_request(request)
handler_mock.assert_called_with(request)
self.assertFalse(response)
self.assertEqual(handler_mock.call_count, 2)

View file

@ -17,7 +17,7 @@ class TestNetwork(SearxTestCase): # pylint: disable=missing-class-docstring
network = Network()
self.assertEqual(next(network._local_addresses_cycle), None)
self.assertEqual(next(network._proxies_cycle), ())
self.assertEqual(network._proxies_by_pattern, {})
def test_ipaddress_cycle(self):
network = NETWORKS['ipv6']
@ -47,26 +47,31 @@ class TestNetwork(SearxTestCase): # pylint: disable=missing-class-docstring
with self.assertRaises(ValueError):
Network(local_addresses=['not_an_ip_address'])
def test_proxy_cycles(self):
def test_proxies_by_patterns(self):
network = Network(proxies='http://localhost:1337')
self.assertEqual(next(network._proxies_cycle), (('all://', 'http://localhost:1337'),))
self.assertEqual(network._proxies_by_pattern, {'all://': ('http://localhost:1337',)})
network = Network(proxies={'https': 'http://localhost:1337', 'http': 'http://localhost:1338'})
self.assertEqual(
next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))
)
self.assertEqual(
next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))
network._proxies_by_pattern,
{
'https://': ('http://localhost:1337',),
'http://': ('http://localhost:1338',),
},
)
network = Network(
proxies={'https': ['http://localhost:1337', 'http://localhost:1339'], 'http': 'http://localhost:1338'}
)
self.assertEqual(
next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))
)
self.assertEqual(
next(network._proxies_cycle), (('https://', 'http://localhost:1339'), ('http://', 'http://localhost:1338'))
network._proxies_by_pattern,
{
'https://': (
'http://localhost:1337',
'http://localhost:1339',
),
'http://': ('http://localhost:1338',),
},
)
with self.assertRaises(ValueError):