forked from zaclys/searxng
551 lines
18 KiB
Python
551 lines
18 KiB
Python
import atexit
|
|
import sys
|
|
import threading
|
|
import asyncio
|
|
import logging
|
|
import concurrent.futures
|
|
from time import time
|
|
from itertools import cycle
|
|
|
|
import httpcore
|
|
import httpx
|
|
import h2.exceptions
|
|
from httpx_socks import AsyncProxyTransport
|
|
from python_socks import parse_proxy_url
|
|
import python_socks._errors
|
|
|
|
from searx import settings
|
|
from searx import logger
|
|
from searx.raise_for_httperror import raise_for_httperror
|
|
|
|
# Optional uvloop (support Python 3.6)
|
|
try:
|
|
import uvloop
|
|
except ImportError:
|
|
pass
|
|
else:
|
|
uvloop.install()
|
|
|
|
# queue.SimpleQueue: Support Python 3.6
|
|
try:
|
|
from queue import SimpleQueue
|
|
except ImportError:
|
|
from queue import Empty
|
|
from collections import deque
|
|
|
|
class SimpleQueue:
|
|
"""Minimal backport of queue.SimpleQueue"""
|
|
|
|
def __init__(self):
|
|
self._queue = deque()
|
|
self._count = threading.Semaphore(0)
|
|
|
|
def put(self, item):
|
|
self._queue.append(item)
|
|
self._count.release()
|
|
|
|
def get(self, timeout=None):
|
|
if not self._count.acquire(True, timeout):
|
|
raise Empty
|
|
return self._queue.popleft()
|
|
|
|
|
|
logger = logger.getChild('poolrequests')
|
|
|
|
|
|
try:
|
|
import ssl
|
|
if ssl.OPENSSL_VERSION_INFO[0:3] < (1, 0, 2):
|
|
# https://github.com/certifi/python-certifi#1024-bit-root-certificates
|
|
logger.critical('You are using an old openssl version({0}), please upgrade above 1.0.2!'
|
|
.format(ssl.OPENSSL_VERSION))
|
|
sys.exit(1)
|
|
except ImportError:
|
|
ssl = None
|
|
if not getattr(ssl, "HAS_SNI", False):
|
|
try:
|
|
import OpenSSL # pylint: disable=unused-import
|
|
except ImportError:
|
|
logger.critical("ssl doesn't support SNI and the pyopenssl module is not installed.\n"
|
|
"Some HTTPS connections will fail")
|
|
sys.exit(1)
|
|
|
|
|
|
LOOP = None
|
|
CLIENTS = dict()
|
|
THREADLOCAL = threading.local()
|
|
LIMITS = httpx.Limits(
|
|
# Magic number kept from previous code
|
|
max_connections=settings['outgoing'].get('pool_connections', 100),
|
|
# Picked from constructor
|
|
max_keepalive_connections=settings['outgoing'].get('pool_maxsize', 10),
|
|
#
|
|
keepalive_expiry=settings['outgoing'].get('keepalive_expiry', 5.0)
|
|
)
|
|
# default parameters for AsyncHTTPTransport
|
|
# see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # noqa
|
|
TRANSPORT_KWARGS = {
|
|
'http2': settings['outgoing'].get('http2', False),
|
|
'retries': 0,
|
|
'trust_env': False,
|
|
'backend': 'asyncio'
|
|
}
|
|
# requests compatibility when reading proxy settings from settings.yml
|
|
PROXY_PATTERN_MAPPING = {
|
|
'http': 'https://',
|
|
'https:': 'https://'
|
|
}
|
|
# default maximum redirect
|
|
# from https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55
|
|
DEFAULT_REDIRECT_LIMIT = 30
|
|
|
|
|
|
if settings['outgoing'].get('source_ips'):
|
|
LOCAL_ADDRESS_CYCLE = cycle(settings['outgoing'].get('source_ips'))
|
|
else:
|
|
LOCAL_ADDRESS_CYCLE = cycle((None, ))
|
|
|
|
|
|
def set_timeout_for_thread(timeout, start_time=None):
|
|
THREADLOCAL.timeout = timeout
|
|
THREADLOCAL.start_time = start_time
|
|
|
|
|
|
def set_enable_http_protocol(enable_http):
|
|
THREADLOCAL.enable_http = enable_http
|
|
|
|
|
|
def get_enable_http_protocol():
|
|
try:
|
|
return THREADLOCAL.enable_http
|
|
except AttributeError:
|
|
return False
|
|
|
|
|
|
def reset_time_for_thread():
|
|
THREADLOCAL.total_time = 0
|
|
|
|
|
|
def get_time_for_thread():
|
|
return THREADLOCAL.total_time
|
|
|
|
|
|
def get_proxy_cycles(proxy_settings):
|
|
if not proxy_settings:
|
|
return None
|
|
# Backwards compatibility for single proxy in settings.yml
|
|
for protocol, proxy in proxy_settings.items():
|
|
if isinstance(proxy, str):
|
|
proxy_settings[protocol] = [proxy]
|
|
|
|
for protocol in proxy_settings:
|
|
proxy_settings[protocol] = cycle(proxy_settings[protocol])
|
|
return proxy_settings
|
|
|
|
|
|
GLOBAL_PROXY_CYCLES = get_proxy_cycles(settings['outgoing'].get('proxies'))
|
|
|
|
|
|
def get_proxies(proxy_cycles):
|
|
if proxy_cycles:
|
|
return {protocol: next(proxy_cycle) for protocol, proxy_cycle in proxy_cycles.items()}
|
|
return None
|
|
|
|
|
|
def get_global_proxies():
|
|
return get_proxies(GLOBAL_PROXY_CYCLES)
|
|
|
|
|
|
async def close_connections_for_url(connection_pool: httpcore.AsyncConnectionPool, url: httpcore._utils.URL):
|
|
origin = httpcore._utils.url_to_origin(url)
|
|
logger.debug('Drop connections for %r', origin)
|
|
connections_to_close = connection_pool._connections_for_origin(origin)
|
|
for connection in connections_to_close:
|
|
await connection_pool._remove_from_pool(connection)
|
|
try:
|
|
await connection.aclose()
|
|
except httpcore.NetworkError as e:
|
|
logger.warning('Error closing an existing connection', exc_info=e)
|
|
|
|
|
|
class AsyncHTTPTransportNoHttp(httpcore.AsyncHTTPTransport):
|
|
"""Block HTTP request"""
|
|
|
|
async def arequest(self, method, url, headers=None, stream=None, ext=None):
|
|
raise httpcore.UnsupportedProtocol("HTTP protocol is disabled")
|
|
|
|
|
|
class AsyncProxyTransportFixed(AsyncProxyTransport):
|
|
"""Fix httpx_socks.AsyncProxyTransport
|
|
|
|
Map python_socks exceptions to httpcore.ProxyError
|
|
|
|
Map socket.gaierror to httpcore.ConnectError
|
|
|
|
Note: keepalive_expiry is ignored, AsyncProxyTransport should call:
|
|
* self._keepalive_sweep()
|
|
* self._response_closed(self, connection)
|
|
|
|
Note: AsyncProxyTransport inherit from AsyncConnectionPool
|
|
|
|
Note: the API is going to change on httpx 0.18.0
|
|
see https://github.com/encode/httpx/pull/1522
|
|
"""
|
|
|
|
async def arequest(self, method, url, headers=None, stream=None, ext=None):
|
|
retry = 2
|
|
while retry > 0:
|
|
retry -= 1
|
|
try:
|
|
return await super().arequest(method, url, headers, stream, ext)
|
|
except (python_socks._errors.ProxyConnectionError,
|
|
python_socks._errors.ProxyTimeoutError,
|
|
python_socks._errors.ProxyError) as e:
|
|
raise httpcore.ProxyError(e)
|
|
except OSError as e:
|
|
# socket.gaierror when DNS resolution fails
|
|
raise httpcore.NetworkError(e)
|
|
except httpcore.RemoteProtocolError as e:
|
|
# in case of httpcore.RemoteProtocolError: Server disconnected
|
|
await close_connections_for_url(self, url)
|
|
logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e)
|
|
# retry
|
|
except (httpcore.NetworkError, httpcore.ProtocolError) as e:
|
|
# httpcore.WriteError on HTTP/2 connection leaves a new opened stream
|
|
# then each new request creates a new stream and raise the same WriteError
|
|
await close_connections_for_url(self, url)
|
|
raise e
|
|
|
|
|
|
class AsyncHTTPTransportFixed(httpx.AsyncHTTPTransport):
|
|
"""Fix httpx.AsyncHTTPTransport"""
|
|
|
|
async def arequest(self, method, url, headers=None, stream=None, ext=None):
|
|
retry = 2
|
|
while retry > 0:
|
|
retry -= 1
|
|
try:
|
|
return await super().arequest(method, url, headers, stream, ext)
|
|
except OSError as e:
|
|
# socket.gaierror when DNS resolution fails
|
|
raise httpcore.ConnectError(e)
|
|
except httpcore.CloseError as e:
|
|
# httpcore.CloseError: [Errno 104] Connection reset by peer
|
|
# raised by _keepalive_sweep()
|
|
# from https://github.com/encode/httpcore/blob/4b662b5c42378a61e54d673b4c949420102379f5/httpcore/_backends/asyncio.py#L198 # noqa
|
|
await close_connections_for_url(self._pool, url)
|
|
logger.warning('httpcore.CloseError: retry', exc_info=e)
|
|
# retry
|
|
except httpcore.RemoteProtocolError as e:
|
|
# in case of httpcore.RemoteProtocolError: Server disconnected
|
|
await close_connections_for_url(self._pool, url)
|
|
logger.warning('httpcore.RemoteProtocolError: retry', exc_info=e)
|
|
# retry
|
|
except (httpcore.ProtocolError, httpcore.NetworkError) as e:
|
|
await close_connections_for_url(self._pool, url)
|
|
raise e
|
|
|
|
|
|
def get_transport_for_socks_proxy(verify, local_address, proxy_url):
|
|
global LOOP, LIMITS, TRANSPORT_KWARGS
|
|
# support socks5h (requests compatibility):
|
|
# https://requests.readthedocs.io/en/master/user/advanced/#socks
|
|
# socks5:// hostname is resolved on client side
|
|
# socks5h:// hostname is resolved on proxy side
|
|
rdns = False
|
|
socks5h = 'socks5h://'
|
|
if proxy_url.startswith(socks5h):
|
|
proxy_url = 'socks5://' + proxy_url[len(socks5h):]
|
|
rdns = True
|
|
|
|
proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url)
|
|
|
|
return AsyncProxyTransportFixed(proxy_type=proxy_type, proxy_host=proxy_host, proxy_port=proxy_port,
|
|
username=proxy_username, password=proxy_password,
|
|
rdns=rdns,
|
|
loop=LOOP,
|
|
verify=verify,
|
|
local_address=local_address,
|
|
max_connections=LIMITS.max_connections,
|
|
max_keepalive_connections=LIMITS.max_keepalive_connections,
|
|
keepalive_expiry=LIMITS.keepalive_expiry,
|
|
**TRANSPORT_KWARGS)
|
|
|
|
|
|
def get_transport(verify, local_address, proxy_url):
|
|
global LIMITS
|
|
return AsyncHTTPTransportFixed(verify=verify,
|
|
local_address=local_address,
|
|
limits=LIMITS,
|
|
proxy=httpx._config.Proxy(proxy_url) if proxy_url else None,
|
|
**TRANSPORT_KWARGS)
|
|
|
|
|
|
def iter_proxies(proxies):
|
|
# https://www.python-httpx.org/compatibility/#proxy-keys
|
|
if isinstance(proxies, str):
|
|
yield 'all://', proxies
|
|
elif isinstance(proxies, dict):
|
|
for pattern, proxy_url in proxies.items():
|
|
pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern)
|
|
yield pattern, proxy_url
|
|
|
|
|
|
def new_client(verify, local_address, proxies, max_redirects, enable_http):
|
|
# See https://www.python-httpx.org/advanced/#routing
|
|
mounts = {}
|
|
for pattern, proxy_url in iter_proxies(proxies):
|
|
if not enable_http and (pattern == 'http' or pattern.startswith('http://')):
|
|
continue
|
|
if proxy_url.startswith('socks4://') \
|
|
or proxy_url.startswith('socks5://') \
|
|
or proxy_url.startswith('socks5h://'):
|
|
mounts[pattern] = get_transport_for_socks_proxy(verify, local_address, proxy_url)
|
|
else:
|
|
mounts[pattern] = get_transport(verify, local_address, proxy_url)
|
|
|
|
if not enable_http:
|
|
mounts['http://'] = AsyncHTTPTransportNoHttp()
|
|
|
|
transport = get_transport(verify, local_address, None)
|
|
return httpx.AsyncClient(transport=transport, mounts=mounts, max_redirects=max_redirects)
|
|
|
|
|
|
def get_client(verify, local_address, proxies, max_redirects, allow_http):
|
|
global CLIENTS
|
|
key = (verify, local_address, repr(proxies), max_redirects, allow_http)
|
|
if key not in CLIENTS:
|
|
CLIENTS[key] = new_client(verify, local_address, proxies, max_redirects, allow_http)
|
|
return CLIENTS[key]
|
|
|
|
|
|
async def send_request(method, url, enable_http, kwargs):
|
|
if isinstance(url, bytes):
|
|
url = url.decode()
|
|
|
|
verify = kwargs.pop('verify', True)
|
|
local_address = next(LOCAL_ADDRESS_CYCLE)
|
|
proxies = kwargs.pop('proxies', None) or get_global_proxies()
|
|
max_redirects = kwargs.pop('max_redirects', DEFAULT_REDIRECT_LIMIT)
|
|
|
|
client = get_client(verify, local_address, proxies, max_redirects, enable_http)
|
|
response = await client.request(method.upper(), url, **kwargs)
|
|
|
|
# requests compatibility
|
|
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
|
|
response.ok = not response.is_error
|
|
|
|
return response
|
|
|
|
|
|
def request(method, url, **kwargs):
|
|
"""same as requests/requests/api.py request(...)"""
|
|
time_before_request = time()
|
|
|
|
# timeout
|
|
if 'timeout' in kwargs:
|
|
timeout = kwargs['timeout']
|
|
else:
|
|
timeout = getattr(THREADLOCAL, 'timeout', None)
|
|
if timeout is not None:
|
|
kwargs['timeout'] = timeout
|
|
|
|
# raise_for_error
|
|
check_for_httperror = True
|
|
if 'raise_for_httperror' in kwargs:
|
|
check_for_httperror = kwargs['raise_for_httperror']
|
|
del kwargs['raise_for_httperror']
|
|
|
|
# do request
|
|
future = asyncio.run_coroutine_threadsafe(send_request(method, url, get_enable_http_protocol(), kwargs), LOOP)
|
|
try:
|
|
if timeout:
|
|
timeout += 0.2 # overhead
|
|
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
|
|
if start_time:
|
|
timeout -= time() - start_time
|
|
|
|
response = future.result(timeout or 120)
|
|
except concurrent.futures.TimeoutError as e:
|
|
raise httpx.TimeoutException('Timeout', request=None) from e
|
|
|
|
# update total_time.
|
|
# See get_time_for_thread() and reset_time_for_thread()
|
|
if hasattr(THREADLOCAL, 'total_time'):
|
|
time_after_request = time()
|
|
THREADLOCAL.total_time += time_after_request - time_before_request
|
|
|
|
# raise an exception
|
|
if check_for_httperror:
|
|
raise_for_httperror(response)
|
|
|
|
return response
|
|
|
|
|
|
async def stream_chunk_to_queue(method, url, q, **kwargs):
|
|
verify = kwargs.pop('verify', True)
|
|
local_address = next(LOCAL_ADDRESS_CYCLE)
|
|
proxies = kwargs.pop('proxies', None) or get_global_proxies()
|
|
# "30" from requests:
|
|
# https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55
|
|
max_redirects = kwargs.pop('max_redirects', 30)
|
|
client = get_client(verify, local_address, proxies, max_redirects, True)
|
|
try:
|
|
async with client.stream(method, url, **kwargs) as response:
|
|
q.put(response)
|
|
async for chunk in response.aiter_bytes(65536):
|
|
if len(chunk) > 0:
|
|
q.put(chunk)
|
|
except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e:
|
|
q.put(e)
|
|
finally:
|
|
q.put(None)
|
|
|
|
|
|
def stream(method, url, **kwargs):
|
|
"""Replace httpx.stream.
|
|
|
|
Usage:
|
|
stream = poolrequests.stream(...)
|
|
response = next(stream)
|
|
for chunk in stream:
|
|
...
|
|
|
|
httpx.Client.stream requires to write the httpx.HTTPTransport version of the
|
|
the httpx.AsyncHTTPTransport declared above.
|
|
"""
|
|
q = SimpleQueue()
|
|
future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(method, url, q, **kwargs), LOOP)
|
|
chunk_or_exception = q.get(timeout=60)
|
|
while chunk_or_exception is not None:
|
|
if isinstance(chunk_or_exception, Exception):
|
|
raise chunk_or_exception
|
|
yield chunk_or_exception
|
|
chunk_or_exception = q.get(timeout=60)
|
|
return future.result()
|
|
|
|
|
|
def get(url, **kwargs):
|
|
kwargs.setdefault('allow_redirects', True)
|
|
return request('get', url, **kwargs)
|
|
|
|
|
|
def options(url, **kwargs):
|
|
kwargs.setdefault('allow_redirects', True)
|
|
return request('options', url, **kwargs)
|
|
|
|
|
|
def head(url, **kwargs):
|
|
kwargs.setdefault('allow_redirects', False)
|
|
return request('head', url, **kwargs)
|
|
|
|
|
|
def post(url, data=None, **kwargs):
|
|
return request('post', url, data=data, **kwargs)
|
|
|
|
|
|
def put(url, data=None, **kwargs):
|
|
return request('put', url, data=data, **kwargs)
|
|
|
|
|
|
def patch(url, data=None, **kwargs):
|
|
return request('patch', url, data=data, **kwargs)
|
|
|
|
|
|
def delete(url, **kwargs):
|
|
return request('delete', url, **kwargs)
|
|
|
|
|
|
def init():
|
|
# log
|
|
for logger_name in ('hpack.hpack', 'hpack.table'):
|
|
logging.getLogger(logger_name).setLevel(logging.WARNING)
|
|
|
|
# loop
|
|
def loop_thread():
|
|
global LOOP
|
|
LOOP = asyncio.new_event_loop()
|
|
LOOP.run_forever()
|
|
|
|
th = threading.Thread(
|
|
target=loop_thread,
|
|
name='asyncio_loop',
|
|
daemon=True,
|
|
)
|
|
th.start()
|
|
|
|
|
|
@atexit.register
|
|
def done():
|
|
"""Close all HTTP client
|
|
|
|
Avoid a warning at exit
|
|
see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785
|
|
"""
|
|
global LOOP
|
|
|
|
async def close_client(client):
|
|
try:
|
|
await client.aclose()
|
|
except httpx.HTTPError:
|
|
pass
|
|
|
|
async def close_clients():
|
|
await asyncio.gather(*[close_client(client) for client in CLIENTS.values()], return_exceptions=False)
|
|
future = asyncio.run_coroutine_threadsafe(close_clients(), LOOP)
|
|
# wait 3 seconds to close the HTTP clients
|
|
future.result(3)
|
|
|
|
|
|
init()
|
|
|
|
|
|
# ## TEMPORARY DEBUG ##
|
|
|
|
|
|
def debug_connection(connection):
|
|
now = LOOP.time()
|
|
expired = (connection.state == httpcore._async.base.ConnectionState.IDLE
|
|
and connection.expires_at is not None
|
|
and now >= connection.expires_at)
|
|
return connection.info()\
|
|
+ (', connect_failed' if connection.connect_failed else '')\
|
|
+ (', expired' if expired else '')
|
|
|
|
|
|
def debug_origin(origin):
|
|
return origin[0].decode() + '://' + origin[1].decode() + ':' + str(origin[2])
|
|
|
|
|
|
def debug_transport(transport):
|
|
result = {
|
|
'__class__': str(transport.__class__.__name__)
|
|
}
|
|
if isinstance(transport, (httpx.AsyncHTTPTransport, AsyncHTTPTransportFixed)):
|
|
pool = transport._pool
|
|
result['__pool_class__'] = str(pool.__class__.__name__)
|
|
if isinstance(pool, httpcore.AsyncConnectionPool):
|
|
for origin, connections in pool._connections.items():
|
|
result[debug_origin(origin)] = [debug_connection(connection) for connection in connections]
|
|
return result
|
|
elif isinstance(transport, AsyncProxyTransportFixed):
|
|
for origin, connections in transport._connections.items():
|
|
result[debug_origin(origin)] = [debug_connection(connection) for connection in connections]
|
|
return result
|
|
return result
|
|
|
|
|
|
def debug_asyncclient(client, key=None):
|
|
result = {}
|
|
if key:
|
|
result['__key__'] = [k if isinstance(k, (str, int, float, bool, type(None))) else repr(k) for k in key]
|
|
result['__default__'] = debug_transport(client._transport)
|
|
for urlpattern, transport in client._mounts.items():
|
|
result[urlpattern.pattern] = debug_transport(transport)
|
|
return result
|
|
|
|
|
|
def debug_asyncclients():
|
|
global CLIENTS
|
|
return [debug_asyncclient(client, key) for key, client in CLIENTS.items()]
|