1
0
Fork 0
searxng/searx/network/client.py

569 lines
21 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pyright: basic
"""Implement various ABCHTTPClient
* OneHTTPClient wrapper around httpx.Client
* BaseHTTPClient httpx.Client accept the verify and max_redirects parameter only in the constructor.
BaseHTTPClient allows to pass these parameter in each query by creating multiple OneHTTPClient.
* HTTPClient Inherit from BaseHTTPClient, raise an error according to retry_on_http_error parameter.
* TorHTTPClient Inherit from HTTPClientSoftError, check Tor connectivity
"""
import random
from abc import ABC, abstractmethod
from collections import namedtuple
from ssl import SSLContext
from typing import Any, Dict, Optional, Tuple, Union
import httpx
from httpx_socks import SyncProxyTransport
from python_socks import ProxyConnectionError, ProxyError, ProxyTimeoutError, parse_proxy_url
from .raise_for_httperror import raise_for_httperror
CertTypes = Union[
# certfile
str,
# (certfile, keyfile)
Tuple[str, Optional[str]],
# (certfile, keyfile, password)
Tuple[str, Optional[str], Optional[str]],
]
SSLCONTEXTS: Dict[Any, SSLContext] = {}
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
Replace the None value, allow explicitly pass None as a function argument"""
NOTSET = _NotSetClass()
class SoftRetryHTTPException(Exception):
"""Client implementations raise this exception to tell the NetworkContext
the response is invalid even if there is no HTTP exception.
This exception is INTERNAL to searx.network and must not be seen outside.
See HTTPClientSoftError which check the HTTP response according to
the raise_for_httperror parameter.
"""
def __init__(self, response):
self.response = response
message = "SoftRetryHTTPException, you should not see this error"
super().__init__(message)
def _shuffle_ciphers(ssl_context):
"""Shuffle httpx's default ciphers of a SSL context randomly.
From `What Is TLS Fingerprint and How to Bypass It`_
> When implementing TLS fingerprinting, servers can't operate based on a
> locked-in whitelist database of fingerprints. New fingerprints appear
> when web clients or TLS libraries release new versions. So, they have to
> live off a blocklist database instead.
> ...
> It's safe to leave the first three as is but shuffle the remaining ciphers
> and you can bypass the TLS fingerprint check.
.. _What Is TLS Fingerprint and How to Bypass It:
https://www.zenrows.com/blog/what-is-tls-fingerprint#how-to-bypass-tls-fingerprinting
"""
c_list = httpx._config.DEFAULT_CIPHERS.split(':') # pylint: disable=protected-access
sc_list, c_list = c_list[:3], c_list[3:]
random.shuffle(c_list)
ssl_context.set_ciphers(":".join(sc_list + c_list))
def _get_sslcontexts(
local_address: str,
proxy_url: Optional[str],
cert: Optional[CertTypes],
verify: Union[str, bool],
trust_env: bool,
http2: bool,
):
key = (local_address, proxy_url, cert, verify, trust_env, http2)
if key not in SSLCONTEXTS:
SSLCONTEXTS[key] = httpx.create_ssl_context(cert, verify, trust_env, http2)
_shuffle_ciphers(SSLCONTEXTS[key])
return SSLCONTEXTS[key]
### Transport
class _HTTPTransportNoHttp(httpx.HTTPTransport):
"""Block HTTP request
The constructor is blank because httpx.HTTPTransport.__init__ creates an SSLContext unconditionally:
https://github.com/encode/httpx/blob/0f61aa58d66680c239ce43c8cdd453e7dc532bfc/httpx/_transports/default.py#L271
Each SSLContext consumes more than 500kb of memory, since there is about one network per engine.
In consequence, this class overrides all public methods
For reference: https://github.com/encode/httpx/issues/2298
"""
def __init__(self, *args, **kwargs):
# pylint: disable=super-init-not-called
# this on purpose if the base class is not called
pass
def handle_request(self, request):
raise httpx.UnsupportedProtocol('HTTP protocol is disabled')
def close(self) -> None:
pass
def __enter__(self): # Use generics for subclass support.
return self
def __exit__(self, exc_type, exc_value, traceback) -> None: # pylint: disable=signature-differs
# avoid to import the various type for the signature, but pylint is not happy
pass
class _CustomSyncProxyTransport(SyncProxyTransport):
"""Inherit from httpx_socks.SyncProxyTransport
Map python_socks exceptions to httpx.ProxyError exceptions
"""
def handle_request(self, request):
try:
return super().handle_request(request)
except ProxyConnectionError as e:
raise httpx.ProxyError("ProxyConnectionError: " + e.strerror, request=request) from e
except ProxyTimeoutError as e:
raise httpx.ProxyError("ProxyTimeoutError: " + e.args[0], request=request) from e
except ProxyError as e:
raise httpx.ProxyError("ProxyError: " + e.args[0], request=request) from e
def _get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries):
# support socks5h (requests compatibility):
# https://requests.readthedocs.io/en/master/user/advanced/#socks
# socks5:// hostname is resolved on client side
# socks5h:// hostname is resolved on proxy side
rdns = False
socks5h = 'socks5h://'
if proxy_url.startswith(socks5h):
proxy_url = 'socks5://' + proxy_url[len(socks5h) :]
rdns = True
proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url)
verify = _get_sslcontexts(local_address, proxy_url, None, verify, True, http2) if verify is True else verify
# About verify: in ProxyTransportFixed, verify is of type httpx._types.VerifyTypes
return _CustomSyncProxyTransport(
proxy_type=proxy_type,
proxy_host=proxy_host,
proxy_port=proxy_port,
username=proxy_username,
password=proxy_password,
rdns=rdns,
verify=verify, # type: ignore
http2=http2,
local_address=local_address,
limits=limit,
retries=retries,
)
def _get_transport(verify, http2, local_address, proxy_url, limit, retries):
verify = _get_sslcontexts(local_address, None, None, verify, True, http2) if verify is True else verify
return httpx.HTTPTransport(
# pylint: disable=protected-access
verify=verify,
http2=http2,
limits=limit,
proxy=httpx._config.Proxy(proxy_url) if proxy_url else None,
local_address=local_address,
retries=retries,
)
### Clients
class ABCHTTPClient(ABC):
"""Abstract HTTP client
Multiple implementation are defined bellow.
There are like an onion: each implementation relies on the previous one
and bring new feature.
"""
@abstractmethod
def send(self, stream: bool, method: str, url: str, **kwargs) -> httpx.Response:
pass
@abstractmethod
def close(self):
pass
@property
@abstractmethod
def is_closed(self) -> bool:
pass
def request(self, method, url, **kwargs) -> httpx.Response:
return self.send(False, method, url, **kwargs)
def stream(self, method, url, **kwargs) -> httpx.Response:
return self.send(True, method, url, **kwargs)
class OneHTTPClient(ABCHTTPClient):
"""Wrap a httpx.Client
Use httpx_socks for socks proxies.
Deal with httpx.RemoteProtocolError exception: httpx raises this exception when the
HTTP/2 server disconnect. It is excepted to reconnect.
Related to https://github.com/encode/httpx/issues/1478
Perhaps it can be removed now : TODO check in production.
To be backward compatible with Request:
* In Response, "ok" is set to "not response.is_error()"
See https://www.python-httpx.org/compatibility/#checking-for-success-and-failure-responses
* allow_redirects is accepted
See https://www.python-httpx.org/compatibility/#redirects
"""
def __init__(
# pylint: disable=too-many-arguments
self,
verify=True,
enable_http=True,
enable_http2=False,
max_connections=None,
max_keepalive_connections=None,
keepalive_expiry=None,
proxies=None,
local_addresses=None,
max_redirects=30,
hook_log_response=None,
log_trace=None,
allow_redirects=True,
logger=None,
):
self.enable_http = enable_http
self.verify = verify
self.enable_http2 = enable_http2
self.max_connections = max_connections
self.max_keepalive_connections = max_keepalive_connections
self.keepalive_expiry = keepalive_expiry
self.proxies = proxies or {}
self.local_address = local_addresses
self.max_redirects = max_redirects
self.hook_log_response = hook_log_response
self.allow_redirects = allow_redirects
self.logger = logger
self.extensions = None
if log_trace:
self.extensions = {"trace": log_trace}
self._new_client()
def send(self, stream, method, url, timeout=None, **kwargs):
self._patch_request(kwargs)
retry = 1
response = None
while retry >= 0: # pragma: no cover
retry -= 1
try:
if stream:
# from https://www.python-httpx.org/async/#streaming-responses
# > For situations when context block usage is not practical,
# > it is possible to enter "manual mode" by sending a Request
# > instance using client.send(..., stream=True).
request = self.client.build_request(
method=method,
url=url,
content=kwargs.get("content"),
data=kwargs.get("data"),
files=kwargs.get("files"),
json=kwargs.get("json"),
params=kwargs.get("params"),
headers=kwargs.get("headers"),
cookies=kwargs.get("cookies"),
timeout=timeout,
extensions=self.extensions,
)
response = self.client.send(
request,
stream=True,
follow_redirects=kwargs.get("follow_redirects", False),
auth=kwargs.get("auth"),
)
else:
response = self.client.request(method, url, extensions=self.extensions, timeout=timeout, **kwargs)
self._patch_response(response)
return response
except httpx.RemoteProtocolError as e:
if response:
response.close()
if retry >= 0:
# the server has closed the connection:
# try again without decreasing the retries variable & with a new HTTP client
self._reconnect_client()
if self.logger:
self.logger.warning('httpx.RemoteProtocolError: the server has disconnected, retrying')
continue
raise e
except (httpx.RequestError, httpx.HTTPStatusError) as e:
if response:
response.close()
raise e
return response # type: ignore
def close(self):
self.client.close()
@property
def is_closed(self) -> bool:
return self.client.is_closed
def _new_client(self):
limit = httpx.Limits(
max_connections=self.max_connections,
max_keepalive_connections=self.max_keepalive_connections,
keepalive_expiry=self.keepalive_expiry,
)
# See https://www.python-httpx.org/advanced/#routing
mounts = {}
for pattern, proxy_url in self.proxies.items():
if not self.enable_http and pattern.startswith('http://'):
continue
if (
proxy_url.startswith('socks4://')
or proxy_url.startswith('socks5://')
or proxy_url.startswith('socks5h://')
):
mounts[pattern] = _get_transport_for_socks_proxy(
self.verify, self.enable_http2, self.local_address, proxy_url, limit, 0
)
else:
mounts[pattern] = _get_transport(
self.verify, self.enable_http2, self.local_address, proxy_url, limit, 0
)
if not self.enable_http:
mounts['http://'] = _HTTPTransportNoHttp()
transport = _get_transport(self.verify, self.enable_http2, self.local_address, None, limit, 0)
event_hooks = None
if self.hook_log_response:
event_hooks = {'response': [self.hook_log_response]}
self.client = httpx.Client(
transport=transport,
mounts=mounts,
max_redirects=self.max_redirects,
event_hooks=event_hooks,
)
def _reconnect_client(self):
self.client.close()
self._new_client()
def _patch_request(self, kwargs):
# see https://www.python-httpx.org/compatibility/#redirects
follow_redirects = self.allow_redirects
if 'allow_redirects' in kwargs:
# see https://github.com/encode/httpx/pull/1808
follow_redirects = kwargs.pop('allow_redirects')
kwargs['follow_redirects'] = follow_redirects
def _patch_response(self, response):
if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
response.ok = not response.is_error # type: ignore
return response
_HTTPMultiClientConf = namedtuple('HTTPMultiClientConf', ['verify', 'max_redirects'])
class BaseHTTPClient(ABCHTTPClient):
"""Some parameter like verify, max_redirects are defined at the client level,
not at the request level.
This class allow to specify these parameters at the request level.
The implementation uses multiple instances of OneHTTPClient
This class does not deal with the retry_on_http_error parameter
"""
def __init__(
self,
**default_kwargs,
):
# set the default values
self.default = _HTTPMultiClientConf(True, 30)
# extract the values from the HTTPCient constructor
# the line before is mandatory to be able to self._extract_kwargs_clients
# and keep the other arguments
self.default, self.default_kwargs = self._extract_kwargs_clients(default_kwargs)
self.clients: Dict[Tuple, OneHTTPClient] = {}
def close(self):
for client in self.clients.values():
client.close()
@property
def is_closed(self) -> bool:
return all(client.is_closed for client in self.clients.values())
# send(... ,foo=1, bar=2)
def send(self, stream, method, url, timeout=None, **kwargs):
client = self._get_client_and_update_kwargs(kwargs)
return client.send(stream, method, url, timeout, **kwargs)
def _get_client_and_update_kwargs(self, kwargs) -> OneHTTPClient:
# extract HTTPMultiClientConf using the parameter in the request
# and fallback to the parameters defined in the constructor
# = the parameters set in the network settings
http_multi_client_conf, kwargs = self._extract_kwargs_clients(kwargs)
if http_multi_client_conf not in self.clients:
self.clients[http_multi_client_conf] = OneHTTPClient(
verify=http_multi_client_conf.verify,
max_redirects=http_multi_client_conf.max_redirects,
**self.default_kwargs,
)
return self.clients[http_multi_client_conf]
def _extract_kwargs_clients(self, kwargs) -> Tuple[_HTTPMultiClientConf, Dict]:
# default values
# see https://www.python-httpx.org/compatibility/#ssl-configuration
verify = kwargs.pop('verify', NOTSET)
max_redirects = kwargs.pop('max_redirects', NOTSET)
if verify == NOTSET:
verify = self.default.verify
if max_redirects == NOTSET:
max_redirects = self.default.max_redirects
return _HTTPMultiClientConf(verify, max_redirects), kwargs
class HTTPClient(BaseHTTPClient):
"""Inherit from BaseHTTPClient, raise an exception according to the retry_on_http_error parameter"""
def __init__(self, retry_on_http_error=None, **kwargs):
super().__init__(**kwargs)
self.retry_on_http_error = retry_on_http_error
self._check_configuration()
def _check_configuration(self):
# make sure we can create at least an OneHTTPClient without exception
self._get_client_and_update_kwargs({})
def send(self, stream, method, url, timeout=None, **kwargs):
try:
do_raise_for_httperror = self._extract_do_raise_for_httperror(kwargs)
response = super().send(stream, method, url, timeout=timeout, **kwargs)
if do_raise_for_httperror:
raise_for_httperror(response)
if self._is_error_but_retry(response):
raise SoftRetryHTTPException(response)
return response
except (httpx.RequestError, httpx.HTTPStatusError) as e:
raise e
def _is_error_but_retry(self, response):
# pylint: disable=too-many-boolean-expressions
return (
(self.retry_on_http_error is True and 400 <= response.status_code <= 599)
or (isinstance(self.retry_on_http_error, list) and response.status_code in self.retry_on_http_error)
or (isinstance(self.retry_on_http_error, int) and response.status_code == self.retry_on_http_error)
)
@staticmethod
def _extract_do_raise_for_httperror(kwargs):
do_raise_for_httperror = True
if 'raise_for_httperror' in kwargs:
do_raise_for_httperror = kwargs['raise_for_httperror']
del kwargs['raise_for_httperror']
return do_raise_for_httperror
def __repr__(self):
keys_values = " ".join([f"{k}={v!r}" for k, v in self.default_kwargs.items()])
return f"<{self.__class__.__name__} retry_on_http_error={self.retry_on_http_error!r} {keys_values}>"
class TorHTTPClient(HTTPClient):
"""Extend HTTPClientSoftError client. To use with Tor configuration.
The class checks if the client is really connected through Tor.
"""
_TOR_CHECK_RESULT = {}
def __init__(self, proxies=None, local_addresses=None, **kwargs):
self.proxies = proxies
self.local_addresses = local_addresses
super().__init__(proxies=proxies, local_addresses=local_addresses, **kwargs)
def _check_configuration(self):
if not self._is_connected_through_tor(self.proxies, self.local_addresses):
self.close()
raise httpx.HTTPError('Network configuration problem: not using Tor')
def _is_connected_through_tor(self, proxies, local_addresses) -> bool:
"""TODO : rewrite to check the proxies variable instead of checking the HTTPTransport ?"""
if proxies is None:
return False
cache_key = (local_addresses, tuple(proxies.items()))
if cache_key in TorHTTPClient._TOR_CHECK_RESULT:
return TorHTTPClient._TOR_CHECK_RESULT[cache_key]
# False is the client use the DNS from the proxy
use_local_dns = False
# get one httpx client through get_client_and_update_kwargs
one_http_client = self._get_client_and_update_kwargs({"verify": True})
httpx_client = one_http_client.client
# ignore client._transport because it is not used with all://
for transport in httpx_client._mounts.values(): # pylint: disable=protected-access
if isinstance(transport, _HTTPTransportNoHttp):
# ignore the NO HTTP transport
continue
if isinstance(transport, _CustomSyncProxyTransport) and not getattr(
transport._pool, "_rdns", False # pylint: disable=protected-access # type: ignore
):
# socks5:// with local DNS
# expect socks5h:// with remote DNS to resolve .onion domain.
use_local_dns = True
break
#
if use_local_dns:
# no test
result = False
else:
# actual check
response = one_http_client.request("GET", "https://check.torproject.org/api/ip", timeout=60)
if response.status_code != 200:
result = False
else:
result = bool(response.json().get("IsTor", False))
TorHTTPClient._TOR_CHECK_RESULT[cache_key] = result
return result
@staticmethod
def _clear_cache():
"""Only for the tests"""
TorHTTPClient._TOR_CHECK_RESULT = {}