searxng/searx/network/utils.py
2021-09-02 17:52:11 +02:00

113 lines
3.9 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
import typing
import re
from yarl import URL
class URLPattern:
"""
A utility class currently used for making lookups against proxy keys...
# Wildcard matching...
>>> pattern = URLPattern("all")
>>> pattern.matches(yarl.URL("http://example.com"))
True
# Witch scheme matching...
>>> pattern = URLPattern("https")
>>> pattern.matches(yarl.URL("https://example.com"))
True
>>> pattern.matches(yarl.URL("http://example.com"))
False
# With domain matching...
>>> pattern = URLPattern("https://example.com")
>>> pattern.matches(yarl.URL("https://example.com"))
True
>>> pattern.matches(yarl.URL("http://example.com"))
False
>>> pattern.matches(yarl.URL("https://other.com"))
False
# Wildcard scheme, with domain matching...
>>> pattern = URLPattern("all://example.com")
>>> pattern.matches(yarl.URL("https://example.com"))
True
>>> pattern.matches(yarl.URL("http://example.com"))
True
>>> pattern.matches(yarl.URL("https://other.com"))
False
# With port matching...
>>> pattern = URLPattern("https://example.com:1234")
>>> pattern.matches(yarl.URL("https://example.com:1234"))
True
>>> pattern.matches(yarl.URL("https://example.com"))
False
"""
def __init__(self, pattern: str) -> None:
if pattern and ":" not in pattern:
raise ValueError(
f"Proxy keys should use proper URL forms rather "
f"than plain scheme strings. "
f'Instead of "{pattern}", use "{pattern}://"'
)
url = URL(pattern)
self.pattern = pattern
self.scheme = "" if url.scheme == "all" else url.scheme
self.host = "" if url.host == "*" else url.host
self.port = url.port
if not url.host or url.host == "*":
self.host_regex: typing.Optional[typing.Pattern[str]] = None
else:
if url.host.startswith("*."):
# *.example.com should match "www.example.com", but not "example.com"
domain = re.escape(url.host[2:])
self.host_regex = re.compile(f"^.+\\.{domain}$")
elif url.host.startswith("*"):
# *example.com should match "www.example.com" and "example.com"
domain = re.escape(url.host[1:])
self.host_regex = re.compile(f"^(.+\\.)?{domain}$")
else:
# example.com should match "example.com" but not "www.example.com"
domain = re.escape(url.host)
self.host_regex = re.compile(f"^{domain}$")
def matches(self, other: URL) -> bool:
if self.scheme and self.scheme != other.scheme:
return False
if (
self.host
and self.host_regex is not None
and not self.host_regex.match(other.host or '')
):
return False
if self.port is not None and self.port != other.port:
return False
return True
@property
def priority(self) -> tuple:
"""
The priority allows URLPattern instances to be sortable, so that
we can match from most specific to least specific.
"""
# URLs with a port should take priority over URLs without a port.
port_priority = 0 if self.port is not None else 1
# Longer hostnames should match first.
host_priority = -len(self.host or '')
# Longer schemes should match first.
scheme_priority = -len(self.scheme)
return (port_priority, host_priority, scheme_priority)
def __hash__(self) -> int:
return hash(self.pattern)
def __lt__(self, other: "URLPattern") -> bool:
return self.priority < other.priority
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, URLPattern) and self.pattern == other.pattern
def __repr__(self) -> str:
return f"<URLPattern pattern=\"{self.pattern}\">"