mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
113 lines
3.9 KiB
Python
113 lines
3.9 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
import typing
|
|
import re
|
|
|
|
|
|
from yarl import URL
|
|
|
|
|
|
class URLPattern:
|
|
"""
|
|
A utility class currently used for making lookups against proxy keys...
|
|
# Wildcard matching...
|
|
>>> pattern = URLPattern("all")
|
|
>>> pattern.matches(yarl.URL("http://example.com"))
|
|
True
|
|
# Witch scheme matching...
|
|
>>> pattern = URLPattern("https")
|
|
>>> pattern.matches(yarl.URL("https://example.com"))
|
|
True
|
|
>>> pattern.matches(yarl.URL("http://example.com"))
|
|
False
|
|
# With domain matching...
|
|
>>> pattern = URLPattern("https://example.com")
|
|
>>> pattern.matches(yarl.URL("https://example.com"))
|
|
True
|
|
>>> pattern.matches(yarl.URL("http://example.com"))
|
|
False
|
|
>>> pattern.matches(yarl.URL("https://other.com"))
|
|
False
|
|
# Wildcard scheme, with domain matching...
|
|
>>> pattern = URLPattern("all://example.com")
|
|
>>> pattern.matches(yarl.URL("https://example.com"))
|
|
True
|
|
>>> pattern.matches(yarl.URL("http://example.com"))
|
|
True
|
|
>>> pattern.matches(yarl.URL("https://other.com"))
|
|
False
|
|
# With port matching...
|
|
>>> pattern = URLPattern("https://example.com:1234")
|
|
>>> pattern.matches(yarl.URL("https://example.com:1234"))
|
|
True
|
|
>>> pattern.matches(yarl.URL("https://example.com"))
|
|
False
|
|
"""
|
|
|
|
def __init__(self, pattern: str) -> None:
|
|
if pattern and ":" not in pattern:
|
|
raise ValueError(
|
|
f"Proxy keys should use proper URL forms rather "
|
|
f"than plain scheme strings. "
|
|
f'Instead of "{pattern}", use "{pattern}://"'
|
|
)
|
|
|
|
url = URL(pattern)
|
|
self.pattern = pattern
|
|
self.scheme = "" if url.scheme == "all" else url.scheme
|
|
self.host = "" if url.host == "*" else url.host
|
|
self.port = url.port
|
|
if not url.host or url.host == "*":
|
|
self.host_regex: typing.Optional[typing.Pattern[str]] = None
|
|
else:
|
|
if url.host.startswith("*."):
|
|
# *.example.com should match "www.example.com", but not "example.com"
|
|
domain = re.escape(url.host[2:])
|
|
self.host_regex = re.compile(f"^.+\\.{domain}$")
|
|
elif url.host.startswith("*"):
|
|
# *example.com should match "www.example.com" and "example.com"
|
|
domain = re.escape(url.host[1:])
|
|
self.host_regex = re.compile(f"^(.+\\.)?{domain}$")
|
|
else:
|
|
# example.com should match "example.com" but not "www.example.com"
|
|
domain = re.escape(url.host)
|
|
self.host_regex = re.compile(f"^{domain}$")
|
|
|
|
def matches(self, other: URL) -> bool:
|
|
if self.scheme and self.scheme != other.scheme:
|
|
return False
|
|
if (
|
|
self.host
|
|
and self.host_regex is not None
|
|
and not self.host_regex.match(other.host or '')
|
|
):
|
|
return False
|
|
if self.port is not None and self.port != other.port:
|
|
return False
|
|
return True
|
|
|
|
@property
|
|
def priority(self) -> tuple:
|
|
"""
|
|
The priority allows URLPattern instances to be sortable, so that
|
|
we can match from most specific to least specific.
|
|
"""
|
|
# URLs with a port should take priority over URLs without a port.
|
|
port_priority = 0 if self.port is not None else 1
|
|
# Longer hostnames should match first.
|
|
host_priority = -len(self.host or '')
|
|
# Longer schemes should match first.
|
|
scheme_priority = -len(self.scheme)
|
|
return (port_priority, host_priority, scheme_priority)
|
|
|
|
def __hash__(self) -> int:
|
|
return hash(self.pattern)
|
|
|
|
def __lt__(self, other: "URLPattern") -> bool:
|
|
return self.priority < other.priority
|
|
|
|
def __eq__(self, other: typing.Any) -> bool:
|
|
return isinstance(other, URLPattern) and self.pattern == other.pattern
|
|
|
|
def __repr__(self) -> str:
|
|
return f"<URLPattern pattern=\"{self.pattern}\">"
|