mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00

The intention of this patch is to improve modularization & documentation of the implementations about the *result* items. This patch does not contain any functional change! Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
80 lines
2.2 KiB
Python
80 lines
2.2 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
# lint: pylint
|
|
|
|
"""Core methods
|
|
"""
|
|
# pylint: disable=too-few-public-methods
|
|
|
|
import re
|
|
from urllib.parse import unquote
|
|
from typing import NamedTuple
|
|
|
|
from searx.engines import engines
|
|
|
|
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
|
|
WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
|
|
|
|
|
|
class Timing(NamedTuple): # pylint: disable=missing-class-docstring
|
|
engine: str
|
|
total: float
|
|
load: float
|
|
|
|
|
|
class UnresponsiveEngine(NamedTuple): # pylint: disable=missing-class-docstring
|
|
engine: str
|
|
error_type: str
|
|
suspended: bool
|
|
|
|
|
|
def result_content_len(content):
|
|
"""Return the meaningful length of the content for a result."""
|
|
if isinstance(content, str):
|
|
return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
|
|
return 0
|
|
|
|
|
|
def result_score(result):
|
|
weight = 1.0
|
|
|
|
for result_engine in result['engines']:
|
|
if hasattr(engines[result_engine], 'weight'):
|
|
weight *= float(engines[result_engine].weight)
|
|
|
|
occurences = len(result['positions'])
|
|
|
|
return sum((occurences * weight) / position for position in result['positions'])
|
|
|
|
|
|
def compare_urls(url_a, url_b):
|
|
"""Lazy compare between two URL.
|
|
|
|
"www.example.com" and "example.com" are equals.
|
|
"www.example.com/path/" and "www.example.com/path" are equals.
|
|
"https://www.example.com/" and "http://www.example.com/" are equals.
|
|
|
|
Args:
|
|
url_a (ParseResult): first URL
|
|
url_b (ParseResult): second URL
|
|
|
|
Returns:
|
|
bool: True if url_a and url_b are equals
|
|
"""
|
|
# ignore www. in comparison
|
|
if url_a.netloc.startswith('www.'):
|
|
host_a = url_a.netloc.replace('www.', '', 1)
|
|
else:
|
|
host_a = url_a.netloc
|
|
if url_b.netloc.startswith('www.'):
|
|
host_b = url_b.netloc.replace('www.', '', 1)
|
|
else:
|
|
host_b = url_b.netloc
|
|
|
|
if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
|
|
return False
|
|
|
|
# remove / from the end of the url if required
|
|
path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
|
|
path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
|
|
|
|
return unquote(path_a) == unquote(path_b)
|