Add searx.results module

Based on #1412
This commit is contained in:
Alexandre Flament 2022-07-26 21:43:20 +02:00 committed by Markus Heiser
parent 675dc04917
commit e242005916
5 changed files with 295 additions and 25 deletions

View file

@ -0,0 +1,47 @@
.. _results models:
====================
searx.results.models
====================
Main Results
------------
.. autoclass:: searx.results.models.MainResult
:members:
.. autoclass:: searx.results.models.UrlResult
:members:
.. autoclass:: searx.results.models.KeyValueResult
:members:
Infoboxes
---------
.. autoclass:: searx.results.models.Infobox
:members:
.. autoclass:: searx.results.models.InfoboxAttribute
:members:
.. autoclass:: searx.results.models.InfoboxImage
:members:
.. autoclass:: searx.results.models.InfoboxUrl
:members:
.. autoclass:: searx.results.models.InfoboxRelatedTopic
:members:
Others
------
.. autoclass:: searx.results.models.Answer
:members:
.. autoclass:: searx.results.models.Correction
:members:
.. autoclass:: searx.results.models.Suggestion
:members:

View file

@ -68,6 +68,8 @@ class Engine: # pylint: disable=too-few-public-methods
safesearch: bool safesearch: bool
time_range_support: bool time_range_support: bool
timeout: float timeout: float
weight: float
display_error_messages: bool
# Defaults for the namespace of an engine module, see :py:func:`load_engine` # Defaults for the namespace of an engine module, see :py:func:`load_engine`

View file

@ -1,13 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pyright: basic
import re import re
from collections import defaultdict from collections import defaultdict
from operator import itemgetter from operator import itemgetter
from threading import RLock from threading import RLock
from typing import List, NamedTuple, Set from typing import NamedTuple, Optional, List, Set, Dict, cast
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
from searx import logger from searx import logger
from searx.engines import engines from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error from searx.metrics import histogram_observe, counter_add, count_error
from . import models
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
@ -55,7 +59,7 @@ def compare_urls(url_a, url_b):
return unquote(path_a) == unquote(path_b) return unquote(path_a) == unquote(path_b)
def merge_two_infoboxes(infobox1, infobox2): def merge_two_infoboxes(infobox1: models.Infobox, infobox2: models.Infobox):
# get engines weights # get engines weights
if hasattr(engines[infobox1['engine']], 'weight'): if hasattr(engines[infobox1['engine']], 'weight'):
weight1 = engines[infobox1['engine']].weight weight1 = engines[infobox1['engine']].weight
@ -91,8 +95,8 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['urls'] = urls1 infobox1['urls'] = urls1
if 'img_src' in infobox2: if infobox2.get('img_src') is not None:
img1 = infobox1.get('img_src', None) img1 = infobox1.get('img_src')
img2 = infobox2.get('img_src') img2 = infobox2.get('img_src')
if img1 is None: if img1 is None:
infobox1['img_src'] = img2 infobox1['img_src'] = img2
@ -127,7 +131,7 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['content'] = content2 infobox1['content'] = content2
def result_score(result): def result_score(result: models.Result):
weight = 1.0 weight = 1.0
for result_engine in result['engines']: for result_engine in result['engines']:
@ -173,18 +177,18 @@ class ResultContainer:
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self._merged_results = [] self._merged_results: List[models.MainResult] = []
self.infoboxes = [] self.infoboxes: List[models.Infobox] = []
self.suggestions = set() self.suggestions: Set[models.Suggestion] = set()
self.answers = {} self.answers: Dict[str, models.Answer] = {}
self.corrections = set() self.corrections: Set[models.Correction] = set()
self._number_of_results = [] self._number_of_results = []
self.engine_data = defaultdict(dict) self.engine_data = defaultdict(dict)
self._closed = False self._closed: bool = False
self.paging = False self.paging: bool = False
self.unresponsive_engines: Set[UnresponsiveEngine] = set() self.unresponsive_engines: Set[UnresponsiveEngine] = set()
self.timings: List[Timing] = [] self.timings: List[Timing] = []
self.redirect_url = None self.redirect_url: Optional[str] = None
self.on_result = lambda _: True self.on_result = lambda _: True
self._lock = RLock() self._lock = RLock()
@ -193,7 +197,7 @@ class ResultContainer:
return return
standard_result_count = 0 standard_result_count = 0
error_msgs = set() error_msgs: Set[str] = set()
for result in list(results): for result in list(results):
result['engine'] = engine_name result['engine'] = engine_name
if 'suggestion' in result and self.on_result(result): if 'suggestion' in result and self.on_result(result):
@ -234,7 +238,7 @@ class ResultContainer:
if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging: if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging:
self.paging = True self.paging = True
def _merge_infobox(self, infobox): def _merge_infobox(self, infobox: models.Infobox):
add_infobox = True add_infobox = True
infobox_id = infobox.get('id', None) infobox_id = infobox.get('id', None)
infobox['engines'] = set([infobox['engine']]) infobox['engines'] = set([infobox['engine']])
@ -249,7 +253,7 @@ class ResultContainer:
if add_infobox: if add_infobox:
self.infoboxes.append(infobox) self.infoboxes.append(infobox)
def _is_valid_url_result(self, result, error_msgs): def _is_valid_url_result(self, result: models.UrlResult, error_msgs: Set[str]) -> bool:
if 'url' in result: if 'url' in result:
if not isinstance(result['url'], str): if not isinstance(result['url'], str):
logger.debug('result: invalid URL: %s', str(result)) logger.debug('result: invalid URL: %s', str(result))
@ -269,7 +273,7 @@ class ResultContainer:
return True return True
def _normalize_url_result(self, result): def _normalize_url_result(self, result: models.UrlResult):
"""Return True if the result is valid""" """Return True if the result is valid"""
result['parsed_url'] = urlparse(result['url']) result['parsed_url'] = urlparse(result['url'])
@ -288,9 +292,9 @@ class ResultContainer:
# strip multiple spaces and carriage returns from content # strip multiple spaces and carriage returns from content
if result.get('content'): if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) # type: ignore
def __merge_url_result(self, result, position): def __merge_url_result(self, result: models.UrlResult, position: int):
result['engines'] = set([result['engine']]) result['engines'] = set([result['engine']])
with self._lock: with self._lock:
duplicated = self.__find_duplicated_http_result(result) duplicated = self.__find_duplicated_http_result(result)
@ -302,11 +306,12 @@ class ResultContainer:
result['positions'] = [position] result['positions'] = [position]
self._merged_results.append(result) self._merged_results.append(result)
def __find_duplicated_http_result(self, result): def __find_duplicated_http_result(self, result: models.UrlResult) -> Optional[models.UrlResult]:
result_template = result.get('template') result_template = result.get('template')
for merged_result in self._merged_results: for merged_result in self._merged_results:
if 'parsed_url' not in merged_result: if 'parsed_url' not in merged_result:
continue continue
merged_result = cast(models.UrlResult, merged_result)
if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get( if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
'template' 'template'
): ):
@ -320,10 +325,10 @@ class ResultContainer:
return merged_result return merged_result
return None return None
def __merge_duplicated_http_result(self, duplicated, result, position): def __merge_duplicated_http_result(self, duplicated: models.UrlResult, result: models.UrlResult, position: int):
# using content with more text # using content with more text
if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')): if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
duplicated['content'] = result['content'] duplicated['content'] = result['content'] # type: ignore
# merge all result's parameters not found in duplicate # merge all result's parameters not found in duplicate
for key in result.keys(): for key in result.keys():
@ -341,11 +346,11 @@ class ResultContainer:
duplicated['url'] = result['parsed_url'].geturl() duplicated['url'] = result['parsed_url'].geturl()
duplicated['parsed_url'] = result['parsed_url'] duplicated['parsed_url'] = result['parsed_url']
def __merge_result_no_url(self, result, position): def __merge_result_no_url(self, result: models.KeyValueResult, position: int):
result['engines'] = set([result['engine']]) result['engines'] = set([result['engine']])
result['positions'] = [position] result['positions'] = [position]
with self._lock: with self._lock:
self._merged_results.append(result) self._merged_results.append(result) # type: ignore
def close(self): def close(self):
self._closed = True self._closed = True

213
searx/results/models.py Normal file
View file

@ -0,0 +1,213 @@
from typing import List, Dict, Set
from typing_extensions import TypedDict, NotRequired, Required, Literal
from urllib.parse import ParseResult
from datetime import datetime
__all__ = [
'Result',
'UrlResult',
'Answer',
'Correction',
'Suggestion',
'Infobox',
'InfoboxUrl',
'InfoboxImage',
'InfoboxAttribute',
'InfoboxRelatedTopic',
]
class Result(TypedDict):
"""A result from any type"""
engine: str
"""Internal field. DO NOT USE"""
weight: float
"""Internal field. DO NOT USE"""
engines: Set[str]
"""Internal field. DO NOT USE"""
category: str
"""Internal field. DO NOT USE"""
positions: List[int]
"""Internal field. DO NOT USE"""
score: float
"""Internal field. DO NOT USE"""
class MainResult(Result):
"""Result that is going to be displayed as a "main" result"""
template: NotRequired[str]
"""Template to display the result. The default value is "default.html".
see searx/templates/simple/result_templates"""
class UrlResult(MainResult):
"""Typical main result: an url, a title and a short description"""
title: str
"""Title of the result"""
url: str
"""URL of the result"""
parsed_url: ParseResult
"""Engines don't have to set this value: it is automatically initialized from the url field.
However, plugins have to manually update this field when they change the url field"""
content: NotRequired[str]
"""Content of the result"""
iframe_src: NotRequired[str]
"""URL of an iframe to add to the result."""
audio_src: NotRequired[str]
"""URL of <audio> element"""
img_src: NotRequired[str]
"""URL of an image to include the result"""
thumbnail: NotRequired[str]
"""URL of a thumbnail"""
publishedDate: NotRequired[datetime]
"""Publication date"""
length: NotRequired[str]
"""Length of the content (for audio or video)"""
author: NotRequired[str]
"""Author of the content (for audio, video, image, ...)"""
metadata: NotRequired[Dict]
"""Dictionnary to allow paging"""
class KeyValueResult(MainResult):
"""a set of key value to display, useful for the DB engines.
The template field must be "key-value.html", all other values are
displayed.
"""
template: Literal["key-value.html"]
"""template must be equal to `key-value.html`"""
class Answer(Result):
"""Answer item in the result list. The answer result item is used in
the :origin:`results.html <searx/templates/simple/results.html>` template.
A answer item is a dictionary type with dedicated keys and values."""
answer: Required[str]
"""The answer string append by the engine."""
url: NotRequired[str]
"""A link that is related to the answer (e.g. the origin of the answer)."""
class Correction(Result):
"""Correction item in the result list. The correction result item is used in
the :origin:`results.html <searx/templates/simple/results.html>` template.
A correction item is a dictionary type with dedicated keys and values."""
url: str
"""The SearXNG search URL for the correction term."""
title: str
"""The 'correction' string append by the engine."""
class Suggestion(Result):
"""Suggestion item in the result list. The suggestion result item is used in
the :origin:`infobox.html <searx/templates/simple/results.html>` template.
A sugestion item is a dictionary type with dedicated keys and values."""
suggestion: Required[str]
"""The SearXNG search URL for the suggestion term."""
class InfoboxUrl(TypedDict):
"""A list of dictionaries with links shown in the infobox.
A **url** item in the ``infobox.urls`` list is a dicticonary
"""
title: str
"""Title of the URL"""
url: str
"""URL by itself : https:/..."""
entity: str
"""set by some engines but unused"""
official: bool
"""set by some engines but unused (oscar)"""
class InfoboxImage(TypedDict):
src: str
"""URL of the image"""
alt: str
"""description of the image / alt attribute"""
class InfoboxAttribute(TypedDict):
"""A **attribute** item in the ``infobox.attributes`` list is a dictionary"""
label: str
"""Label of the attribute"""
value: str
"""Value of the attribute. If set, the image field is ignored"""
image: InfoboxImage
"""Image for this attribute. Ignored if the field value is set"""
entity: str
"""set by some engines but unused"""
class InfoboxRelatedTopic(TypedDict):
"""A **topic** item in the ``infobox.relatedTopics`` list is a dictionary"""
suggestion: str
"""suggested search query"""
name: str
"""set by some engines but unused"""
class Infobox(Result):
"""Infobox item in the result list. The infobox result item is used in the
:origin:`infobox.html <searx/templates/simple/infobox.html>` template.
A infobox item is a dictionary type with dedicated keys and values.
"""
infobox: Required[str]
"""Name of the infobox (mandatory)."""
id: str
"""URL of the infobox. Will be used to merge infoboxes."""
content: str
"""Content of the infobox (the description)"""
img_src: str
"""URL of the image to show in the infobox"""
urls: List[InfoboxUrl]
"""A list of dictionaries with links shown in the infobox."""
attributes: List[InfoboxAttribute]
"""A list of dictionaries with attributes shown in the infobox"""
relatedTopics: List[InfoboxRelatedTopic]
"""A list of dictionaries with related topics shown in the infobox"""

View file

@ -710,6 +710,7 @@ def search():
# output # output
for result in results: for result in results:
result = typing.cast(dict, result)
if output_format == 'html': if output_format == 'html':
if 'content' in result and result['content']: if 'content' in result and result['content']:
result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query) result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
@ -763,7 +764,9 @@ def search():
keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type') keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
csv.writerow(keys) csv.writerow(keys)
for row in results: for row in results:
row['host'] = row['parsed_url'].netloc row = typing.cast(dict, row)
if 'parsed_url' in row:
row['host'] = row['parsed_url'].netloc
row['type'] = 'result' row['type'] = 'result'
csv.writerow([row.get(key, '') for key in keys]) csv.writerow([row.get(key, '') for key in keys])
for a in result_container.answers: for a in result_container.answers: