Add searx.results module

Based on #1412
This commit is contained in:
Alexandre Flament 2022-07-26 21:43:20 +02:00 committed by Markus Heiser
parent 675dc04917
commit e242005916
5 changed files with 295 additions and 25 deletions

View file

@ -0,0 +1,47 @@
.. _results models:
====================
searx.results.models
====================
Main Results
------------
.. autoclass:: searx.results.models.MainResult
:members:
.. autoclass:: searx.results.models.UrlResult
:members:
.. autoclass:: searx.results.models.KeyValueResult
:members:
Infoboxes
---------
.. autoclass:: searx.results.models.Infobox
:members:
.. autoclass:: searx.results.models.InfoboxAttribute
:members:
.. autoclass:: searx.results.models.InfoboxImage
:members:
.. autoclass:: searx.results.models.InfoboxUrl
:members:
.. autoclass:: searx.results.models.InfoboxRelatedTopic
:members:
Others
------
.. autoclass:: searx.results.models.Answer
:members:
.. autoclass:: searx.results.models.Correction
:members:
.. autoclass:: searx.results.models.Suggestion
:members:

View file

@ -68,6 +68,8 @@ class Engine: # pylint: disable=too-few-public-methods
safesearch: bool
time_range_support: bool
timeout: float
weight: float
display_error_messages: bool
# Defaults for the namespace of an engine module, see :py:func:`load_engine`

View file

@ -1,13 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pyright: basic
import re
from collections import defaultdict
from operator import itemgetter
from threading import RLock
from typing import List, NamedTuple, Set
from typing import NamedTuple, Optional, List, Set, Dict, cast
from urllib.parse import urlparse, unquote
from searx import logger
from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error
from . import models
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
@ -55,7 +59,7 @@ def compare_urls(url_a, url_b):
return unquote(path_a) == unquote(path_b)
def merge_two_infoboxes(infobox1, infobox2):
def merge_two_infoboxes(infobox1: models.Infobox, infobox2: models.Infobox):
# get engines weights
if hasattr(engines[infobox1['engine']], 'weight'):
weight1 = engines[infobox1['engine']].weight
@ -91,8 +95,8 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['urls'] = urls1
if 'img_src' in infobox2:
img1 = infobox1.get('img_src', None)
if infobox2.get('img_src') is not None:
img1 = infobox1.get('img_src')
img2 = infobox2.get('img_src')
if img1 is None:
infobox1['img_src'] = img2
@ -127,7 +131,7 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['content'] = content2
def result_score(result):
def result_score(result: models.Result):
weight = 1.0
for result_engine in result['engines']:
@ -173,18 +177,18 @@ class ResultContainer:
def __init__(self):
super().__init__()
self._merged_results = []
self.infoboxes = []
self.suggestions = set()
self.answers = {}
self.corrections = set()
self._merged_results: List[models.MainResult] = []
self.infoboxes: List[models.Infobox] = []
self.suggestions: Set[models.Suggestion] = set()
self.answers: Dict[str, models.Answer] = {}
self.corrections: Set[models.Correction] = set()
self._number_of_results = []
self.engine_data = defaultdict(dict)
self._closed = False
self.paging = False
self._closed: bool = False
self.paging: bool = False
self.unresponsive_engines: Set[UnresponsiveEngine] = set()
self.timings: List[Timing] = []
self.redirect_url = None
self.redirect_url: Optional[str] = None
self.on_result = lambda _: True
self._lock = RLock()
@ -193,7 +197,7 @@ class ResultContainer:
return
standard_result_count = 0
error_msgs = set()
error_msgs: Set[str] = set()
for result in list(results):
result['engine'] = engine_name
if 'suggestion' in result and self.on_result(result):
@ -234,7 +238,7 @@ class ResultContainer:
if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging:
self.paging = True
def _merge_infobox(self, infobox):
def _merge_infobox(self, infobox: models.Infobox):
add_infobox = True
infobox_id = infobox.get('id', None)
infobox['engines'] = set([infobox['engine']])
@ -249,7 +253,7 @@ class ResultContainer:
if add_infobox:
self.infoboxes.append(infobox)
def _is_valid_url_result(self, result, error_msgs):
def _is_valid_url_result(self, result: models.UrlResult, error_msgs: Set[str]) -> bool:
if 'url' in result:
if not isinstance(result['url'], str):
logger.debug('result: invalid URL: %s', str(result))
@ -269,7 +273,7 @@ class ResultContainer:
return True
def _normalize_url_result(self, result):
def _normalize_url_result(self, result: models.UrlResult):
"""Return True if the result is valid"""
result['parsed_url'] = urlparse(result['url'])
@ -288,9 +292,9 @@ class ResultContainer:
# strip multiple spaces and carriage returns from content
if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) # type: ignore
def __merge_url_result(self, result, position):
def __merge_url_result(self, result: models.UrlResult, position: int):
result['engines'] = set([result['engine']])
with self._lock:
duplicated = self.__find_duplicated_http_result(result)
@ -302,11 +306,12 @@ class ResultContainer:
result['positions'] = [position]
self._merged_results.append(result)
def __find_duplicated_http_result(self, result):
def __find_duplicated_http_result(self, result: models.UrlResult) -> Optional[models.UrlResult]:
result_template = result.get('template')
for merged_result in self._merged_results:
if 'parsed_url' not in merged_result:
continue
merged_result = cast(models.UrlResult, merged_result)
if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
'template'
):
@ -320,10 +325,10 @@ class ResultContainer:
return merged_result
return None
def __merge_duplicated_http_result(self, duplicated, result, position):
def __merge_duplicated_http_result(self, duplicated: models.UrlResult, result: models.UrlResult, position: int):
# using content with more text
if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
duplicated['content'] = result['content']
duplicated['content'] = result['content'] # type: ignore
# merge all result's parameters not found in duplicate
for key in result.keys():
@ -341,11 +346,11 @@ class ResultContainer:
duplicated['url'] = result['parsed_url'].geturl()
duplicated['parsed_url'] = result['parsed_url']
def __merge_result_no_url(self, result, position):
def __merge_result_no_url(self, result: models.KeyValueResult, position: int):
result['engines'] = set([result['engine']])
result['positions'] = [position]
with self._lock:
self._merged_results.append(result)
self._merged_results.append(result) # type: ignore
def close(self):
self._closed = True

213
searx/results/models.py Normal file
View file

@ -0,0 +1,213 @@
from typing import List, Dict, Set
from typing_extensions import TypedDict, NotRequired, Required, Literal
from urllib.parse import ParseResult
from datetime import datetime
__all__ = [
'Result',
'UrlResult',
'Answer',
'Correction',
'Suggestion',
'Infobox',
'InfoboxUrl',
'InfoboxImage',
'InfoboxAttribute',
'InfoboxRelatedTopic',
]
class Result(TypedDict):
"""A result from any type"""
engine: str
"""Internal field. DO NOT USE"""
weight: float
"""Internal field. DO NOT USE"""
engines: Set[str]
"""Internal field. DO NOT USE"""
category: str
"""Internal field. DO NOT USE"""
positions: List[int]
"""Internal field. DO NOT USE"""
score: float
"""Internal field. DO NOT USE"""
class MainResult(Result):
"""Result that is going to be displayed as a "main" result"""
template: NotRequired[str]
"""Template to display the result. The default value is "default.html".
see searx/templates/simple/result_templates"""
class UrlResult(MainResult):
"""Typical main result: an url, a title and a short description"""
title: str
"""Title of the result"""
url: str
"""URL of the result"""
parsed_url: ParseResult
"""Engines don't have to set this value: it is automatically initialized from the url field.
However, plugins have to manually update this field when they change the url field"""
content: NotRequired[str]
"""Content of the result"""
iframe_src: NotRequired[str]
"""URL of an iframe to add to the result."""
audio_src: NotRequired[str]
"""URL of <audio> element"""
img_src: NotRequired[str]
"""URL of an image to include the result"""
thumbnail: NotRequired[str]
"""URL of a thumbnail"""
publishedDate: NotRequired[datetime]
"""Publication date"""
length: NotRequired[str]
"""Length of the content (for audio or video)"""
author: NotRequired[str]
"""Author of the content (for audio, video, image, ...)"""
metadata: NotRequired[Dict]
"""Dictionnary to allow paging"""
class KeyValueResult(MainResult):
"""a set of key value to display, useful for the DB engines.
The template field must be "key-value.html", all other values are
displayed.
"""
template: Literal["key-value.html"]
"""template must be equal to `key-value.html`"""
class Answer(Result):
"""Answer item in the result list. The answer result item is used in
the :origin:`results.html <searx/templates/simple/results.html>` template.
A answer item is a dictionary type with dedicated keys and values."""
answer: Required[str]
"""The answer string append by the engine."""
url: NotRequired[str]
"""A link that is related to the answer (e.g. the origin of the answer)."""
class Correction(Result):
"""Correction item in the result list. The correction result item is used in
the :origin:`results.html <searx/templates/simple/results.html>` template.
A correction item is a dictionary type with dedicated keys and values."""
url: str
"""The SearXNG search URL for the correction term."""
title: str
"""The 'correction' string append by the engine."""
class Suggestion(Result):
"""Suggestion item in the result list. The suggestion result item is used in
the :origin:`infobox.html <searx/templates/simple/results.html>` template.
A sugestion item is a dictionary type with dedicated keys and values."""
suggestion: Required[str]
"""The SearXNG search URL for the suggestion term."""
class InfoboxUrl(TypedDict):
"""A list of dictionaries with links shown in the infobox.
A **url** item in the ``infobox.urls`` list is a dicticonary
"""
title: str
"""Title of the URL"""
url: str
"""URL by itself : https:/..."""
entity: str
"""set by some engines but unused"""
official: bool
"""set by some engines but unused (oscar)"""
class InfoboxImage(TypedDict):
src: str
"""URL of the image"""
alt: str
"""description of the image / alt attribute"""
class InfoboxAttribute(TypedDict):
"""A **attribute** item in the ``infobox.attributes`` list is a dictionary"""
label: str
"""Label of the attribute"""
value: str
"""Value of the attribute. If set, the image field is ignored"""
image: InfoboxImage
"""Image for this attribute. Ignored if the field value is set"""
entity: str
"""set by some engines but unused"""
class InfoboxRelatedTopic(TypedDict):
"""A **topic** item in the ``infobox.relatedTopics`` list is a dictionary"""
suggestion: str
"""suggested search query"""
name: str
"""set by some engines but unused"""
class Infobox(Result):
"""Infobox item in the result list. The infobox result item is used in the
:origin:`infobox.html <searx/templates/simple/infobox.html>` template.
A infobox item is a dictionary type with dedicated keys and values.
"""
infobox: Required[str]
"""Name of the infobox (mandatory)."""
id: str
"""URL of the infobox. Will be used to merge infoboxes."""
content: str
"""Content of the infobox (the description)"""
img_src: str
"""URL of the image to show in the infobox"""
urls: List[InfoboxUrl]
"""A list of dictionaries with links shown in the infobox."""
attributes: List[InfoboxAttribute]
"""A list of dictionaries with attributes shown in the infobox"""
relatedTopics: List[InfoboxRelatedTopic]
"""A list of dictionaries with related topics shown in the infobox"""

View file

@ -710,6 +710,7 @@ def search():
# output
for result in results:
result = typing.cast(dict, result)
if output_format == 'html':
if 'content' in result and result['content']:
result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
@ -763,7 +764,9 @@ def search():
keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
csv.writerow(keys)
for row in results:
row['host'] = row['parsed_url'].netloc
row = typing.cast(dict, row)
if 'parsed_url' in row:
row['host'] = row['parsed_url'].netloc
row['type'] = 'result'
csv.writerow([row.get(key, '') for key in keys])
for a in result_container.answers: