mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
parent
675dc04917
commit
e242005916
5 changed files with 295 additions and 25 deletions
47
docs/src/searx.results.models.rst
Normal file
47
docs/src/searx.results.models.rst
Normal file
|
@ -0,0 +1,47 @@
|
|||
.. _results models:
|
||||
|
||||
====================
|
||||
searx.results.models
|
||||
====================
|
||||
|
||||
Main Results
|
||||
------------
|
||||
|
||||
.. autoclass:: searx.results.models.MainResult
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.UrlResult
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.KeyValueResult
|
||||
:members:
|
||||
|
||||
Infoboxes
|
||||
---------
|
||||
|
||||
.. autoclass:: searx.results.models.Infobox
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.InfoboxAttribute
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.InfoboxImage
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.InfoboxUrl
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.InfoboxRelatedTopic
|
||||
:members:
|
||||
|
||||
Others
|
||||
------
|
||||
|
||||
.. autoclass:: searx.results.models.Answer
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.Correction
|
||||
:members:
|
||||
|
||||
.. autoclass:: searx.results.models.Suggestion
|
||||
:members:
|
|
@ -68,6 +68,8 @@ class Engine: # pylint: disable=too-few-public-methods
|
|||
safesearch: bool
|
||||
time_range_support: bool
|
||||
timeout: float
|
||||
weight: float
|
||||
display_error_messages: bool
|
||||
|
||||
|
||||
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# pyright: basic
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from threading import RLock
|
||||
from typing import List, NamedTuple, Set
|
||||
from typing import NamedTuple, Optional, List, Set, Dict, cast
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
from searx import logger
|
||||
from searx.engines import engines
|
||||
from searx.metrics import histogram_observe, counter_add, count_error
|
||||
from . import models
|
||||
|
||||
|
||||
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
|
||||
|
@ -55,7 +59,7 @@ def compare_urls(url_a, url_b):
|
|||
return unquote(path_a) == unquote(path_b)
|
||||
|
||||
|
||||
def merge_two_infoboxes(infobox1, infobox2):
|
||||
def merge_two_infoboxes(infobox1: models.Infobox, infobox2: models.Infobox):
|
||||
# get engines weights
|
||||
if hasattr(engines[infobox1['engine']], 'weight'):
|
||||
weight1 = engines[infobox1['engine']].weight
|
||||
|
@ -91,8 +95,8 @@ def merge_two_infoboxes(infobox1, infobox2):
|
|||
|
||||
infobox1['urls'] = urls1
|
||||
|
||||
if 'img_src' in infobox2:
|
||||
img1 = infobox1.get('img_src', None)
|
||||
if infobox2.get('img_src') is not None:
|
||||
img1 = infobox1.get('img_src')
|
||||
img2 = infobox2.get('img_src')
|
||||
if img1 is None:
|
||||
infobox1['img_src'] = img2
|
||||
|
@ -127,7 +131,7 @@ def merge_two_infoboxes(infobox1, infobox2):
|
|||
infobox1['content'] = content2
|
||||
|
||||
|
||||
def result_score(result):
|
||||
def result_score(result: models.Result):
|
||||
weight = 1.0
|
||||
|
||||
for result_engine in result['engines']:
|
||||
|
@ -173,18 +177,18 @@ class ResultContainer:
|
|||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._merged_results = []
|
||||
self.infoboxes = []
|
||||
self.suggestions = set()
|
||||
self.answers = {}
|
||||
self.corrections = set()
|
||||
self._merged_results: List[models.MainResult] = []
|
||||
self.infoboxes: List[models.Infobox] = []
|
||||
self.suggestions: Set[models.Suggestion] = set()
|
||||
self.answers: Dict[str, models.Answer] = {}
|
||||
self.corrections: Set[models.Correction] = set()
|
||||
self._number_of_results = []
|
||||
self.engine_data = defaultdict(dict)
|
||||
self._closed = False
|
||||
self.paging = False
|
||||
self._closed: bool = False
|
||||
self.paging: bool = False
|
||||
self.unresponsive_engines: Set[UnresponsiveEngine] = set()
|
||||
self.timings: List[Timing] = []
|
||||
self.redirect_url = None
|
||||
self.redirect_url: Optional[str] = None
|
||||
self.on_result = lambda _: True
|
||||
self._lock = RLock()
|
||||
|
||||
|
@ -193,7 +197,7 @@ class ResultContainer:
|
|||
return
|
||||
|
||||
standard_result_count = 0
|
||||
error_msgs = set()
|
||||
error_msgs: Set[str] = set()
|
||||
for result in list(results):
|
||||
result['engine'] = engine_name
|
||||
if 'suggestion' in result and self.on_result(result):
|
||||
|
@ -234,7 +238,7 @@ class ResultContainer:
|
|||
if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging:
|
||||
self.paging = True
|
||||
|
||||
def _merge_infobox(self, infobox):
|
||||
def _merge_infobox(self, infobox: models.Infobox):
|
||||
add_infobox = True
|
||||
infobox_id = infobox.get('id', None)
|
||||
infobox['engines'] = set([infobox['engine']])
|
||||
|
@ -249,7 +253,7 @@ class ResultContainer:
|
|||
if add_infobox:
|
||||
self.infoboxes.append(infobox)
|
||||
|
||||
def _is_valid_url_result(self, result, error_msgs):
|
||||
def _is_valid_url_result(self, result: models.UrlResult, error_msgs: Set[str]) -> bool:
|
||||
if 'url' in result:
|
||||
if not isinstance(result['url'], str):
|
||||
logger.debug('result: invalid URL: %s', str(result))
|
||||
|
@ -269,7 +273,7 @@ class ResultContainer:
|
|||
|
||||
return True
|
||||
|
||||
def _normalize_url_result(self, result):
|
||||
def _normalize_url_result(self, result: models.UrlResult):
|
||||
"""Return True if the result is valid"""
|
||||
result['parsed_url'] = urlparse(result['url'])
|
||||
|
||||
|
@ -288,9 +292,9 @@ class ResultContainer:
|
|||
|
||||
# strip multiple spaces and carriage returns from content
|
||||
if result.get('content'):
|
||||
result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
|
||||
result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) # type: ignore
|
||||
|
||||
def __merge_url_result(self, result, position):
|
||||
def __merge_url_result(self, result: models.UrlResult, position: int):
|
||||
result['engines'] = set([result['engine']])
|
||||
with self._lock:
|
||||
duplicated = self.__find_duplicated_http_result(result)
|
||||
|
@ -302,11 +306,12 @@ class ResultContainer:
|
|||
result['positions'] = [position]
|
||||
self._merged_results.append(result)
|
||||
|
||||
def __find_duplicated_http_result(self, result):
|
||||
def __find_duplicated_http_result(self, result: models.UrlResult) -> Optional[models.UrlResult]:
|
||||
result_template = result.get('template')
|
||||
for merged_result in self._merged_results:
|
||||
if 'parsed_url' not in merged_result:
|
||||
continue
|
||||
merged_result = cast(models.UrlResult, merged_result)
|
||||
if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
|
||||
'template'
|
||||
):
|
||||
|
@ -320,10 +325,10 @@ class ResultContainer:
|
|||
return merged_result
|
||||
return None
|
||||
|
||||
def __merge_duplicated_http_result(self, duplicated, result, position):
|
||||
def __merge_duplicated_http_result(self, duplicated: models.UrlResult, result: models.UrlResult, position: int):
|
||||
# using content with more text
|
||||
if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
|
||||
duplicated['content'] = result['content']
|
||||
duplicated['content'] = result['content'] # type: ignore
|
||||
|
||||
# merge all result's parameters not found in duplicate
|
||||
for key in result.keys():
|
||||
|
@ -341,11 +346,11 @@ class ResultContainer:
|
|||
duplicated['url'] = result['parsed_url'].geturl()
|
||||
duplicated['parsed_url'] = result['parsed_url']
|
||||
|
||||
def __merge_result_no_url(self, result, position):
|
||||
def __merge_result_no_url(self, result: models.KeyValueResult, position: int):
|
||||
result['engines'] = set([result['engine']])
|
||||
result['positions'] = [position]
|
||||
with self._lock:
|
||||
self._merged_results.append(result)
|
||||
self._merged_results.append(result) # type: ignore
|
||||
|
||||
def close(self):
|
||||
self._closed = True
|
213
searx/results/models.py
Normal file
213
searx/results/models.py
Normal file
|
@ -0,0 +1,213 @@
|
|||
from typing import List, Dict, Set
|
||||
from typing_extensions import TypedDict, NotRequired, Required, Literal
|
||||
from urllib.parse import ParseResult
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
__all__ = [
|
||||
'Result',
|
||||
'UrlResult',
|
||||
'Answer',
|
||||
'Correction',
|
||||
'Suggestion',
|
||||
'Infobox',
|
||||
'InfoboxUrl',
|
||||
'InfoboxImage',
|
||||
'InfoboxAttribute',
|
||||
'InfoboxRelatedTopic',
|
||||
]
|
||||
|
||||
|
||||
class Result(TypedDict):
|
||||
"""A result from any type"""
|
||||
|
||||
engine: str
|
||||
"""Internal field. DO NOT USE"""
|
||||
|
||||
weight: float
|
||||
"""Internal field. DO NOT USE"""
|
||||
|
||||
engines: Set[str]
|
||||
"""Internal field. DO NOT USE"""
|
||||
|
||||
category: str
|
||||
"""Internal field. DO NOT USE"""
|
||||
|
||||
positions: List[int]
|
||||
"""Internal field. DO NOT USE"""
|
||||
|
||||
score: float
|
||||
"""Internal field. DO NOT USE"""
|
||||
|
||||
|
||||
class MainResult(Result):
|
||||
"""Result that is going to be displayed as a "main" result"""
|
||||
|
||||
template: NotRequired[str]
|
||||
"""Template to display the result. The default value is "default.html".
|
||||
see searx/templates/simple/result_templates"""
|
||||
|
||||
|
||||
class UrlResult(MainResult):
|
||||
"""Typical main result: an url, a title and a short description"""
|
||||
|
||||
title: str
|
||||
"""Title of the result"""
|
||||
|
||||
url: str
|
||||
"""URL of the result"""
|
||||
|
||||
parsed_url: ParseResult
|
||||
"""Engines don't have to set this value: it is automatically initialized from the url field.
|
||||
However, plugins have to manually update this field when they change the url field"""
|
||||
|
||||
content: NotRequired[str]
|
||||
"""Content of the result"""
|
||||
|
||||
iframe_src: NotRequired[str]
|
||||
"""URL of an iframe to add to the result."""
|
||||
|
||||
audio_src: NotRequired[str]
|
||||
"""URL of <audio> element"""
|
||||
|
||||
img_src: NotRequired[str]
|
||||
"""URL of an image to include the result"""
|
||||
|
||||
thumbnail: NotRequired[str]
|
||||
"""URL of a thumbnail"""
|
||||
|
||||
publishedDate: NotRequired[datetime]
|
||||
"""Publication date"""
|
||||
|
||||
length: NotRequired[str]
|
||||
"""Length of the content (for audio or video)"""
|
||||
|
||||
author: NotRequired[str]
|
||||
"""Author of the content (for audio, video, image, ...)"""
|
||||
|
||||
metadata: NotRequired[Dict]
|
||||
"""Dictionnary to allow paging"""
|
||||
|
||||
|
||||
class KeyValueResult(MainResult):
|
||||
"""a set of key value to display, useful for the DB engines.
|
||||
|
||||
The template field must be "key-value.html", all other values are
|
||||
displayed.
|
||||
"""
|
||||
|
||||
template: Literal["key-value.html"]
|
||||
"""template must be equal to `key-value.html`"""
|
||||
|
||||
|
||||
class Answer(Result):
|
||||
"""Answer item in the result list. The answer result item is used in
|
||||
the :origin:`results.html <searx/templates/simple/results.html>` template.
|
||||
A answer item is a dictionary type with dedicated keys and values."""
|
||||
|
||||
answer: Required[str]
|
||||
"""The answer string append by the engine."""
|
||||
|
||||
url: NotRequired[str]
|
||||
"""A link that is related to the answer (e.g. the origin of the answer)."""
|
||||
|
||||
|
||||
class Correction(Result):
|
||||
"""Correction item in the result list. The correction result item is used in
|
||||
the :origin:`results.html <searx/templates/simple/results.html>` template.
|
||||
A correction item is a dictionary type with dedicated keys and values."""
|
||||
|
||||
url: str
|
||||
"""The SearXNG search URL for the correction term."""
|
||||
|
||||
title: str
|
||||
"""The 'correction' string append by the engine."""
|
||||
|
||||
|
||||
class Suggestion(Result):
|
||||
"""Suggestion item in the result list. The suggestion result item is used in
|
||||
the :origin:`infobox.html <searx/templates/simple/results.html>` template.
|
||||
A sugestion item is a dictionary type with dedicated keys and values."""
|
||||
|
||||
suggestion: Required[str]
|
||||
"""The SearXNG search URL for the suggestion term."""
|
||||
|
||||
|
||||
class InfoboxUrl(TypedDict):
|
||||
"""A list of dictionaries with links shown in the infobox.
|
||||
A **url** item in the ``infobox.urls`` list is a dicticonary
|
||||
"""
|
||||
|
||||
title: str
|
||||
"""Title of the URL"""
|
||||
|
||||
url: str
|
||||
"""URL by itself : https:/..."""
|
||||
|
||||
entity: str
|
||||
"""set by some engines but unused"""
|
||||
|
||||
official: bool
|
||||
"""set by some engines but unused (oscar)"""
|
||||
|
||||
|
||||
class InfoboxImage(TypedDict):
|
||||
src: str
|
||||
"""URL of the image"""
|
||||
|
||||
alt: str
|
||||
"""description of the image / alt attribute"""
|
||||
|
||||
|
||||
class InfoboxAttribute(TypedDict):
|
||||
"""A **attribute** item in the ``infobox.attributes`` list is a dictionary"""
|
||||
|
||||
label: str
|
||||
"""Label of the attribute"""
|
||||
|
||||
value: str
|
||||
"""Value of the attribute. If set, the image field is ignored"""
|
||||
|
||||
image: InfoboxImage
|
||||
"""Image for this attribute. Ignored if the field value is set"""
|
||||
|
||||
entity: str
|
||||
"""set by some engines but unused"""
|
||||
|
||||
|
||||
class InfoboxRelatedTopic(TypedDict):
|
||||
"""A **topic** item in the ``infobox.relatedTopics`` list is a dictionary"""
|
||||
|
||||
suggestion: str
|
||||
"""suggested search query"""
|
||||
|
||||
name: str
|
||||
"""set by some engines but unused"""
|
||||
|
||||
|
||||
class Infobox(Result):
|
||||
"""Infobox item in the result list. The infobox result item is used in the
|
||||
:origin:`infobox.html <searx/templates/simple/infobox.html>` template.
|
||||
A infobox item is a dictionary type with dedicated keys and values.
|
||||
"""
|
||||
|
||||
infobox: Required[str]
|
||||
"""Name of the infobox (mandatory)."""
|
||||
|
||||
id: str
|
||||
"""URL of the infobox. Will be used to merge infoboxes."""
|
||||
|
||||
content: str
|
||||
"""Content of the infobox (the description)"""
|
||||
|
||||
img_src: str
|
||||
"""URL of the image to show in the infobox"""
|
||||
|
||||
urls: List[InfoboxUrl]
|
||||
"""A list of dictionaries with links shown in the infobox."""
|
||||
|
||||
attributes: List[InfoboxAttribute]
|
||||
"""A list of dictionaries with attributes shown in the infobox"""
|
||||
|
||||
relatedTopics: List[InfoboxRelatedTopic]
|
||||
"""A list of dictionaries with related topics shown in the infobox"""
|
|
@ -710,6 +710,7 @@ def search():
|
|||
|
||||
# output
|
||||
for result in results:
|
||||
result = typing.cast(dict, result)
|
||||
if output_format == 'html':
|
||||
if 'content' in result and result['content']:
|
||||
result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
|
||||
|
@ -763,7 +764,9 @@ def search():
|
|||
keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
|
||||
csv.writerow(keys)
|
||||
for row in results:
|
||||
row['host'] = row['parsed_url'].netloc
|
||||
row = typing.cast(dict, row)
|
||||
if 'parsed_url' in row:
|
||||
row['host'] = row['parsed_url'].netloc
|
||||
row['type'] = 'result'
|
||||
csv.writerow([row.get(key, '') for key in keys])
|
||||
for a in result_container.answers:
|
||||
|
|
Loading…
Add table
Reference in a new issue