searxng/searx/results/infobox.py
Markus Heiser 3d473a773d [mod] mudularize & document searx.results
The intention of this patch is to improve modularization & documentation of the
implementations about the *result* items.

  This patch does not contain any functional change!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2022-08-06 18:58:43 +02:00

157 lines
4.5 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Infobox item in the result list. The infobox result item is used in the
:origin:`infobox.html <searx/templates/simple/infobox.html>` template.
A infobox item is a dictionary type with dedicated keys and values. In the
result list a infobox item is identified by the existence of the key ``infobox``.
.. code:: python
results.append({
'infobox' : str,
'id' : str,
'content' : str,
'img_src' : str,
'urls' : [url, ...],
'attributes' : [attribute, ...],
'relatedTopics' : [topic, ...],
'engine' : engine,
})
infobox : ``str``
Name of the infobox (mandatory).
id : ``str``
URL of the infobox. Will be used to merge infoboxes.
content : ``str``
Content of the infobox (the description)
img_src:
URL of the image to show in the infobox
urls : ``[url, ...]``
A list of dictionaries with links shown in the infobox. A **url** item in the
``infobox.urls`` list is a dicticonary:
.. code:: python
url = {
'title' : str,
'url' : str,
'entity' : str, # set by some engines but unused
'official' : bool, # set by some engines but unused (oscar)
}
attributes : ``[attribute, ...]``
A **attribute** item in the ``infobox.attributes`` list is a dictionary:
.. code:: python
attribute = {
'label' : str,
'value' : str,
'image' : {
'src': str,
'alt': str,
},
'entity' : str, # set by some engines but unused
}
relatedTopics : ``[topic, ...]``
A **topic** item in the ``infobox.relatedTopics`` list is a dictionary:
.. code:: python
topic = {
'suggestion' : str,
'name' : str, # set by some engines but unused
}
"""
from urllib.parse import urlparse
from searx.engines import engines
from .core import (
result_content_len,
compare_urls,
)
class Infoboxes(list):
"""List of infobox items in the :py:obj:`.container.ResultContainer`"""
def merge_two_infoboxes(infobox1, infobox2):
# pylint: disable=too-many-branches, too-many-statements
# get engines weights
if hasattr(engines[infobox1['engine']], 'weight'):
weight1 = engines[infobox1['engine']].weight
else:
weight1 = 1
if hasattr(engines[infobox2['engine']], 'weight'):
weight2 = engines[infobox2['engine']].weight
else:
weight2 = 1
if weight2 > weight1:
infobox1['engine'] = infobox2['engine']
infobox1['engines'] |= infobox2['engines']
if 'urls' in infobox2:
urls1 = infobox1.get('urls', None)
if urls1 is None:
urls1 = []
for url2 in infobox2.get('urls', []):
unique_url = True
parsed_url2 = urlparse(url2.get('url', ''))
entity_url2 = url2.get('entity')
for url1 in urls1:
if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
urlparse(url1.get('url', '')), parsed_url2
):
unique_url = False
break
if unique_url:
urls1.append(url2)
infobox1['urls'] = urls1
if 'img_src' in infobox2:
img1 = infobox1.get('img_src', None)
img2 = infobox2.get('img_src')
if img1 is None:
infobox1['img_src'] = img2
elif weight2 > weight1:
infobox1['img_src'] = img2
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes')
if attributes1 is None:
infobox1['attributes'] = attributes1 = []
attributeSet = set()
for attribute in attributes1:
label = attribute.get('label')
if label not in attributeSet:
attributeSet.add(label)
entity = attribute.get('entity')
if entity not in attributeSet:
attributeSet.add(entity)
for attribute in infobox2.get('attributes', []):
if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
attributes1.append(attribute)
if 'content' in infobox2:
content1 = infobox1.get('content', None)
content2 = infobox2.get('content', '')
if content1 is not None:
if result_content_len(content2) > result_content_len(content1):
infobox1['content'] = content2
else:
infobox1['content'] = content2