mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
[fix] hostname_replace: modify URLs in the infobox
Closes: https://github.com/searxng/searxng/issues/1348 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
069e1d7fb4
commit
3e69a98f80
4 changed files with 166 additions and 19 deletions
9
docs/src/searx.plugins.hostname_replace.rst
Normal file
9
docs/src/searx.plugins.hostname_replace.rst
Normal file
|
@ -0,0 +1,9 @@
|
|||
.. _hostname_replace plugin:
|
||||
|
||||
=======================
|
||||
Hostname-Replace Plugin
|
||||
=======================
|
||||
|
||||
.. automodule:: searx.plugins.hostname_replace
|
||||
:members:
|
||||
|
|
@ -1,10 +1,44 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Rewrite result hostnames or remove results based on the hostname.
|
||||
|
||||
``/etc/searxng/settings.yml``
|
||||
Deactivate by default, activate plugin and append entries to the
|
||||
list ``hostname_replace``
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
enabled_plugins:
|
||||
- 'Hostname replace' # see hostname_replace configuration below
|
||||
# ...
|
||||
|
||||
.. _#911: https://github.com/searxng/searxng/discussions/911
|
||||
.. _#970: https://github.com/searxng/searxng/discussions/970
|
||||
|
||||
Configuration of the replacements (`#911`_, `#970`_)
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
hostname_replace:
|
||||
# to ignore result from codegrepper.com
|
||||
'(.*\\.)?codegrepper\\.com': false
|
||||
|
||||
# redirect youtube links to a invidio instance
|
||||
'(.*\\.)?youtube\\.com$': 'invidio.xamh.de'
|
||||
'(.*\\.)?youtube-nocookie\\.com$': 'invidio.xamh.de'
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
from urllib.parse import urlunparse, urlparse
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx import settings
|
||||
from searx.plugins import logger
|
||||
from flask_babel import gettext
|
||||
from searx.results.container import is_standard_result
|
||||
from searx.results.infobox import infobox_modify_url, is_infobox
|
||||
from searx.results.answer import answer_modify_url, is_answer
|
||||
|
||||
|
||||
name = gettext('Hostname replace')
|
||||
description = gettext('Rewrite result hostnames or remove results based on the hostname')
|
||||
|
@ -17,30 +51,46 @@ replacements = {re.compile(p): r for (p, r) in settings[plugin_id].items()} if p
|
|||
|
||||
logger = logger.getChild(plugin_id)
|
||||
parsed = 'parsed_url'
|
||||
_url_fields = ['iframe_src', 'audio_src']
|
||||
|
||||
|
||||
def on_result(request, search, result):
|
||||
def on_result(_request, _search, result):
|
||||
|
||||
for (pattern, replacement) in replacements.items():
|
||||
# pylint: disable=cell-var-from-loop
|
||||
|
||||
if parsed in result:
|
||||
if pattern.search(result[parsed].netloc):
|
||||
# to keep or remove this result from the result list depends
|
||||
# (only) on the 'parsed_url'
|
||||
if not replacement:
|
||||
return False
|
||||
def modify_url(url):
|
||||
url_src = urlparse(url)
|
||||
if not pattern.search(url_src.netloc):
|
||||
return url
|
||||
if not replacement:
|
||||
return None
|
||||
url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc))
|
||||
return urlunparse(url_src)
|
||||
|
||||
if is_infobox(result):
|
||||
infobox_modify_url(modify_url, result)
|
||||
continue
|
||||
|
||||
if is_answer(result):
|
||||
answer_modify_url(modify_url, result)
|
||||
continue
|
||||
|
||||
if is_standard_result(result):
|
||||
if parsed in result:
|
||||
if pattern.search(result[parsed].netloc):
|
||||
# to keep or remove this result from the result list depends
|
||||
# (only) on the 'parsed_url'
|
||||
if not replacement:
|
||||
return False
|
||||
result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc))
|
||||
result['url'] = urlunparse(result[parsed])
|
||||
|
||||
for url_field in _url_fields:
|
||||
if result.get(url_field):
|
||||
url_src = urlparse(result[url_field])
|
||||
if pattern.search(url_src.netloc):
|
||||
if not replacement:
|
||||
del result[url_field]
|
||||
else:
|
||||
url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc))
|
||||
result[url_field] = urlunparse(url_src)
|
||||
|
||||
for url_field in ['iframe_src', 'audio_src']:
|
||||
url = result.get(url_field)
|
||||
if url:
|
||||
_url = modify_url(url)
|
||||
if _url is None:
|
||||
del result[url]
|
||||
elif _url != url:
|
||||
result[url_field] = url
|
||||
return True
|
||||
|
|
|
@ -35,3 +35,30 @@ class Answers(dict):
|
|||
|
||||
def add(self, result):
|
||||
self[result['answer']] = result
|
||||
|
||||
|
||||
def answer_modify_url(modify_url_func, result):
|
||||
"""Modify 'url' field in the answer-result.
|
||||
|
||||
:param func modify_url_func: A function that gets one argument; the 'url'
|
||||
field of the ``result`` item. The function returns the URL to use
|
||||
instead (even the URL is not modified). To drop the 'url' field from
|
||||
the result the function returns ``None``.
|
||||
|
||||
:param dict result: The result item.
|
||||
"""
|
||||
|
||||
if not is_answer(result):
|
||||
return
|
||||
|
||||
url = result.get('url')
|
||||
if not url:
|
||||
return
|
||||
|
||||
_url = modify_url_func(url)
|
||||
if _url is None:
|
||||
# logger.debug("answer: remove url from %s", url)
|
||||
del result['url']
|
||||
elif _url != url:
|
||||
# logger.debug("answer: redirect url %s", _url)
|
||||
result['url'] = _url
|
||||
|
|
|
@ -163,3 +163,64 @@ def merge_two_infoboxes(infobox1, infobox2):
|
|||
infobox1['content'] = content2
|
||||
else:
|
||||
infobox1['content'] = content2
|
||||
|
||||
|
||||
def infobox_modify_url(modify_url_func, result):
|
||||
"""Modify URL fields in the infobox-result.
|
||||
|
||||
:param func modify_url_func: A function that gets one argument; a *url*
|
||||
field of the ``result`` item. The function returns the URL to use
|
||||
instead (even the URL is not modified). To drop the 'url' field from
|
||||
the result the function returns ``None``. This function is called for
|
||||
each field that contains URL.
|
||||
|
||||
:param dict result: The result item.
|
||||
|
||||
"""
|
||||
|
||||
if not is_infobox(result):
|
||||
return
|
||||
|
||||
img_src = result.get('img_src')
|
||||
urls = result.get('urls', [])
|
||||
attributes = result.get('attributes', [])
|
||||
|
||||
# infobox.img_src
|
||||
if img_src:
|
||||
_img_src = modify_url_func(img_src)
|
||||
if _img_src is None:
|
||||
# logger.debug("infobox: remove img_src from %s", infobox_name)
|
||||
del result['img_src']
|
||||
elif _img_src != img_src:
|
||||
# logger.debug("infobox: redirect img_src %s", _img_src)
|
||||
result['img_src'] = _img_src
|
||||
|
||||
# A 'url' item in the infobox.urls list has this attributes:
|
||||
#
|
||||
# 'title', 'url'
|
||||
|
||||
for url in urls:
|
||||
url_url = url.get('url')
|
||||
if url_url:
|
||||
_url_url = modify_url_func(url_url)
|
||||
if _url_url is None:
|
||||
# logger.debug("infobox: remove url %s", url)
|
||||
urls.remove(url)
|
||||
elif _url_url != url_url:
|
||||
# logger.debug("infobox: redirect url %s", _url_url)
|
||||
url['url'] = _url_url
|
||||
|
||||
# A 'attr' item in the infobox.attributes list has this attributes:
|
||||
#
|
||||
# 'label', 'image'
|
||||
|
||||
for attr in attributes:
|
||||
image = attr.get('image')
|
||||
if image:
|
||||
_image = modify_url_func(image)
|
||||
if image is None:
|
||||
# logger.debug("infobox: remove image %s", attr)
|
||||
attributes.remove(attr)
|
||||
elif _image != image:
|
||||
# logger.debug("infobox: redirect %s", _image)
|
||||
attr['image'] = _image
|
||||
|
|
Loading…
Add table
Reference in a new issue