From 3e69a98f8017f4e4087f21dd39896f135c6cb569 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 2 Jul 2022 19:43:14 +0200 Subject: [PATCH] [fix] hostname_replace: modify URLs in the infobox Closes: https://github.com/searxng/searxng/issues/1348 Signed-off-by: Markus Heiser --- docs/src/searx.plugins.hostname_replace.rst | 9 +++ searx/plugins/hostname_replace.py | 88 ++++++++++++++++----- searx/results/answer.py | 27 +++++++ searx/results/infobox.py | 61 ++++++++++++++ 4 files changed, 166 insertions(+), 19 deletions(-) create mode 100644 docs/src/searx.plugins.hostname_replace.rst diff --git a/docs/src/searx.plugins.hostname_replace.rst b/docs/src/searx.plugins.hostname_replace.rst new file mode 100644 index 000000000..5f17c20a8 --- /dev/null +++ b/docs/src/searx.plugins.hostname_replace.rst @@ -0,0 +1,9 @@ +.. _hostname_replace plugin: + +======================= +Hostname-Replace Plugin +======================= + +.. automodule:: searx.plugins.hostname_replace + :members: + diff --git a/searx/plugins/hostname_replace.py b/searx/plugins/hostname_replace.py index 039aadb91..0202381ed 100644 --- a/searx/plugins/hostname_replace.py +++ b/searx/plugins/hostname_replace.py @@ -1,10 +1,44 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Rewrite result hostnames or remove results based on the hostname. + +``/etc/searxng/settings.yml`` + Deactivate by default, activate plugin and append entries to the + list ``hostname_replace`` + +.. code-block:: yaml + + enabled_plugins: + - 'Hostname replace' # see hostname_replace configuration below + # ... + +.. _#911: https://github.com/searxng/searxng/discussions/911 +.. _#970: https://github.com/searxng/searxng/discussions/970 + +Configuration of the replacements (`#911`_, `#970`_) + +.. code-block:: yaml + + hostname_replace: + # to ignore result from codegrepper.com + '(.*\\.)?codegrepper\\.com': false + + # redirect youtube links to a invidio instance + '(.*\\.)?youtube\\.com$': 'invidio.xamh.de' + '(.*\\.)?youtube-nocookie\\.com$': 'invidio.xamh.de' + +""" import re from urllib.parse import urlunparse, urlparse +from flask_babel import gettext + from searx import settings from searx.plugins import logger -from flask_babel import gettext +from searx.results.container import is_standard_result +from searx.results.infobox import infobox_modify_url, is_infobox +from searx.results.answer import answer_modify_url, is_answer + name = gettext('Hostname replace') description = gettext('Rewrite result hostnames or remove results based on the hostname') @@ -17,30 +51,46 @@ replacements = {re.compile(p): r for (p, r) in settings[plugin_id].items()} if p logger = logger.getChild(plugin_id) parsed = 'parsed_url' -_url_fields = ['iframe_src', 'audio_src'] -def on_result(request, search, result): +def on_result(_request, _search, result): for (pattern, replacement) in replacements.items(): + # pylint: disable=cell-var-from-loop - if parsed in result: - if pattern.search(result[parsed].netloc): - # to keep or remove this result from the result list depends - # (only) on the 'parsed_url' - if not replacement: - return False + def modify_url(url): + url_src = urlparse(url) + if not pattern.search(url_src.netloc): + return url + if not replacement: + return None + url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc)) + return urlunparse(url_src) + + if is_infobox(result): + infobox_modify_url(modify_url, result) + continue + + if is_answer(result): + answer_modify_url(modify_url, result) + continue + + if is_standard_result(result): + if parsed in result: + if pattern.search(result[parsed].netloc): + # to keep or remove this result from the result list depends + # (only) on the 'parsed_url' + if not replacement: + return False result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc)) result['url'] = urlunparse(result[parsed]) - for url_field in _url_fields: - if result.get(url_field): - url_src = urlparse(result[url_field]) - if pattern.search(url_src.netloc): - if not replacement: - del result[url_field] - else: - url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc)) - result[url_field] = urlunparse(url_src) - + for url_field in ['iframe_src', 'audio_src']: + url = result.get(url_field) + if url: + _url = modify_url(url) + if _url is None: + del result[url] + elif _url != url: + result[url_field] = url return True diff --git a/searx/results/answer.py b/searx/results/answer.py index 42f4cc756..4d92a1a14 100644 --- a/searx/results/answer.py +++ b/searx/results/answer.py @@ -35,3 +35,30 @@ class Answers(dict): def add(self, result): self[result['answer']] = result + + +def answer_modify_url(modify_url_func, result): + """Modify 'url' field in the answer-result. + + :param func modify_url_func: A function that gets one argument; the 'url' + field of the ``result`` item. The function returns the URL to use + instead (even the URL is not modified). To drop the 'url' field from + the result the function returns ``None``. + + :param dict result: The result item. + """ + + if not is_answer(result): + return + + url = result.get('url') + if not url: + return + + _url = modify_url_func(url) + if _url is None: + # logger.debug("answer: remove url from %s", url) + del result['url'] + elif _url != url: + # logger.debug("answer: redirect url %s", _url) + result['url'] = _url diff --git a/searx/results/infobox.py b/searx/results/infobox.py index 9e3b8a08f..08326f0b5 100644 --- a/searx/results/infobox.py +++ b/searx/results/infobox.py @@ -163,3 +163,64 @@ def merge_two_infoboxes(infobox1, infobox2): infobox1['content'] = content2 else: infobox1['content'] = content2 + + +def infobox_modify_url(modify_url_func, result): + """Modify URL fields in the infobox-result. + + :param func modify_url_func: A function that gets one argument; a *url* + field of the ``result`` item. The function returns the URL to use + instead (even the URL is not modified). To drop the 'url' field from + the result the function returns ``None``. This function is called for + each field that contains URL. + + :param dict result: The result item. + + """ + + if not is_infobox(result): + return + + img_src = result.get('img_src') + urls = result.get('urls', []) + attributes = result.get('attributes', []) + + # infobox.img_src + if img_src: + _img_src = modify_url_func(img_src) + if _img_src is None: + # logger.debug("infobox: remove img_src from %s", infobox_name) + del result['img_src'] + elif _img_src != img_src: + # logger.debug("infobox: redirect img_src %s", _img_src) + result['img_src'] = _img_src + + # A 'url' item in the infobox.urls list has this attributes: + # + # 'title', 'url' + + for url in urls: + url_url = url.get('url') + if url_url: + _url_url = modify_url_func(url_url) + if _url_url is None: + # logger.debug("infobox: remove url %s", url) + urls.remove(url) + elif _url_url != url_url: + # logger.debug("infobox: redirect url %s", _url_url) + url['url'] = _url_url + + # A 'attr' item in the infobox.attributes list has this attributes: + # + # 'label', 'image' + + for attr in attributes: + image = attr.get('image') + if image: + _image = modify_url_func(image) + if image is None: + # logger.debug("infobox: remove image %s", attr) + attributes.remove(attr) + elif _image != image: + # logger.debug("infobox: redirect %s", _image) + attr['image'] = _image