diff --git a/searx/plugins/url_rewrite.py b/searx/plugins/url_rewrite.py new file mode 100644 index 000000000..30f52d71d --- /dev/null +++ b/searx/plugins/url_rewrite.py @@ -0,0 +1,109 @@ +""" +The **URL Rewrite plugin** allows modification of search result URLs based on regular expression patterns. +URLs can be rewritten to different destinations, blocked entirely, or have their priority adjusted in the +results list. + +The plugin can be enabled by adding it to the `enabled_plugins` **list** in the `settings.yml`: + +.. code:: yaml + + enabled_plugins: + - 'URL Rewrite' + ... + +Configuration options in `settings.yml`: + +- ``url_rewrite.rules``: A **list** of rules that define how URLs should be handled. Each rule + can contain the following keys: + + - ``pattern``: Regular expression pattern to match against result URLs + - ``repl``: Replacement string for matched URLs, or `false` to remove matching results + - ``priority``: Optional priority adjustment ('high' or 'low') for matching results + +Example configuration: + +.. code:: yaml + + url_rewrite: + rules: + - pattern: '^(?Phttps://(.*\.)?cnn\.com/.*)$' + repl: 'https://reader.example.com/?url=\g' + - pattern: '^https?://(?:www\.)?facebook\.com/.*' + repl: false + - pattern: '^(?Phttps://news\.example\.com/.*)$' + repl: 'https://proxy.example.com/?url=\g' + priority: high + +In this example: +- CNN articles are redirected through a reader service +- Facebook URLs are removed from results +- News site URLs are proxied and given higher priority + +The ``pattern`` field supports Python regular expressions with named capture groups. +The ``repl`` field can reference captured groups using ``\g`` syntax. +Setting ``repl`` to ``false`` will remove matching results entirely. +The optional ``priority`` field can be set to 'high' or 'low' to adjust result ranking. +""" +import re +from urllib.parse import urlparse + +from flask_babel import gettext + +from searx import settings +from searx.settings_loader import get_yaml_cfg +from searx.plugins import logger + + +name = gettext("URL Rewrite") +description = gettext("Rewrite URLs of search results") +default_on = True +preference_section = 'general' +plugin_id = 'url_rewrite' + + +logger = logger.getChild(plugin_id) +config = settings.get(plugin_id, {}) +rules = config.get("rules", []) + + +def on_result(request, search, result): + if not rules: + logger.debug("No url rewrite rules found in settings") + return True + + if 'url' not in result: + logger.debug("No url found in result") + return True + + for rewrite in rules: + pattern = rewrite.get('pattern') + repl = rewrite.get('repl') + priority = rewrite.get('priority') + replace_url = rewrite.get('replace_url', True) + + if not pattern: + continue + + if repl is None: + logger.debug(f'No repl found for pattern {pattern}, skipping') + continue + + if re.search(pattern, result['url']): + if repl is False: + logger.info(f'Dropping {result["url"]} - matched {pattern}') + return False + + new_url = re.sub(pattern, repl, result['url']) + result['url'] = new_url + + if replace_url: + result['parsed_url'] = urlparse(new_url) + + if priority: + result['priority'] = priority + logger.info(f'Set priority to {priority} for {result["url"]}') + + logger.info(f'Rewrote {result["url"]} using pattern {pattern}') + break + + return True