mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
Add first iteration of a URL rewrite plugin
This commit is contained in:
parent
0245e82bd2
commit
4d28669beb
1 changed files with 109 additions and 0 deletions
109
searx/plugins/url_rewrite.py
Normal file
109
searx/plugins/url_rewrite.py
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
"""
|
||||||
|
The **URL Rewrite plugin** allows modification of search result URLs based on regular expression patterns.
|
||||||
|
URLs can be rewritten to different destinations, blocked entirely, or have their priority adjusted in the
|
||||||
|
results list.
|
||||||
|
|
||||||
|
The plugin can be enabled by adding it to the `enabled_plugins` **list** in the `settings.yml`:
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
enabled_plugins:
|
||||||
|
- 'URL Rewrite'
|
||||||
|
...
|
||||||
|
|
||||||
|
Configuration options in `settings.yml`:
|
||||||
|
|
||||||
|
- ``url_rewrite.rules``: A **list** of rules that define how URLs should be handled. Each rule
|
||||||
|
can contain the following keys:
|
||||||
|
|
||||||
|
- ``pattern``: Regular expression pattern to match against result URLs
|
||||||
|
- ``repl``: Replacement string for matched URLs, or `false` to remove matching results
|
||||||
|
- ``priority``: Optional priority adjustment ('high' or 'low') for matching results
|
||||||
|
|
||||||
|
Example configuration:
|
||||||
|
|
||||||
|
.. code:: yaml
|
||||||
|
|
||||||
|
url_rewrite:
|
||||||
|
rules:
|
||||||
|
- pattern: '^(?P<url>https://(.*\.)?cnn\.com/.*)$'
|
||||||
|
repl: 'https://reader.example.com/?url=\g<url>'
|
||||||
|
- pattern: '^https?://(?:www\.)?facebook\.com/.*'
|
||||||
|
repl: false
|
||||||
|
- pattern: '^(?P<url>https://news\.example\.com/.*)$'
|
||||||
|
repl: 'https://proxy.example.com/?url=\g<url>'
|
||||||
|
priority: high
|
||||||
|
|
||||||
|
In this example:
|
||||||
|
- CNN articles are redirected through a reader service
|
||||||
|
- Facebook URLs are removed from results
|
||||||
|
- News site URLs are proxied and given higher priority
|
||||||
|
|
||||||
|
The ``pattern`` field supports Python regular expressions with named capture groups.
|
||||||
|
The ``repl`` field can reference captured groups using ``\g<name>`` syntax.
|
||||||
|
Setting ``repl`` to ``false`` will remove matching results entirely.
|
||||||
|
The optional ``priority`` field can be set to 'high' or 'low' to adjust result ranking.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from flask_babel import gettext
|
||||||
|
|
||||||
|
from searx import settings
|
||||||
|
from searx.settings_loader import get_yaml_cfg
|
||||||
|
from searx.plugins import logger
|
||||||
|
|
||||||
|
|
||||||
|
name = gettext("URL Rewrite")
|
||||||
|
description = gettext("Rewrite URLs of search results")
|
||||||
|
default_on = True
|
||||||
|
preference_section = 'general'
|
||||||
|
plugin_id = 'url_rewrite'
|
||||||
|
|
||||||
|
|
||||||
|
logger = logger.getChild(plugin_id)
|
||||||
|
config = settings.get(plugin_id, {})
|
||||||
|
rules = config.get("rules", [])
|
||||||
|
|
||||||
|
|
||||||
|
def on_result(request, search, result):
|
||||||
|
if not rules:
|
||||||
|
logger.debug("No url rewrite rules found in settings")
|
||||||
|
return True
|
||||||
|
|
||||||
|
if 'url' not in result:
|
||||||
|
logger.debug("No url found in result")
|
||||||
|
return True
|
||||||
|
|
||||||
|
for rewrite in rules:
|
||||||
|
pattern = rewrite.get('pattern')
|
||||||
|
repl = rewrite.get('repl')
|
||||||
|
priority = rewrite.get('priority')
|
||||||
|
replace_url = rewrite.get('replace_url', True)
|
||||||
|
|
||||||
|
if not pattern:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if repl is None:
|
||||||
|
logger.debug(f'No repl found for pattern {pattern}, skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
|
if re.search(pattern, result['url']):
|
||||||
|
if repl is False:
|
||||||
|
logger.info(f'Dropping {result["url"]} - matched {pattern}')
|
||||||
|
return False
|
||||||
|
|
||||||
|
new_url = re.sub(pattern, repl, result['url'])
|
||||||
|
result['url'] = new_url
|
||||||
|
|
||||||
|
if replace_url:
|
||||||
|
result['parsed_url'] = urlparse(new_url)
|
||||||
|
|
||||||
|
if priority:
|
||||||
|
result['priority'] = priority
|
||||||
|
logger.info(f'Set priority to {priority} for {result["url"]}')
|
||||||
|
|
||||||
|
logger.info(f'Rewrote {result["url"]} using pattern {pattern}')
|
||||||
|
break
|
||||||
|
|
||||||
|
return True
|
Loading…
Add table
Reference in a new issue