Merge branch 'searxng:master' into elasticsearch-custom-query

This commit is contained in:
frob 2024-11-29 02:32:55 +01:00 committed by GitHub
commit 82d1544a6b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
341 changed files with 29669 additions and 12534 deletions

View file

@ -5,6 +5,7 @@
# pylint: disable=use-dict-literal
import json
import html
from urllib.parse import urlencode, quote_plus
import lxml
@ -162,7 +163,7 @@ def stract(query, _lang):
if not resp.ok:
return []
return [suggestion['raw'] for suggestion in resp.json()]
return [html.unescape(suggestion['raw']) for suggestion in resp.json()]
def startpage(query, sxng_locale):

View file

@ -14,17 +14,7 @@ import typing
import logging
import pathlib
try:
import tomllib
pytomlpp = None
USE_TOMLLIB = True
except ImportError:
import pytomlpp
tomllib = None
USE_TOMLLIB = False
from ..compat import tomllib
__all__ = ['Config', 'UNSET', 'SchemaIssue']
@ -32,7 +22,7 @@ log = logging.getLogger(__name__)
class FALSE:
"""Class of ``False`` singelton"""
"""Class of ``False`` singleton"""
# pylint: disable=multiple-statements
def __init__(self, msg):
@ -91,7 +81,7 @@ class Config:
return cfg
def __init__(self, cfg_schema: typing.Dict, deprecated: typing.Dict[str, str]):
"""Construtor of class Config.
"""Constructor of class Config.
:param cfg_schema: Schema of the configuration
:param deprecated: dictionary that maps deprecated configuration names to a messages
@ -169,7 +159,7 @@ class Config:
return pathlib.Path(str(val))
def pyobj(self, name, default=UNSET):
"""Get python object refered by full qualiffied name (FQN) in the config
"""Get python object referred by full qualiffied name (FQN) in the config
string."""
fqn = self.get(name, default)
@ -183,19 +173,10 @@ class Config:
def toml_load(file_name):
if USE_TOMLLIB:
# Python >= 3.11
try:
with open(file_name, "rb") as f:
return tomllib.load(f)
except tomllib.TOMLDecodeError as exc:
msg = str(exc).replace('\t', '').replace('\n', ' ')
log.error("%s: %s", file_name, msg)
raise
# fallback to pytomlpp for Python < 3.11
try:
return pytomlpp.load(file_name)
except pytomlpp.DecodeError as exc:
with open(file_name, "rb") as f:
return tomllib.load(f)
except tomllib.TOMLDecodeError as exc:
msg = str(exc).replace('\t', '').replace('\n', ' ')
log.error("%s: %s", file_name, msg)
raise

View file

@ -76,11 +76,11 @@ LONG_MAX = 150
LONG_MAX_SUSPICIOUS = 10
"""Maximum suspicious requests from one IP in the :py:obj:`LONG_WINDOW`"""
API_WONDOW = 3600
API_WINDOW = 3600
"""Time (sec) before sliding window for API requests (format != html) expires."""
API_MAX = 4
"""Maximum requests from one IP in the :py:obj:`API_WONDOW`"""
"""Maximum requests from one IP in the :py:obj:`API_WINDOW`"""
SUSPICIOUS_IP_WINDOW = 3600 * 24 * 30
"""Time (sec) before sliding window for one suspicious IP expires."""
@ -103,7 +103,7 @@ def filter_request(
return None
if request.args.get('format', 'html') != 'html':
c = incr_sliding_window(redis_client, 'ip_limit.API_WONDOW:' + network.compressed, API_WONDOW)
c = incr_sliding_window(redis_client, 'ip_limit.API_WINDOW:' + network.compressed, API_WINDOW)
if c > API_MAX:
return too_many_requests(network, "too many request in API_WINDOW")

View file

@ -28,7 +28,7 @@ And in the HTML template from flask a stylesheet link is needed (the value of
<link rel="stylesheet"
href="{{ url_for('client_token', token=link_token) }}"
type="text/css" />
type="text/css" >
.. _X-Forwarded-For:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
@ -55,10 +55,10 @@ from ._helpers import (
)
TOKEN_LIVE_TIME = 600
"""Livetime (sec) of limiter's CSS token."""
"""Lifetime (sec) of limiter's CSS token."""
PING_LIVE_TIME = 3600
"""Livetime (sec) of the ping-key from a client (request)"""
"""Lifetime (sec) of the ping-key from a client (request)"""
PING_KEY = 'SearXNG_limiter.ping'
"""Prefix of all ping-keys generated by :py:obj:`get_ping_key`"""

18
searx/compat.py Normal file
View file

@ -0,0 +1,18 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Compatibility with older versions"""
# pylint: disable=unused-import
__all__ = [
"tomllib",
]
import sys
# TOML (lib) compatibility
# ------------------------
if sys.version_info >= (3, 11):
import tomllib
else:
import tomli as tomllib

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -5,7 +5,7 @@
],
"ua": "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}",
"versions": [
"126.0",
"125.0"
"132.0",
"131.0"
]
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,229 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Adobe Stock`_ is a service that gives access to millions of royalty-free
assets. Assets types include photos, vectors, illustrations, templates, 3D
assets, videos, motion graphics templates and audio tracks.
.. Adobe Stock: https://stock.adobe.com/
Configuration
=============
The engine has the following mandatory setting:
- SearXNG's :ref:`engine categories`
- Adobe-Stock's :py:obj:`adobe_order`
- Adobe-Stock's :py:obj:`adobe_content_types`
.. code:: yaml
- name: adobe stock
engine: adobe_stock
shortcut: asi
categories: [images]
adobe_order: relevance
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
- name: adobe stock video
engine: adobe_stock
network: adobe stock
shortcut: asi
categories: [videos]
adobe_order: relevance
adobe_content_types: ["video"]
Implementation
==============
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import isodate
if TYPE_CHECKING:
import logging
logger: logging.Logger
about = {
"website": "https://stock.adobe.com/",
"wikidata_id": "Q5977430",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
categories = []
paging = True
send_accept_language_header = True
results_per_page = 10
base_url = "https://stock.adobe.com"
adobe_order: str = ""
"""Sort order, can be one of:
- ``relevance`` or
- ``featured`` or
- ``creation`` (most recent) or
- ``nb_downloads`` (number of downloads)
"""
ADOBE_VALID_TYPES = ["photo", "illustration", "zip_vector", "video", "template", "3d", "audio", "image"]
adobe_content_types: list = []
"""A list of of content types. The following content types are offered:
- Images: ``image``
- Videos: ``video``
- Templates: ``template``
- 3D: ``3d``
- Audio ``audio``
Additional subcategories:
- Photos: ``photo``
- Illustrations: ``illustration``
- Vectors: ``zip_vector`` (Vectors),
"""
# Do we need support for "free_collection" and "include_stock_enterprise"?
def init(_):
if not categories:
raise ValueError("adobe_stock engine: categories is unset")
# adobe_order
if not adobe_order:
raise ValueError("adobe_stock engine: adobe_order is unset")
if adobe_order not in ["relevance", "featured", "creation", "nb_downloads"]:
raise ValueError(f"unsupported adobe_order: {adobe_order}")
# adobe_content_types
if not adobe_content_types:
raise ValueError("adobe_stock engine: adobe_content_types is unset")
if isinstance(adobe_content_types, list):
for t in adobe_content_types:
if t not in ADOBE_VALID_TYPES:
raise ValueError("adobe_stock engine: adobe_content_types: '%s' is invalid" % t)
else:
raise ValueError(
"adobe_stock engine: adobe_content_types must be a list of strings not %s" % type(adobe_content_types)
)
def request(query, params):
args = {
"k": query,
"limit": results_per_page,
"order": adobe_order,
"search_page": params["pageno"],
"search_type": "pagination",
}
for content_type in ADOBE_VALID_TYPES:
args[f"filters[content_type:{content_type}]"] = 1 if content_type in adobe_content_types else 0
params["url"] = f"{base_url}/de/Ajax/Search?{urlencode(args)}"
# headers required to bypass bot-detection
if params["searxng_locale"] == "all":
params["headers"]["Accept-Language"] = "en-US,en;q=0.5"
return params
def parse_image_item(item):
return {
"template": "images.html",
"url": item["content_url"],
"title": item["title"],
"content": item["asset_type"],
"img_src": item["content_thumb_extra_large_url"],
"thumbnail_src": item["thumbnail_url"],
"resolution": f"{item['content_original_width']}x{item['content_original_height']}",
"img_format": item["format"],
"author": item["author"],
}
def parse_video_item(item):
# in video items, the title is more or less a "content description", we try
# to reduce the lenght of the title ..
title = item["title"]
content = ""
if "." in title.strip()[:-1]:
content = title
title = title.split(".", 1)[0]
elif "," in title:
content = title
title = title.split(",", 1)[0]
elif len(title) > 50:
content = title
title = ""
for w in content.split(" "):
title += f" {w}"
if len(title) > 50:
title = title.strip() + "\u2026"
break
return {
"template": "videos.html",
"url": item["content_url"],
"title": title,
"content": content,
# https://en.wikipedia.org/wiki/ISO_8601#Durations
"length": isodate.parse_duration(item["time_duration"]),
"publishedDate": datetime.strptime(item["creation_date"], "%Y-%m-%d"),
"thumbnail": item["thumbnail_url"],
"iframe_src": item["video_small_preview_url"],
"metadata": item["asset_type"],
}
def parse_audio_item(item):
audio_data = item["audio_data"]
content = audio_data.get("description") or ""
if audio_data.get("album"):
content = audio_data["album"] + " - " + content
return {
"url": item["content_url"],
"title": item["title"],
"content": content,
# "thumbnail": base_url + item["thumbnail_url"],
"iframe_src": audio_data["preview"]["url"],
"publishedDate": datetime.fromisoformat(audio_data["release_date"]) if audio_data["release_date"] else None,
"length": timedelta(seconds=round(audio_data["duration"] / 1000)) if audio_data["duration"] else None,
"author": item.get("artist_name"),
}
def response(resp):
results = []
json_resp = resp.json()
if isinstance(json_resp["items"], list):
return None
for item in json_resp["items"].values():
if item["asset_type"].lower() in ["image", "premium-image", "illustration", "vector"]:
result = parse_image_item(item)
elif item["asset_type"].lower() == "video":
result = parse_video_item(item)
elif item["asset_type"].lower() == "audio":
result = parse_audio_item(item)
else:
logger.error("no handle for %s --> %s", item["asset_type"], item)
continue
results.append(result)
return results

View file

@ -0,0 +1,83 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Alpine Linux binary packages`_. `Alpine Linux`_ is a Linux-based operation
system designed to be small, simple and secure. Contrary to many other Linux
distributions, it uses musl, BusyBox and OpenRC. Alpine is mostly used on
servers and for Docker images.
.. _Alpine Linux binary packages: https://pkgs.alpinelinux.org
.. _Alpine Linux: https://www.alpinelinux.org
"""
import re
from urllib.parse import urlencode
from lxml import html
from dateutil import parser
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://www.alpinelinux.org',
'wikidata_id': 'Q4033826',
'use_official_api': False,
'official_api_documentation': None,
'require_api_key': False,
'results': 'HTML',
}
paging = True
categories = ['packages', 'it']
base_url = "https://pkgs.alpinelinux.org"
alpine_arch = 'x86_64'
"""Kernel architecture: ``x86_64``, ``x86``, ``aarch64``, ``armhf``,
``ppc64le``, ``s390x``, ``armv7`` or ``riscv64``"""
ARCH_RE = re.compile("x86_64|x86|aarch64|armhf|ppc64le|s390x|armv7|riscv64")
"""Regular expression to match supported architectures in the query string."""
def request(query, params):
query_arch = ARCH_RE.search(query)
if query_arch:
query_arch = query_arch.group(0)
query = query.replace(query_arch, '').strip()
args = {
# use wildcards to match more than just packages with the exact same
# name as the query
'name': f"*{query}*",
'page': params['pageno'],
'arch': query_arch or alpine_arch,
}
params['url'] = f"{base_url}/packages?{urlencode(args)}"
return params
def response(resp):
results = []
doc = html.fromstring(resp.text)
for result in eval_xpath_list(doc, "//table/tbody/tr"):
if len(result.xpath("./td")) < 9:
# skip non valid entries in the result table
# e.g the "No item found..." message
continue
results.append(
{
'template': 'packages.html',
'url': base_url + extract_text(eval_xpath(result, './td[contains(@class, "package")]/a/@href')),
'title': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
'package_name': extract_text(eval_xpath(result, './td[contains(@class, "package")]')),
'publishedDate': parser.parse(extract_text(eval_xpath(result, './td[contains(@class, "bdate")]'))),
'version': extract_text(eval_xpath(result, './td[contains(@class, "version")]')),
'homepage': extract_text(eval_xpath(result, './td[contains(@class, "url")]/a/@href')),
'maintainer': extract_text(eval_xpath(result, './td[contains(@class, "maintainer")]')),
'license_name': extract_text(eval_xpath(result, './td[contains(@class, "license")]')),
'tags': [extract_text(eval_xpath(result, './td[contains(@class, "repo")]'))],
}
)
return results

View file

@ -34,10 +34,10 @@ Implementations
"""
from typing import List, Dict, Any, Optional
from urllib.parse import quote
from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.utils import extract_text, eval_xpath, eval_xpath_getindex, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
@ -53,7 +53,7 @@ about: Dict[str, Any] = {
# engine dependent config
categories: List[str] = ["files"]
paging: bool = False
paging: bool = True
# search-url
base_url: str = "https://annas-archive.org"
@ -99,9 +99,18 @@ def init(engine_settings=None): # pylint: disable=unused-argument
def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
q = quote(query)
lang = traits.get_language(params["language"], traits.all_locale) # type: ignore
params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}"
args = {
'lang': lang,
'content': aa_content,
'ext': aa_ext,
'sort': aa_sort,
'q': query,
'page': params['pageno'],
}
# filter out None and empty values
filtered_args = dict((k, v) for k, v in args.items() if v)
params["url"] = f"{base_url}/search?{urlencode(filtered_args)}"
return params
@ -128,12 +137,12 @@ def response(resp) -> List[Dict[str, Optional[str]]]:
def _get_result(item):
return {
'template': 'paper.html',
'url': base_url + item.xpath('./@href')[0],
'url': base_url + extract_text(eval_xpath_getindex(item, './@href', 0)),
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
'thumbnail': item.xpath('.//img/@src')[0],
'thumbnail': extract_text(eval_xpath_getindex(item, './/img/@src', 0, default=None), allow_none=True),
}
@ -184,3 +193,8 @@ def fetch_traits(engine_traits: EngineTraits):
for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
engine_traits.custom['sort'].append(x.get("value"))
# for better diff; sort the persistence of these traits
engine_traits.custom['content'].sort()
engine_traits.custom['ext'].sort()
engine_traits.custom['sort'].sort()

View file

@ -31,7 +31,7 @@ paging = True
number_of_results = 10
# shortcuts for advanced search
shorcut_dict = {
shortcut_dict = {
# user-friendly keywords
'format:': 'dcformat:',
'author:': 'dccreator:',
@ -55,7 +55,7 @@ shorcut_dict = {
def request(query, params):
# replace shortcuts with API advanced search keywords
for key, val in shorcut_dict.items():
for key, val in shortcut_dict.items():
query = re.sub(key, val, query)
# basic search

View file

@ -9,6 +9,8 @@ import string
from urllib.parse import urlencode
from datetime import datetime, timedelta
from searx import utils
# Engine metadata
about = {
"website": "https://www.bilibili.com",
@ -56,6 +58,8 @@ def request(query, params):
# Format the video duration
def format_duration(duration):
if not ":" in duration:
return None
minutes, seconds = map(int, duration.split(":"))
total_seconds = minutes * 60 + seconds
@ -70,7 +74,7 @@ def response(resp):
results = []
for item in search_res.get("data", {}).get("result", []):
title = item["title"]
title = utils.html_to_text(item["title"])
url = item["arcurl"]
thumbnail = item["pic"]
description = item["description"]

View file

@ -10,7 +10,7 @@ On the `preference page`_ Bing offers a lot of languages an regions (see section
LANGUAGE and COUNTRY/REGION). The Language is the language of the UI, we need
in SearXNG to get the translations of data such as *"published last week"*.
There is a description of the offical search-APIs_, unfortunately this is not
There is a description of the official search-APIs_, unfortunately this is not
the API we can use or that bing itself would use. You can look up some things
in the API to get a better picture of bing, but the value specifications like
the market codes are usually outdated or at least no longer used by bing itself.
@ -91,7 +91,7 @@ def request(query, params):
page = params.get('pageno', 1)
query_params = {
'q': query,
# if arg 'pq' is missed, somtimes on page 4 we get results from page 1,
# if arg 'pq' is missed, sometimes on page 4 we get results from page 1,
# don't ask why it is only sometimes / its M$ and they have never been
# deterministic ;)
'pq': query,
@ -177,7 +177,7 @@ def response(resp):
logger.debug('result error :\n%s', e)
if result_len and _page_offset(resp.search_params.get("pageno", 0)) > result_len:
# Avoid reading more results than avalaible.
# Avoid reading more results than available.
# For example, if there is 100 results from some search and we try to get results from 120 to 130,
# Bing will send back the results from 0 to 10 and no error.
# If we compare results count with the first parameter of the request we can avoid this "invalid" results.

View file

@ -99,7 +99,7 @@ def response(resp):
'url': metadata['purl'],
'thumbnail_src': metadata['turl'],
'img_src': metadata['murl'],
'content': metadata['desc'],
'content': metadata.get('desc'),
'title': title,
'source': source,
'resolution': img_format[0],

View file

@ -123,7 +123,9 @@ def response(resp):
thumbnail = None
imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
if imagelink is not None:
thumbnail = 'https://www.bing.com/' + imagelink.attrib.get('src')
thumbnail = imagelink.attrib.get('src')
if not thumbnail.startswith("https://www.bing.com"):
thumbnail = 'https://www.bing.com/' + thumbnail
results.append(
{

View file

@ -123,7 +123,6 @@ from typing import Any, TYPE_CHECKING
from urllib.parse import (
urlencode,
urlparse,
parse_qs,
)
from dateutil import parser
@ -137,6 +136,7 @@ from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
js_variable_to_python,
get_embeded_stream_url,
)
from searx.enginelib.traits import EngineTraits
@ -311,7 +311,7 @@ def _parse_search(resp):
# In my tests a video tag in the WEB search was most often not a
# video, except the ones from youtube ..
iframe_src = _get_iframe_src(url)
iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
item['template'] = 'videos.html'
@ -328,15 +328,6 @@ def _parse_search(resp):
return result_list
def _get_iframe_src(url):
parsed_url = urlparse(url)
if parsed_url.path == '/watch' and parsed_url.query:
video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
if video_id:
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
return None
def _parse_news(json_resp):
result_list = []
@ -392,7 +383,7 @@ def _parse_videos(json_resp):
if result['thumbnail'] is not None:
item['thumbnail'] = result['thumbnail']['src']
iframe_src = _get_iframe_src(url)
iframe_src = get_embeded_stream_url(url)
if iframe_src:
item['iframe_src'] = iframe_src
@ -426,14 +417,15 @@ def fetch_traits(engine_traits: EngineTraits):
print("ERROR: response from Brave is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for option in dom.xpath('//div[@id="language-select"]//option'):
for option in dom.xpath('//section//option[@value="en-us"]/../option'):
ui_lang = option.get('value')
try:
if '-' in ui_lang:
l = babel.Locale.parse(ui_lang, sep='-')
if l.territory:
sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
else:
sxng_tag = language_tag(babel.Locale.parse(ui_lang))
sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
@ -453,7 +445,7 @@ def fetch_traits(engine_traits: EngineTraits):
if not resp.ok: # type: ignore
print("ERROR: response from Brave is not OK.")
country_js = resp.text[resp.text.index("options:{all") + len('options:') :]
country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
country_js = country_js[: country_js.index("},k={default")]
country_tags = js_variable_to_python(country_js)

View file

@ -54,7 +54,6 @@ def response(resp):
excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
# it is better to emit <br/> instead of |, but html tags are verboten
content = content.strip().replace('\n', ' | ')
content = ' '.join(content.split())

View file

@ -0,0 +1,68 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Cloudflare AI engine"""
from json import loads, dumps
from searx.exceptions import SearxEngineAPIException
about = {
"website": 'https://ai.cloudflare.com',
"wikidata_id": None,
"official_api_documentation": 'https://developers.cloudflare.com/workers-ai',
"use_official_api": True,
"require_api_key": True,
"results": 'JSON',
}
cf_account_id = ''
cf_ai_api = ''
cf_ai_gateway = ''
cf_ai_model = ''
cf_ai_model_display_name = 'Cloudflare AI'
# Assistant messages hint to the AI about the desired output format. Not all models support this role.
cf_ai_model_assistant = 'Keep your answers as short and effective as possible.'
# System messages define the AI's personality. You can use them to set rules and how you expect the AI to behave.
cf_ai_model_system = 'You are a self-aware language model who is honest and direct about any question from the user.'
def request(query, params):
params['query'] = query
params['url'] = f'https://gateway.ai.cloudflare.com/v1/{cf_account_id}/{cf_ai_gateway}/workers-ai/{cf_ai_model}'
params['method'] = 'POST'
params['headers']['Authorization'] = f'Bearer {cf_ai_api}'
params['headers']['Content-Type'] = 'application/json'
params['data'] = dumps(
{
'messages': [
{'role': 'assistant', 'content': cf_ai_model_assistant},
{'role': 'system', 'content': cf_ai_model_system},
{'role': 'user', 'content': params['query']},
]
}
).encode('utf-8')
return params
def response(resp):
results = []
json = loads(resp.text)
if 'error' in json:
raise SearxEngineAPIException('Cloudflare AI error: ' + json['error'])
if 'result' in json:
results.append(
{
'content': json['result']['response'],
'infobox': cf_ai_model_display_name,
}
)
return results

View file

@ -10,6 +10,8 @@ engine offers some additional settings:
- :py:obj:`api_order`
- :py:obj:`search_endpoint`
- :py:obj:`show_avatar`
- :py:obj:`api_key`
- :py:obj:`api_username`
Example
=======
@ -27,6 +29,20 @@ for the ``paddling.com`` forum:
categories: ['social media', 'sports']
show_avatar: true
If the forum is private, you need to add an API key and username for the search:
.. code:: yaml
- name: paddling
engine: discourse
shortcut: paddle
base_url: 'https://forums.paddling.com/'
api_order: views
categories: ['social media', 'sports']
show_avatar: true
api_key: '<KEY>'
api_username: 'system'
Implementations
===============
@ -65,6 +81,12 @@ api_order = 'likes'
show_avatar = False
"""Show avatar of the user who send the post."""
api_key = ''
"""API key of the Discourse forum."""
api_username = ''
"""API username of the Discourse forum."""
paging = True
time_range_support = True
@ -98,6 +120,12 @@ def request(query, params):
'X-Requested-With': 'XMLHttpRequest',
}
if api_key != '':
params['headers']['Api-Key'] = api_key
if api_username != '':
params['headers']['Api-Username'] = api_username
return params

View file

@ -1,12 +1,14 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo Lite
~~~~~~~~~~~~~~~
DuckDuckGo WEB
~~~~~~~~~~~~~~
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
from urllib.parse import urlencode, quote_plus
import json
import babel
import lxml.html
@ -18,13 +20,13 @@ from searx import (
)
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
extr,
extract_text,
)
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx import redisdb
from searx.enginelib.traits import EngineTraits
from searx.utils import extr
from searx.exceptions import SearxEngineCaptchaException
if TYPE_CHECKING:
import logging
@ -42,7 +44,7 @@ about = {
}
send_accept_language_header = True
"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
"""DuckDuckGo-Lite tries to guess user's preferred language from the HTTP
``Accept-Language``. Optional the user can select a region filter (but not a
language).
"""
@ -53,47 +55,37 @@ paging = True
time_range_support = True
safesearch = True # user can't select but the results are filtered
url = 'https://lite.duckduckgo.com/lite/'
# url_ping = 'https://duckduckgo.com/t/sl_l'
url = "https://html.duckduckgo.com/html"
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
__CACHE = []
def cache_vqd(query, value):
def _cache_key(query: str, region: str):
return 'SearXNG_ddg_web_vqd' + redislib.secret_hash(f"{query}//{region}")
def cache_vqd(query: str, region: str, value: str):
"""Caches a ``vqd`` value from a query."""
c = redisdb.client()
if c:
logger.debug("cache vqd value: %s", value)
key = 'SearXNG_ddg_web_vqd' + redislib.secret_hash(query)
c.set(key, value, ex=600)
logger.debug("VALKEY cache vqd value: %s (%s)", value, region)
c.set(_cache_key(query, region), value, ex=600)
else:
logger.debug("MEM cache vqd value: %s (%s)", value, region)
if len(__CACHE) > 100: # cache vqd from last 100 queries
__CACHE.pop(0)
__CACHE.append((_cache_key(query, region), value))
def get_vqd(query):
"""Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
response.
def get_vqd(query: str, region: str, force_request: bool = False):
"""Returns the ``vqd`` that fits to the *query*.
.. hint::
If an empty string is returned there are no results for the ``query`` and
therefore no ``vqd`` value.
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
(such as extremely long search terms that are often sent by bots), no ``vqd``
value can be determined.
If SearXNG cannot determine a ``vqd`` value, then no request should go out
to DDG:
A request with a wrong ``vqd`` value leads to DDG temporarily putting
SearXNG's IP on a block list.
Requests from IPs in this block list run into timeouts.
Not sure, but it seems the block list is a sliding window: to get my IP rid
from the bot list I had to cool down my IP for 1h (send no requests from
that IP to DDG).
:param query: The query term
:param region: DDG's region code
:param force_request: force a request to get a vqd value from DDG
TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
by all request to DDG:
@ -104,29 +96,47 @@ def get_vqd(query):
- DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
- DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
(such as extremely long search terms that are often sent by bots), no ``vqd``
value can be determined.
If SearXNG cannot determine a ``vqd`` value, then no request should go out
to DDG.
.. attention::
A request with a wrong ``vqd`` value leads to DDG temporarily putting
SearXNG's IP on a block list.
Requests from IPs in this block list run into timeouts. Not sure, but it
seems the block list is a sliding window: to get my IP rid from the bot list
I had to cool down my IP for 1h (send no requests from that IP to DDG).
"""
value = None
key = _cache_key(query, region)
c = redisdb.client()
if c:
key = 'SearXNG_ddg_web_vqd' + redislib.secret_hash(query)
value = c.get(key)
if value or value == b'':
value = value.decode('utf-8')
logger.debug("re-use cached vqd value: %s", value)
value = value.decode('utf-8') # type: ignore
logger.debug("re-use CACHED vqd value: %s", value)
return value
query_url = 'https://duckduckgo.com/?' + urlencode({'q': query})
res = get(query_url)
doc = lxml.html.fromstring(res.text)
for script in doc.xpath("//script[@type='text/javascript']"):
script = script.text
if 'vqd="' in script:
value = extr(script, 'vqd="', '"')
break
logger.debug("new vqd value: '%s'", value)
if value is not None:
cache_vqd(query, value)
return value
for k, value in __CACHE:
if k == key:
logger.debug("MEM re-use CACHED vqd value: %s", value)
return value
if force_request:
resp = get(f'https://duckduckgo.com/?q={quote_plus(query)}')
if resp.status_code == 200: # type: ignore
value = extr(resp.text, 'vqd="', '"') # type: ignore
if value:
logger.debug("vqd value from DDG request: %s", value)
cache_vqd(query, region, value)
return value
return None
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
@ -154,9 +164,10 @@ def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
.. hint::
`DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
selection to the user, only a region can be selected by the user
(``eng_region`` from the example above). DDG-lite stores the selected
`DDG-lite <https://lite.duckduckgo.com/lite>`__ and the *no Javascript*
page https://html.duckduckgo.com/html do not offer a language selection
to the user, only a region can be selected by the user (``eng_region``
from the example above). DDG-lite and *no Javascript* store the selected
region in a cookie::
params['cookies']['kl'] = eng_region # 'ar-es'
@ -240,10 +251,27 @@ def request(query, params):
query = quote_ddg_bangs(query)
# request needs a vqd argument
vqd = get_vqd(query)
if len(query) >= 500:
# DDG does not accept queries with more than 499 chars
params["url"] = None
return
# Advanced search syntax ends in CAPTCHA
# https://duckduckgo.com/duckduckgo-help-pages/results/syntax/
query = " ".join(
[
x.removeprefix("site:").removeprefix("intitle:").removeprefix("inurl:").removeprefix("filetype:")
for x in query.split()
]
)
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
if eng_region == "wt-wt":
# https://html.duckduckgo.com/html sets an empty value for "all".
eng_region = ""
params['data']['kl'] = eng_region
params['cookies']['kl'] = eng_region
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])
params['url'] = url
@ -251,45 +279,79 @@ def request(query, params):
params['data']['q'] = query
# The API is not documented, so we do some reverse engineering and emulate
# what https://lite.duckduckgo.com/lite/ does when you press "next Page"
# link again and again ..
# what https://html.duckduckgo.com/html does when you press "next Page" link
# again and again ..
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
params['data']['vqd'] = vqd
# initial page does not have an offset
params['headers']['Sec-Fetch-Dest'] = "document"
params['headers']['Sec-Fetch-Mode'] = "navigate" # at least this one is used by ddg's bot detection
params['headers']['Sec-Fetch-Site'] = "same-origin"
params['headers']['Sec-Fetch-User'] = "?1"
# Form of the initial search page does have empty values in the form
if params['pageno'] == 1:
params['data']['b'] = ""
params['data']['df'] = ''
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
params['cookies']['df'] = time_range_dict[params['time_range']]
if params['pageno'] == 2:
# second page does have an offset of 20
offset = (params['pageno'] - 1) * 20
params['data']['s'] = offset
params['data']['dc'] = offset + 1
elif params['pageno'] > 2:
# third and following pages do have an offset of 20 + n*50
offset = 20 + (params['pageno'] - 2) * 50
params['data']['s'] = offset
params['data']['dc'] = offset + 1
# initial page does not have additional data in the input form
if params['pageno'] > 1:
# initial page does not have these additional data in the input form
params['data']['o'] = form_data.get('o', 'json')
params['data']['api'] = form_data.get('api', 'd.js')
params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['v'] = form_data.get('v', 'l')
params['headers']['Referer'] = 'https://lite.duckduckgo.com/'
params['headers']['Referer'] = url
params['data']['kl'] = eng_region
params['cookies']['kl'] = eng_region
vqd = get_vqd(query, eng_region, force_request=False)
params['data']['df'] = ''
if params['time_range'] in time_range_dict:
params['data']['df'] = time_range_dict[params['time_range']]
params['cookies']['df'] = time_range_dict[params['time_range']]
# Certain conditions must be met in order to call up one of the
# following pages ...
if vqd:
params['data']['vqd'] = vqd # follow up pages / requests needs a vqd argument
else:
# Don't try to call follow up pages without a vqd value. DDG
# recognizes this as a request from a bot. This lowers the
# reputation of the SearXNG IP and DDG starts to activate CAPTCHAs.
params["url"] = None
return
if params['searxng_locale'].startswith("zh"):
# Some locales (at least China) do not have a "next page" button and ddg
# will return a HTTP/2 403 Forbidden for a request of such a page.
params["url"] = None
return
logger.debug("param data: %s", params['data'])
logger.debug("param cookies: %s", params['cookies'])
return params
def is_ddg_captcha(dom):
"""In case of CAPTCHA ddg response its own *not a Robot* dialog and is not
redirected to a CAPTCHA page."""
return bool(eval_xpath(dom, "//form[@id='challenge-form']"))
def response(resp):
@ -300,38 +362,40 @@ def response(resp):
results = []
doc = lxml.html.fromstring(resp.text)
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
if is_ddg_captcha(doc):
# set suspend time to zero is OK --> ddg does not block the IP
raise SearxEngineCaptchaException(suspended_time=0, message=f"CAPTCHA ({resp.search_params['data'].get('kl')})")
if len(result_table) == 2:
# some locales (at least China) does not have a "next page" button and
# the layout of the HTML tables is different.
result_table = result_table[1]
elif not len(result_table) >= 3:
# no more results
return []
else:
result_table = result_table[2]
# update form data from response
form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
if len(form):
form = eval_xpath(doc, '//input[@name="vqd"]/..')
if len(form):
# some locales (at least China) does not have a "next page" button
form = form[0]
form_vqd = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
form = form[0]
form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
logger.debug('form_data: %s', form_data)
cache_vqd(resp.search_params['data']['q'], resp.search_params['data']['kl'], form_vqd)
tr_rows = eval_xpath(result_table, './/tr')
# In the last <tr> is the form of the 'previous/next page' links
tr_rows = tr_rows[:-1]
# just select "web-result" and ignore results of class "result--ad result--ad--small"
for div_result in eval_xpath(doc, '//div[@id="links"]/div[contains(@class, "web-result")]'):
len_tr_rows = len(tr_rows)
offset = 0
item = {}
title = eval_xpath(div_result, './/h2/a')
if not title:
# this is the "No results." item in the result list
continue
item["title"] = extract_text(title)
item["url"] = eval_xpath(div_result, './/h2/a/@href')[0]
item["content"] = extract_text(eval_xpath(div_result, './/a[contains(@class, "result__snippet")]')[0])
zero_click_info_xpath = '//html/body/form/div/table[2]/tr[2]/td/text()'
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip()
results.append(item)
if zero_click and "Your IP address is" not in zero_click:
zero_click_info_xpath = '//div[@id="zero_click_abstract"]'
zero_click = extract_text(eval_xpath(doc, zero_click_info_xpath)).strip() # type: ignore
if zero_click and (
"Your IP address is" not in zero_click
and "Your user agent:" not in zero_click
and "URL Decoded:" not in zero_click
):
current_query = resp.search_params["data"].get("q")
results.append(
@ -341,33 +405,6 @@ def response(resp):
}
)
while len_tr_rows >= offset + 4:
# assemble table rows we need to scrap
tr_title = tr_rows[offset]
tr_content = tr_rows[offset + 1]
offset += 4
# ignore sponsored Adds <tr class="result-sponsored">
if tr_content.get('class') == 'result-sponsored':
continue
a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
if a_tag is None:
continue
td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
if td_content is None:
continue
results.append(
{
'title': a_tag.text_content(),
'content': extract_text(td_content),
'url': a_tag.get('href'),
}
)
return results
@ -375,7 +412,7 @@ def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages & regions from DuckDuckGo.
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
DuckDuckGo's language "Browsers preferred language" (``wt_WT``) makes no
sense in a SearXNG request since SearXNG's ``all`` will not add a
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
is ``wt-wt`` (the region).
@ -405,7 +442,7 @@ def fetch_traits(engine_traits: EngineTraits):
if not resp.ok: # type: ignore
print("ERROR: response from DuckDuckGo is not OK.")
js_code = extr(resp.text, 'regions:', ',snippetLengths')
js_code = extr(resp.text, 'regions:', ',snippetLengths') # type: ignore
regions = json.loads(js_code)
for eng_tag, name in regions.items():
@ -439,7 +476,7 @@ def fetch_traits(engine_traits: EngineTraits):
engine_traits.custom['lang_region'] = {}
js_code = extr(resp.text, 'languages:', ',regions')
js_code = extr(resp.text, 'languages:', ',regions') # type: ignore
languages = js_variable_to_python(js_code)
for eng_lang, name in languages.items():

View file

@ -4,15 +4,15 @@ DuckDuckGo Extra (images, videos, news)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from searx.utils import get_embeded_stream_url
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import (
get_ddg_lang,
get_vqd,
)
from searx.engines.duckduckgo import get_ddg_lang, get_vqd
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
@ -47,15 +47,16 @@ search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
def request(query, params):
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
# request needs a vqd argument
vqd = get_vqd(query)
vqd = get_vqd(query, eng_region, force_request=True)
if not vqd:
# some search terms do not have results and therefore no vqd value
params['url'] = None
return params
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
args = {
@ -85,6 +86,12 @@ def request(query, params):
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
# sending these two headers prevents rate limiting for the query
params['headers'] = {
'Referer': 'https://duckduckgo.com/',
'X-Requested-With': 'XMLHttpRequest',
}
return params
@ -108,7 +115,7 @@ def _video_result(result):
'title': result['title'],
'content': result['description'],
'thumbnail': result['images'].get('small') or result['images'].get('medium'),
'iframe_src': result['embed_url'],
'iframe_src': get_embeded_stream_url(result['content']),
'source': result['provider'],
'length': result['duration'],
'metadata': result.get('uploader'),

View file

@ -35,8 +35,8 @@ def response(resp):
results = []
for item in search_res:
img = 'https://findthatmeme.us-southeast-1.linodeobjects.com/' + item['image_path']
thumb = 'https://findthatmeme.us-southeast-1.linodeobjects.com/thumb/' + item.get('thumbnail', '')
img = 'https://s3.thehackerblog.com/findthatmeme/' + item['image_path']
thumb = 'https://s3.thehackerblog.com/findthatmeme/thumb/' + item.get('thumbnail', '')
date = datetime.strptime(item["updated_at"].split("T")[0], "%Y-%m-%d")
formatted_date = datetime.utcfromtimestamp(date.timestamp())

97
searx/engines/geizhals.py Normal file
View file

@ -0,0 +1,97 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Geizhals is a German website to compare the price of a product on the
most common German shopping sites and find the lowest price.
The sorting of the search results can be influenced by the following additions
to the search term:
``asc`` or ``price``
To sort by price in ascending order.
``desc``
To sort by price in descending order.
"""
import re
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, eval_xpath_list, extract_text
about = {
'website': 'https://geizhals.de',
'wikidata_id': 'Q15977657',
'use_official_api': False,
'official_api_documentation': None,
'require_api_key': False,
'results': 'HTML',
'language': 'de',
}
paging = True
categories = ['shopping']
base_url = "https://geizhals.de"
sort_order = 'relevance'
SORT_RE = re.compile(r"sort:(\w+)")
sort_order_map = {
'relevance': None,
'price': 'p',
'asc': 'p',
'desc': '-p',
}
def request(query, params):
sort = None
sort_order_path = SORT_RE.search(query)
if sort_order_path:
sort = sort_order_map.get(sort_order_path.group(1))
query = SORT_RE.sub("", query)
logger.debug(query)
args = {
'fs': query,
'pg': params['pageno'],
'toggle_all': 1, # load item specs
'sort': sort,
}
params['url'] = f"{base_url}/?{urlencode(args)}"
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in eval_xpath_list(dom, "//article[contains(@class, 'listview__item')]"):
content = []
for spec in eval_xpath_list(result, ".//div[contains(@class, 'specs-grid__item')]"):
content.append(f"{extract_text(eval_xpath(spec, './dt'))}: {extract_text(eval_xpath(spec, './dd'))}")
metadata = [
extract_text(eval_xpath(result, ".//div[contains(@class, 'stars-rating-label')]")),
extract_text(eval_xpath(result, ".//div[contains(@class, 'listview__offercount')]")),
]
item = {
'template': 'products.html',
'url': (
base_url + "/" + extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__name-link')]/@href"))
),
'title': extract_text(eval_xpath(result, ".//h3[contains(@class, 'listview__name')]")),
'content': ' | '.join(content),
'thumbnail': extract_text(eval_xpath(result, ".//img[contains(@class, 'listview__image')]/@src")),
'metadata': ', '.join(item for item in metadata if item),
}
best_price = extract_text(eval_xpath(result, ".//a[contains(@class, 'listview__price-link')]")).split(" ")
if len(best_price) > 1:
item["price"] = f"Bestes Angebot: {best_price[1]}"
results.append(item)
return results

View file

@ -1,125 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Gentoo Wiki
"""
from urllib.parse import urlencode, urljoin
from lxml import html
from searx.utils import extract_text
# about
about = {
"website": 'https://wiki.gentoo.org/',
"wikidata_id": 'Q1050637',
"official_api_documentation": 'https://wiki.gentoo.org/api.php',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine dependent config
categories = ['it', 'software wikis']
paging = True
base_url = 'https://wiki.gentoo.org'
# xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a'
xpath_content = './/div[@class="searchresult"]'
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
def locale_to_lang_code(locale):
if locale.find('-') >= 0:
locale = locale.split('-')[0]
return locale
# wikis for some languages were moved off from the main site, we need to make
# requests to correct URLs to be able to get results in those languages
lang_urls = {
'en': {'base': 'https://wiki.gentoo.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}'},
'others': {
'base': 'https://wiki.gentoo.org',
'search': '/index.php?title=Special:Search&offset={offset}&{query}\
&profile=translation&languagefilter={language}',
},
}
# get base & search URLs for selected language
def get_lang_urls(language):
if language != 'en':
return lang_urls['others']
return lang_urls['en']
# Language names to build search requests for
# those languages which are hosted on the main site.
main_langs = {
'ar': 'العربية',
'bg': 'Български',
'cs': 'Česky',
'da': 'Dansk',
'el': 'Ελληνικά',
'es': 'Español',
'he': 'עברית',
'hr': 'Hrvatski',
'hu': 'Magyar',
'it': 'Italiano',
'ko': '한국어',
'lt': 'Lietuviškai',
'nl': 'Nederlands',
'pl': 'Polski',
'pt': 'Português',
'ru': 'Русский',
'sl': 'Slovenský',
'th': 'ไทย',
'uk': 'Українська',
'zh': '简体中文',
}
# do search-request
def request(query, params):
# translate the locale (e.g. 'en-US') to language code ('en')
language = locale_to_lang_code(params['language'])
# if our language is hosted on the main site, we need to add its name
# to the query in order to narrow the results to that language
if language in main_langs:
query += ' (' + main_langs[language] + ')'
# prepare the request parameters
query = urlencode({'search': query})
offset = (params['pageno'] - 1) * 20
# get request URLs for our language of choice
urls = get_lang_urls(language)
search_url = urls['base'] + urls['search']
params['url'] = search_url.format(query=query, offset=offset, language=language)
return params
# get response from search-request
def response(resp):
# get the base URL for the language in which request was made
language = locale_to_lang_code(resp.search_params['language'])
url = get_lang_urls(language)['base']
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(xpath_results):
link = result.xpath(xpath_link)[0]
href = urljoin(url, link.attrib.get('href'))
title = extract_text(link)
content = extract_text(result.xpath(xpath_content))
results.append({'url': href, 'title': title, 'content': content})
return results

View file

@ -1,7 +1,8 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine to search in collaborative software platforms based on Gitea_.
"""Engine to search in collaborative software platforms based on Gitea_ or Forgejo_.
.. _Gitea: https://about.gitea.com/
.. _Forgejo: https://forgejo.org/
Configuration
=============
@ -23,6 +24,11 @@ Optional settings are:
base_url: https://gitea.com
shortcut: gitea
- name: forgejo.com
engine: gitea
base_url: https://code.forgejo.org
shortcut: forgejo
If you would like to use additional instances, just configure new engines in the
:ref:`settings <settings engine>` and set the ``base_url``.
@ -95,13 +101,14 @@ def response(resp):
'url': item.get('html_url'),
'title': item.get('full_name'),
'content': ' / '.join(content),
'img_src': item.get('owner', {}).get('avatar_url'),
# Use Repository Avatar and fall back to Owner Avatar if not set.
'thumbnail': item.get('avatar_url') or item.get('owner', {}).get('avatar_url'),
'package_name': item.get('name'),
'maintainer': item.get('owner', {}).get('login'),
'maintainer': item.get('owner', {}).get('username'),
'publishedDate': parser.parse(item.get("updated_at") or item.get("created_at")),
'tags': item.get('topics', []),
'popularity': item.get('stargazers_count'),
'homepage': item.get('homepage'),
'popularity': item.get('stars_count'),
'homepage': item.get('website'),
'source_code_url': item.get('clone_url'),
}
)

95
searx/engines/gitlab.py Normal file
View file

@ -0,0 +1,95 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Engine to search in collaborative software platforms based on GitLab_ with
the `GitLab REST API`_.
.. _GitLab: https://about.gitlab.com/install/
.. _GitLab REST API: https://docs.gitlab.com/ee/api/
Configuration
=============
The engine has the following mandatory setting:
- :py:obj:`base_url`
Optional settings are:
- :py:obj:`api_path`
.. code:: yaml
- name: gitlab
engine: gitlab
base_url: https://gitlab.com
shortcut: gl
about:
website: https://gitlab.com/
wikidata_id: Q16639197
- name: gnome
engine: gitlab
base_url: https://gitlab.gnome.org
shortcut: gn
about:
website: https://gitlab.gnome.org
wikidata_id: Q44316
Implementations
===============
"""
from urllib.parse import urlencode
from dateutil import parser
about = {
"website": None,
"wikidata_id": None,
"official_api_documentation": "https://docs.gitlab.com/ee/api/",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['it', 'repos']
paging = True
base_url: str = ""
"""Base URL of the GitLab host."""
api_path: str = 'api/v4/projects'
"""The path the `project API <https://docs.gitlab.com/ee/api/projects.html>`_.
The default path should work fine usually.
"""
def request(query, params):
args = {'search': query, 'page': params['pageno']}
params['url'] = f"{base_url}/{api_path}?{urlencode(args)}"
return params
def response(resp):
results = []
for item in resp.json():
results.append(
{
'template': 'packages.html',
'url': item.get('web_url'),
'title': item.get('name'),
'content': item.get('description'),
'thumbnail': item.get('avatar_url'),
'package_name': item.get('name'),
'maintainer': item.get('namespace', {}).get('name'),
'publishedDate': parser.parse(item.get('last_activity_at') or item.get("created_at")),
'tags': item.get('tag_list', []),
'popularity': item.get('star_count'),
'homepage': item.get('readme_url'),
'source_code_url': item.get('http_url_to_repo'),
}
)
return results

View file

@ -59,11 +59,6 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables
# ------------------------
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-sncf="1"]'
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
@ -334,31 +329,38 @@ def response(resp):
# results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
for item in answer_list:
for bubble in eval_xpath(item, './/div[@class="nnFGuf"]'):
bubble.drop_tree()
results.append(
{
'answer': item.xpath("normalize-space()"),
'answer': extract_text(item),
'url': (eval_xpath(item, '../..//a/@href') + [None])[0],
}
)
# parse results
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
# pylint: disable=too-many-nested-blocks
try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0, None)
url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue
content_nodes = eval_xpath(result, content_xpath)
content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
for item in content_nodes:
for script in item.xpath(".//script"):
script.getparent().remove(script)
content = extract_text(content_nodes)
if not content:
@ -439,7 +441,7 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
print("INFO: google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
continue
sxng_lang = language_tag(locale)

View file

@ -34,6 +34,7 @@ from searx.engines.google import (
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
from searx.utils import get_embeded_stream_url
if TYPE_CHECKING:
import logging
@ -125,6 +126,7 @@ def response(resp):
'content': content,
'author': pub_info,
'thumbnail': thumbnail,
'iframe_src': get_embeded_stream_url(url),
'template': 'videos.html',
}
)

View file

@ -57,7 +57,11 @@ def request(query, params):
if params['time_range']:
search_type = 'search_by_date'
timestamp = (datetime.now() - relativedelta(**{f"{params['time_range']}s": 1})).timestamp()
timestamp = (
# pylint: disable=unexpected-keyword-arg
datetime.now()
- relativedelta(**{f"{params['time_range']}s": 1}) # type: ignore
).timestamp()
query_params["numericFilters"] = f"created_at_i>{timestamp}"
params["url"] = f"{base_url}/{search_type}?{urlencode(query_params)}"

View file

@ -1,71 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Internet Archive scholar(science)
"""
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": "https://scholar.archive.org/",
"wikidata_id": "Q115667709",
"official_api_documentation": "https://scholar.archive.org/api/redoc",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories = ['science', 'scientific publications']
paging = True
base_url = "https://scholar.archive.org"
results_per_page = 15
def request(query, params):
args = {
"q": query,
"limit": results_per_page,
"offset": (params["pageno"] - 1) * results_per_page,
}
params["url"] = f"{base_url}/search?{urlencode(args)}"
params["headers"]["Accept"] = "application/json"
return params
def response(resp):
results = []
json = resp.json()
for result in json["results"]:
publishedDate, content, doi = None, '', None
if result['biblio'].get('release_date'):
publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d")
if len(result['abstracts']) > 0:
content = result['abstracts'][0].get('body')
elif len(result['_highlights']) > 0:
content = result['_highlights'][0]
if len(result['releases']) > 0:
doi = result['releases'][0].get('doi')
results.append(
{
'template': 'paper.html',
'url': result['fulltext']['access_url'],
'title': result['biblio'].get('title') or result['biblio'].get('container_name'),
'content': html_to_text(content),
'publisher': result['biblio'].get('publisher'),
'doi': doi,
'journal': result['biblio'].get('container_name'),
'authors': result['biblio'].get('contrib_names'),
'tags': result['tags'],
'publishedDate': publishedDate,
'issns': result['biblio'].get('issns'),
'pdf_url': result['fulltext'].get('access_url'),
}
)
return results

View file

@ -7,6 +7,8 @@ import random
from urllib.parse import quote_plus, urlparse
from dateutil import parser
from searx.utils import humanize_number
# about
about = {
"website": 'https://api.invidious.io/',
@ -91,7 +93,8 @@ def response(resp):
"url": url,
"title": result.get("title", ""),
"content": result.get("description", ""),
'length': length,
"length": length,
"views": humanize_number(result['viewCount']),
"template": "videos.html",
"author": result.get("author"),
"publishedDate": publishedDate,

View file

@ -16,23 +16,17 @@ from json import loads
from urllib.parse import urlencode
from searx.utils import to_string, html_to_text
# parameters for generating a request
search_url = None
url_query = None
url_prefix = ""
content_query = None
title_query = None
content_html_to_text = False
title_html_to_text = False
paging = False
suggestion_query = ''
results_query = ''
method = 'GET'
request_body = ''
cookies = {}
headers = {}
'''Some engines might offer different result based on cookies or headers.
Possible use-case: To set safesearch cookie or header to moderate.'''
paging = False
# parameters for engines with paging support
#
# number of results on each page
@ -41,6 +35,16 @@ page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1
# parameters for parsing the response
results_query = ''
url_query = None
url_prefix = ""
title_query = None
content_query = None
suggestion_query = ''
title_html_to_text = False
content_html_to_text = False
def iterate(iterable):
if isinstance(iterable, dict):
@ -98,9 +102,8 @@ def query(data, query_string):
def request(query, params): # pylint: disable=redefined-outer-name
query = urlencode({'q': query})[2:]
fp = {'query': urlencode({'q': query})[2:]} # pylint: disable=invalid-name
fp = {'query': query} # pylint: disable=invalid-name
if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
@ -108,7 +111,12 @@ def request(query, params): # pylint: disable=redefined-outer-name
params['headers'].update(headers)
params['url'] = search_url.format(**fp)
params['query'] = query
params['method'] = method
if request_body:
# don't url-encode the query if it's in the request body
fp['query'] = query
params['data'] = request_body.format(**fp)
return params
@ -146,7 +154,11 @@ def response(resp):
}
)
else:
for url, title, content in zip(query(json, url_query), query(json, title_query), query(json, content_query)):
for result in json:
url = query(result, url_query)[0]
title = query(result, title_query)[0]
content = query(result, content_query)[0]
results.append(
{
'url': url_prefix + to_string(url),

View file

@ -31,6 +31,7 @@ def request(_query, params):
params['method'] = 'POST'
params['headers'] = {'Content-Type': 'application/json'}
params['req_url'] = request_url
return params
@ -40,7 +41,13 @@ def response(resp):
json_resp = resp.json()
text = json_resp.get('translatedText')
from_lang = resp.search_params["from_lang"][1]
to_lang = resp.search_params["to_lang"][1]
query = resp.search_params["query"]
req_url = resp.search_params["req_url"]
if text:
results.append({'answer': text})
results.append({"answer": text, "url": f"{req_url}/?source={from_lang}&target={to_lang}&q={query}"})
return results

View file

@ -27,7 +27,7 @@ categories = ['images']
paging = True
endpoint = 'photos'
base_url = 'https://loc.gov'
base_url = 'https://www.loc.gov'
search_string = "/{endpoint}/?sp={page}&{query}&fo=json"
@ -63,8 +63,8 @@ def response(resp):
if not url:
continue
img_src = result['item'].get('service_medium')
if not img_src or img_src == 'https://memory.loc.gov/pp/grp.gif':
img_list = result.get('image_url')
if not img_list:
continue
title = result['title']
@ -88,8 +88,8 @@ def response(resp):
'url': url,
'title': title,
'content': ' / '.join([i for i in content_items if i]),
'img_src': img_src,
'thumbnail_src': result['item'].get('thumb_gallery'),
'img_src': img_list[-1],
'thumbnail_src': img_list[0],
'author': author,
}
)

View file

@ -0,0 +1,95 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""MariaDB is a community driven fork of MySQL. Before enabling MariaDB engine,
you must the install the pip package ``mariadb`` along with the necessary
prerequities.
`See the following documentation for more details
<https://mariadb.com/docs/server/connect/programming-languages/c/install/>`_
Example
=======
This is an example configuration for querying a MariaDB server:
.. code:: yaml
- name: my_database
engine: mariadb_server
database: my_database
username: searxng
password: password
limit: 5
query_str: 'SELECT * from my_table WHERE my_column=%(query)s'
Implementations
===============
"""
from typing import TYPE_CHECKING
try:
import mariadb
except ImportError:
# import error is ignored because the admin has to install mysql manually to use
# the engine
pass
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
engine_type = 'offline'
host = "127.0.0.1"
"""Hostname of the DB connector"""
port = 3306
"""Port of the DB connector"""
database = ""
"""Name of the database."""
username = ""
"""Username for the DB connection."""
password = ""
"""Password for the DB connection."""
query_str = ""
"""SQL query that returns the result items."""
limit = 10
paging = True
result_template = 'key-value.html'
_connection = None
def init(engine_settings):
global _connection # pylint: disable=global-statement
if 'query_str' not in engine_settings:
raise ValueError('query_str cannot be empty')
if not engine_settings['query_str'].lower().startswith('select '):
raise ValueError('only SELECT query is supported')
_connection = mariadb.connect(database=database, user=username, password=password, host=host, port=port)
def search(query, params):
query_params = {'query': query}
query_to_run = query_str + ' LIMIT {0} OFFSET {1}'.format(limit, (params['pageno'] - 1) * limit)
logger.debug("SQL Query: %s", query_to_run)
with _connection.cursor() as cur:
cur.execute(query_to_run, query_params)
results = []
col_names = [i[0] for i in cur.description]
for res in cur:
result = dict(zip(col_names, map(str, res)))
result['template'] = result_template
results.append(result)
return results

View file

@ -100,6 +100,12 @@ base_url: str = 'https://{language}.wikipedia.org/'
ISO 639-1 language code (en, de, fr ..) of the search language.
"""
api_path: str = 'w/api.php'
"""The path the PHP api is listening on.
The default path should work fine usually.
"""
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
"""The longhand version of MediaWiki time strings."""
@ -113,12 +119,7 @@ def request(query, params):
else:
params['language'] = params['language'].split('-')[0]
if base_url.endswith('/'):
api_url = base_url + 'w/api.php?'
else:
api_url = base_url + '/w/api.php?'
api_url = api_url.format(language=params['language'])
api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language'])
offset = (params['pageno'] - 1) * number_of_results
args = {

View file

@ -1,12 +1,15 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Mojeek (general, images, news)"""
from typing import TYPE_CHECKING
from datetime import datetime
from urllib.parse import urlencode
from lxml import html
from dateutil.relativedelta import relativedelta
from searx.utils import eval_xpath, eval_xpath_list, extract_text
from searx.enginelib.traits import EngineTraits
about = {
'website': 'https://mojeek.com',
@ -42,6 +45,18 @@ news_url_xpath = './/h2/a/@href'
news_title_xpath = './/h2/a'
news_content_xpath = './/p[@class="s"]'
language_param = 'lb'
region_param = 'arc'
_delta_kwargs = {'day': 'days', 'week': 'weeks', 'month': 'months', 'year': 'years'}
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
traits: EngineTraits
def init(_):
if search_type not in ('', 'images', 'news'):
@ -53,13 +68,16 @@ def request(query, params):
'q': query,
'safe': min(params['safesearch'], 1),
'fmt': search_type,
language_param: traits.get_language(params['searxng_locale'], traits.custom['language_all']),
region_param: traits.get_region(params['searxng_locale'], traits.custom['region_all']),
}
if search_type == '':
args['s'] = 10 * (params['pageno'] - 1)
if params['time_range'] and search_type != 'images':
args["since"] = (datetime.now() - relativedelta(**{f"{params['time_range']}s": 1})).strftime("%Y%m%d")
kwargs = {_delta_kwargs[params['time_range']]: 1}
args["since"] = (datetime.now() - relativedelta(**kwargs)).strftime("%Y%m%d") # type: ignore
logger.debug(args["since"])
params['url'] = f"{base_url}/search?{urlencode(args)}"
@ -94,7 +112,7 @@ def _image_results(dom):
'template': 'images.html',
'url': extract_text(eval_xpath(result, image_url_xpath)),
'title': extract_text(eval_xpath(result, image_title_xpath)),
'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)),
'img_src': base_url + extract_text(eval_xpath(result, image_img_src_xpath)), # type: ignore
'content': '',
}
)
@ -130,3 +148,31 @@ def response(resp):
return _news_results(dom)
raise ValueError(f"Invalid search type {search_type}")
def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from searx.locales import get_official_locales, region_tag
from babel import Locale, UnknownLocaleError
import contextlib
resp = network.get(base_url + "/preferences", headers={'Accept-Language': 'en-US,en;q=0.5'})
dom = html.fromstring(resp.text) # type: ignore
languages = eval_xpath_list(dom, f'//select[@name="{language_param}"]/option/@value')
engine_traits.custom['language_all'] = languages[0]
for code in languages[1:]:
with contextlib.suppress(UnknownLocaleError):
locale = Locale(code)
engine_traits.languages[locale.language] = code
regions = eval_xpath_list(dom, f'//select[@name="{region_param}"]/option/@value')
engine_traits.custom['region_all'] = regions[1]
for code in regions[2:]:
for locale in get_official_locales(code, engine_traits.languages):
engine_traits.regions[region_tag(locale)] = code

View file

@ -20,6 +20,8 @@ Otherwise, follow instructions provided by Mullvad for enabling the VPN on Linux
update of SearXNG!
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from httpx import Response
from lxml import html
@ -37,6 +39,8 @@ traits: EngineTraits
use_cache: bool = True # non-cache use only has 100 searches per day!
leta_engine: str = 'google'
search_url = "https://leta.mullvad.net"
# about
@ -61,6 +65,11 @@ time_range_dict = {
"year": "y1",
}
available_leta_engines = [
'google', # first will be default if provided engine is invalid
'brave',
]
def is_vpn_connected(dom: html.HtmlElement) -> bool:
"""Returns true if the VPN is connected, False otherwise"""
@ -80,11 +89,22 @@ def assign_headers(headers: dict) -> dict:
def request(query: str, params: dict):
country = traits.get_region(params.get('searxng_locale', 'all'), traits.all_locale) # type: ignore
result_engine = leta_engine
if leta_engine not in available_leta_engines:
result_engine = available_leta_engines[0]
logger.warning(
'Configured engine "%s" not one of the available engines %s, defaulting to "%s"',
leta_engine,
available_leta_engines,
result_engine,
)
params['url'] = search_url
params['method'] = 'POST'
params['data'] = {
"q": query,
"gl": country if country is str else '',
'engine': result_engine,
}
# pylint: disable=undefined-variable
if use_cache:
@ -107,8 +127,15 @@ def request(query: str, params: dict):
return params
def extract_result(dom_result: html.HtmlElement):
[a_elem, h3_elem, p_elem] = eval_xpath_list(dom_result, 'div/div/*')
def extract_result(dom_result: list[html.HtmlElement]):
# Infoboxes sometimes appear in the beginning and will have a length of 0
if len(dom_result) == 3:
[a_elem, h3_elem, p_elem] = dom_result
elif len(dom_result) == 4:
[_, a_elem, h3_elem, p_elem] = dom_result
else:
return None
return {
'url': extract_text(a_elem.text),
'title': extract_text(h3_elem),
@ -116,6 +143,14 @@ def extract_result(dom_result: html.HtmlElement):
}
def extract_results(search_results: html.HtmlElement):
for search_result in search_results:
dom_result = eval_xpath_list(search_result, 'div/div/*')
result = extract_result(dom_result)
if result is not None:
yield result
def response(resp: Response):
"""Checks if connected to Mullvad VPN, then extracts the search results from
the DOM resp: requests response object"""
@ -124,7 +159,7 @@ def response(resp: Response):
if not is_vpn_connected(dom):
raise SearxEngineResponseException('Not connected to Mullvad VPN')
search_results = eval_xpath(dom.body, '//main/div[2]/div')
return [extract_result(sr) for sr in search_results]
return list(extract_results(search_results))
def fetch_traits(engine_traits: EngineTraits):

View file

@ -34,12 +34,25 @@ except ImportError:
engine_type = 'offline'
auth_plugin = 'caching_sha2_password'
host = "127.0.0.1"
"""Hostname of the DB connector"""
port = 3306
"""Port of the DB connector"""
database = ""
"""Name of the database."""
username = ""
"""Username for the DB connection."""
password = ""
"""Password for the DB connection."""
query_str = ""
"""SQL query that returns the result items."""
limit = 10
paging = True
result_template = 'key-value.html'

View file

@ -0,0 +1,71 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Open library (books)
"""
from urllib.parse import urlencode
import re
from dateutil import parser
about = {
'website': 'https://openlibrary.org',
'wikidata_id': 'Q1201876',
'require_api_key': False,
'use_official_api': False,
'official_api_documentation': 'https://openlibrary.org/developers/api',
}
paging = True
categories = []
base_url = "https://openlibrary.org"
results_per_page = 10
def request(query, params):
args = {
'q': query,
'page': params['pageno'],
'limit': results_per_page,
}
params['url'] = f"{base_url}/search.json?{urlencode(args)}"
return params
def _parse_date(date):
try:
return parser.parse(date)
except parser.ParserError:
return None
def response(resp):
results = []
for item in resp.json().get("docs", []):
cover = None
if 'lending_identifier_s' in item:
cover = f"https://archive.org/services/img/{item['lending_identifier_s']}"
published = item.get('publish_date')
if published:
published_dates = [date for date in map(_parse_date, published) if date]
if published_dates:
published = min(published_dates)
if not published:
published = parser.parse(str(item.get('first_published_year')))
result = {
'template': 'paper.html',
'url': f"{base_url}{item['key']}",
'title': item['title'],
'content': re.sub(r"\{|\}", "", item['first_sentence'][0]) if item.get('first_sentence') else '',
'isbn': item.get('isbn', [])[:5],
'authors': item.get('author_name', []),
'thumbnail': cover,
'publishedDate': published,
'tags': item.get('subject', [])[:10] + item.get('place', [])[:10],
}
results.append(result)
return results

View file

@ -14,7 +14,7 @@ import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
from searx.utils import html_to_text
from searx.utils import html_to_text, humanize_number
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
@ -124,6 +124,7 @@ def video_response(resp):
'content': html_to_text(result.get('description') or ''),
'author': result.get('account', {}).get('displayName'),
'length': minute_to_hm(result.get('duration')),
'views': humanize_number(result['views']),
'template': 'videos.html',
'publishedDate': parse(result['publishedAt']),
'iframe_src': result.get('embedUrl'),

View file

@ -53,6 +53,8 @@ from urllib.parse import urlencode
import datetime
from dateutil import parser
from searx.utils import humanize_number
# about
about = {
"website": 'https://github.com/TeamPiped/Piped/',
@ -138,6 +140,7 @@ def response(resp):
"title": result.get("title", ""),
"publishedDate": parser.parse(time.ctime(uploaded / 1000)) if uploaded != -1 else None,
"iframe_src": _frontend_url() + '/embed' + result.get("url", ""),
"views": humanize_number(result["views"]),
}
length = result.get("duration")
if length:

View file

@ -29,12 +29,25 @@ except ImportError:
pass
engine_type = 'offline'
host = "127.0.0.1"
"""Hostname of the DB connector"""
port = "5432"
"""Port of the DB connector"""
database = ""
"""Name of the database."""
username = ""
"""Username for the DB connection."""
password = ""
"""Password for the DB connection."""
query_str = ""
"""SQL query that returns the result items."""
limit = 10
paging = True
result_template = 'key-value.html'

View file

@ -49,7 +49,11 @@ from flask_babel import gettext
import babel
import lxml
from searx.exceptions import SearxEngineAPIException, SearxEngineTooManyRequestsException
from searx.exceptions import (
SearxEngineAPIException,
SearxEngineTooManyRequestsException,
SearxEngineCaptchaException,
)
from searx.network import raise_for_httperror
from searx.enginelib.traits import EngineTraits
@ -57,6 +61,7 @@ from searx.utils import (
eval_xpath,
eval_xpath_list,
extract_text,
get_embeded_stream_url,
)
traits: EngineTraits
@ -187,6 +192,8 @@ def parse_web_api(resp):
error_code = data.get('error_code')
if error_code == 24:
raise SearxEngineTooManyRequestsException()
if search_results.get("data", {}).get("error_data", {}).get("captchaUrl") is not None:
raise SearxEngineCaptchaException()
msg = ",".join(data.get('message', ['unknown']))
raise SearxEngineAPIException(f"{msg} ({error_code})")
@ -297,6 +304,7 @@ def parse_web_api(resp):
'title': title,
'url': res_url,
'content': content,
'iframe_src': get_embeded_stream_url(res_url),
'publishedDate': pub_date,
'thumbnail': thumbnail,
'template': 'videos.html',

View file

@ -165,10 +165,12 @@ def fetch_traits(engine_traits: EngineTraits):
countrycodes = set()
for region in country_list:
if region['iso_3166_1'] not in babel_reg_list:
# country_list contains duplicates that differ only in upper/lower case
_reg = region['iso_3166_1'].upper()
if _reg not in babel_reg_list:
print(f"ERROR: region tag {region['iso_3166_1']} is unknown by babel")
continue
countrycodes.add(region['iso_3166_1'])
countrycodes.add(_reg)
countrycodes = list(countrycodes)
countrycodes.sort()

View file

@ -1,98 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Słownik Języka Polskiego
Dictionary of the polish language from PWN (sjp.pwn)
"""
from lxml.html import fromstring
from searx import logger
from searx.utils import extract_text
from searx.network import raise_for_httperror
logger = logger.getChild('sjp engine')
# about
about = {
"website": 'https://sjp.pwn.pl',
"wikidata_id": 'Q55117369',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"language": 'pl',
}
categories = ['dictionaries']
paging = False
URL = 'https://sjp.pwn.pl'
SEARCH_URL = URL + '/szukaj/{query}.html'
word_xpath = '//div[@class="query"]'
dict_xpath = [
'//div[@class="wyniki sjp-so-wyniki sjp-so-anchor"]',
'//div[@class="wyniki sjp-wyniki sjp-anchor"]',
'//div[@class="wyniki sjp-doroszewski-wyniki sjp-doroszewski-anchor"]',
]
def request(query, params):
params['url'] = SEARCH_URL.format(query=query)
logger.debug(f"query_url --> {params['url']}")
return params
def response(resp):
results = []
raise_for_httperror(resp)
dom = fromstring(resp.text)
word = extract_text(dom.xpath(word_xpath))
definitions = []
for dict_src in dict_xpath:
for src in dom.xpath(dict_src):
src_text = extract_text(src.xpath('.//span[@class="entry-head-title"]/text()')).strip()
src_defs = []
for def_item in src.xpath('.//div[contains(@class, "ribbon-element")]'):
if def_item.xpath('./div[@class="znacz"]'):
sub_defs = []
for def_sub_item in def_item.xpath('./div[@class="znacz"]'):
def_sub_text = extract_text(def_sub_item).lstrip('0123456789. ')
sub_defs.append(def_sub_text)
src_defs.append((word, sub_defs))
else:
def_text = extract_text(def_item).strip()
def_link = def_item.xpath('./span/a/@href')
if 'doroszewski' in def_link[0]:
def_text = f"<a href='{def_link[0]}'>{def_text}</a>"
src_defs.append((def_text, ''))
definitions.append((src_text, src_defs))
if not definitions:
return results
infobox = ''
for src in definitions:
infobox += f"<div><small>{src[0]}</small>"
infobox += "<ul>"
for def_text, sub_def in src[1]:
infobox += f"<li>{def_text}</li>"
if sub_def:
infobox += "<ol>"
for sub_def_text in sub_def:
infobox += f"<li>{sub_def_text}</li>"
infobox += "</ol>"
infobox += "</ul></div>"
results.append(
{
'infobox': word,
'content': infobox,
}
)
return results

View file

@ -41,8 +41,13 @@ import sqlite3
import contextlib
engine_type = 'offline'
database = ""
"""Filename of the SQLite DB."""
query_str = ""
"""SQL query that returns the result items."""
limit = 10
paging = True
result_template = 'key-value.html'

View file

@ -142,7 +142,7 @@ search_url = base_url + '/sp/search'
# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
# not ads: div[@class="result"] are the direct children of div[@id="results"]
search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form

View file

@ -7,6 +7,7 @@ ends.
from json import dumps
from searx.utils import searx_useragent
from searx.enginelib.traits import EngineTraits
about = {
"website": "https://stract.com/",
@ -18,7 +19,10 @@ about = {
categories = ['general']
paging = True
search_url = "https://stract.com/beta/api/search"
base_url = "https://stract.com/beta/api"
search_url = base_url + "/search"
traits: EngineTraits
def request(query, params):
@ -29,7 +33,14 @@ def request(query, params):
'Content-Type': 'application/json',
'User-Agent': searx_useragent(),
}
params['data'] = dumps({'query': query, 'page': params['pageno'] - 1})
region = traits.get_region(params["searxng_locale"], default=traits.all_locale)
params['data'] = dumps(
{
'query': query,
'page': params['pageno'] - 1,
'selectedRegion': region,
}
)
return params
@ -47,3 +58,24 @@ def response(resp):
)
return results
def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from babel import Locale, languages
from searx.locales import region_tag
territories = Locale("en").territories
json = network.get(base_url + "/docs/openapi.json").json()
regions = json['components']['schemas']['Region']['enum']
engine_traits.all_locale = regions[0]
for region in regions[1:]:
for code, name in territories.items():
if region not in (code, name):
continue
for lang in languages.get_official_languages(code, de_facto=True):
engine_traits.regions[region_tag(Locale(lang, code))] = region

View file

@ -14,10 +14,16 @@ billion images `[tineye.com] <https://tineye.com/how>`_.
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from datetime import datetime
from flask_babel import gettext
if TYPE_CHECKING:
import logging
logger = logging.getLogger()
about = {
"website": 'https://tineye.com',
"wikidata_id": 'Q2382535',
@ -34,7 +40,7 @@ categories = ['general']
paging = True
safesearch = False
base_url = 'https://tineye.com'
search_string = '/result_json/?page={page}&{query}'
search_string = '/api/v1/result_json/?page={page}&{query}'
FORMAT_NOT_SUPPORTED = gettext(
"Could not read that image url. This may be due to an unsupported file"
@ -120,7 +126,7 @@ def parse_tineye_match(match_json):
crawl_date = backlink_json.get("crawl_date")
if crawl_date:
crawl_date = datetime.fromisoformat(crawl_date[:-3])
crawl_date = datetime.strptime(crawl_date, '%Y-%m-%d')
else:
crawl_date = datetime.min
@ -150,29 +156,15 @@ def parse_tineye_match(match_json):
def response(resp):
"""Parse HTTP response from TinEye."""
results = []
try:
# handle the 422 client side errors, and the possible 400 status code error
if resp.status_code in (400, 422):
json_data = resp.json()
except Exception as exc: # pylint: disable=broad-except
msg = "can't parse JSON response // %s" % exc
logger.error(msg)
json_data = {'error': msg}
# handle error codes from Tineye
if resp.is_error:
if resp.status_code in (400, 422):
message = 'HTTP status: %s' % resp.status_code
error = json_data.get('error')
s_key = json_data.get('suggestions', {}).get('key', '')
if error and s_key:
message = "%s (%s)" % (error, s_key)
elif error:
message = error
suggestions = json_data.get('suggestions', {})
message = f'HTTP Status Code: {resp.status_code}'
if resp.status_code == 422:
s_key = suggestions.get('key', '')
if s_key == "Invalid image URL":
# test https://docs.searxng.org/_static/searxng-wordmark.svg
message = FORMAT_NOT_SUPPORTED
@ -182,16 +174,23 @@ def response(resp):
elif s_key == 'Download Error':
# test https://notexists
message = DOWNLOAD_ERROR
else:
logger.warning("Unknown suggestion key encountered: %s", s_key)
else: # 400
description = suggestions.get('description')
if isinstance(description, list):
message = ','.join(description)
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
# results.append({'answer': message})
logger.error(message)
# see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
# results.append({'answer': message})
logger.error(message)
return []
return results
# Raise for all other responses
resp.raise_for_status()
resp.raise_for_status()
# append results from matches
results = []
json_data = resp.json()
for match_json in json_data['matches']:
@ -209,7 +208,7 @@ def response(resp):
'title': backlink['image_name'],
'img_src': backlink['url'],
'format': tineye_match['image_format'],
'widht': tineye_match['width'],
'width': tineye_match['width'],
'height': tineye_match['height'],
'publishedDate': backlink['crawl_date'],
}

View file

@ -32,7 +32,7 @@ void_arch = 'x86_64'
"""Default architecture to search for. For valid values see :py:obj:`ARCH_RE`"""
ARCH_RE = re.compile('aarch64-musl|armv6l-musl|armv7l-musl|x86_64-musl|aarch64|armv6l|armv7l|i686|x86_64')
"""Regular expresion that match a architecture in the query string."""
"""Regular expression that match a architecture in the query string."""
def request(query, params):

View file

@ -7,6 +7,8 @@ import datetime
from urllib.parse import urlencode
from searx.utils import html_to_text, humanize_bytes
# about
about = {
"website": 'https://commons.wikimedia.org/',
@ -74,7 +76,7 @@ def response(resp):
result = {
'url': imageinfo["descriptionurl"],
'title': title,
'content': item["snippet"],
'content': html_to_text(item["snippet"]),
}
if search_type == "images":
@ -93,7 +95,7 @@ def response(resp):
elif search_type == "files":
result['template'] = 'files.html'
result['metadata'] = imageinfo['mime']
result['size'] = imageinfo['size']
result['size'] = humanize_bytes(imageinfo['size'])
elif search_type == "audio":
result['iframe_src'] = imageinfo['url']

View file

@ -20,13 +20,9 @@ about = {
categories = ['general']
paging = False
URL = 'https://www.wordnik.com'
SEARCH_URL = URL + '/words/{query}'
def request(query, params):
params['url'] = SEARCH_URL.format(query=query)
logger.debug(f"query_url --> {params['url']}")
params['url'] = f"https://www.wordnik.com/words/{query}"
return params

View file

@ -12,6 +12,8 @@ Request:
- :py:obj:`search_url`
- :py:obj:`lang_all`
- :py:obj:`soft_max_redirects`
- :py:obj:`method`
- :py:obj:`request_body`
- :py:obj:`cookies`
- :py:obj:`headers`
@ -151,6 +153,16 @@ headers = {}
'''Some engines might offer different result based headers. Possible use-case:
To set header to moderate.'''
method = 'GET'
'''Some engines might require to do POST requests for search.'''
request_body = ''
'''The body of the request. This can only be used if different :py:obj:`method`
is set, e.g. ``POST``. For formatting see the documentation of :py:obj:`search_url`::
search={query}&page={pageno}{time_range}{safe_search}
'''
paging = False
'''Engine supports paging [True or False].'''
@ -236,8 +248,14 @@ def request(query, params):
params['headers'].update(headers)
params['url'] = search_url.format(**fargs)
params['soft_max_redirects'] = soft_max_redirects
params['method'] = method
if request_body:
# don't url-encode the query if it's in the request body
fargs['query'] = query
params['data'] = request_body.format(**fargs)
params['soft_max_redirects'] = soft_max_redirects
params['raise_for_httperror'] = False
return params

View file

@ -118,6 +118,8 @@ def _base_url() -> str:
url = engines['yacy'].base_url # type: ignore
if isinstance(url, list):
url = random.choice(url)
if url.endswith("/"):
url = url[:-1]
return url

View file

@ -16,6 +16,7 @@ from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
html_to_text,
)
from searx.enginelib.traits import EngineTraits
@ -133,12 +134,20 @@ def response(resp):
url = parse_url(url)
title = eval_xpath_getindex(result, './/h3//a/@aria-label', 0, default='')
title = extract_text(title)
title: str = extract_text(title)
content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
content = extract_text(content, allow_none=True)
content: str = extract_text(content, allow_none=True)
# append result
results.append({'url': url, 'title': title, 'content': content})
results.append(
{
'url': url,
# title sometimes contains HTML tags / see
# https://github.com/searxng/searxng/issues/3790
'title': " ".join(html_to_text(title).strip().split()),
'content': " ".join(html_to_text(content).strip().split()),
}
)
for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'):
# append suggestion

133
searx/engines/yandex.py Normal file
View file

@ -0,0 +1,133 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Yandex (Web, images)"""
from json import loads
from urllib.parse import urlencode
from html import unescape
from lxml import html
from searx.exceptions import SearxEngineCaptchaException
from searx.utils import humanize_bytes, eval_xpath, eval_xpath_list, extract_text, extr
# Engine metadata
about = {
"website": 'https://yandex.com/',
"wikidata_id": 'Q5281',
"official_api_documentation": "?",
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# Engine configuration
categories = []
paging = True
search_type = ""
# Search URL
base_url_web = 'https://yandex.com/search/site/'
base_url_images = 'https://yandex.com/images/search'
results_xpath = '//li[contains(@class, "serp-item")]'
url_xpath = './/a[@class="b-serp-item__title-link"]/@href'
title_xpath = './/h3[@class="b-serp-item__title"]/a[@class="b-serp-item__title-link"]/span'
content_xpath = './/div[@class="b-serp-item__content"]//div[@class="b-serp-item__text"]'
def catch_bad_response(resp):
if resp.url.path.startswith('/showcaptcha'):
raise SearxEngineCaptchaException()
def request(query, params):
query_params_web = {
"tmpl_version": "releases",
"text": query,
"web": "1",
"frame": "1",
"searchid": "3131712",
}
query_params_images = {
"text": query,
"uinfo": "sw-1920-sh-1080-ww-1125-wh-999",
}
if params['pageno'] > 1:
query_params_web.update({"p": params["pageno"] - 1})
query_params_images.update({"p": params["pageno"] - 1})
params["cookies"] = {'cookie': "yp=1716337604.sp.family%3A0#1685406411.szm.1:1920x1080:1920x999"}
if search_type == 'web':
params['url'] = f"{base_url_web}?{urlencode(query_params_web)}"
elif search_type == 'images':
params['url'] = f"{base_url_images}?{urlencode(query_params_images)}"
return params
def response(resp):
if search_type == 'web':
catch_bad_response(resp)
dom = html.fromstring(resp.text)
results = []
for result in eval_xpath_list(dom, results_xpath):
results.append(
{
'url': extract_text(eval_xpath(result, url_xpath)),
'title': extract_text(eval_xpath(result, title_xpath)),
'content': extract_text(eval_xpath(result, content_xpath)),
}
)
return results
if search_type == 'images':
catch_bad_response(resp)
html_data = html.fromstring(resp.text)
html_sample = unescape(html.tostring(html_data, encoding='unicode'))
content_between_tags = extr(
html_sample, '{"location":"/images/search/', 'advRsyaSearchColumn":null}}', default="fail"
)
json_data = '{"location":"/images/search/' + content_between_tags + 'advRsyaSearchColumn":null}}'
if content_between_tags == "fail":
content_between_tags = extr(html_sample, '{"location":"/images/search/', 'false}}}')
json_data = '{"location":"/images/search/' + content_between_tags + 'false}}}'
json_resp = loads(json_data)
results = []
for _, item_data in json_resp['initialState']['serpList']['items']['entities'].items():
title = item_data['snippet']['title']
source = item_data['snippet']['url']
thumb = item_data['image']
fullsize_image = item_data['viewerData']['dups'][0]['url']
height = item_data['viewerData']['dups'][0]['h']
width = item_data['viewerData']['dups'][0]['w']
filesize = item_data['viewerData']['dups'][0]['fileSizeInBytes']
humanized_filesize = humanize_bytes(filesize)
results.append(
{
'title': title,
'url': source,
'img_src': fullsize_image,
'filesize': humanized_filesize,
'thumbnail_src': thumb,
'template': 'images.html',
'resolution': f'{width} x {height}',
}
)
return results
return []

View file

@ -67,6 +67,8 @@ def response(resp):
for result in resp.json()[1]['results']:
if search_type == "web":
if result['type'] != 'Organic':
continue
results.append(_web_result(result))
elif search_type == "images":
results.append(_images_result(result))

View file

@ -43,6 +43,7 @@ from flask_babel import gettext
from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
from searx.exceptions import SearxException
if TYPE_CHECKING:
import httpx
@ -108,13 +109,21 @@ def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]:
zlib_year_to=zlib_year_to,
zlib_ext=zlib_ext,
)
params["verify"] = False
return params
def domain_is_seized(dom):
return bool(dom.xpath('//title') and "seized" in dom.xpath('//title')[0].text.lower())
def response(resp: httpx.Response) -> List[Dict[str, Any]]:
results: List[Dict[str, Any]] = []
dom = html.fromstring(resp.text)
if domain_is_seized(dom):
raise SearxException(f"zlibrary domain is seized: {base_url}")
for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'):
results.append(_parse_result(item))
@ -168,22 +177,30 @@ def _parse_result(item) -> Dict[str, Any]:
def fetch_traits(engine_traits: EngineTraits) -> None:
"""Fetch languages and other search arguments from zlibrary's search form."""
# pylint: disable=import-outside-toplevel
# pylint: disable=import-outside-toplevel, too-many-branches
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
resp = get(base_url, verify=False)
if not resp.ok: # type: ignore
raise RuntimeError("Response from zlibrary's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
if domain_is_seized(dom):
print(f"ERROR: zlibrary domain is seized: {base_url}")
# don't change anything, re-use the existing values
engine_traits.all_locale = ENGINE_TRAITS["z-library"]["all_locale"]
engine_traits.custom = ENGINE_TRAITS["z-library"]["custom"]
engine_traits.languages = ENGINE_TRAITS["z-library"]["languages"]
return
engine_traits.all_locale = ""
engine_traits.custom["ext"] = []
engine_traits.custom["year_from"] = []
engine_traits.custom["year_to"] = []
resp = get(base_url)
if not resp.ok: # type: ignore
raise RuntimeError("Response from zlibrary's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"):
engine_traits.custom["year_from"].append(year.get("value"))

View file

@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Exception types raised by SearXNG modules.
"""
from __future__ import annotations
from typing import Optional, Union
@ -61,7 +62,7 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException):
"""This settings contains the default suspended time (default 86400 sec / 1
day)."""
def __init__(self, suspended_time: int = None, message: str = 'Access denied'):
def __init__(self, suspended_time: int | None = None, message: str = 'Access denied'):
"""Generic exception to raise when an engine denies access to the results.
:param suspended_time: How long the engine is going to be suspended in
@ -70,12 +71,13 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException):
:param message: Internal message. Defaults to ``Access denied``
:type message: str
"""
suspended_time = suspended_time or self._get_default_suspended_time()
if suspended_time is None:
suspended_time = self._get_default_suspended_time()
super().__init__(message + ', suspended_time=' + str(suspended_time))
self.suspended_time = suspended_time
self.message = message
def _get_default_suspended_time(self):
def _get_default_suspended_time(self) -> int:
from searx import get_setting # pylint: disable=C0415
return get_setting(self.SUSPEND_TIME_SETTING)
@ -88,7 +90,7 @@ class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
"""This settings contains the default suspended time (default 86400 sec / 1
day)."""
def __init__(self, suspended_time=None, message='CAPTCHA'):
def __init__(self, suspended_time: int | None = None, message='CAPTCHA'):
super().__init__(message=message, suspended_time=suspended_time)
@ -102,7 +104,7 @@ class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
"""This settings contains the default suspended time (default 3660 sec / 1
hour)."""
def __init__(self, suspended_time=None, message='Too many request'):
def __init__(self, suspended_time: int | None = None, message='Too many request'):
super().__init__(message=message, suspended_time=suspended_time)

View file

@ -0,0 +1,38 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations for providing the favicons in SearXNG"""
from __future__ import annotations
__all__ = ["init", "favicon_url", "favicon_proxy"]
import pathlib
from searx import logger
from searx import get_setting
from .proxy import favicon_url, favicon_proxy
logger = logger.getChild('favicons')
def is_active():
return bool(get_setting("search.favicon_resolver", False))
def init():
# pylint: disable=import-outside-toplevel
from . import config, cache, proxy
from .. import settings_loader
cfg_file = (settings_loader.get_user_cfg_folder() or pathlib.Path("/etc/searxng")) / "favicons.toml"
if not cfg_file.exists():
if is_active():
logger.error(f"missing favicon config: {cfg_file}")
cfg_file = config.DEFAULT_CFG_TOML_PATH
logger.debug(f"load favicon config: {cfg_file}")
cfg = config.FaviconConfig.from_toml_file(cfg_file, use_cache=True)
cache.init(cfg.cache)
proxy.init(cfg.proxy)
del cache, config, proxy, cfg, settings_loader

View file

@ -0,0 +1,12 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Command line implementation"""
import typer
from . import cache
from . import init
init()
app = typer.Typer()
app.add_typer(cache.app, name="cache", help="commands related to the cache")
app()

476
searx/favicons/cache.py Normal file
View file

@ -0,0 +1,476 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations for caching favicons.
:py:obj:`FaviconCacheConfig`:
Configuration of the favicon cache
:py:obj:`FaviconCache`:
Abstract base class for the implementation of a favicon cache.
:py:obj:`FaviconCacheSQLite`:
Favicon cache that manages the favicon BLOBs in a SQLite DB.
:py:obj:`FaviconCacheNull`:
Fallback solution if the configured cache cannot be used for system reasons.
----
"""
from __future__ import annotations
from typing import Literal
import os
import abc
import dataclasses
import hashlib
import logging
import sqlite3
import tempfile
import time
import typer
import msgspec
from searx import sqlitedb
from searx import logger
from searx.utils import humanize_bytes, humanize_number
CACHE: "FaviconCache"
FALLBACK_ICON = b"FALLBACK_ICON"
logger = logger.getChild('favicons.cache')
app = typer.Typer()
@app.command()
def state():
"""show state of the cache"""
print(CACHE.state().report())
@app.command()
def maintenance(force: bool = True, debug: bool = False):
"""perform maintenance of the cache"""
root_log = logging.getLogger()
if debug:
root_log.setLevel(logging.DEBUG)
else:
root_log.handlers = []
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(message)s"))
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
state_t0 = CACHE.state()
CACHE.maintenance(force=force)
state_t1 = CACHE.state()
state_delta = state_t0 - state_t1
print("The cache has been reduced by:")
print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
def init(cfg: "FaviconCacheConfig"):
"""Initialization of a global ``CACHE``"""
global CACHE # pylint: disable=global-statement
if cfg.db_type == "sqlite":
if sqlite3.sqlite_version_info <= (3, 35):
logger.critical(
"Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
sqlite3.sqlite_version,
)
CACHE = FaviconCacheNull(cfg)
else:
CACHE = FaviconCacheSQLite(cfg)
elif cfg.db_type == "mem":
logger.error("Favicons are cached in memory, don't use this in production!")
CACHE = FaviconCacheMEM(cfg)
else:
raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
class FaviconCacheConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
"""Configuration of the favicon cache."""
db_type: Literal["sqlite", "mem"] = "sqlite"
"""Type of the database:
``sqlite``:
:py:obj:`.cache.FaviconCacheSQLite`
``mem``:
:py:obj:`.cache.FaviconCacheMEM` (not recommended)
"""
db_url: str = tempfile.gettempdir() + os.sep + "faviconcache.db"
"""URL of the SQLite DB, the path to the database file."""
HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
"""Hold time (default in sec.), after which a BLOB is removed from the cache."""
LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
"""Maximum of bytes (default) stored in the cache of all blobs. Note: The
limit is only reached at each maintenance interval after which the oldest
BLOBs are deleted; the limit is exceeded during the maintenance period. If
the maintenance period is *too long* or maintenance is switched off
completely, the cache grows uncontrollably."""
BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
"""The maximum BLOB size in bytes that a favicon may have so that it can be
saved in the cache. If the favicon is larger, it is not saved in the cache
and must be requested by the client via the proxy."""
MAINTENANCE_PERIOD: int = 60 * 60
"""Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
``auto``."""
MAINTENANCE_MODE: Literal["auto", "off"] = "auto"
"""Type of maintenance mode
``auto``:
Maintenance is carried out automatically as part of the maintenance
intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
``off``:
Maintenance is switched off and must be carried out by an external process
if required.
"""
@dataclasses.dataclass
class FaviconCacheStats:
"""Dataclass wich provides information on the status of the cache."""
favicons: int | None = None
bytes: int | None = None
domains: int | None = None
resolvers: int | None = None
field_descr = (
("favicons", "number of favicons in cache", humanize_number),
("bytes", "total size (approx. bytes) of cache", humanize_bytes),
("domains", "total number of domains in cache", humanize_number),
("resolvers", "number of resolvers", str),
)
def __sub__(self, other) -> FaviconCacheStats:
if not isinstance(other, self.__class__):
raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
kwargs = {}
for field, _, _ in self.field_descr:
self_val, other_val = getattr(self, field), getattr(other, field)
if None in (self_val, other_val):
continue
if isinstance(self_val, int):
kwargs[field] = self_val - other_val
else:
kwargs[field] = self_val
return self.__class__(**kwargs)
def report(self, fmt: str = "{descr}: {val}\n"):
s = []
for field, descr, cast in self.field_descr:
val = getattr(self, field)
if val is None:
val = "--"
else:
val = cast(val)
s.append(fmt.format(descr=descr, val=val))
return "".join(s)
class FaviconCache(abc.ABC):
"""Abstract base class for the implementation of a favicon cache."""
@abc.abstractmethod
def __init__(self, cfg: FaviconCacheConfig):
"""An instance of the favicon cache is build up from the configuration."""
@abc.abstractmethod
def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
"""Returns ``None`` or the tuple of ``(data, mime)`` that has been
registered in the cache. The ``None`` indicates that there was no entry
in the cache."""
@abc.abstractmethod
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
"""Set data and mime-type in the cache. If data is None, the
:py:obj:`FALLBACK_ICON` is registered. in the cache."""
@abc.abstractmethod
def state(self) -> FaviconCacheStats:
"""Returns a :py:obj:`FaviconCacheStats` (key/values) with information
on the state of the cache."""
@abc.abstractmethod
def maintenance(self, force=False):
"""Performs maintenance on the cache"""
class FaviconCacheNull(FaviconCache):
"""A dummy favicon cache that caches nothing / a fallback solution. The
NullCache is used when more efficient caches such as the
:py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
library is only available in an old version and does not meet the
requirements."""
def __init__(self, cfg: FaviconCacheConfig):
return None
def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
return None
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
return False
def state(self):
return FaviconCacheStats(favicons=0)
def maintenance(self, force=False):
pass
class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache):
"""Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
model in the SQLite DB is implemented using the abstract class
:py:obj:`sqlitedb.SQLiteAppl`.
The following configurations are required / supported:
- :py:obj:`FaviconCacheConfig.db_url`
- :py:obj:`FaviconCacheConfig.HOLD_TIME`
- :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
- :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
- :py:obj:`MAINTENANCE_PERIOD`
- :py:obj:`MAINTENANCE_MODE`
"""
DB_SCHEMA = 1
DDL_BLOBS = """\
CREATE TABLE IF NOT EXISTS blobs (
sha256 TEXT,
bytes_c INTEGER,
mime TEXT NOT NULL,
data BLOB NOT NULL,
PRIMARY KEY (sha256))"""
"""Table to store BLOB objects by their sha256 hash values."""
DDL_BLOB_MAP = """\
CREATE TABLE IF NOT EXISTS blob_map (
m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
sha256 TEXT,
resolver TEXT,
authority TEXT,
PRIMARY KEY (resolver, authority))"""
"""Table to map from (resolver, authority) to sha256 hash values."""
DDL_CREATE_TABLES = {
"blobs": DDL_BLOBS,
"blob_map": DDL_BLOB_MAP,
}
SQL_DROP_LEFTOVER_BLOBS = (
"DELETE FROM blobs WHERE sha256 IN ("
" SELECT b.sha256"
" FROM blobs b"
" LEFT JOIN blob_map bm"
" ON b.sha256 = bm.sha256"
" WHERE bm.sha256 IS NULL)"
)
"""Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
SQL_ITER_BLOBS_SHA256_BYTES_C = (
"SELECT b.sha256, b.bytes_c FROM blobs b"
" JOIN blob_map bm "
" ON b.sha256 = bm.sha256"
" ORDER BY bm.m_time ASC"
)
SQL_INSERT_BLOBS = (
"INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
" ON CONFLICT (sha256) DO NOTHING"
) # fmt: skip
SQL_INSERT_BLOB_MAP = (
"INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
" ON CONFLICT DO UPDATE "
" SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
)
def __init__(self, cfg: FaviconCacheConfig):
"""An instance of the favicon cache is build up from the configuration.""" #
if cfg.db_url == ":memory:":
logger.critical("don't use SQLite DB in :memory: in production!!")
super().__init__(cfg.db_url)
self.cfg = cfg
def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
res = self.DB.execute(sql, (resolver, authority)).fetchone()
if res is None:
return None
data, mime = (None, None)
sha256 = res[0]
if sha256 == FALLBACK_ICON:
return data, mime
sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
res = self.DB.execute(sql, (sha256,)).fetchone()
if res is not None:
data, mime = res
return data, mime
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time:
# Should automatic maintenance be moved to a new thread?
self.maintenance()
if data is not None and mime is None:
logger.error(
"favicon resolver %s tries to cache mime-type None for authority %s",
resolver,
authority,
)
return False
bytes_c = len(data or b"")
if bytes_c > self.cfg.BLOB_MAX_BYTES:
logger.info(
"favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
)
return False
if data is None:
sha256 = FALLBACK_ICON
else:
sha256 = hashlib.sha256(data).hexdigest()
with self.connect() as conn:
if sha256 != FALLBACK_ICON:
conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
return True
@property
def next_maintenance_time(self) -> int:
"""Returns (unix epoch) time of the next maintenance."""
return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
def maintenance(self, force=False):
# Prevent parallel DB maintenance cycles from other DB connections
# (e.g. in multi thread or process environments).
if not force and int(time.time()) < self.next_maintenance_time:
logger.debug("no maintenance required yet, next maintenance interval is in the future")
return
self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
# do maintenance tasks
with self.connect() as conn:
# drop items not in HOLD time
res = conn.execute(
f"DELETE FROM blob_map"
f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
)
logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
# drop old items to be in LIMIT_TOTAL_BYTES
total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
c = 0
sha_list = []
for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
sha256, bytes_c = row
sha_list.append(sha256)
c += bytes_c
if c > x:
break
if sha_list:
conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
def _query_val(self, sql, default=None):
val = self.DB.execute(sql).fetchone()
if val is not None:
val = val[0]
if val is None:
val = default
return val
def state(self) -> FaviconCacheStats:
return FaviconCacheStats(
favicons=self._query_val("SELECT count(*) FROM blobs", 0),
bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
)
class FaviconCacheMEM(FaviconCache):
"""Favicon cache in process' memory. Its just a POC that stores the
favicons in the memory of the process.
.. attention::
Don't use it in production, it will blow up your memory!!
"""
def __init__(self, cfg):
self.cfg = cfg
self._data = {}
self._sha_mime = {}
def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
if sha is None:
return None
data = self._data.get(sha)
if data == FALLBACK_ICON:
data = None
return data, mime
def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
if data is None:
data = FALLBACK_ICON
mime = None
elif mime is None:
logger.error(
"favicon resolver %s tries to cache mime-type None for authority %s",
resolver,
authority,
)
return False
digest = hashlib.sha256(data).hexdigest()
self._data[digest] = data
self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
return True
def state(self):
return FaviconCacheStats(favicons=len(self._data.keys()))
def maintenance(self, force=False):
pass

65
searx/favicons/config.py Normal file
View file

@ -0,0 +1,65 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from __future__ import annotations
import pathlib
import msgspec
from .cache import FaviconCacheConfig
from .proxy import FaviconProxyConfig
CONFIG_SCHEMA: int = 1
"""Version of the configuration schema."""
TOML_CACHE_CFG: dict[str, "FaviconConfig"] = {}
"""Cache config objects by TOML's filename."""
DEFAULT_CFG_TOML_PATH = pathlib.Path(__file__).parent / "favicons.toml"
class FaviconConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
"""The class aggregates configurations of the favicon tools"""
cfg_schema: int
"""Config's schema version. The specification of the version of the schema
is mandatory, currently only version :py:obj:`CONFIG_SCHEMA` is supported.
By specifying a version, it is possible to ensure downward compatibility in
the event of future changes to the configuration schema"""
cache: FaviconCacheConfig = msgspec.field(default_factory=FaviconCacheConfig)
"""Setup of the :py:obj:`.cache.FaviconCacheConfig`."""
proxy: FaviconProxyConfig = msgspec.field(default_factory=FaviconProxyConfig)
"""Setup of the :py:obj:`.proxy.FaviconProxyConfig`."""
@classmethod
def from_toml_file(cls, cfg_file: pathlib.Path, use_cache: bool) -> "FaviconConfig":
"""Create a config object from a TOML file, the ``use_cache`` argument
specifies whether a cache should be used.
"""
cached = TOML_CACHE_CFG.get(str(cfg_file))
if use_cache and cached:
return cached
with cfg_file.open("rb") as f:
data = f.read()
cfg = msgspec.toml.decode(data, type=_FaviconConfig)
schema = cfg.favicons.cfg_schema
if schema != CONFIG_SCHEMA:
raise ValueError(
f"config schema version {CONFIG_SCHEMA} is needed, version {schema} is given in {cfg_file}"
)
cfg = cfg.favicons
if use_cache and cached:
TOML_CACHE_CFG[str(cfg_file.resolve())] = cfg
return cfg
class _FaviconConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
# wrapper struct for root object "favicons."
favicons: FaviconConfig

View file

@ -0,0 +1,25 @@
[favicons]
cfg_schema = 1 # config's schema version no.
[favicons.proxy]
# max_age = 5184000 # 60 days / default: 7 days (604800 sec)
# [favicons.proxy.resolver_map]
#
# The available favicon resolvers are registered here.
#
# "duckduckgo" = "searx.favicons.resolvers.duckduckgo"
# "allesedv" = "searx.favicons.resolvers.allesedv"
# "google" = "searx.favicons.resolvers.google"
# "yandex" = "searx.favicons.resolvers.yandex"
[favicons.cache]
# db_url = "/var/cache/searxng/faviconcache.db" # default: "/tmp/faviconcache.db"
# HOLD_TIME = 5184000 # 60 days / default: 30 days
# LIMIT_TOTAL_BYTES = 2147483648 # 2 GB / default: 50 MB
# BLOB_MAX_BYTES = 40960 # 40 KB / default 20 KB
# MAINTENANCE_MODE = "off" # default: "auto"
# MAINTENANCE_PERIOD = 600 # 10min / default: 1h

237
searx/favicons/proxy.py Normal file
View file

@ -0,0 +1,237 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations for a favicon proxy"""
from __future__ import annotations
from typing import Callable
import importlib
import base64
import pathlib
import urllib.parse
import flask
from httpx import HTTPError
import msgspec
from searx import get_setting
from searx.webutils import new_hmac, is_hmac_of
from searx.exceptions import SearxEngineResponseException
from .resolvers import DEFAULT_RESOLVER_MAP
from . import cache
DEFAULT_FAVICON_URL = {}
CFG: FaviconProxyConfig = None # type: ignore
def init(cfg: FaviconProxyConfig):
global CFG # pylint: disable=global-statement
CFG = cfg
def _initial_resolver_map():
d = {}
name: str = get_setting("search.favicon_resolver", None) # type: ignore
if name:
func = DEFAULT_RESOLVER_MAP.get(name)
if func:
d = {name: f"searx.favicons.resolvers.{func.__name__}"}
return d
class FaviconProxyConfig(msgspec.Struct):
"""Configuration of the favicon proxy."""
max_age: int = 60 * 60 * 24 * 7 # seven days
"""HTTP header Cache-Control_ ``max-age``
.. _Cache-Control: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control
"""
secret_key: str = get_setting("server.secret_key") # type: ignore
"""By default, the value from :ref:`server.secret_key <settings server>`
setting is used."""
resolver_timeout: int = get_setting("outgoing.request_timeout") # type: ignore
"""Timeout which the resolvers should not exceed, is usually passed to the
outgoing request of the resolver. By default, the value from
:ref:`outgoing.request_timeout <settings outgoing>` setting is used."""
resolver_map: dict[str, str] = msgspec.field(default_factory=_initial_resolver_map)
"""The resolver_map is a key / value dictionary where the key is the name of
the resolver and the value is the fully qualifying name (fqn) of resolver's
function (the callable). The resolvers from the python module
:py:obj:`searx.favicons.resolver` are available by default."""
def get_resolver(self, name: str) -> Callable | None:
"""Returns the callable object (function) of the resolver with the
``name``. If no resolver is registered for the ``name``, ``None`` is
returned.
"""
fqn = self.resolver_map.get(name)
if fqn is None:
return None
mod_name, _, func_name = fqn.rpartition('.')
mod = importlib.import_module(mod_name)
func = getattr(mod, func_name)
if func is None:
raise ValueError(f"resolver {fqn} is not implemented")
return func
favicon_path: str = get_setting("ui.static_path") + "/themes/{theme}/img/empty_favicon.svg" # type: ignore
favicon_mime_type: str = "image/svg+xml"
def favicon(self, **replacements):
"""Returns pathname and mimetype of the default favicon."""
return (
pathlib.Path(self.favicon_path.format(**replacements)),
self.favicon_mime_type,
)
def favicon_data_url(self, **replacements):
"""Returns data image URL of the default favicon."""
cache_key = ", ".join(f"{x}:{replacements[x]}" for x in sorted(list(replacements.keys()), key=str))
data_url = DEFAULT_FAVICON_URL.get(cache_key)
if data_url is not None:
return data_url
fav, mimetype = CFG.favicon(**replacements)
# hint: encoding utf-8 limits favicons to be a SVG image
with fav.open("r", encoding="utf-8") as f:
data_url = f.read()
data_url = urllib.parse.quote(data_url)
data_url = f"data:{mimetype};utf8,{data_url}"
DEFAULT_FAVICON_URL[cache_key] = data_url
return data_url
def favicon_proxy():
"""REST API of SearXNG's favicon proxy service
::
/favicon_proxy?authority=<...>&h=<...>
``authority``:
Domain name :rfc:`3986` / see :py:obj:`favicon_url`
``h``:
HMAC :rfc:`2104`, build up from the :ref:`server.secret_key <settings
server>` setting.
"""
authority = flask.request.args.get('authority')
# malformed request or RFC 3986 authority
if not authority or "/" in authority:
return '', 400
# malformed request / does not have authorisation
if not is_hmac_of(
CFG.secret_key,
authority.encode(),
flask.request.args.get('h', ''),
):
return '', 400
resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore
# if resolver is empty or not valid, just return HTTP 400.
if not resolver or resolver not in CFG.resolver_map.keys():
return "", 400
data, mime = search_favicon(resolver, authority)
if data is not None and mime is not None:
resp = flask.Response(data, mimetype=mime) # type: ignore
resp.headers['Cache-Control'] = f"max-age={CFG.max_age}"
return resp
# return default favicon from static path
theme = flask.request.preferences.get_value("theme") # type: ignore
fav, mimetype = CFG.favicon(theme=theme)
return flask.send_from_directory(fav.parent, fav.name, mimetype=mimetype)
def search_favicon(resolver: str, authority: str) -> tuple[None | bytes, None | str]:
"""Sends the request to the favicon resolver and returns a tuple for the
favicon. The tuple consists of ``(data, mime)``, if the resolver has not
determined a favicon, both values are ``None``.
``data``:
Binary data of the favicon.
``mime``:
Mime type of the favicon.
"""
data, mime = (None, None)
func = CFG.get_resolver(resolver)
if func is None:
return data, mime
# to avoid superfluous requests to the resolver, first look in the cache
data_mime = cache.CACHE(resolver, authority)
if data_mime is not None:
return data_mime
try:
data, mime = func(authority, timeout=CFG.resolver_timeout)
if data is None or mime is None:
data, mime = (None, None)
except (HTTPError, SearxEngineResponseException):
pass
cache.CACHE.set(resolver, authority, mime, data)
return data, mime
def favicon_url(authority: str) -> str:
"""Function to generate the image URL used for favicons in SearXNG's result
lists. The ``authority`` argument (aka netloc / :rfc:`3986`) is usually a
(sub-) domain name. This function is used in the HTML (jinja) templates.
.. code:: html
<div class="favicon">
<img src="{{ favicon_url(result.parsed_url.netloc) }}">
</div>
The returned URL is a route to :py:obj:`favicon_proxy` REST API.
If the favicon is already in the cache, the returned URL is a `data URL`_
(something like ``data:image/png;base64,...``). By generating a data url from
the :py:obj:`.cache.FaviconCache`, additional HTTP roundtripps via the
:py:obj:`favicon_proxy` are saved. However, it must also be borne in mind
that data urls are not cached in the client (web browser).
.. _data URL: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
"""
resolver = flask.request.preferences.get_value('favicon_resolver') # type: ignore
# if resolver is empty or not valid, just return nothing.
if not resolver or resolver not in CFG.resolver_map.keys():
return ""
data_mime = cache.CACHE(resolver, authority)
if data_mime == (None, None):
# we have already checked, the resolver does not have a favicon
theme = flask.request.preferences.get_value("theme") # type: ignore
return CFG.favicon_data_url(theme=theme)
if data_mime is not None:
data, mime = data_mime
return f"data:{mime};base64,{str(base64.b64encode(data), 'utf-8')}" # type: ignore
h = new_hmac(CFG.secret_key, authority.encode())
proxy_url = flask.url_for('favicon_proxy')
query = urllib.parse.urlencode({"authority": authority, "h": h})
return f"{proxy_url}?{query}"

100
searx/favicons/resolvers.py Normal file
View file

@ -0,0 +1,100 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations of the favicon *resolvers* that are available in the favicon
proxy by default. A *resolver* is a function that obtains the favicon from an
external source. The *resolver* function receives two arguments (``domain,
timeout``) and returns a tuple ``(data, mime)``.
"""
from __future__ import annotations
__all__ = ["DEFAULT_RESOLVER_MAP", "allesedv", "duckduckgo", "google", "yandex"]
from typing import Callable
from searx import network
from searx import logger
DEFAULT_RESOLVER_MAP: dict[str, Callable]
logger = logger.getChild('favicons.resolvers')
def _req_args(**kwargs):
# add the request arguments from the searx.network
d = {"raise_for_httperror": False}
d.update(kwargs)
return d
def allesedv(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from allesedv.com / https://favicon.allesedv.com/"""
data, mime = (None, None)
url = f"https://f1.allesedv.com/32/{domain}"
logger.debug("fetch favicon from: %s", url)
# will just return a 200 regardless of the favicon existing or not
# sometimes will be correct size, sometimes not
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200:
mime = response.headers['Content-Type']
if mime != 'image/gif':
data = response.content
return data, mime
def duckduckgo(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from duckduckgo.com / https://blog.jim-nielsen.com/2021/displaying-favicons-for-any-domain/"""
data, mime = (None, None)
url = f"https://icons.duckduckgo.com/ip2/{domain}.ico"
logger.debug("fetch favicon from: %s", url)
# will return a 404 if the favicon does not exist and a 200 if it does,
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200:
# api will respond with a 32x32 png image
mime = response.headers['Content-Type']
data = response.content
return data, mime
def google(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from google.com"""
data, mime = (None, None)
# URL https://www.google.com/s2/favicons?sz=32&domain={domain}" will be
# redirected (HTTP 301 Moved Permanently) to t1.gstatic.com/faviconV2:
url = (
f"https://t1.gstatic.com/faviconV2?client=SOCIAL&type=FAVICON&fallback_opts=TYPE,SIZE,URL"
f"&url=https://{domain}&size=32"
)
logger.debug("fetch favicon from: %s", url)
# will return a 404 if the favicon does not exist and a 200 if it does,
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200:
# api will respond with a 32x32 png image
mime = response.headers['Content-Type']
data = response.content
return data, mime
def yandex(domain: str, timeout: int) -> tuple[None | bytes, None | str]:
"""Favicon Resolver from yandex.com"""
data, mime = (None, None)
url = f"https://favicon.yandex.net/favicon/{domain}"
logger.debug("fetch favicon from: %s", url)
# api will respond with a 16x16 png image, if it doesn't exist, it will be a
# 1x1 png image (70 bytes)
response = network.get(url, **_req_args(timeout=timeout))
if response and response.status_code == 200 and len(response.content) > 70:
mime = response.headers['Content-Type']
data = response.content
return data, mime
DEFAULT_RESOLVER_MAP = {
"allesedv": allesedv,
"duckduckgo": duckduckgo,
"google": google,
"yandex": yandex,
}

View file

@ -0,0 +1,87 @@
# A propos de SearXNG
SearXNG est un [Métamoteur] qui agrège les résultats d'autres
{{link('moteurs de recherche', 'preferences')}} tout en ne sauvegardant
aucune informations à propos de ses utilisateurs.
Le projet SearXNG est maintenu par une communauté ouverte.
Rejoignez-nous sur Matrix si vous avez des questions ou simplement pour
discuter de SearXNG: [#searxng:matrix.org].
Aidez-nous à rendre SearXNG meilleur.
- Vous pouvez améliorer les traductions de SearXNG avec l'outil
[Weblate].
- Suivez le développement, contribuez au projet ou remontez des erreurs
en utilisant le [dépôt de sources].
- Pour obtenir de plus amples informations, consultez la documentation
en ligne du [projet SearXNG].
## Pourquoi l'utiliser ?
- SearXNG ne vous fournira pas de résultats aussi personnalisés que
Google, mais il ne générera pas non plus de suivi sur vous.
- SearXNG ne se soucis pas des recherches que vous faites, ne partage
aucune information avec des tiers et ne peut pas être utilisé contre
vous.
- SearXNG est un logiciel libre. Son code source est 100% ouvert et tout
le mode est encouragé à l'améliorer.
Si vous êtes soucieux du respect de la vie privée et des libertés sur
Internet, faites de SearXNG votre moteur de recherche par défaut. Vous
pouvez aussi installer et utiliser SearXNG sur votre propre serveur.
## Comment le configurer comme moteur de recherche par défaut ?
SearXNG prend en charge [OpenSearch]. Pour plus d'informations sur la
manière de modifier votre moteur de recherche par défaut, veuillez
consulter la documentation de votre navigateur :
- [Firefox]
- [Microsoft Edge] - Ce lien propose aussi les instructions pour les
navigateurs Chrome et Safari.
- Les navigateurs basés sur [Chromium] permettent d'ajouter des sites de
navigation sans même y accéder.
Lorsqu'un moteur de recherche est ajouté, son nom doit être unique. Si
vous ne pouvez pas ajouter un moteur de recherche, veuillez :
- Supprimer le doublon (le nom par défaut est SearXNG) ou bien
- Contacter le propriétaire de l'instance que vous souhaitez utiliser
afin qu'il modifie le nom de celle-ci.
## Comment ça marche ?
SearXNG est une reprise logicielle du projet [searx] [Métamoteur],
lui-même inspiré du [projet Seeks]. Il assure la confidentialité en
mélangeant vos recherches vers d'autres plateformes sans stocker aucune
données de recherche. SearXNG peut être ajouté à la barre de recherche
de votre navigateur et même être utilisé comme moteur de recherche par
défaut.
Le lien "{{link('statistiques des moteurs', 'stats')}}" présente des
informations anonymisées concernant l'utilisation des divers moteurs de
recherche.
## Comment reprendre la main ?
SearXNG apprécie votre préoccupation concernant les traces de recherche.
N'hésitez pas à utiliser le [dépôt de sources] et à maintenir votre
propre instance de recherche.
Ajouter votre instance à la [liste d'instances
publiques]({{get_setting('brand.public_instances')}}) afin d'aider
d'autres personnes à protéger leur vie privée et rendre l'Internet plus
libre. Plus Internet sera décentralisé, plus nous aurons de liberté !
[dépôt de sources]: {{GIT_URL}}
[#searxng:matrix.org]: https://matrix.to/#/#searxng:matrix.org
[projet SearXNG]: {{get_setting('brand.docs_url')}}
[searx]: https://github.com/searx/searx
[Métamoteur]: https://fr.wikipedia.org/wiki/M%C3%A9tamoteur
[Weblate]: https://translate.codeberg.org/projects/searxng/
[projet Seeks]: https://beniz.github.io/seeks/
[OpenSearch]: https://github.com/dewitt/opensearch/blob/master/opensearch-1-1-draft-6.md
[Firefox]: https://support.mozilla.org/en-US/kb/add-or-remove-search-engine-firefox
[Microsoft Edge]: https://support.microsoft.com/en-us/help/4028574/microsoft-edge-change-the-default-search-engine
[Chromium]: https://www.chromium.org/tab-to-search

View file

@ -0,0 +1,97 @@
# Syntaxe de recherche
SearXNG permet de modifier les catégories de recherche, les moteurs
utilisés ou encore la langue de recherche par l'intermédiaire d'une
syntaxe dédiée. La liste des moteurs de recherche, de catégories et de
langues disponibles est accessible depuis la page de
{{link('préférences', 'preferences')}}.
## `!` Spécifier un moteur ou une catégorie
Pour restreindre la recherche à un moteur ou une catégorie, utilisez le
caractère "!". Voici quelques exemples d'utilisation :
- Rechercher **paris** sur Wikipédia.
- {{search('!wp paris')}}
- {{search('!wikipedia paris')}}
- Rechercher **paris** dans la catégorie **Carte**.
- {{search('!map paris')}}
- Rechercher des **Images**.
- {{search('!images Wau Holland')}}
Les abréviations de moteurs et de langues sont aussi valides. Il est
possible d'accumuler les moteurs et catégories dans une requête
complexe. Par exemple, {{search('!map !ddg !wp paris')}} recherchera
**paris** dans la catégorie **Carte** de DuckDuckGo et Wikipédia.
## `:` Spécifier une langue
Utilisez le préfixe ":" pour limiter la recherche à une langue en
particulier. Par exemple :
- Rechercher dans les pages françaises de Wikipédia.
- {{search(':fr !wp Wau Holland')}}
## `!!<bang>` Recherches externes (!Bang)
SearXNG supporte les recherches [DuckDuckGo] de type "!Bang". Utilisez
le préfixe "!!" pour être automatiquement redirigé vers un moteur de
recherche externe. Par exemple :
- Rechercher sur Wikipédia en langue française.
- {{search('!!wfr Wau Holland')}}
Prenez garde au fait que de telles recherches sont exécutées directement
sur le moteur externe. Dans ce cas, SearXNG ne peut pas protéger votre
vie privée.
[DuckDuckGo]: https://duckduckgo.com/bang
## `!!` Redirection automatique
En utilisant "!!" suivi d'un ou plusieurs espaces lors de votre
recherche, vous serez automatiquement redirigé vers le premier résultat
de recherche. Cela correspondant au fonctionnement "J'ai de la chance"
du moteur Google. Par exemple :
- Rechercher et être redirigé directement vers le premier lien
correspondant.
- {{search('!! Wau Holland')}}
Prenez garde au fait qu'aucune vérification ne peut être faite
concernant le premier lien retourné. Il pourrait même s'agir d'un site
dangereux. Dans ce cas, SearXNG ne peut pas protéger votre vie
privée. Soyez prudent en utilisant cette fonctionnalité.
## Requêtes spéciales
Dans la section _requêtes spéciales_ de la page de {{link('préférences',
'preferences')}} se trouve une liste de mots clés à usage particulier.
Par exemple :
- Générer une valeur aléatoire.
- {{search('random uuid')}}
- Calculer une moyenne.
- {{search('avg 123 548 2.04 24.2')}}
- Afficher la valeur de la variable _User-Agent_ utilisée par votre
navigateur (doit être activé manuellement).
- {{search('user-agent')}}
- Convertir une chaîne de caractères en valeurs de hachage ("hash digests")
(doit être activé manuellement).
- {{search('md5 lorem ipsum')}}
- {{search('sha512 lorem ipsum')}}

View file

@ -128,9 +128,6 @@ _INSTALLED = False
LIMITER_CFG_SCHEMA = Path(__file__).parent / "limiter.toml"
"""Base configuration (schema) of the botdetection."""
LIMITER_CFG = Path('/etc/searxng/limiter.toml')
"""Local Limiter configuration."""
CFG_DEPRECATED = {
# "dummy.old.foo": "config 'dummy.old.foo' exists only for tests. Don't use it in your real project config."
}
@ -138,8 +135,12 @@ CFG_DEPRECATED = {
def get_cfg() -> config.Config:
global CFG # pylint: disable=global-statement
if CFG is None:
CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, LIMITER_CFG, CFG_DEPRECATED)
from . import settings_loader # pylint: disable=import-outside-toplevel
cfg_file = (settings_loader.get_user_cfg_folder() or Path("/etc/searxng")) / "limiter.toml"
CFG = config.Config.from_toml(LIMITER_CFG_SCHEMA, cfg_file, CFG_DEPRECATED)
return CFG

View file

@ -12,7 +12,7 @@ ipv6_prefix = 48
[botdetection.ip_limit]
# To get unlimited access in a local network, by default link-lokal addresses
# To get unlimited access in a local network, by default link-local addresses
# (networks) are not monitored by the ip_limit
filter_link_local = false

View file

@ -120,7 +120,7 @@ _TR_LOCALES: list[str] = []
def get_translation_locales() -> list[str]:
"""Returns the list of transaltion locales (*underscore*). The list is
"""Returns the list of translation locales (*underscore*). The list is
generated from the translation folders in :origin:`searx/translations`"""
global _TR_LOCALES # pylint:disable=global-statement
@ -152,7 +152,7 @@ def locales_initialize():
def region_tag(locale: babel.Locale) -> str:
"""Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US)."""
if not locale.territory:
raise ValueError('%s missed a territory')
raise ValueError('babel.Locale %s: missed a territory' % locale)
return locale.language + '-' + locale.territory

View file

@ -8,6 +8,7 @@ from timeit import default_timer
from operator import itemgetter
from searx.engines import engines
from searx.openmetrics import OpenMetricsFamily
from .models import HistogramStorage, CounterStorage, VoidHistogram, VoidCounterStorage
from .error_recorder import count_error, count_exception, errors_per_engines
@ -149,7 +150,9 @@ def get_reliabilities(engline_name_list, checker_results):
checker_result = checker_results.get(engine_name, {})
checker_success = checker_result.get('success', True)
errors = engine_errors.get(engine_name) or []
if counter('engine', engine_name, 'search', 'count', 'sent') == 0:
sent_count = counter('engine', engine_name, 'search', 'count', 'sent')
if sent_count == 0:
# no request
reliability = None
elif checker_success and not errors:
@ -164,8 +167,9 @@ def get_reliabilities(engline_name_list, checker_results):
reliabilities[engine_name] = {
'reliability': reliability,
'sent_count': sent_count,
'errors': errors,
'checker': checker_results.get(engine_name, {}).get('errors', {}),
'checker': checker_result.get('errors', {}),
}
return reliabilities
@ -245,3 +249,57 @@ def get_engines_stats(engine_name_list):
'max_time': math.ceil(max_time_total or 0),
'max_result_count': math.ceil(max_result_count or 0),
}
def openmetrics(engine_stats, engine_reliabilities):
metrics = [
OpenMetricsFamily(
key="searxng_engines_response_time_total_seconds",
type_hint="gauge",
help_hint="The average total response time of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['total'] or 0 for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_response_time_processing_seconds",
type_hint="gauge",
help_hint="The average processing response time of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['processing'] or 0 for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_response_time_http_seconds",
type_hint="gauge",
help_hint="The average HTTP response time of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['http'] or 0 for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_result_count_total",
type_hint="counter",
help_hint="The total amount of results returned by the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[engine['result_count'] or 0 for engine in engine_stats['time']],
),
OpenMetricsFamily(
key="searxng_engines_request_count_total",
type_hint="counter",
help_hint="The total amount of user requests made to this engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[
engine_reliabilities.get(engine['name'], {}).get('sent_count', 0) or 0
for engine in engine_stats['time']
],
),
OpenMetricsFamily(
key="searxng_engines_reliability_total",
type_hint="counter",
help_hint="The overall reliability of the engine",
data_info=[{'engine_name': engine['name']} for engine in engine_stats['time']],
data=[
engine_reliabilities.get(engine['name'], {}).get('reliability', 0) or 0
for engine in engine_stats['time']
],
),
]
return "".join([str(metric) for metric in metrics])

View file

@ -11,16 +11,12 @@ from typing import Any, Dict
import httpx
from httpx_socks import AsyncProxyTransport
from python_socks import parse_proxy_url, ProxyConnectionError, ProxyTimeoutError, ProxyError
import uvloop
from searx import logger
# Optional uvloop (support Python 3.6)
try:
import uvloop
except ImportError:
pass
else:
uvloop.install()
uvloop.install()
logger = logger.getChild('searx.network.client')

View file

@ -233,8 +233,7 @@ class Network:
del kwargs['raise_for_httperror']
return do_raise_for_httperror
@staticmethod
def patch_response(response, do_raise_for_httperror):
def patch_response(self, response, do_raise_for_httperror):
if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
@ -242,8 +241,11 @@ class Network:
# raise an exception
if do_raise_for_httperror:
raise_for_httperror(response)
try:
raise_for_httperror(response)
except:
self._logger.warning(f"HTTP Request failed: {response.request.method} {response.request.url}")
raise
return response
def is_valid_response(self, response):
@ -269,7 +271,7 @@ class Network:
else:
response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0:
return Network.patch_response(response, do_raise_for_httperror)
return self.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e:
if not was_disconnected:
# the server has closed the connection:

35
searx/openmetrics.py Normal file
View file

@ -0,0 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Module providing support for displaying data in OpenMetrics format"""
class OpenMetricsFamily: # pylint: disable=too-few-public-methods
"""A family of metrics.
The key parameter is the metric name that should be used (snake case).
The type_hint parameter must be one of 'counter', 'gauge', 'histogram', 'summary'.
The help_hint parameter is a short string explaining the metric.
The data_info parameter is a dictionary of descriptionary parameters for the data point (e.g. request method/path).
The data parameter is a flat list of the actual data in shape of a primive type.
See https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md for more information.
"""
def __init__(self, key: str, type_hint: str, help_hint: str, data_info: list, data: list):
self.key = key
self.type_hint = type_hint
self.help_hint = help_hint
self.data_info = data_info
self.data = data
def __str__(self):
text_representation = f"""# HELP {self.key} {self.help_hint}
# TYPE {self.key} {self.type_hint}
"""
for i, data_info_dict in enumerate(self.data_info):
if not data_info_dict or not self.data[i]:
continue
info_representation = ','.join([f"{key}=\"{value}\"" for (key, value) in data_info_dict.items()])
text_representation += f"{self.key}{{{info_representation}}} {self.data[i]}\n"
return text_representation

View file

@ -3,19 +3,27 @@
"""
import ast
import re
import operator
from multiprocessing import Process, Queue
from typing import Callable
import flask
import babel
from flask_babel import gettext
from searx import settings
from searx.plugins import logger
name = "Basic Calculator"
description = gettext("Calculate mathematical expressions via the search bar")
default_on = False
default_on = True
preference_section = 'general'
plugin_id = 'calculator'
operators = {
logger = logger.getChild(plugin_id)
operators: dict[type, Callable] = {
ast.Add: operator.add,
ast.Sub: operator.sub,
ast.Mult: operator.mul,
@ -35,11 +43,15 @@ def _eval_expr(expr):
>>> _eval_expr('1 + 2*3**(4^5) / (6 + -7)')
-5.0
"""
return _eval(ast.parse(expr, mode='eval').body)
try:
return _eval(ast.parse(expr, mode='eval').body)
except ZeroDivisionError:
# This is undefined
return ""
def _eval(node):
if isinstance(node, ast.Constant) and isinstance(node.value, int):
if isinstance(node, ast.Constant) and isinstance(node.value, (int, float)):
return node.value
if isinstance(node, ast.BinOp):
@ -51,10 +63,31 @@ def _eval(node):
raise TypeError(node)
def timeout_func(timeout, func, *args, **kwargs):
def handler(q: Queue, func, args, **kwargs): # pylint:disable=invalid-name
try:
q.put(func(*args, **kwargs))
except:
q.put(None)
raise
que = Queue()
p = Process(target=handler, args=(que, func, args), kwargs=kwargs)
p.start()
p.join(timeout=timeout)
ret_val = None
if not p.is_alive():
ret_val = que.get()
else:
logger.debug("terminate function after timeout is exceeded")
p.terminate()
p.join()
p.close()
return ret_val
def post_search(_request, search):
# don't run on public instances due to possible attack surfaces
if settings['server']['public_instance']:
return True
# only show the result of the expression on the first page
if search.search_query.pageno > 1:
@ -68,21 +101,30 @@ def post_search(_request, search):
# replace commonly used math operators with their proper Python operator
query = query.replace("x", "*").replace(":", "/")
# use UI language
ui_locale = babel.Locale.parse(flask.request.preferences.get_value('locale'), sep='-')
# parse the number system in a localized way
def _decimal(match: re.Match) -> str:
val = match.string[match.start() : match.end()]
val = babel.numbers.parse_decimal(val, ui_locale, numbering_system="latn")
return str(val)
decimal = ui_locale.number_symbols["latn"]["decimal"]
group = ui_locale.number_symbols["latn"]["group"]
query = re.sub(f"[0-9]+[{decimal}|{group}][0-9]+[{decimal}|{group}]?[0-9]?", _decimal, query)
# only numbers and math operators are accepted
if any(str.isalpha(c) for c in query):
return True
# in python, powers are calculated via **
query_py_formatted = query.replace("^", "**")
try:
result = str(_eval_expr(query_py_formatted))
if result != query:
search.result_container.answers['calculate'] = {'answer': f"{query} = {result}"}
except (TypeError, SyntaxError, ArithmeticError):
pass
# Prevent the runtime from being longer than 50 ms
result = timeout_func(0.05, _eval_expr, query_py_formatted)
if result is None or result == "":
return True
result = babel.numbers.format_decimal(result, locale=ui_locale)
search.result_container.answers['calculate'] = {'answer': f"{search.search_query.query} = {result}"}
return True
def is_allowed():
return not settings['server']['public_instance']

View file

@ -1,35 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from flask_babel import gettext
from searx.plugins import logger
name = gettext('Hostname replace')
description = "Deprecated / contact system admin to configure 'Hostnames plugin'!!"
default_on = False
preference_section = 'general'
plugin_id = 'hostname_replace'
logger = logger.getChild(plugin_id)
REPORTED = False
def deprecated_msg():
global REPORTED # pylint: disable=global-statement
if REPORTED:
return
logger.error(
"'Hostname replace' plugin is deprecated and will be dropped soon!"
" Configure 'Hostnames plugin':"
" https://docs.searxng.org/src/searx.plugins.hostnames.html"
)
REPORTED = True
def on_result(_request, _search, result):
# pylint: disable=import-outside-toplevel, cyclic-import
from searx.plugins.hostnames import on_result as hostnames_on_result
deprecated_msg()
return hostnames_on_result(_request, _search, result)

View file

@ -1,17 +1,19 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=too-many-branches
"""In addition to rewriting/replace reslut URLs, the *hoostnames* plugin offers
other features.
"""
.. attention::
The 'Hostnames plugin' from `PR-3463
<https://github.com/searxng/searxng/pull/3463>`_ is a rewrite of the
'Hostname replace' plugin. Backwards compatibility is guaranteed for a
transitional period, but this will end soon.
The **"Hostname replace"** plugin has been replace by **"Hostnames
plugin"**, see :pull:`3463` & :pull:`3552`.
**To maintainers of SearXNG instances, please modify your old plugin config
to the new.**
The **Hostnames plugin** can be enabled by adding it to the
``enabled_plugins`` **list** in the ``setting.yml`` like so.
.. code:: yaml
enabled_plugins:
- 'Hostnames plugin'
...
- ``hostnames.replace``: A **mapping** of regular expressions to hostnames to be
replaced by other hostnames.
@ -96,7 +98,7 @@ from flask_babel import gettext
from searx import settings
from searx.plugins import logger
from searx.settings_loader import get_yaml_file
from searx.settings_loader import get_yaml_cfg
name = gettext('Hostnames plugin')
description = gettext('Rewrite hostnames, remove results or prioritize them based on the hostname')
@ -118,7 +120,7 @@ def _load_regular_expressions(settings_key):
# load external file with configuration
if isinstance(setting_value, str):
setting_value = get_yaml_file(setting_value)
setting_value = get_yaml_cfg(setting_value)
if isinstance(setting_value, list):
return {re.compile(r) for r in setting_value}
@ -129,29 +131,8 @@ def _load_regular_expressions(settings_key):
return {}
# compatibility fallback for old hostname replace plugin
# TODO: remove in the future once most/all instance maintainers finished migrating # pylint: disable=fixme
def _load_regular_expressions_with_fallback(settings_key):
expressions = _load_regular_expressions(settings_key)
if expressions:
return expressions
# fallback to the old `hostname_replace` settings format
# pylint: disable=import-outside-toplevel, cyclic-import
hostname_replace_config = settings.get('hostname_replace', {})
if hostname_replace_config:
from searx.plugins.hostname_replace import deprecated_msg
deprecated_msg()
if settings_key == 'replace':
return {re.compile(p): r for (p, r) in hostname_replace_config.items() if r}
return {re.compile(p) for (p, r) in hostname_replace_config.items() if not r}
replacements = _load_regular_expressions_with_fallback('replace')
removables = _load_regular_expressions_with_fallback('remove')
replacements = _load_regular_expressions('replace')
removables = _load_regular_expressions('remove')
high_priority = _load_regular_expressions('high_priority')
low_priority = _load_regular_expressions('low_priority')
@ -163,10 +144,10 @@ def _matches_parsed_url(result, pattern):
def on_result(_request, _search, result):
for pattern, replacement in replacements.items():
if _matches_parsed_url(result, pattern):
logger.debug(result['url'])
# logger.debug(result['url'])
result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc))
result['url'] = urlunparse(result[parsed])
logger.debug(result['url'])
# logger.debug(result['url'])
for url_field in _url_fields:
if not result.get(url_field):

View file

@ -28,5 +28,5 @@ def post_search(request, search):
search.result_container.answers['ip'] = {'answer': gettext('Your IP is: ') + ip}
elif ua_regex.match(search.search_query.query):
ua = request.user_agent
search.result_container.answers['user-agent'] = {'answer': gettext('Your user-agent is: ') + ua}
search.result_container.answers['user-agent'] = {'answer': gettext('Your user-agent is: ') + ua.string}
return True

View file

@ -234,7 +234,7 @@ def _parse_text_and_convert(search, from_query, to_query):
value = target_from_si(float(value))
if measured.group('E'):
# when incomming notation is scientific, outgoing notation is scientific
# when incoming notation is scientific, outgoing notation is scientific
result = babel.numbers.format_scientific(value, locale=_locale)
else:
result = babel.numbers.format_decimal(value, locale=_locale, format='#,##0.##########;-#')

View file

@ -13,7 +13,7 @@ from collections import OrderedDict
import flask
import babel
from searx import settings, autocomplete
from searx import settings, autocomplete, favicons
from searx.enginelib import Engine
from searx.plugins import Plugin
from searx.locales import LOCALE_NAMES
@ -325,7 +325,7 @@ class ClientPref:
# hint: searx.webapp.get_client_settings should be moved into this class
locale: babel.Locale
"""Locale prefered by the client."""
"""Locale preferred by the client."""
def __init__(self, locale: Optional[babel.Locale] = None):
self.locale = locale
@ -406,6 +406,11 @@ class Preferences:
locked=is_locked('autocomplete'),
choices=list(autocomplete.backends.keys()) + ['']
),
'favicon_resolver': EnumStringSetting(
settings['search']['favicon_resolver'],
locked=is_locked('favicon_resolver'),
choices=list(favicons.proxy.CFG.resolver_map.keys()) + ['']
),
'image_proxy': BooleanSetting(
settings['server']['image_proxy'],
locked=is_locked('image_proxy')
@ -441,7 +446,7 @@ class Preferences:
'simple_style': EnumStringSetting(
settings['ui']['theme_args']['simple_style'],
locked=is_locked('simple_style'),
choices=['', 'auto', 'light', 'dark']
choices=['', 'auto', 'light', 'dark', 'black']
),
'center_alignment': BooleanSetting(
settings['ui']['center_alignment'],
@ -474,7 +479,6 @@ class Preferences:
self.plugins = PluginsSetting('plugins', plugins=plugins)
self.tokens = SetSetting('tokens')
self.client = client or ClientPref()
self.unknown_params: Dict[str, str] = {}
def get_as_url_params(self):
"""Return preferences as URL parameters"""
@ -518,10 +522,6 @@ class Preferences:
self.plugins.parse_cookie(input_data.get('disabled_plugins', ''), input_data.get('enabled_plugins', ''))
elif user_setting_name == 'tokens':
self.tokens.parse(user_setting)
elif not any(
user_setting_name.startswith(x) for x in ['enabled_', 'disabled_', 'engine_', 'category_', 'plugin_']
):
self.unknown_params[user_setting_name] = user_setting
def parse_form(self, input_data: Dict[str, str]):
"""Parse formular (``<input>``) data from a ``flask.request.form``"""
@ -546,8 +546,7 @@ class Preferences:
disabled_plugins.append(user_setting_name)
elif user_setting_name == 'tokens':
self.tokens.parse_form(user_setting)
else:
self.unknown_params[user_setting_name] = user_setting
self.key_value_settings['categories'].parse_form(enabled_categories)
self.engines.parse_form(disabled_engines)
self.plugins.parse_form(disabled_plugins)
@ -558,8 +557,6 @@ class Preferences:
ret_val = None
if user_setting_name in self.key_value_settings:
ret_val = self.key_value_settings[user_setting_name].get_value()
if user_setting_name in self.unknown_params:
ret_val = self.unknown_params[user_setting_name]
return ret_val
def save(self, resp: flask.Response):
@ -572,8 +569,6 @@ class Preferences:
self.engines.save(resp)
self.plugins.save(resp)
self.tokens.save('tokens', resp)
for k, v in self.unknown_params.items():
resp.set_cookie(k, v, max_age=COOKIE_MAX_AGE)
return resp
def validate_token(self, engine):

View file

@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name, missing-module-docstring, missing-class-docstring
from __future__ import annotations
from abc import abstractmethod, ABC
import re
@ -258,7 +259,7 @@ class RawTextQuery:
FeelingLuckyParser, # redirect to the first link in the results list
]
def __init__(self, query, disabled_engines):
def __init__(self, query: str, disabled_engines: list):
assert isinstance(query, str)
# input parameters
self.query = query

View file

@ -9,7 +9,6 @@ from typing import List, NamedTuple, Set
from urllib.parse import urlparse, unquote
from searx import logger
from searx import utils
from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error
@ -366,9 +365,9 @@ class ResultContainer:
result['score'] = result_score(result, result.get('priority'))
# removing html content and whitespace duplications
if result.get('content'):
result['content'] = utils.html_to_text(result['content']).strip()
result['content'] = result['content'].strip()
if result.get('title'):
result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())
result['title'] = ' '.join(result['title'].strip().split())
for result_engine in result['engines']:
counter_add(result['score'], 'engine', result_engine, 'score')

View file

@ -20,7 +20,7 @@ if (next_call_ts == false or next_call_ts == nil) then
-- 2/ the next call is a random time between start_after_from and start_after_to
local initial_delay = math.random(start_after_from, start_after_to)
redis.call('SET', redis_key, now + initial_delay)
return { false, delay }
return { false, initial_delay }
end
-- next_call_ts is defined

View file

@ -137,9 +137,6 @@ class OnlineProcessor(EngineProcessor):
self.engine.request(query, params)
# ignoring empty urls
if params['url'] is None:
return None
if not params['url']:
return None

View file

@ -23,7 +23,7 @@ def name_to_iso4217(name):
currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str):
return currency
return currency[0]
return currency[-1]
def iso4217_to_name(iso4217, language):

View file

@ -55,6 +55,7 @@ STYLE_NAMES = {
'AUTO': 'auto',
'LIGHT': 'light',
'DARK': 'dark',
'BLACK': 'black',
}
BRAND_CUSTOM_LINKS = {

View file

@ -12,6 +12,10 @@ general:
contact_url: false
# record stats
enable_metrics: true
# expose stats in open metrics format at /metrics
# leave empty to disable (no password set)
# open_metrics: <password>
open_metrics: ''
brand:
new_issue_url: https://github.com/searxng/searxng/issues/new
@ -35,6 +39,9 @@ search:
autocomplete: ""
# minimun characters to type before autocompleter starts
autocomplete_min: 4
# backend for the favicon near URL in search results.
# Available resolvers: "allesedv", "duckduckgo", "google", "yandex" - leave blank to turn it off by default.
favicon_resolver: ""
# Default search language - leave blank to detect from browser information or
# use codes from 'languages.py'
default_lang: "auto"
@ -219,19 +226,16 @@ outgoing:
#
# enabled_plugins:
# # these plugins are enabled if nothing is configured ..
# - 'Basic Calculator'
# - 'Hash plugin'
# - 'Self Information'
# - 'Tracker URL remover'
# - 'Unit converter plugin'
# - 'Ahmia blacklist' # activation depends on outgoing.using_tor_proxy
# # these plugins are disabled if nothing is configured ..
# - 'Hostnames plugin' # see 'hostnames' configuration below
# - 'Basic Calculator'
# - 'Open Access DOI rewrite'
# - 'Tor check plugin'
# # Read the docs before activate: auto-detection of the language could be
# # detrimental to users expectations / users can activate the plugin in the
# # preferences if they want.
# - 'Autodetect search language'
# Configuration of the "Hostnames plugin":
#
@ -325,6 +329,41 @@ engines:
shortcut: 9g
disabled: true
- name: adobe stock
engine: adobe_stock
shortcut: asi
categories: ["images"]
# https://docs.searxng.org/dev/engines/online/adobe_stock.html
adobe_order: relevance
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
timeout: 6
disabled: true
- name: adobe stock video
engine: adobe_stock
shortcut: asv
network: adobe stock
categories: ["videos"]
adobe_order: relevance
adobe_content_types: ["video"]
timeout: 6
disabled: true
- name: adobe stock audio
engine: adobe_stock
shortcut: asa
network: adobe stock
categories: ["music"]
adobe_order: relevance
adobe_content_types: ["audio"]
timeout: 6
disabled: true
- name: alpine linux packages
engine: alpinelinux
disabled: true
shortcut: alp
- name: annas archive
engine: annas_archive
disabled: true
@ -404,7 +443,6 @@ engines:
shortcut: wp
# add "list" to the array to get results in the results list
display_type: ["infobox"]
base_url: 'https://{language}.wikipedia.org/'
categories: [general]
- name: bilibili
@ -477,6 +515,23 @@ engines:
# to show premium or plus results too:
# skip_premium: false
- name: cloudflareai
engine: cloudflareai
shortcut: cfai
# get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/
cf_account_id: 'your_cf_accout_id'
cf_ai_api: 'your_cf_api'
# create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/
cf_ai_gateway: 'your_cf_ai_gateway_name'
# find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation
cf_ai_model: 'ai_model_name'
# custom your preferences
# cf_ai_model_display_name: 'Cloudflare AI'
# cf_ai_model_assistant: 'prompts_for_assistant_role'
# cf_ai_model_system: 'prompts_for_system_role'
timeout: 30
disabled: true
# - name: core.ac.uk
# engine: core
# categories: science
@ -506,6 +561,8 @@ engines:
url_query: link
title_query: title
content_query: snippet
title_html_to_text: true
content_html_to_text: true
disabled: true
about:
website: https://crowdview.ai/
@ -557,33 +614,6 @@ engines:
categories: general
shortcut: cc
- name: bahnhof
engine: json_engine
search_url: https://www.bahnhof.de/api/stations/search/{query}
url_prefix: https://www.bahnhof.de/
url_query: slug
title_query: name
content_query: state
shortcut: bf
disabled: true
about:
website: https://www.bahn.de
wikidata_id: Q22811603
use_official_api: false
require_api_key: false
results: JSON
language: de
tests:
bahnhof:
matrix:
query: berlin
lang: en
result_container:
- not_empty
- ['one_title_contains', 'Berlin Hauptbahnhof']
test:
- unique_results
- name: deezer
engine: deezer
shortcut: dz
@ -618,6 +648,24 @@ engines:
shortcut: dh
categories: [it, packages]
- name: encyclosearch
engine: json_engine
shortcut: es
categories: general
paging: true
search_url: https://encyclosearch.org/encyclosphere/search?q={query}&page={pageno}&resultsPerPage=15
results_query: Results
url_query: SourceURL
title_query: Title
content_query: Description
disabled: true
about:
website: https://encyclosearch.org
official_api_documentation: https://encyclosearch.org/docs/#/rest-api
use_official_api: true
require_api_key: false
results: JSON
- name: erowid
engine: xpath
paging: true
@ -792,34 +840,40 @@ engines:
timeout: 8.0
disabled: true
- name: geizhals
engine: geizhals
shortcut: geiz
disabled: true
- name: genius
engine: genius
shortcut: gen
- name: gentoo
engine: gentoo
engine: mediawiki
shortcut: ge
timeout: 10.0
categories: ["it", "software wikis"]
base_url: "https://wiki.gentoo.org/"
api_path: "api.php"
search_type: text
timeout: 10
- name: gitlab
engine: json_engine
paging: true
search_url: https://gitlab.com/api/v4/projects?search={query}&page={pageno}
url_query: web_url
title_query: name_with_namespace
content_query: description
page_size: 20
categories: [it, repos]
engine: gitlab
base_url: https://gitlab.com
shortcut: gl
timeout: 10.0
disabled: true
about:
website: https://about.gitlab.com/
website: https://gitlab.com/
wikidata_id: Q16639197
official_api_documentation: https://docs.gitlab.com/ee/api/
use_official_api: false
require_api_key: false
results: JSON
# - name: gnome
# engine: gitlab
# base_url: https://gitlab.gnome.org
# shortcut: gn
# about:
# website: https://gitlab.gnome.org
# wikidata_id: Q44316
- name: github
engine: github
@ -898,26 +952,6 @@ engines:
shortcut: mi
disabled: true
- name: gpodder
engine: json_engine
shortcut: gpod
timeout: 4.0
paging: false
search_url: https://gpodder.net/search.json?q={query}
url_query: url
title_query: title
content_query: description
page_size: 19
categories: music
disabled: true
about:
website: https://gpodder.net
wikidata_id: Q3093354
official_api_documentation: https://gpoddernet.readthedocs.io/en/latest/api/
use_official_api: false
requires_api_key: false
results: JSON
- name: habrahabr
engine: xpath
paging: true
@ -1230,6 +1264,7 @@ engines:
# read https://docs.searxng.org/dev/engines/online/mullvad_leta.html
# - name: mullvadleta
# engine: mullvad_leta
# leta_engine: google # choose one of the following: google, brave
# use_cache: true # Only 100 non-cache searches per day, suggested only for private instances
# search_url: https://leta.mullvad.net
# categories: [general, web]
@ -1280,6 +1315,12 @@ engines:
require_api_key: false
results: JSON
- name: openlibrary
engine: openlibrary
shortcut: ol
timeout: 5
disabled: true
- name: openmeteo
engine: open_meteo
shortcut: om
@ -1540,6 +1581,25 @@ engines:
engine: reddit
shortcut: re
page_size: 25
disabled: true
- name: right dao
engine: xpath
paging: true
page_size: 12
search_url: https://rightdao.com/search?q={query}&start={pageno}
results_xpath: //div[contains(@class, "description")]
url_xpath: ../div[contains(@class, "title")]/a/@href
title_xpath: ../div[contains(@class, "title")]
content_xpath: .
categories: general
shortcut: rd
disabled: true
about:
website: https://rightdao.com/
use_official_api: false
require_api_key: false
results: HTML
- name: rottentomatoes
engine: rottentomatoes
@ -1597,11 +1657,6 @@ engines:
api_site: 'askubuntu'
categories: [it, q&a]
- name: internetarchivescholar
engine: internet_archive_scholar
shortcut: ias
timeout: 15.0
- name: superuser
engine: stackexchange
shortcut: su
@ -1780,6 +1835,22 @@ engines:
engine: unsplash
shortcut: us
- name: yandex
engine: yandex
categories: general
search_type: web
shortcut: yd
disabled: true
inactive: true
- name: yandex images
engine: yandex
categories: images
search_type: images
shortcut: ydi
disabled: true
inactive: true
- name: yandex music
engine: yandex_music
shortcut: ydm
@ -1828,25 +1899,6 @@ engines:
about:
website: https://wiby.me/
- name: alexandria
engine: json_engine
shortcut: alx
categories: general
paging: true
search_url: https://api.alexandria.org/?a=1&q={query}&p={pageno}
results_query: results
title_query: title
url_query: url
content_query: snippet
timeout: 1.5
disabled: true
about:
website: https://alexandria.org/
official_api_documentation: https://github.com/alexandria-org/alexandria-api/raw/master/README.md
use_official_api: true
require_api_key: false
results: JSON
- name: wikibooks
engine: mediawiki
weight: 0.5
@ -2015,6 +2067,16 @@ engines:
# query_str: 'SELECT * from mytable WHERE fieldname=%(query)s'
# shortcut: mysql
# Required dependency: mariadb
# - name: mariadb
# engine: mariadb_server
# database: mydatabase
# username: user
# password: pass
# limit: 10
# query_str: 'SELECT * from mytable WHERE fieldname=%(query)s'
# shortcut: mdb
- name: 1337x
engine: 1337x
shortcut: 1337x
@ -2124,28 +2186,35 @@ engines:
disabled: true
- name: yacy
# https://docs.searxng.org/dev/engines/online/yacy.html
engine: yacy
categories: general
search_type: text
base_url:
- https://yacy.searchlab.eu
- https://search.lomig.me
- https://yacy.ecosys.eu
- https://search.webproject.link
# see https://github.com/searxng/searxng/pull/3631#issuecomment-2240903027
# - https://search.kyun.li
# - https://yacy.securecomcorp.eu
# - https://yacy.myserv.ca
# - https://yacy.nsupdate.info
# - https://yacy.electroncash.de
shortcut: ya
disabled: true
# required if you aren't using HTTPS for your local yacy instance
# https://docs.searxng.org/dev/engines/online/yacy.html
# enable_http: true
# timeout: 3.0
# search_mode: 'global'
# if you aren't using HTTPS for your local yacy instance disable https
# enable_http: false
search_mode: 'global'
# timeout can be reduced in 'local' search mode
timeout: 5.0
- name: yacy images
engine: yacy
network: yacy
categories: images
search_type: image
shortcut: yai
disabled: true
# timeout can be reduced in 'local' search mode
timeout: 5.0
- name: rumble
engine: rumble
@ -2165,7 +2234,6 @@ engines:
- name: wordnik
engine: wordnik
shortcut: def
base_url: https://www.wordnik.com/
categories: [dictionaries]
timeout: 5.0
@ -2211,13 +2279,6 @@ engines:
seekr_category: videos
disabled: true
- name: sjp.pwn
engine: sjp
shortcut: sjp
base_url: https://sjp.pwn.pl/
timeout: 5.0
disabled: true
- name: stract
engine: stract
shortcut: str

View file

@ -18,7 +18,7 @@ searx_dir = abspath(dirname(__file__))
logger = logging.getLogger('searx')
OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss']
SXNG_LOCALE_TAGS = ['all', 'auto'] + list(l[0] for l in sxng_locales)
SIMPLE_STYLE = ('auto', 'light', 'dark')
SIMPLE_STYLE = ('auto', 'light', 'dark', 'black')
CATEGORIES_AS_TABS = {
'general': {},
'images': {},
@ -143,6 +143,7 @@ SCHEMA = {
'contact_url': SettingsValue((None, False, str), None),
'donation_url': SettingsValue((bool, str), "https://docs.searxng.org/donate.html"),
'enable_metrics': SettingsValue(bool, True),
'open_metrics': SettingsValue(str, ''),
},
'brand': {
'issue_url': SettingsValue(str, 'https://github.com/searxng/searxng/issues'),
@ -156,6 +157,7 @@ SCHEMA = {
'safe_search': SettingsValue((0, 1, 2), 0),
'autocomplete': SettingsValue(str, ''),
'autocomplete_min': SettingsValue(int, 4),
'favicon_resolver': SettingsValue(str, ''),
'default_lang': SettingsValue(tuple(SXNG_LOCALE_TAGS + ['']), ''),
'languages': SettingSublistValue(SXNG_LOCALE_TAGS, SXNG_LOCALE_TAGS),
'ban_time_on_fail': SettingsValue(numbers.Real, 5),

View file

@ -1,68 +1,116 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, too-many-branches
"""Implementations for loading configurations from YAML files. This essentially
includes the configuration of the (:ref:`SearXNG appl <searxng settings.yml>`)
server. The default configuration for the application server is loaded from the
:origin:`DEFAULT_SETTINGS_FILE <searx/settings.yml>`. This default
configuration can be completely replaced or :ref:`customized individually
<use_default_settings.yml>` and the ``SEARXNG_SETTINGS_PATH`` environment
variable can be used to set the location from which the local customizations are
to be loaded. The rules used for this can be found in the
:py:obj:`get_user_cfg_folder` function.
from typing import Optional
from os import environ
from os.path import dirname, join, abspath, isfile
- By default, local configurations are expected in folder ``/etc/searxng`` from
where applications can load them with the :py:obj:`get_yaml_cfg` function.
- By default, customized :ref:`SearXNG appl <searxng settings.yml>` settings are
expected in a file named ``settings.yml``.
"""
from __future__ import annotations
import os.path
from collections.abc import Mapping
from itertools import filterfalse
from pathlib import Path
import yaml
from searx.exceptions import SearxSettingsException
searx_dir = os.path.abspath(os.path.dirname(__file__))
searx_dir = abspath(dirname(__file__))
SETTINGS_YAML = Path("settings.yml")
DEFAULT_SETTINGS_FILE = Path(searx_dir) / SETTINGS_YAML
"""The :origin:`searx/settings.yml` file with all the default settings."""
def existing_filename_or_none(file_name: str) -> Optional[str]:
if isfile(file_name):
return file_name
return None
def load_yaml(file_name):
def load_yaml(file_name: str | Path):
"""Load YAML config from a file."""
try:
with open(file_name, 'r', encoding='utf-8') as settings_yaml:
return yaml.safe_load(settings_yaml)
return yaml.safe_load(settings_yaml) or {}
except IOError as e:
raise SearxSettingsException(e, file_name) from e
raise SearxSettingsException(e, str(file_name)) from e
except yaml.YAMLError as e:
raise SearxSettingsException(e, file_name) from e
raise SearxSettingsException(e, str(file_name)) from e
def get_yaml_file(file_name):
path = existing_filename_or_none(join(searx_dir, file_name))
if path is None:
raise FileNotFoundError(f"File {file_name} does not exist!")
def get_yaml_cfg(file_name: str | Path) -> dict:
"""Shortcut to load a YAML config from a file, located in the
return load_yaml(path)
def get_default_settings_path():
return existing_filename_or_none(join(searx_dir, 'settings.yml'))
def get_user_settings_path() -> Optional[str]:
"""Get an user settings file.
By descending priority:
1. ``environ['SEARXNG_SETTINGS_PATH']``
2. ``/etc/searxng/settings.yml`` except if ``SEARXNG_DISABLE_ETC_SETTINGS`` is ``true`` or ``1``
3. ``None``
- :py:obj:`get_user_cfg_folder` or
- in the ``searx`` folder of the SearXNG installation
"""
# check the environment variable SEARXNG_SETTINGS_PATH
# if the environment variable is defined, this is the last check
if 'SEARXNG_SETTINGS_PATH' in environ:
return existing_filename_or_none(environ['SEARXNG_SETTINGS_PATH'])
folder = get_user_cfg_folder() or Path(searx_dir)
fname = folder / file_name
if not fname.is_file():
raise FileNotFoundError(f"File {fname} does not exist!")
# if SEARXNG_DISABLE_ETC_SETTINGS don't look any further
if environ.get('SEARXNG_DISABLE_ETC_SETTINGS', '').lower() in ('1', 'true'):
return None
return load_yaml(fname)
# check /etc/searxng/settings.yml
# (continue with other locations if the file is not found)
return existing_filename_or_none('/etc/searxng/settings.yml')
def get_user_cfg_folder() -> Path | None:
"""Returns folder where the local configurations are located.
1. If the ``SEARXNG_SETTINGS_PATH`` environment is set and points to a
folder (e.g. ``/etc/mysxng/``), all local configurations are expected in
this folder. The settings of the :ref:`SearXNG appl <searxng
settings.yml>` then expected in ``settings.yml``
(e.g. ``/etc/mysxng/settings.yml``).
2. If the ``SEARXNG_SETTINGS_PATH`` environment is set and points to a file
(e.g. ``/etc/mysxng/myinstance.yml``), this file contains the settings of
the :ref:`SearXNG appl <searxng settings.yml>` and the folder
(e.g. ``/etc/mysxng/``) is used for all other configurations.
This type (``SEARXNG_SETTINGS_PATH`` points to a file) is suitable for
use cases in which different profiles of the :ref:`SearXNG appl <searxng
settings.yml>` are to be managed, such as in test scenarios.
3. If folder ``/etc/searxng`` exists, it is used.
In case none of the above path exists, ``None`` is returned. In case of
environment ``SEARXNG_SETTINGS_PATH`` is set, but the (folder or file) does
not exists, a :py:obj:`EnvironmentError` is raised.
"""
folder = None
settings_path = os.environ.get("SEARXNG_SETTINGS_PATH")
# Disable default /etc/searxng is intended exclusively for internal testing purposes
# and is therefore not documented!
disable_etc = os.environ.get('SEARXNG_DISABLE_ETC_SETTINGS', '').lower() in ('1', 'true')
if settings_path:
# rule 1. and 2.
settings_path = Path(settings_path)
if settings_path.is_dir():
folder = settings_path
elif settings_path.is_file():
folder = settings_path.parent
else:
raise EnvironmentError(1, f"{settings_path} not exists!", settings_path)
if not folder and not disable_etc:
# default: rule 3.
folder = Path("/etc/searxng")
if not folder.is_dir():
folder = None
return folder
def update_dict(default_dict, user_dict):
@ -74,7 +122,9 @@ def update_dict(default_dict, user_dict):
return default_dict
def update_settings(default_settings, user_settings):
def update_settings(default_settings: dict, user_settings: dict):
# pylint: disable=too-many-branches
# merge everything except the engines
for k, v in user_settings.items():
if k not in ('use_default_settings', 'engines'):
@ -124,6 +174,7 @@ def update_settings(default_settings, user_settings):
def is_use_default_settings(user_settings):
use_default_settings = user_settings.get('use_default_settings')
if use_default_settings is True:
return True
@ -134,25 +185,37 @@ def is_use_default_settings(user_settings):
raise ValueError('Invalid value for use_default_settings')
def load_settings(load_user_settings=True):
default_settings_path = get_default_settings_path()
user_settings_path = get_user_settings_path()
if user_settings_path is None or not load_user_settings:
# no user settings
return (load_yaml(default_settings_path), 'load the default settings from {}'.format(default_settings_path))
def load_settings(load_user_settings=True) -> tuple[dict, str]:
"""Function for loading the settings of the SearXNG application
(:ref:`settings.yml <searxng settings.yml>`)."""
# user settings
user_settings = load_yaml(user_settings_path)
if is_use_default_settings(user_settings):
msg = f"load the default settings from {DEFAULT_SETTINGS_FILE}"
cfg = load_yaml(DEFAULT_SETTINGS_FILE)
cfg_folder = get_user_cfg_folder()
if not load_user_settings or not cfg_folder:
return cfg, msg
settings_yml = os.environ.get("SEARXNG_SETTINGS_PATH")
if settings_yml and Path(settings_yml).is_file():
# see get_user_cfg_folder() --> SEARXNG_SETTINGS_PATH points to a file
settings_yml = Path(settings_yml).name
else:
# see get_user_cfg_folder() --> SEARXNG_SETTINGS_PATH points to a folder
settings_yml = SETTINGS_YAML
cfg_file = cfg_folder / settings_yml
if not cfg_file.exists():
return cfg, msg
msg = f"load the user settings from {cfg_file}"
user_cfg = load_yaml(cfg_file)
if is_use_default_settings(user_cfg):
# the user settings are merged with the default configuration
default_settings = load_yaml(default_settings_path)
update_settings(default_settings, user_settings)
return (
default_settings,
'merge the default settings ( {} ) and the user settings ( {} )'.format(
default_settings_path, user_settings_path
),
)
msg = f"merge the default settings ( {DEFAULT_SETTINGS_FILE} ) and the user settings ( {cfg_file} )"
update_settings(cfg, user_cfg)
else:
cfg = user_cfg
# the user settings, fully replace the default configuration
return (user_settings, 'load the user settings from {}'.format(user_settings_path))
return cfg, msg

323
searx/sqlitedb.py Normal file
View file

@ -0,0 +1,323 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Implementations to make access to SQLite databases a little more convenient.
:py:obj:`SQLiteAppl`
Abstract class with which DB applications can be implemented.
:py:obj:`SQLiteProperties`:
Class to manage properties stored in a database.
----
"""
from __future__ import annotations
import sys
import re
import sqlite3
import threading
import abc
from searx import logger
logger = logger.getChild('sqlitedb')
class SQLiteAppl(abc.ABC):
"""Abstract base class for implementing convenient DB access in SQLite
applications. In the constructor, a :py:obj:`SQLiteProperties` instance is
already aggregated under ``self.properties``."""
DDL_CREATE_TABLES: dict[str, str] = {}
DB_SCHEMA: int = 1
"""As soon as changes are made to the DB schema, the version number must be
increased. Changes to the version number require the DB to be recreated (or
migrated / if an migration path exists and is implemented)."""
SQLITE_THREADING_MODE = {
0: "single-thread",
1: "multi-thread",
3: "serialized"}[sqlite3.threadsafety] # fmt:skip
"""Threading mode of the SQLite library. Depends on the options used at
compile time and is different for different distributions and architectures.
Possible values are 0:``single-thread``, 1:``multi-thread``,
3:``serialized`` (see :py:obj:`sqlite3.threadsafety`). Pre- Python 3.11
this value was hard coded to 1.
Depending on this value, optimizations are made, e.g. in serialized mode
it is not necessary to create a separate DB connector for each thread.
"""
SQLITE_JOURNAL_MODE = "WAL"
SQLITE_CONNECT_ARGS = {
# "timeout": 5.0,
# "detect_types": 0,
"check_same_thread": bool(SQLITE_THREADING_MODE != "serialized"),
"cached_statements": 0, # https://github.com/python/cpython/issues/118172
# "uri": False,
"autocommit": False,
} # fmt:skip
"""Connection arguments (:py:obj:`sqlite3.connect`)
``check_same_thread``:
Is disabled by default when :py:obj:`SQLITE_THREADING_MODE` is
``serialized``. The check is more of a hindrance in this case because it
would prevent a DB connector from being used in multiple threads.
``autocommit``:
Is disabled by default. Note: autocommit option has been added in Python
3.12.
``cached_statements``:
Is set to ``0`` by default. Note: Python 3.12+ fetch result are not
consistent in multi-threading application and causing an API misuse error.
The multithreading use in SQLiteAppl is intended and supported if
threadsafety is set to 3 (aka "serialized"). CPython supports serialized
from version 3.12 on, but unfortunately only with errors:
- https://github.com/python/cpython/issues/118172
- https://github.com/python/cpython/issues/123873
The workaround for SQLite3 multithreading cache inconsistency ist to set
option ``cached_statements`` to ``0`` by default.
"""
def __init__(self, db_url):
self.db_url = db_url
self.properties = SQLiteProperties(db_url)
self.thread_local = threading.local()
self._init_done = False
self._compatibility()
def _compatibility(self):
if self.SQLITE_THREADING_MODE == "serialized":
self._DB = None
else:
msg = (
f"SQLite library is compiled with {self.SQLITE_THREADING_MODE} mode,"
" read https://docs.python.org/3/library/sqlite3.html#sqlite3.threadsafety"
)
if threading.active_count() > 1:
logger.error(msg)
else:
logger.warning(msg)
if sqlite3.sqlite_version_info <= (3, 35):
# See "Generalize UPSERT:" in https://sqlite.org/releaselog/3_35_0.html
logger.critical(
"SQLite runtime library version %s is not supported (require >= 3.35)", sqlite3.sqlite_version
)
def connect(self) -> sqlite3.Connection:
"""Creates a new DB connection (:py:obj:`SQLITE_CONNECT_ARGS`). If not
already done, the DB schema is set up
"""
if sys.version_info < (3, 12):
# Prior Python 3.12 there is no "autocommit" option
self.SQLITE_CONNECT_ARGS.pop("autocommit", None)
self.init()
logger.debug("%s: connect to DB: %s // %s", self.__class__.__name__, self.db_url, self.SQLITE_CONNECT_ARGS)
conn = sqlite3.Connection(self.db_url, **self.SQLITE_CONNECT_ARGS) # type: ignore
conn.execute(f"PRAGMA journal_mode={self.SQLITE_JOURNAL_MODE}")
self.register_functions(conn)
return conn
def register_functions(self, conn):
"""Create user-defined_ SQL functions.
``REGEXP(<pattern>, <field>)`` : 0 | 1
`re.search`_ returns (int) 1 for a match and 0 for none match of
``<pattern>`` in ``<field>``.
.. code:: sql
SELECT '12' AS field WHERE REGEXP('^[0-9][0-9]$', field)
-- 12
SELECT REGEXP('[0-9][0-9]', 'X12Y')
-- 1
SELECT REGEXP('[0-9][0-9]', 'X1Y')
-- 0
.. _user-defined: https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.create_function
.. _deterministic: https://sqlite.org/deterministic.html
.. _re.search: https://docs.python.org/3/library/re.html#re.search
"""
conn.create_function('regexp', 2, lambda x, y: 1 if re.search(x, y) else 0, deterministic=True)
@property
def DB(self) -> sqlite3.Connection:
"""Provides a DB connection. The connection is a *singleton* and
therefore well suited for read access. If
:py:obj:`SQLITE_THREADING_MODE` is ``serialized`` only one DB connection
is created for all threads.
.. note::
For dedicated `transaction control`_, it is recommended to create a
new connection (:py:obj:`SQLiteAppl.connect`).
.. _transaction control:
https://docs.python.org/3/library/sqlite3.html#sqlite3-controlling-transactions
"""
if getattr(self.thread_local, 'DB', None) is None:
self.thread_local.DB = self.connect()
# Theoretically it is possible to reuse the DB cursor across threads as
# of Python 3.12, in practice the threading of the cursor seems to me to
# be so faulty that I prefer to establish one connection per thread
self.thread_local.DB.commit()
return self.thread_local.DB
# In "serialized" mode, SQLite can be safely used by multiple threads
# with no restriction.
#
# if self.SQLITE_THREADING_MODE != "serialized":
# if getattr(self.thread_local, 'DB', None) is None:
# self.thread_local.DB = self.connect()
# return self.thread_local.DB
#
# if self._DB is None:
# self._DB = self.connect() # pylint: disable=attribute-defined-outside-init
# return self._DB
def init(self):
"""Initializes the DB schema and properties, is only executed once even
if called several times."""
if self._init_done:
return
self._init_done = True
logger.debug("init DB: %s", self.db_url)
self.properties.init()
ver = self.properties("DB_SCHEMA")
if ver is None:
with self.properties.DB:
self.create_schema(self.properties.DB)
else:
ver = int(ver)
if ver != self.DB_SCHEMA:
raise sqlite3.DatabaseError("Expected DB schema v%s, DB schema is v%s" % (self.DB_SCHEMA, ver))
logger.debug("DB_SCHEMA = %s", ver)
def create_schema(self, conn):
logger.debug("create schema ..")
with conn:
for table_name, sql in self.DDL_CREATE_TABLES.items():
conn.execute(sql)
self.properties.set(f"Table {table_name} created", table_name)
self.properties.set("DB_SCHEMA", self.DB_SCHEMA)
self.properties.set("LAST_MAINTENANCE", "")
class SQLiteProperties(SQLiteAppl):
"""Simple class to manage properties of a DB application in the DB. The
object has its own DB connection and transaction area.
.. code:: sql
CREATE TABLE IF NOT EXISTS properties (
name TEXT,
value TEXT,
m_time INTEGER DEFAULT (strftime('%s', 'now')),
PRIMARY KEY (name))
"""
SQLITE_JOURNAL_MODE = "WAL"
DDL_PROPERTIES = """\
CREATE TABLE IF NOT EXISTS properties (
name TEXT,
value TEXT,
m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
PRIMARY KEY (name))"""
"""Table to store properties of the DB application"""
SQL_GET = "SELECT value FROM properties WHERE name = ?"
SQL_M_TIME = "SELECT m_time FROM properties WHERE name = ?"
SQL_SET = (
"INSERT INTO properties (name, value) VALUES (?, ?)"
" ON CONFLICT(name) DO UPDATE"
" SET value=excluded.value, m_time=strftime('%s', 'now')"
)
SQL_TABLE_EXISTS = (
"SELECT name FROM sqlite_master"
" WHERE type='table' AND name='properties'"
) # fmt:skip
SQLITE_CONNECT_ARGS = dict(SQLiteAppl.SQLITE_CONNECT_ARGS)
SQLITE_CONNECT_ARGS["autocommit"] = True # This option has no effect before Python 3.12
def __init__(self, db_url: str): # pylint: disable=super-init-not-called
self.db_url = db_url
self.thread_local = threading.local()
self._init_done = False
self._compatibility()
def init(self):
"""Initializes DB schema of the properties in the DB."""
if self._init_done:
return
self._init_done = True
logger.debug("init properties of DB: %s", self.db_url)
with self.DB as conn:
res = conn.execute(self.SQL_TABLE_EXISTS)
if res.fetchone() is None: # DB schema needs to be be created
self.create_schema(conn)
def __call__(self, name, default=None):
"""Returns the value of the property ``name`` or ``default`` if property
not exists in DB."""
res = self.DB.execute(self.SQL_GET, (name,)).fetchone()
if res is None:
return default
return res[0]
def set(self, name, value):
"""Set ``value`` of property ``name`` in DB. If property already
exists, update the ``m_time`` (and the value)."""
self.DB.execute(self.SQL_SET, (name, value))
if sys.version_info <= (3, 12):
# Prior Python 3.12 there is no "autocommit" option / lets commit
# explicitely.
self.DB.commit()
def row(self, name, default=None):
"""Returns the DB row of property ``name`` or ``default`` if property
not exists in DB."""
cur = self.DB.cursor()
cur.execute("SELECT * FROM properties WHERE name = ?", (name,))
res = cur.fetchone()
if res is None:
return default
col_names = [column[0] for column in cur.description]
return dict(zip(col_names, res))
def m_time(self, name, default: int = 0) -> int:
"""Last modification time of this property."""
res = self.DB.execute(self.SQL_M_TIME, (name,)).fetchone()
if res is None:
return default
return int(res[0])
def create_schema(self, conn):
with conn:
conn.execute(self.DDL_PROPERTIES)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show more