data: engine descriptions: use SQLite instead of JSON

To reduce memory usage, use a SQLite database to store the engine descriptions.
A dump of the database is stored in Git to facilitate maintenance,
especially the pull requests made automatically every month.

Related to
* https://github.com/searxng/searxng/discussions/2633
* https://github.com/searxng/searxng/pull/3443
This commit is contained in:
Alexandre Flament 2024-05-04 08:04:02 +00:00
parent ec41b53587
commit 83d4a2ebb0
8 changed files with 4126 additions and 7987 deletions

View file

@ -13,15 +13,20 @@ __all__ = [
'WIKIDATA_UNITS',
'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'LOCALES',
'ahmia_blacklist_loader',
'fetch_engine_descriptions',
]
import json
import sqlite3
from contextlib import contextmanager
from typing import Dict, Generator, List
from pathlib import Path
data_dir = Path(__file__).parent
data_connection_local = {}
def _load(filename):
@ -29,6 +34,41 @@ def _load(filename):
return json.load(f)
@contextmanager
def sql_connection(filename: str) -> Generator[sqlite3.Connection, None, None]:
"""Return a read only SQLite connection to filename.
The filename is relative to searx/data
Multiple calls to this function in the same thread,
already return the same connection.
"""
dict_id = filename
connection = data_connection_local.get(dict_id)
if connection is None:
data_filename = str(data_dir / filename)
# open database in read only mode and allow to share between threads
# https://www.sqlite.org/faq.html#q6
# see https://ricardoanderegg.com/posts/python-sqlite-thread-safety/
# and https://docs.python.org/3/library/sqlite3.html#sqlite3.threadsafety
# sqlite3.threadsafety is hard coded to 1
# the only reliable way to check if multithreading is supported is to run this query
# SELECT * FROM pragma_compile_options WHERE compile_options LIKE 'THREADSAFE=%'
# but THREADSAFE=1 on Linux anyway
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True, check_same_thread=False)
# 512KB of cache instead of 2MB (512KB / 4KB = 128, 4KB is the default page size)
# https://www.sqlite.org/pragma.html#pragma_cache_size
data_connection.execute("PRAGMA cache_size = 128;")
data_connection_local[dict_id] = data_connection
yield data_connection
def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
"""Return engine description and source for each engine name."""
with sql_connection("engine_descriptions.db") as conn:
res = conn.execute("SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,))
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
@ -48,6 +88,5 @@ EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')
LOCALES = _load('locales.json')

View file

@ -0,0 +1,3 @@
Dumps of the SQLite files in ``searx.data``.
These files are not used by SearXNG, they are here for reference.

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

View file

@ -58,7 +58,7 @@ from searx import infopage
from searx import limiter
from searx.botdetection import link_token
from searx.data import ENGINE_DESCRIPTIONS
from searx.data import fetch_engine_descriptions
from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path
@ -1102,17 +1102,10 @@ def image_proxy():
@app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions():
locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy()
result = fetch_engine_descriptions('en')
if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
for engine, description in fetch_engine_descriptions(locale).items():
result[engine] = description
for engine, description in result.items():
if len(description) == 2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str):
description = [description, 'wikipedia']
result[engine] = description
# overwrite by about:description (from settings)
for engine_name, engine_mod in engines.items():