data: engine descriptions: use SQLite instead of JSON

To reduce memory usage, use a SQLite database to store the engine descriptions.
A dump of the database is stored in Git to facilitate maintenance,
especially the pull requests made automatically every month.

Related to
* https://github.com/searxng/searxng/discussions/2633
* https://github.com/searxng/searxng/pull/3443
This commit is contained in:
Alexandre Flament 2024-05-04 08:04:02 +00:00
parent ec41b53587
commit 83d4a2ebb0
8 changed files with 4126 additions and 7987 deletions

View file

@ -13,15 +13,20 @@ __all__ = [
'WIKIDATA_UNITS',
'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'LOCALES',
'ahmia_blacklist_loader',
'fetch_engine_descriptions',
]
import json
import sqlite3
from contextlib import contextmanager
from typing import Dict, Generator, List
from pathlib import Path
data_dir = Path(__file__).parent
data_connection_local = {}
def _load(filename):
@ -29,6 +34,41 @@ def _load(filename):
return json.load(f)
@contextmanager
def sql_connection(filename: str) -> Generator[sqlite3.Connection, None, None]:
"""Return a read only SQLite connection to filename.
The filename is relative to searx/data
Multiple calls to this function in the same thread,
already return the same connection.
"""
dict_id = filename
connection = data_connection_local.get(dict_id)
if connection is None:
data_filename = str(data_dir / filename)
# open database in read only mode and allow to share between threads
# https://www.sqlite.org/faq.html#q6
# see https://ricardoanderegg.com/posts/python-sqlite-thread-safety/
# and https://docs.python.org/3/library/sqlite3.html#sqlite3.threadsafety
# sqlite3.threadsafety is hard coded to 1
# the only reliable way to check if multithreading is supported is to run this query
# SELECT * FROM pragma_compile_options WHERE compile_options LIKE 'THREADSAFE=%'
# but THREADSAFE=1 on Linux anyway
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True, check_same_thread=False)
# 512KB of cache instead of 2MB (512KB / 4KB = 128, 4KB is the default page size)
# https://www.sqlite.org/pragma.html#pragma_cache_size
data_connection.execute("PRAGMA cache_size = 128;")
data_connection_local[dict_id] = data_connection
yield data_connection
def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
"""Return engine description and source for each engine name."""
with sql_connection("engine_descriptions.db") as conn:
res = conn.execute("SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,))
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
@ -48,6 +88,5 @@ EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')
LOCALES = _load('locales.json')

View file

@ -0,0 +1,3 @@
Dumps of the SQLite files in ``searx.data``.
These files are not used by SearXNG, they are here for reference.

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long