data: engine descriptions: use SQLite instead of JSON

To reduce memory usage, use a SQLite database to store the engine descriptions. A dump of the database is stored in Git to facilitate maintenance, especially the pull requests made automatically every month. Related to * https://github.com/searxng/searxng/discussions/2633 * https://github.com/searxng/searxng/pull/3443
2024-01-01 19:24:07 +01:00 · 2024-05-04 08:04:02 +00:00 · 2024-05-04 08:04:02 +00:00 · 83d4a2ebb0
commit 83d4a2ebb0
parent ec41b53587
8 changed files with 4126 additions and 7987 deletions
--- a/searx/data/init.py
+++ b/searx/data/init.py
@ -13,15 +13,20 @@ __all__ = [
    'WIKIDATA_UNITS',
    'EXTERNAL_BANGS',
    'OSM_KEYS_TAGS',
-    'ENGINE_DESCRIPTIONS',
    'LOCALES',
    'ahmia_blacklist_loader',
+    'fetch_engine_descriptions',
 ]

 import json
+import sqlite3
+from contextlib import contextmanager
+from typing import Dict, Generator, List
 from pathlib import Path

+
 data_dir = Path(__file__).parent
+data_connection_local = {}


 def _load(filename):
@ -29,6 +34,41 @@ def _load(filename):
        return json.load(f)


+@contextmanager
+def sql_connection(filename: str) -> Generator[sqlite3.Connection, None, None]:
+    """Return a read only SQLite connection to filename.
+    The filename is relative to searx/data
+
+    Multiple calls to this function in the same thread,
+    already return the same connection.
+    """
+    dict_id = filename
+    connection = data_connection_local.get(dict_id)
+    if connection is None:
+        data_filename = str(data_dir / filename)
+        # open database in read only mode and allow to share between threads
+        # https://www.sqlite.org/faq.html#q6
+        # see https://ricardoanderegg.com/posts/python-sqlite-thread-safety/
+        # and https://docs.python.org/3/library/sqlite3.html#sqlite3.threadsafety
+        #     sqlite3.threadsafety is hard coded to 1
+        # the only reliable way to check if multithreading is supported is to run this query
+        # SELECT * FROM pragma_compile_options WHERE compile_options LIKE 'THREADSAFE=%'
+        # but THREADSAFE=1 on Linux anyway
+        data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True, check_same_thread=False)
+        # 512KB of cache instead of 2MB (512KB / 4KB = 128, 4KB is the default page size)
+        # https://www.sqlite.org/pragma.html#pragma_cache_size
+        data_connection.execute("PRAGMA cache_size = 128;")
+        data_connection_local[dict_id] = data_connection
+    yield data_connection
+
+
+def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
+    """Return engine description and source for each engine name."""
+    with sql_connection("engine_descriptions.db") as conn:
+        res = conn.execute("SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,))
+        return {result[0]: [result[1], result[2]] for result in res.fetchall()}
+
+
 def ahmia_blacklist_loader():
    """Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
    names.  The MD5 values are fetched by::
@ -48,6 +88,5 @@ EXTERNAL_URLS = _load('external_urls.json')
 WIKIDATA_UNITS = _load('wikidata_units.json')
 EXTERNAL_BANGS = _load('external_bangs.json')
 OSM_KEYS_TAGS = _load('osm_keys_tags.json')
-ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
 ENGINE_TRAITS = _load('engine_traits.json')
 LOCALES = _load('locales.json')
--- a/searx/data/dumps/README.rst
+++ b/searx/data/dumps/README.rst
@ -0,0 +1,3 @@
+Dumps of the SQLite files in ``searx.data``.
+
+These files are not used by SearXNG, they are here for reference.
--- a/searx/data/dumps/engine_descriptions.csv
+++ b/searx/data/dumps/engine_descriptions.csv
--- a/searx/data/engine_descriptions.db
+++ b/searx/data/engine_descriptions.db
--- a/searx/data/engine_descriptions.json
+++ b/searx/data/engine_descriptions.json
--- a/searx/webapp.py
+++ b/searx/webapp.py
@ -58,7 +58,7 @@ from searx import infopage
 from searx import limiter
 from searx.botdetection import link_token

-from searx.data import ENGINE_DESCRIPTIONS
+from searx.data import fetch_engine_descriptions
 from searx.results import Timing
 from searx.settings_defaults import OUTPUT_FORMATS
 from searx.settings_loader import get_default_settings_path
@ -1102,17 +1102,10 @@ def image_proxy():
@app.route('/engine_descriptions.json', methods=['GET'])
 def engine_descriptions():
    locale = get_locale().split('_')[0]
-    result = ENGINE_DESCRIPTIONS['en'].copy()
+    result = fetch_engine_descriptions('en')
    if locale != 'en':
-        for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
+        for engine, description in fetch_engine_descriptions(locale).items():
            result[engine] = description
-    for engine, description in result.items():
-        if len(description) == 2 and description[1] == 'ref':
-            ref_engine, ref_lang = description[0].split(':')
-            description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
-        if isinstance(description, str):
-            description = [description, 'wikipedia']
-        result[engine] = description

    # overwrite by about:description (from settings)
    for engine_name, engine_mod in engines.items():
--- a/searxng_extra/update/update_engine_descriptions.py
+++ b/searxng_extra/update/update_engine_descriptions.py
@ -9,22 +9,24 @@ Output file: :origin:`searx/data/engine_descriptions.json`.

 # pylint: disable=invalid-name, global-statement

+import csv
 import json
+import sqlite3
 from urllib.parse import urlparse
-from os.path import join
+from pathlib import Path

 from lxml.html import fromstring

 from searx.engines import wikidata, set_loggers
 from searx.utils import extract_text, searx_useragent
 from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
-from searx import searx_dir
 from searx.utils import gen_useragent, detect_language
 import searx.search
 import searx.network
 from searx.data import data_dir

-DATA_FILE = data_dir / 'engine_descriptions.json'
+DATABASE_FILE = data_dir / 'engine_descriptions.db'
+CSV_FILE = data_dir / 'dumps' / 'engine_descriptions.csv'

 set_loggers(wikidata, 'wikidata')
 locales_initialize()
@ -323,37 +325,32 @@ def fetch_website_descriptions():
            fetch_website_description(engine_name, website)


-def get_engine_descriptions_filename():
-    return join(join(searx_dir, "data"), "engine_descriptions.json")
-
-
-def get_output():
+def write_db():
    """
-    From descriptions[engine][language] = [description, source]
-    To
+    Erase and write the SQLite database searx/data/engine_descriptions.db :
+    * create one table engine_descriptions
+    * dump write all the values

-    * output[language][engine] = description_and_source
-    * description_and_source can be:
-       * [description, source]
-       * description (if source = "wikipedia")
-       * [f"engine:lang", "ref"] (reference to another existing description)
+    Make a JSON dump of the values into engine_descriptions.json
    """
-    output = {locale: {} for locale in LOCALE_NAMES}
-
-    seen_descriptions = {}
-
-    for engine_name, lang_descriptions in descriptions.items():
-        for language, description in lang_descriptions.items():
-            if description[0] in seen_descriptions:
-                ref = seen_descriptions[description[0]]
-                description = [f'{ref[0]}:{ref[1]}', 'ref']
-            else:
-                seen_descriptions[description[0]] = (engine_name, language)
-                if description[1] == 'wikipedia':
-                    description = description[0]
-            output.setdefault(language, {}).setdefault(engine_name, description)
-
-    return output
+    data = [
+        (language, engine_name, description[0], description[1])
+        for engine_name, lang_descriptions in descriptions.items()
+        for language, description in lang_descriptions.items()
+    ]
+    data.sort(key=lambda item: (item[0], item[1]))
+    Path(DATABASE_FILE).unlink(missing_ok=True)
+    with sqlite3.connect(DATABASE_FILE) as con:
+        cur = con.cursor()
+        cur.execute("CREATE TABLE engine_descriptions(language, engine, description, source)")
+        cur.executemany("INSERT INTO engine_descriptions VALUES(?, ?, ?, ?)", data)
+        cur.execute("CREATE INDEX index_engine_descriptions ON engine_descriptions('language')")
+        con.commit()
+    with CSV_FILE.open('w', encoding="utf8") as f:
+        w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
+        w.writerow(["language", "engine", "description", "source"])
+        for row in data:
+            w.writerow(row)


 def main():
@ -361,10 +358,7 @@ def main():
    fetch_wikidata_descriptions()
    fetch_wikipedia_descriptions()
    fetch_website_descriptions()
-
-    output = get_output()
-    with DATA_FILE.open('w', encoding='utf8') as f:
-        f.write(json.dumps(output, indent=1, separators=(',', ':'), sort_keys=True, ensure_ascii=False))
+    write_db()


 if __name__ == "__main__":
--- a/setup.py
+++ b/setup.py
@ -61,6 +61,7 @@ setup(
            'data/*.json',
            'data/*.txt',
            'data/*.ftz',
+            'data/*.db',
            'infopage/*/*',
            'static/themes/simple/css/*',
            'static/themes/simple/css/*/*',