mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
data: engine descriptions: use SQLite instead of JSON
To reduce memory usage, use a SQLite database to store the engine descriptions. A dump of the database is stored in Git to facilitate maintenance, especially the pull requests made automatically every month. Related to * https://github.com/searxng/searxng/discussions/2633 * https://github.com/searxng/searxng/pull/3443
This commit is contained in:
parent
ec41b53587
commit
83d4a2ebb0
8 changed files with 4126 additions and 7987 deletions
|
@ -13,15 +13,20 @@ __all__ = [
|
|||
'WIKIDATA_UNITS',
|
||||
'EXTERNAL_BANGS',
|
||||
'OSM_KEYS_TAGS',
|
||||
'ENGINE_DESCRIPTIONS',
|
||||
'LOCALES',
|
||||
'ahmia_blacklist_loader',
|
||||
'fetch_engine_descriptions',
|
||||
]
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from typing import Dict, Generator, List
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
data_dir = Path(__file__).parent
|
||||
data_connection_local = {}
|
||||
|
||||
|
||||
def _load(filename):
|
||||
|
@ -29,6 +34,41 @@ def _load(filename):
|
|||
return json.load(f)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def sql_connection(filename: str) -> Generator[sqlite3.Connection, None, None]:
|
||||
"""Return a read only SQLite connection to filename.
|
||||
The filename is relative to searx/data
|
||||
|
||||
Multiple calls to this function in the same thread,
|
||||
already return the same connection.
|
||||
"""
|
||||
dict_id = filename
|
||||
connection = data_connection_local.get(dict_id)
|
||||
if connection is None:
|
||||
data_filename = str(data_dir / filename)
|
||||
# open database in read only mode and allow to share between threads
|
||||
# https://www.sqlite.org/faq.html#q6
|
||||
# see https://ricardoanderegg.com/posts/python-sqlite-thread-safety/
|
||||
# and https://docs.python.org/3/library/sqlite3.html#sqlite3.threadsafety
|
||||
# sqlite3.threadsafety is hard coded to 1
|
||||
# the only reliable way to check if multithreading is supported is to run this query
|
||||
# SELECT * FROM pragma_compile_options WHERE compile_options LIKE 'THREADSAFE=%'
|
||||
# but THREADSAFE=1 on Linux anyway
|
||||
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True, check_same_thread=False)
|
||||
# 512KB of cache instead of 2MB (512KB / 4KB = 128, 4KB is the default page size)
|
||||
# https://www.sqlite.org/pragma.html#pragma_cache_size
|
||||
data_connection.execute("PRAGMA cache_size = 128;")
|
||||
data_connection_local[dict_id] = data_connection
|
||||
yield data_connection
|
||||
|
||||
|
||||
def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
|
||||
"""Return engine description and source for each engine name."""
|
||||
with sql_connection("engine_descriptions.db") as conn:
|
||||
res = conn.execute("SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,))
|
||||
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
|
||||
|
||||
|
||||
def ahmia_blacklist_loader():
|
||||
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
|
||||
names. The MD5 values are fetched by::
|
||||
|
@ -48,6 +88,5 @@ EXTERNAL_URLS = _load('external_urls.json')
|
|||
WIKIDATA_UNITS = _load('wikidata_units.json')
|
||||
EXTERNAL_BANGS = _load('external_bangs.json')
|
||||
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
|
||||
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
|
||||
ENGINE_TRAITS = _load('engine_traits.json')
|
||||
LOCALES = _load('locales.json')
|
||||
|
|
3
searx/data/dumps/README.rst
Normal file
3
searx/data/dumps/README.rst
Normal file
|
@ -0,0 +1,3 @@
|
|||
Dumps of the SQLite files in ``searx.data``.
|
||||
|
||||
These files are not used by SearXNG, they are here for reference.
|
4049
searx/data/dumps/engine_descriptions.csv
Normal file
4049
searx/data/dumps/engine_descriptions.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
searx/data/engine_descriptions.db
Normal file
BIN
searx/data/engine_descriptions.db
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -58,7 +58,7 @@ from searx import infopage
|
|||
from searx import limiter
|
||||
from searx.botdetection import link_token
|
||||
|
||||
from searx.data import ENGINE_DESCRIPTIONS
|
||||
from searx.data import fetch_engine_descriptions
|
||||
from searx.results import Timing
|
||||
from searx.settings_defaults import OUTPUT_FORMATS
|
||||
from searx.settings_loader import get_default_settings_path
|
||||
|
@ -1102,17 +1102,10 @@ def image_proxy():
|
|||
@app.route('/engine_descriptions.json', methods=['GET'])
|
||||
def engine_descriptions():
|
||||
locale = get_locale().split('_')[0]
|
||||
result = ENGINE_DESCRIPTIONS['en'].copy()
|
||||
result = fetch_engine_descriptions('en')
|
||||
if locale != 'en':
|
||||
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
|
||||
for engine, description in fetch_engine_descriptions(locale).items():
|
||||
result[engine] = description
|
||||
for engine, description in result.items():
|
||||
if len(description) == 2 and description[1] == 'ref':
|
||||
ref_engine, ref_lang = description[0].split(':')
|
||||
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
|
||||
if isinstance(description, str):
|
||||
description = [description, 'wikipedia']
|
||||
result[engine] = description
|
||||
|
||||
# overwrite by about:description (from settings)
|
||||
for engine_name, engine_mod in engines.items():
|
||||
|
|
|
@ -9,22 +9,24 @@ Output file: :origin:`searx/data/engine_descriptions.json`.
|
|||
|
||||
# pylint: disable=invalid-name, global-statement
|
||||
|
||||
import csv
|
||||
import json
|
||||
import sqlite3
|
||||
from urllib.parse import urlparse
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
|
||||
from lxml.html import fromstring
|
||||
|
||||
from searx.engines import wikidata, set_loggers
|
||||
from searx.utils import extract_text, searx_useragent
|
||||
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
|
||||
from searx import searx_dir
|
||||
from searx.utils import gen_useragent, detect_language
|
||||
import searx.search
|
||||
import searx.network
|
||||
from searx.data import data_dir
|
||||
|
||||
DATA_FILE = data_dir / 'engine_descriptions.json'
|
||||
DATABASE_FILE = data_dir / 'engine_descriptions.db'
|
||||
CSV_FILE = data_dir / 'dumps' / 'engine_descriptions.csv'
|
||||
|
||||
set_loggers(wikidata, 'wikidata')
|
||||
locales_initialize()
|
||||
|
@ -323,37 +325,32 @@ def fetch_website_descriptions():
|
|||
fetch_website_description(engine_name, website)
|
||||
|
||||
|
||||
def get_engine_descriptions_filename():
|
||||
return join(join(searx_dir, "data"), "engine_descriptions.json")
|
||||
|
||||
|
||||
def get_output():
|
||||
def write_db():
|
||||
"""
|
||||
From descriptions[engine][language] = [description, source]
|
||||
To
|
||||
Erase and write the SQLite database searx/data/engine_descriptions.db :
|
||||
* create one table engine_descriptions
|
||||
* dump write all the values
|
||||
|
||||
* output[language][engine] = description_and_source
|
||||
* description_and_source can be:
|
||||
* [description, source]
|
||||
* description (if source = "wikipedia")
|
||||
* [f"engine:lang", "ref"] (reference to another existing description)
|
||||
Make a JSON dump of the values into engine_descriptions.json
|
||||
"""
|
||||
output = {locale: {} for locale in LOCALE_NAMES}
|
||||
|
||||
seen_descriptions = {}
|
||||
|
||||
for engine_name, lang_descriptions in descriptions.items():
|
||||
for language, description in lang_descriptions.items():
|
||||
if description[0] in seen_descriptions:
|
||||
ref = seen_descriptions[description[0]]
|
||||
description = [f'{ref[0]}:{ref[1]}', 'ref']
|
||||
else:
|
||||
seen_descriptions[description[0]] = (engine_name, language)
|
||||
if description[1] == 'wikipedia':
|
||||
description = description[0]
|
||||
output.setdefault(language, {}).setdefault(engine_name, description)
|
||||
|
||||
return output
|
||||
data = [
|
||||
(language, engine_name, description[0], description[1])
|
||||
for engine_name, lang_descriptions in descriptions.items()
|
||||
for language, description in lang_descriptions.items()
|
||||
]
|
||||
data.sort(key=lambda item: (item[0], item[1]))
|
||||
Path(DATABASE_FILE).unlink(missing_ok=True)
|
||||
with sqlite3.connect(DATABASE_FILE) as con:
|
||||
cur = con.cursor()
|
||||
cur.execute("CREATE TABLE engine_descriptions(language, engine, description, source)")
|
||||
cur.executemany("INSERT INTO engine_descriptions VALUES(?, ?, ?, ?)", data)
|
||||
cur.execute("CREATE INDEX index_engine_descriptions ON engine_descriptions('language')")
|
||||
con.commit()
|
||||
with CSV_FILE.open('w', encoding="utf8") as f:
|
||||
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
|
||||
w.writerow(["language", "engine", "description", "source"])
|
||||
for row in data:
|
||||
w.writerow(row)
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -361,10 +358,7 @@ def main():
|
|||
fetch_wikidata_descriptions()
|
||||
fetch_wikipedia_descriptions()
|
||||
fetch_website_descriptions()
|
||||
|
||||
output = get_output()
|
||||
with DATA_FILE.open('w', encoding='utf8') as f:
|
||||
f.write(json.dumps(output, indent=1, separators=(',', ':'), sort_keys=True, ensure_ascii=False))
|
||||
write_db()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
1
setup.py
1
setup.py
|
@ -61,6 +61,7 @@ setup(
|
|||
'data/*.json',
|
||||
'data/*.txt',
|
||||
'data/*.ftz',
|
||||
'data/*.db',
|
||||
'infopage/*/*',
|
||||
'static/themes/simple/css/*',
|
||||
'static/themes/simple/css/*/*',
|
||||
|
|
Loading…
Add table
Reference in a new issue