data: currencies: use SQLite instead of JSON

This commit is contained in:
Alexandre Flament 2024-05-04 10:41:36 +00:00
parent 83d4a2ebb0
commit 71f1789be0
7 changed files with 10155 additions and 15018 deletions

View file

@ -7,7 +7,6 @@
__all__ = [ __all__ = [
'ENGINE_TRAITS', 'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS', 'USER_AGENTS',
'EXTERNAL_URLS', 'EXTERNAL_URLS',
'WIKIDATA_UNITS', 'WIKIDATA_UNITS',
@ -16,12 +15,17 @@ __all__ = [
'LOCALES', 'LOCALES',
'ahmia_blacklist_loader', 'ahmia_blacklist_loader',
'fetch_engine_descriptions', 'fetch_engine_descriptions',
'fetch_iso4217_from_user',
'fetch_name_from_iso4217',
] ]
import re
import unicodedata
import json import json
import sqlite3 import sqlite3
from contextlib import contextmanager from contextlib import contextmanager
from typing import Dict, Generator, List from typing import Dict, Generator, List, Optional
from functools import lru_cache
from pathlib import Path from pathlib import Path
@ -69,6 +73,42 @@ def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
return {result[0]: [result[1], result[2]] for result in res.fetchall()} return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def _normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
@lru_cache(10)
def fetch_iso4217_from_user(name: str) -> Optional[str]:
with sql_connection("currencies.db") as connection:
# try the iso4217
res = connection.execute("SELECT iso4217 FROM currencies WHERE lower(iso4217)=? LIMIT 1", (name.lower(),))
result = res.fetchone()
if result:
return result[0]
# try the currency names
name = _normalize_name(name)
res = connection.execute("SELECT iso4217 FROM currencies WHERE name=?", (name,))
result = list(set(result[0] for result in res.fetchall()))
if len(result) == 1:
return result[0]
# ambiguity --> return nothing
return None
@lru_cache(10)
def fetch_name_from_iso4217(iso4217: str, language: str) -> Optional[str]:
with sql_connection("currencies.db") as connection:
res = connection.execute("SELECT name FROM currencies WHERE iso4217=? AND language=?", (iso4217, language))
result = [result[0] for result in res.fetchall()]
if len(result) == 1:
return result[0]
return None
def ahmia_blacklist_loader(): def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion """Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by:: names. The MD5 values are fetched by::
@ -82,7 +122,6 @@ def ahmia_blacklist_loader():
return f.read().split() return f.read().split()
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json') USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json') EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json') WIKIDATA_UNITS = _load('wikidata_units.json')

BIN
searx/data/currencies.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,7 +10,7 @@ from functools import partial
from flask_babel import gettext from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES from searx.data import OSM_KEYS_TAGS, fetch_name_from_iso4217
from searx.utils import searx_useragent from searx.utils import searx_useragent
from searx.external_urls import get_external_url from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
@ -449,9 +449,9 @@ def get_key_label(key_name, lang):
# https://taginfo.openstreetmap.org/keys/currency#values # https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':') currency = key_name.split(':')
if len(currency) > 1: if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency[1]) label = fetch_name_from_iso4217(currency[1], lang)
if o: if label:
return get_label(o, lang).lower() return label
return currency[1] return currency[1]
labels = OSM_KEYS_TAGS['keys'] labels = OSM_KEYS_TAGS['keys']

View file

@ -3,33 +3,14 @@
""" """
import unicodedata
import re import re
from searx.data import CURRENCIES from searx.data import fetch_iso4217_from_user, fetch_name_from_iso4217
from .online import OnlineProcessor from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
def normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name):
name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str):
return currency
return currency[0]
def iso4217_to_name(iso4217, language):
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor): class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines.""" """Processor class used by ``online_currency`` engines."""
@ -52,14 +33,17 @@ class OnlineCurrencyProcessor(OnlineProcessor):
amount = float(amount_str) amount = float(amount_str)
except ValueError: except ValueError:
return None return None
from_currency = name_to_iso4217(from_currency.strip()) from_currency = fetch_iso4217_from_user(from_currency.strip())
to_currency = name_to_iso4217(to_currency.strip()) to_currency = fetch_iso4217_from_user(to_currency.strip())
if from_currency is None or to_currency is None:
return None
params['amount'] = amount params['amount'] = amount
params['from'] = from_currency params['from'] = from_currency
params['to'] = to_currency params['to'] = to_currency
params['from_name'] = iso4217_to_name(from_currency, 'en') params['from_name'] = fetch_name_from_iso4217(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en') params['to_name'] = fetch_name_from_iso4217(to_currency, 'en')
return params return params
def get_default_tests(self): def get_default_tests(self):

View file

@ -9,15 +9,20 @@ Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
# pylint: disable=invalid-name # pylint: disable=invalid-name
import csv
import re import re
import unicodedata import unicodedata
import json import sqlite3
from pathlib import Path
from searx.network import set_timeout_for_thread
from searx.locales import LOCALE_NAMES, locales_initialize from searx.locales import LOCALE_NAMES, locales_initialize
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.data import data_dir from searx.data import data_dir
DATA_FILE = data_dir / 'currencies.json' DATABASE_FILE = data_dir / 'currencies.db'
CSV_FILE = data_dir / 'dumps' / 'currencies.csv'
set_loggers(wikidata, 'wikidata') set_loggers(wikidata, 'wikidata')
locales_initialize() locales_initialize()
@ -75,57 +80,45 @@ def _normalize_name(name):
return name return name
def add_currency_name(db, name, iso4217, normalize_name=True): def add_entry(db, language, iso4217, name, normalize_name=True):
db_names = db['names']
if normalize_name: if normalize_name:
name = _normalize_name(name) name = _normalize_name(name)
iso4217_set = db_names.setdefault(name, []) entry = (language, iso4217, name)
if iso4217 not in iso4217_set: db.add(entry)
iso4217_set.insert(0, iso4217)
def add_currency_label(db, label, iso4217, language):
labels = db['iso4217'].setdefault(iso4217, {})
labels[language] = label
def wikidata_request_result_iterator(request): def wikidata_request_result_iterator(request):
set_timeout_for_thread(60)
result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None: if result is not None:
yield from result['results']['bindings'] yield from result['results']['bindings']
def fetch_db(): def fetch_db():
db = { db = set()
'names': {},
'iso4217': {},
}
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST): for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
iso4217 = r['iso4217']['value'] iso4217 = r['iso4217']['value']
article_name = r['article_name']['value'] article_name = r['article_name']['value']
article_lang = r['article_name']['xml:lang'] article_lang = r['article_name']['xml:lang']
add_currency_name(db, article_name, iso4217) add_entry(db, article_lang, iso4217, article_name)
add_currency_label(db, article_name, iso4217, article_lang)
for r in wikidata_request_result_iterator(SARQL_REQUEST): for r in wikidata_request_result_iterator(SARQL_REQUEST):
iso4217 = r['iso4217']['value'] iso4217 = r['iso4217']['value']
if 'label' in r: if 'label' in r:
label = r['label']['value'] label = r['label']['value']
label_lang = r['label']['xml:lang'] label_lang = r['label']['xml:lang']
add_currency_name(db, label, iso4217) add_entry(db, label_lang, iso4217, label)
add_currency_label(db, label, iso4217, label_lang)
if 'alias' in r: if 'alias' in r:
add_currency_name(db, r['alias']['value'], iso4217) add_entry(db, "", iso4217, r['alias']['value'])
if 'unicode' in r: if 'unicode' in r:
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False) add_entry(db, "", iso4217, r['unicode']['value'], normalize_name=False)
if 'unit' in r: if 'unit' in r:
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False) add_entry(db, "", iso4217, r['unit']['value'], normalize_name=False)
return db return db
@ -135,22 +128,33 @@ def main():
db = fetch_db() db = fetch_db()
# static # static
add_currency_name(db, "euro", 'EUR') add_entry(db, "", 'EUR', "euro")
add_currency_name(db, "euros", 'EUR') add_entry(db, "", 'EUR', "euros")
add_currency_name(db, "dollar", 'USD') add_entry(db, "", 'USD', "dollar")
add_currency_name(db, "dollars", 'USD') add_entry(db, "", 'USD', "dollars")
add_currency_name(db, "peso", 'MXN') add_entry(
add_currency_name(db, "pesos", 'MXN') db,
"",
'MXN',
"peso",
)
add_entry(db, "", 'MXN', "pesos")
# reduce memory usage: db = list(db)
# replace lists with one item by the item. see db.sort(key=lambda entry: (entry[0], entry[1], entry[2]))
# searx.search.processors.online_currency.name_to_iso4217 Path(DATABASE_FILE).unlink(missing_ok=True)
for name in db['names']: with sqlite3.connect(DATABASE_FILE) as con:
if len(db['names'][name]) == 1: cur = con.cursor()
db['names'][name] = db['names'][name][0] cur.execute("CREATE TABLE currencies(language, iso4217, name)")
cur.executemany("INSERT INTO currencies VALUES(?, ?, ?)", db)
with DATA_FILE.open('w', encoding='utf8') as f: cur.execute("CREATE INDEX index_currencies_iso4217 ON currencies('iso4217')")
json.dump(db, f, indent=4, sort_keys=True, ensure_ascii=False) cur.execute("CREATE INDEX index_currencies_name ON currencies('name')")
con.commit()
with CSV_FILE.open('w', encoding='utf8') as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["language", "iso4217", "name"])
for row in db:
w.writerow(row)
if __name__ == '__main__': if __name__ == '__main__':