This commit is contained in:
Alexandre Flament 2024-05-28 22:26:15 +02:00 committed by GitHub
commit e24df67b98
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 86700 additions and 79765 deletions

View file

@ -7,21 +7,30 @@
__all__ = [ __all__ = [
'ENGINE_TRAITS', 'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS', 'USER_AGENTS',
'EXTERNAL_URLS', 'EXTERNAL_URLS',
'WIKIDATA_UNITS', 'WIKIDATA_UNITS',
'EXTERNAL_BANGS', 'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'LOCALES', 'LOCALES',
'ahmia_blacklist_loader', 'ahmia_blacklist_loader',
'fetch_engine_descriptions',
'fetch_iso4217_from_user',
'fetch_name_from_iso4217',
'fetch_osm_key_label',
] ]
import re
import unicodedata
import json import json
import sqlite3
from contextlib import contextmanager
from typing import Dict, Generator, List, Optional
from functools import lru_cache
from pathlib import Path from pathlib import Path
data_dir = Path(__file__).parent data_dir = Path(__file__).parent
data_connection_local = {}
def _load(filename): def _load(filename):
@ -29,6 +38,116 @@ def _load(filename):
return json.load(f) return json.load(f)
@contextmanager
def sql_connection(filename: str) -> Generator[sqlite3.Connection, None, None]:
"""Return a read only SQLite connection to filename.
The filename is relative to searx/data
Multiple calls to this function in the same thread,
already return the same connection.
"""
dict_id = filename
connection = data_connection_local.get(dict_id)
if connection is None:
data_filename = str(data_dir / filename)
# open database in read only mode and allow to share between threads
# https://www.sqlite.org/faq.html#q6
# see https://ricardoanderegg.com/posts/python-sqlite-thread-safety/
# and https://docs.python.org/3/library/sqlite3.html#sqlite3.threadsafety
# sqlite3.threadsafety is hard coded to 1
# the only reliable way to check if multithreading is supported is to run this query
# SELECT * FROM pragma_compile_options WHERE compile_options LIKE 'THREADSAFE=%'
# but THREADSAFE=1 on Linux anyway
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True, check_same_thread=False)
# 512KB of cache instead of 2MB (512KB / 4KB = 128, 4KB is the default page size)
# https://www.sqlite.org/pragma.html#pragma_cache_size
data_connection.execute("PRAGMA cache_size = 128;")
data_connection_local[dict_id] = data_connection
yield data_connection
def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
"""Return engine description and source for each engine name."""
with sql_connection("engine_descriptions.db") as conn:
res = conn.execute("SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,))
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def _normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
@lru_cache(10)
def fetch_iso4217_from_user(name: str) -> Optional[str]:
with sql_connection("currencies.db") as connection:
# try the iso4217
res = connection.execute("SELECT iso4217 FROM currencies WHERE lower(iso4217)=? LIMIT 1", (name.lower(),))
result = res.fetchone()
if result:
return result[0]
# try the currency names
name = _normalize_name(name)
res = connection.execute("SELECT iso4217 FROM currencies WHERE name=?", (name,))
result = list(set(result[0] for result in res.fetchall()))
if len(result) == 1:
return result[0]
# ambiguity --> return nothing
return None
@lru_cache(10)
def fetch_name_from_iso4217(iso4217: str, language: str) -> Optional[str]:
with sql_connection("currencies.db") as connection:
res = connection.execute("SELECT name FROM currencies WHERE iso4217=? AND language=?", (iso4217, language))
result = [result[0] for result in res.fetchall()]
if len(result) == 1:
return result[0]
return None
@lru_cache(100)
def fetch_osm_key_label(key_name: str, language: str) -> Optional[str]:
if key_name.startswith('currency:'):
# currency:EUR --> get the name from the CURRENCIES variable
# see https://wiki.openstreetmap.org/wiki/Key%3Acurrency
# and for example https://taginfo.openstreetmap.org/keys/currency:EUR#values
# but there is also currency=EUR (currently not handled)
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
label = fetch_name_from_iso4217(currency[1], language)
if label:
return label
return currency[1]
language = language.lower()
language_short = language.split('-')[0]
with sql_connection("osm_keys_tags.db") as conn:
res = conn.execute(
"SELECT language, label FROM osm_keys WHERE name=? AND language in (?, ?, 'en')",
(key_name, language, language_short),
)
result = {result[0]: result[1] for result in res.fetchall()}
return result.get(language) or result.get(language_short) or result.get('en')
@lru_cache(100)
def fetch_osm_tag_label(tag_key: str, tag_value: str, language: str) -> Optional[str]:
language = language.lower()
language_short = language.split('-')[0]
with sql_connection("osm_keys_tags.db") as conn:
res = conn.execute(
"SELECT language, label FROM osm_tags WHERE tag_key=? AND tag_value=? AND language in (?, ?, 'en')",
(tag_key, tag_value, language, language_short),
)
result = {result[0]: result[1] for result in res.fetchall()}
return result.get(language) or result.get(language_short) or result.get('en')
def ahmia_blacklist_loader(): def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion """Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by:: names. The MD5 values are fetched by::
@ -42,12 +161,9 @@ def ahmia_blacklist_loader():
return f.read().split() return f.read().split()
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json') USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json') EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json') WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json') EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json') ENGINE_TRAITS = _load('engine_traits.json')
LOCALES = _load('locales.json') LOCALES = _load('locales.json')

BIN
searx/data/currencies.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,3 @@
Dumps of the SQLite files in ``searx.data``.
These files are not used by SearXNG, they are here for reference.

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

70384
searx/data/dumps/osm_tags.csv Normal file

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because one or more lines are too long

BIN
searx/data/osm_keys_tags.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -6,7 +6,7 @@ from time import time
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.network import get as http_get from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label from searx.data import fetch_osm_key_label
about = { about = {
"website": 'https://www.apple.com/maps/', "website": 'https://www.apple.com/maps/',
@ -72,7 +72,7 @@ def response(resp):
telephone = result['telephone'] telephone = result['telephone']
links.append( links.append(
{ {
'label': get_key_label('phone', user_language), 'label': fetch_osm_key_label('phone', user_language),
'url': 'tel:' + telephone, 'url': 'tel:' + telephone,
'url_label': telephone, 'url_label': telephone,
} }
@ -81,7 +81,7 @@ def response(resp):
url = result['urls'][0] url = result['urls'][0]
links.append( links.append(
{ {
'label': get_key_label('website', user_language), 'label': fetch_osm_key_label('website', user_language),
'url': url, 'url': url,
'url_label': url, 'url_label': url,
} }

View file

@ -10,7 +10,7 @@ from functools import partial
from flask_babel import gettext from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES from searx.data import fetch_osm_tag_label, fetch_osm_key_label
from searx.utils import searx_useragent from searx.utils import searx_useragent
from searx.external_urls import get_external_url from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
@ -187,14 +187,14 @@ def response(resp):
'template': 'map.html', 'template': 'map.html',
'title': title, 'title': title,
'address': address, 'address': address,
'address_label': get_key_label('addr', user_language), 'address_label': fetch_osm_key_label('addr', user_language),
'url': url, 'url': url,
'osm': osm, 'osm': osm,
'geojson': geojson, 'geojson': geojson,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'links': links, 'links': links,
'data': data, 'data': data,
'type': get_tag_label(result.get('category'), result.get('type', ''), user_language), 'type': fetch_osm_tag_label(result.get('category'), result.get('type', ''), user_language),
'type_icon': result.get('icon'), 'type_icon': result.get('icon'),
'content': '', 'content': '',
'longitude': result['lon'], 'longitude': result['lon'],
@ -367,7 +367,7 @@ def get_links(result, user_language):
url_label = result.get('wikidata', {}).get('itemLabel') or url_label url_label = result.get('wikidata', {}).get('itemLabel') or url_label
links.append( links.append(
{ {
'label': get_key_label(k, user_language), 'label': fetch_osm_key_label(k, user_language),
'url': url, 'url': url,
'url_label': url_label, 'url_label': url_label,
} }
@ -389,7 +389,7 @@ def get_data(result, user_language, ignore_keys):
continue continue
if get_key_rank(k) is None: if get_key_rank(k) is None:
continue continue
k_label = get_key_label(k, user_language) k_label = fetch_osm_key_label(k, user_language)
if k_label: if k_label:
data.append( data.append(
{ {
@ -412,51 +412,3 @@ def get_key_rank(k):
# "payment:*" in KEY_ORDER matches "payment:cash", "payment:debit card", etc... # "payment:*" in KEY_ORDER matches "payment:cash", "payment:debit card", etc...
key_rank = KEY_RANKS.get(k.split(':')[0] + ':*') key_rank = KEY_RANKS.get(k.split(':')[0] + ':*')
return key_rank return key_rank
def get_label(labels, lang):
"""Get label from labels in OSM_KEYS_TAGS
in OSM_KEYS_TAGS, labels have key == '*'
"""
tag_label = labels.get(lang.lower())
if tag_label is None:
# example: if 'zh-hk' is not found, check 'zh'
tag_label = labels.get(lang.split('-')[0])
if tag_label is None and lang != 'en':
# example: if 'zh' is not found, check 'en'
tag_label = labels.get('en')
if tag_label is None and len(labels.values()) > 0:
# example: if still not found, use the first entry
tag_label = labels.values()[0]
return tag_label
def get_tag_label(tag_category, tag_name, lang):
"""Get tag label from OSM_KEYS_TAGS"""
tag_name = '' if tag_name is None else tag_name
tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {})
return get_label(tag_labels, lang)
def get_key_label(key_name, lang):
"""Get key label from OSM_KEYS_TAGS"""
if key_name.startswith('currency:'):
# currency:EUR --> get the name from the CURRENCIES variable
# see https://wiki.openstreetmap.org/wiki/Key%3Acurrency
# and for example https://taginfo.openstreetmap.org/keys/currency:EUR#values
# but there is also currency=EUR (currently not handled)
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency[1])
if o:
return get_label(o, lang).lower()
return currency[1]
labels = OSM_KEYS_TAGS['keys']
for k in key_name.split(':') + ['*']:
labels = labels.get(k)
if labels is None:
return None
return get_label(labels, lang)

View file

@ -3,33 +3,14 @@
""" """
import unicodedata
import re import re
from searx.data import CURRENCIES from searx.data import fetch_iso4217_from_user, fetch_name_from_iso4217
from .online import OnlineProcessor from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
def normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name):
name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str):
return currency
return currency[0]
def iso4217_to_name(iso4217, language):
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor): class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines.""" """Processor class used by ``online_currency`` engines."""
@ -52,14 +33,17 @@ class OnlineCurrencyProcessor(OnlineProcessor):
amount = float(amount_str) amount = float(amount_str)
except ValueError: except ValueError:
return None return None
from_currency = name_to_iso4217(from_currency.strip()) from_currency = fetch_iso4217_from_user(from_currency.strip())
to_currency = name_to_iso4217(to_currency.strip()) to_currency = fetch_iso4217_from_user(to_currency.strip())
if from_currency is None or to_currency is None:
return None
params['amount'] = amount params['amount'] = amount
params['from'] = from_currency params['from'] = from_currency
params['to'] = to_currency params['to'] = to_currency
params['from_name'] = iso4217_to_name(from_currency, 'en') params['from_name'] = fetch_name_from_iso4217(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en') params['to_name'] = fetch_name_from_iso4217(to_currency, 'en')
return params return params
def get_default_tests(self): def get_default_tests(self):

View file

@ -58,7 +58,7 @@ from searx import infopage
from searx import limiter from searx import limiter
from searx.botdetection import link_token from searx.botdetection import link_token
from searx.data import ENGINE_DESCRIPTIONS from searx.data import fetch_engine_descriptions
from searx.results import Timing from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path from searx.settings_loader import get_default_settings_path
@ -1102,16 +1102,9 @@ def image_proxy():
@app.route('/engine_descriptions.json', methods=['GET']) @app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions(): def engine_descriptions():
locale = get_locale().split('_')[0] locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy() result = fetch_engine_descriptions('en')
if locale != 'en': if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items(): for engine, description in fetch_engine_descriptions(locale).items():
result[engine] = description
for engine, description in result.items():
if len(description) == 2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str):
description = [description, 'wikipedia']
result[engine] = description result[engine] = description
# overwrite by about:description (from settings) # overwrite by about:description (from settings)

View file

@ -9,15 +9,20 @@ Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
# pylint: disable=invalid-name # pylint: disable=invalid-name
import csv
import re import re
import unicodedata import unicodedata
import json import sqlite3
from pathlib import Path
from searx.network import set_timeout_for_thread
from searx.locales import LOCALE_NAMES, locales_initialize from searx.locales import LOCALE_NAMES, locales_initialize
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.data import data_dir from searx.data import data_dir
DATA_FILE = data_dir / 'currencies.json' DATABASE_FILE = data_dir / 'currencies.db'
CSV_FILE = data_dir / 'dumps' / 'currencies.csv'
set_loggers(wikidata, 'wikidata') set_loggers(wikidata, 'wikidata')
locales_initialize() locales_initialize()
@ -75,57 +80,45 @@ def _normalize_name(name):
return name return name
def add_currency_name(db, name, iso4217, normalize_name=True): def add_entry(db, language, iso4217, name, normalize_name=True):
db_names = db['names']
if normalize_name: if normalize_name:
name = _normalize_name(name) name = _normalize_name(name)
iso4217_set = db_names.setdefault(name, []) entry = (language, iso4217, name)
if iso4217 not in iso4217_set: db.add(entry)
iso4217_set.insert(0, iso4217)
def add_currency_label(db, label, iso4217, language):
labels = db['iso4217'].setdefault(iso4217, {})
labels[language] = label
def wikidata_request_result_iterator(request): def wikidata_request_result_iterator(request):
set_timeout_for_thread(60)
result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None: if result is not None:
yield from result['results']['bindings'] yield from result['results']['bindings']
def fetch_db(): def fetch_db():
db = { db = set()
'names': {},
'iso4217': {},
}
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST): for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
iso4217 = r['iso4217']['value'] iso4217 = r['iso4217']['value']
article_name = r['article_name']['value'] article_name = r['article_name']['value']
article_lang = r['article_name']['xml:lang'] article_lang = r['article_name']['xml:lang']
add_currency_name(db, article_name, iso4217) add_entry(db, article_lang, iso4217, article_name)
add_currency_label(db, article_name, iso4217, article_lang)
for r in wikidata_request_result_iterator(SARQL_REQUEST): for r in wikidata_request_result_iterator(SARQL_REQUEST):
iso4217 = r['iso4217']['value'] iso4217 = r['iso4217']['value']
if 'label' in r: if 'label' in r:
label = r['label']['value'] label = r['label']['value']
label_lang = r['label']['xml:lang'] label_lang = r['label']['xml:lang']
add_currency_name(db, label, iso4217) add_entry(db, label_lang, iso4217, label)
add_currency_label(db, label, iso4217, label_lang)
if 'alias' in r: if 'alias' in r:
add_currency_name(db, r['alias']['value'], iso4217) add_entry(db, "", iso4217, r['alias']['value'])
if 'unicode' in r: if 'unicode' in r:
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False) add_entry(db, "", iso4217, r['unicode']['value'], normalize_name=False)
if 'unit' in r: if 'unit' in r:
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False) add_entry(db, "", iso4217, r['unit']['value'], normalize_name=False)
return db return db
@ -135,22 +128,33 @@ def main():
db = fetch_db() db = fetch_db()
# static # static
add_currency_name(db, "euro", 'EUR') add_entry(db, "", 'EUR', "euro")
add_currency_name(db, "euros", 'EUR') add_entry(db, "", 'EUR', "euros")
add_currency_name(db, "dollar", 'USD') add_entry(db, "", 'USD', "dollar")
add_currency_name(db, "dollars", 'USD') add_entry(db, "", 'USD', "dollars")
add_currency_name(db, "peso", 'MXN') add_entry(
add_currency_name(db, "pesos", 'MXN') db,
"",
'MXN',
"peso",
)
add_entry(db, "", 'MXN', "pesos")
# reduce memory usage: db = list(db)
# replace lists with one item by the item. see db.sort(key=lambda entry: (entry[0], entry[1], entry[2]))
# searx.search.processors.online_currency.name_to_iso4217 Path(DATABASE_FILE).unlink(missing_ok=True)
for name in db['names']: with sqlite3.connect(DATABASE_FILE) as con:
if len(db['names'][name]) == 1: cur = con.cursor()
db['names'][name] = db['names'][name][0] cur.execute("CREATE TABLE currencies(language, iso4217, name)")
cur.executemany("INSERT INTO currencies VALUES(?, ?, ?)", db)
with DATA_FILE.open('w', encoding='utf8') as f: cur.execute("CREATE INDEX index_currencies_iso4217 ON currencies('iso4217')")
json.dump(db, f, indent=4, sort_keys=True, ensure_ascii=False) cur.execute("CREATE INDEX index_currencies_name ON currencies('name')")
con.commit()
with CSV_FILE.open('w', encoding='utf8') as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["language", "iso4217", "name"])
for row in db:
w.writerow(row)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -9,22 +9,24 @@ Output file: :origin:`searx/data/engine_descriptions.json`.
# pylint: disable=invalid-name, global-statement # pylint: disable=invalid-name, global-statement
import csv
import json import json
import sqlite3
from urllib.parse import urlparse from urllib.parse import urlparse
from os.path import join from pathlib import Path
from lxml.html import fromstring from lxml.html import fromstring
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, searx_useragent from searx.utils import extract_text, searx_useragent
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language from searx.utils import gen_useragent, detect_language
import searx.search import searx.search
import searx.network import searx.network
from searx.data import data_dir from searx.data import data_dir
DATA_FILE = data_dir / 'engine_descriptions.json' DATABASE_FILE = data_dir / 'engine_descriptions.db'
CSV_FILE = data_dir / 'dumps' / 'engine_descriptions.csv'
set_loggers(wikidata, 'wikidata') set_loggers(wikidata, 'wikidata')
locales_initialize() locales_initialize()
@ -323,37 +325,32 @@ def fetch_website_descriptions():
fetch_website_description(engine_name, website) fetch_website_description(engine_name, website)
def get_engine_descriptions_filename(): def write_db():
return join(join(searx_dir, "data"), "engine_descriptions.json")
def get_output():
""" """
From descriptions[engine][language] = [description, source] Erase and write the SQLite database searx/data/engine_descriptions.db :
To * create one table engine_descriptions
* dump write all the values
* output[language][engine] = description_and_source Make a JSON dump of the values into engine_descriptions.json
* description_and_source can be:
* [description, source]
* description (if source = "wikipedia")
* [f"engine:lang", "ref"] (reference to another existing description)
""" """
output = {locale: {} for locale in LOCALE_NAMES} data = [
(language, engine_name, description[0], description[1])
seen_descriptions = {} for engine_name, lang_descriptions in descriptions.items()
for language, description in lang_descriptions.items()
for engine_name, lang_descriptions in descriptions.items(): ]
for language, description in lang_descriptions.items(): data.sort(key=lambda item: (item[0], item[1]))
if description[0] in seen_descriptions: Path(DATABASE_FILE).unlink(missing_ok=True)
ref = seen_descriptions[description[0]] with sqlite3.connect(DATABASE_FILE) as con:
description = [f'{ref[0]}:{ref[1]}', 'ref'] cur = con.cursor()
else: cur.execute("CREATE TABLE engine_descriptions(language, engine, description, source)")
seen_descriptions[description[0]] = (engine_name, language) cur.executemany("INSERT INTO engine_descriptions VALUES(?, ?, ?, ?)", data)
if description[1] == 'wikipedia': cur.execute("CREATE INDEX index_engine_descriptions ON engine_descriptions('language')")
description = description[0] con.commit()
output.setdefault(language, {}).setdefault(engine_name, description) with CSV_FILE.open('w', encoding="utf8") as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
return output w.writerow(["language", "engine", "description", "source"])
for row in data:
w.writerow(row)
def main(): def main():
@ -361,10 +358,7 @@ def main():
fetch_wikidata_descriptions() fetch_wikidata_descriptions()
fetch_wikipedia_descriptions() fetch_wikipedia_descriptions()
fetch_website_descriptions() fetch_website_descriptions()
write_db()
output = get_output()
with DATA_FILE.open('w', encoding='utf8') as f:
f.write(json.dumps(output, indent=1, separators=(',', ':'), sort_keys=True, ensure_ascii=False))
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -42,8 +42,9 @@ Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ...
""" """
import json import csv
import collections import sqlite3
from pathlib import Path
from searx.network import set_timeout_for_thread from searx.network import set_timeout_for_thread
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
@ -51,7 +52,9 @@ from searx.sxng_locales import sxng_locales
from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK
from searx.data import data_dir from searx.data import data_dir
DATA_FILE = data_dir / 'osm_keys_tags.json' DATABASE_FILE = data_dir / 'osm_keys_tags.db'
CSV_KEYS_FILE = data_dir / 'dumps' / 'osm_keys.csv'
CSV_TAGS_FILE = data_dir / 'dumps' / 'osm_tags.csv'
set_loggers(wikidata, 'wikidata') set_loggers(wikidata, 'wikidata')
@ -78,42 +81,39 @@ ORDER BY ?key ?item ?itemLabel
LANGUAGES = [l[0].lower() for l in sxng_locales] LANGUAGES = [l[0].lower() for l in sxng_locales]
PRESET_KEYS = { PRESET_KEYS = [
('wikidata',): {'en': 'Wikidata'}, ["wikidata", "en", "Wikidata"],
('wikipedia',): {'en': 'Wikipedia'}, ["wikipedia", "en", "Wikipedia"],
('email',): {'en': 'Email'}, ["email", "en", "email"],
('facebook',): {'en': 'Facebook'}, ["facebook", "en", "facebook"],
('fax',): {'en': 'Fax'}, ["fax", "en", "Fax"],
('internet_access', 'ssid'): {'en': 'Wi-Fi'}, ["internet_access:ssid", "en", "Wi-Fi"],
} ]
INCLUDED_KEYS = {('addr',)} INCLUDED_KEYS = {('addr',)}
def get_preset_keys():
results = collections.OrderedDict()
for keys, value in PRESET_KEYS.items():
r = results
for k in keys:
r = r.setdefault(k, {})
r.setdefault('*', value)
return results
def get_keys(): def get_keys():
results = get_preset_keys() result_keys = set()
results = PRESET_KEYS.copy()
response = wikidata.send_wikidata_query(SPARQL_KEYS_REQUEST) response = wikidata.send_wikidata_query(SPARQL_KEYS_REQUEST)
for key in response['results']['bindings']: for key in response['results']['bindings']:
keys = key['key']['value'].split(':')[1:] keys = key['key']['value'].split(':')[1:]
label = key['itemLabel']['value'].lower()
lang = key['itemLabel']['xml:lang']
if lang not in LANGUAGES:
continue
if keys[0] == 'currency' and len(keys) > 1: if keys[0] == 'currency' and len(keys) > 1:
# special case in openstreetmap.py # special case in openstreetmap.py
continue continue
if keys[0] == 'contact' and len(keys) > 1: if keys[0] == 'contact' and len(keys) > 1:
if lang == "en":
# label for the key "contact.email" is "Email" # label for the key "contact.email" is "Email"
# whatever the language # whatever the language
r = results.setdefault('contact', {}) results.append((":".join(keys), "en", keys[1]))
r[keys[1]] = {'*': {'en': keys[1]}}
continue continue
if tuple(keys) in PRESET_KEYS: if tuple(keys) in PRESET_KEYS:
# skip presets (already set above) # skip presets (already set above)
@ -125,40 +125,46 @@ def get_keys():
): ):
# keep only keys that will be displayed by openstreetmap.py # keep only keys that will be displayed by openstreetmap.py
continue continue
label = key['itemLabel']['value'].lower()
lang = key['itemLabel']['xml:lang'] entry = (":".join(keys), lang, label)
r = results entry_key = (entry[0], entry[1])
for k in keys: if entry_key not in result_keys:
r = r.setdefault(k, {}) results.append(entry)
r = r.setdefault('*', {}) result_keys.add(entry_key)
if lang in LANGUAGES:
r.setdefault(lang, label)
# special cases # special cases
results['delivery']['covid19']['*'].clear() results = [entry for entry in results if entry[0] != 'delivery:covid19']
for k, v in results['delivery']['*'].items(): results.extend(
results['delivery']['covid19']['*'][k] = v + ' (COVID19)' [['delivery:covid19', entry[1], entry[2] + ' (COVID19)'] for entry in results if entry[0] == 'delivery']
)
results['opening_hours']['covid19']['*'].clear() results = [entry for entry in results if entry[0] != 'opening_hours:covid19']
for k, v in results['opening_hours']['*'].items(): results.extend(
results['opening_hours']['covid19']['*'][k] = v + ' (COVID19)' [
['opening_hours:covid19', entry[1], entry[2] + ' (COVID19)']
for entry in results
if entry[0] == 'opening_hours'
]
)
return results return results
def get_tags(): def get_tags():
results = collections.OrderedDict() results = []
response = wikidata.send_wikidata_query(SPARQL_TAGS_REQUEST) response = wikidata.send_wikidata_query(SPARQL_TAGS_REQUEST)
for tag in response['results']['bindings']: for tag in response['results']['bindings']:
tag_names = tag['tag']['value'].split(':')[1].split('=') try:
if len(tag_names) == 2: tag_key, tag_value = tag['tag']['value'].split('=')
tag_category, tag_type = tag_names if tag_key.startswith("Tag:"):
else: tag_key = tag_key[4:]
tag_category, tag_type = tag_names[0], '' except ValueError:
print("ignore tag", tag['tag']['value'])
continue
label = tag['itemLabel']['value'].lower() label = tag['itemLabel']['value'].lower()
lang = tag['itemLabel']['xml:lang'] lang = tag['itemLabel']['xml:lang']
if lang in LANGUAGES: if lang in LANGUAGES:
results.setdefault(tag_category, {}).setdefault(tag_type, {}).setdefault(lang, label) results.append((tag_key, tag_value, lang, label))
return results return results
@ -206,9 +212,30 @@ def optimize_keys(data):
if __name__ == '__main__': if __name__ == '__main__':
set_timeout_for_thread(60) set_timeout_for_thread(60)
result = { osm_keys = get_keys()
'keys': optimize_keys(get_keys()), osm_tags = get_tags()
'tags': optimize_tags(get_tags()),
} osm_keys.sort(key=lambda item: (item[0], item[1]))
with DATA_FILE.open('w', encoding="utf8") as f: osm_tags.sort(key=lambda item: (item[0], item[1]))
json.dump(result, f, indent=4, sort_keys=True, ensure_ascii=False)
Path(DATABASE_FILE).unlink(missing_ok=True)
with sqlite3.connect(DATABASE_FILE) as con:
cur = con.cursor()
cur.execute("CREATE TABLE osm_keys(name, language, label)")
cur.executemany("INSERT INTO osm_keys VALUES(?, ?, ?)", osm_keys)
cur.execute("CREATE INDEX index_osm_keys ON osm_keys('name', 'language')")
cur.execute("CREATE TABLE osm_tags(tag_key, tag_value, language, label)")
cur.executemany("INSERT INTO osm_tags VALUES(?, ?, ?, ?)", osm_tags)
cur.execute("CREATE INDEX index_osm_tags ON osm_tags('tag_key', 'tag_value', 'language')")
con.commit()
with CSV_KEYS_FILE.open('w', encoding="utf8") as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["name", "language", "label"])
for row in osm_keys:
w.writerow(row)
with CSV_TAGS_FILE.open('w', encoding="utf8") as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["tag_key", "tag_value", "language", "label"])
for row in osm_tags:
w.writerow(row)

View file

@ -61,6 +61,7 @@ setup(
'data/*.json', 'data/*.json',
'data/*.txt', 'data/*.txt',
'data/*.ftz', 'data/*.ftz',
'data/*.db',
'infopage/*/*', 'infopage/*/*',
'static/themes/simple/css/*', 'static/themes/simple/css/*',
'static/themes/simple/css/*/*', 'static/themes/simple/css/*/*',