This commit is contained in:
Alexandre Flament 2024-05-28 22:26:15 +02:00 committed by GitHub
commit e24df67b98
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 86700 additions and 79765 deletions

View file

@ -7,21 +7,30 @@
__all__ = [
'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS',
'EXTERNAL_URLS',
'WIKIDATA_UNITS',
'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'LOCALES',
'ahmia_blacklist_loader',
'fetch_engine_descriptions',
'fetch_iso4217_from_user',
'fetch_name_from_iso4217',
'fetch_osm_key_label',
]
import re
import unicodedata
import json
import sqlite3
from contextlib import contextmanager
from typing import Dict, Generator, List, Optional
from functools import lru_cache
from pathlib import Path
data_dir = Path(__file__).parent
data_connection_local = {}
def _load(filename):
@ -29,6 +38,116 @@ def _load(filename):
return json.load(f)
@contextmanager
def sql_connection(filename: str) -> Generator[sqlite3.Connection, None, None]:
"""Return a read only SQLite connection to filename.
The filename is relative to searx/data
Multiple calls to this function in the same thread,
already return the same connection.
"""
dict_id = filename
connection = data_connection_local.get(dict_id)
if connection is None:
data_filename = str(data_dir / filename)
# open database in read only mode and allow to share between threads
# https://www.sqlite.org/faq.html#q6
# see https://ricardoanderegg.com/posts/python-sqlite-thread-safety/
# and https://docs.python.org/3/library/sqlite3.html#sqlite3.threadsafety
# sqlite3.threadsafety is hard coded to 1
# the only reliable way to check if multithreading is supported is to run this query
# SELECT * FROM pragma_compile_options WHERE compile_options LIKE 'THREADSAFE=%'
# but THREADSAFE=1 on Linux anyway
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True, check_same_thread=False)
# 512KB of cache instead of 2MB (512KB / 4KB = 128, 4KB is the default page size)
# https://www.sqlite.org/pragma.html#pragma_cache_size
data_connection.execute("PRAGMA cache_size = 128;")
data_connection_local[dict_id] = data_connection
yield data_connection
def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
"""Return engine description and source for each engine name."""
with sql_connection("engine_descriptions.db") as conn:
res = conn.execute("SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,))
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def _normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
@lru_cache(10)
def fetch_iso4217_from_user(name: str) -> Optional[str]:
with sql_connection("currencies.db") as connection:
# try the iso4217
res = connection.execute("SELECT iso4217 FROM currencies WHERE lower(iso4217)=? LIMIT 1", (name.lower(),))
result = res.fetchone()
if result:
return result[0]
# try the currency names
name = _normalize_name(name)
res = connection.execute("SELECT iso4217 FROM currencies WHERE name=?", (name,))
result = list(set(result[0] for result in res.fetchall()))
if len(result) == 1:
return result[0]
# ambiguity --> return nothing
return None
@lru_cache(10)
def fetch_name_from_iso4217(iso4217: str, language: str) -> Optional[str]:
with sql_connection("currencies.db") as connection:
res = connection.execute("SELECT name FROM currencies WHERE iso4217=? AND language=?", (iso4217, language))
result = [result[0] for result in res.fetchall()]
if len(result) == 1:
return result[0]
return None
@lru_cache(100)
def fetch_osm_key_label(key_name: str, language: str) -> Optional[str]:
if key_name.startswith('currency:'):
# currency:EUR --> get the name from the CURRENCIES variable
# see https://wiki.openstreetmap.org/wiki/Key%3Acurrency
# and for example https://taginfo.openstreetmap.org/keys/currency:EUR#values
# but there is also currency=EUR (currently not handled)
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
label = fetch_name_from_iso4217(currency[1], language)
if label:
return label
return currency[1]
language = language.lower()
language_short = language.split('-')[0]
with sql_connection("osm_keys_tags.db") as conn:
res = conn.execute(
"SELECT language, label FROM osm_keys WHERE name=? AND language in (?, ?, 'en')",
(key_name, language, language_short),
)
result = {result[0]: result[1] for result in res.fetchall()}
return result.get(language) or result.get(language_short) or result.get('en')
@lru_cache(100)
def fetch_osm_tag_label(tag_key: str, tag_value: str, language: str) -> Optional[str]:
language = language.lower()
language_short = language.split('-')[0]
with sql_connection("osm_keys_tags.db") as conn:
res = conn.execute(
"SELECT language, label FROM osm_tags WHERE tag_key=? AND tag_value=? AND language in (?, ?, 'en')",
(tag_key, tag_value, language, language_short),
)
result = {result[0]: result[1] for result in res.fetchall()}
return result.get(language) or result.get(language_short) or result.get('en')
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
@ -42,12 +161,9 @@ def ahmia_blacklist_loader():
return f.read().split()
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')
LOCALES = _load('locales.json')

BIN
searx/data/currencies.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,3 @@
Dumps of the SQLite files in ``searx.data``.
These files are not used by SearXNG, they are here for reference.

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

70384
searx/data/dumps/osm_tags.csv Normal file

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because one or more lines are too long

BIN
searx/data/osm_keys_tags.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -6,7 +6,7 @@ from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label
from searx.data import fetch_osm_key_label
about = {
"website": 'https://www.apple.com/maps/',
@ -72,7 +72,7 @@ def response(resp):
telephone = result['telephone']
links.append(
{
'label': get_key_label('phone', user_language),
'label': fetch_osm_key_label('phone', user_language),
'url': 'tel:' + telephone,
'url_label': telephone,
}
@ -81,7 +81,7 @@ def response(resp):
url = result['urls'][0]
links.append(
{
'label': get_key_label('website', user_language),
'label': fetch_osm_key_label('website', user_language),
'url': url,
'url_label': url,
}

View file

@ -10,7 +10,7 @@ from functools import partial
from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES
from searx.data import fetch_osm_tag_label, fetch_osm_key_label
from searx.utils import searx_useragent
from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
@ -187,14 +187,14 @@ def response(resp):
'template': 'map.html',
'title': title,
'address': address,
'address_label': get_key_label('addr', user_language),
'address_label': fetch_osm_key_label('addr', user_language),
'url': url,
'osm': osm,
'geojson': geojson,
'thumbnail': thumbnail,
'links': links,
'data': data,
'type': get_tag_label(result.get('category'), result.get('type', ''), user_language),
'type': fetch_osm_tag_label(result.get('category'), result.get('type', ''), user_language),
'type_icon': result.get('icon'),
'content': '',
'longitude': result['lon'],
@ -367,7 +367,7 @@ def get_links(result, user_language):
url_label = result.get('wikidata', {}).get('itemLabel') or url_label
links.append(
{
'label': get_key_label(k, user_language),
'label': fetch_osm_key_label(k, user_language),
'url': url,
'url_label': url_label,
}
@ -389,7 +389,7 @@ def get_data(result, user_language, ignore_keys):
continue
if get_key_rank(k) is None:
continue
k_label = get_key_label(k, user_language)
k_label = fetch_osm_key_label(k, user_language)
if k_label:
data.append(
{
@ -412,51 +412,3 @@ def get_key_rank(k):
# "payment:*" in KEY_ORDER matches "payment:cash", "payment:debit card", etc...
key_rank = KEY_RANKS.get(k.split(':')[0] + ':*')
return key_rank
def get_label(labels, lang):
"""Get label from labels in OSM_KEYS_TAGS
in OSM_KEYS_TAGS, labels have key == '*'
"""
tag_label = labels.get(lang.lower())
if tag_label is None:
# example: if 'zh-hk' is not found, check 'zh'
tag_label = labels.get(lang.split('-')[0])
if tag_label is None and lang != 'en':
# example: if 'zh' is not found, check 'en'
tag_label = labels.get('en')
if tag_label is None and len(labels.values()) > 0:
# example: if still not found, use the first entry
tag_label = labels.values()[0]
return tag_label
def get_tag_label(tag_category, tag_name, lang):
"""Get tag label from OSM_KEYS_TAGS"""
tag_name = '' if tag_name is None else tag_name
tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {})
return get_label(tag_labels, lang)
def get_key_label(key_name, lang):
"""Get key label from OSM_KEYS_TAGS"""
if key_name.startswith('currency:'):
# currency:EUR --> get the name from the CURRENCIES variable
# see https://wiki.openstreetmap.org/wiki/Key%3Acurrency
# and for example https://taginfo.openstreetmap.org/keys/currency:EUR#values
# but there is also currency=EUR (currently not handled)
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency[1])
if o:
return get_label(o, lang).lower()
return currency[1]
labels = OSM_KEYS_TAGS['keys']
for k in key_name.split(':') + ['*']:
labels = labels.get(k)
if labels is None:
return None
return get_label(labels, lang)

View file

@ -3,33 +3,14 @@
"""
import unicodedata
import re
from searx.data import CURRENCIES
from searx.data import fetch_iso4217_from_user, fetch_name_from_iso4217
from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
def normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name):
name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str):
return currency
return currency[0]
def iso4217_to_name(iso4217, language):
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines."""
@ -52,14 +33,17 @@ class OnlineCurrencyProcessor(OnlineProcessor):
amount = float(amount_str)
except ValueError:
return None
from_currency = name_to_iso4217(from_currency.strip())
to_currency = name_to_iso4217(to_currency.strip())
from_currency = fetch_iso4217_from_user(from_currency.strip())
to_currency = fetch_iso4217_from_user(to_currency.strip())
if from_currency is None or to_currency is None:
return None
params['amount'] = amount
params['from'] = from_currency
params['to'] = to_currency
params['from_name'] = iso4217_to_name(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en')
params['from_name'] = fetch_name_from_iso4217(from_currency, 'en')
params['to_name'] = fetch_name_from_iso4217(to_currency, 'en')
return params
def get_default_tests(self):

View file

@ -58,7 +58,7 @@ from searx import infopage
from searx import limiter
from searx.botdetection import link_token
from searx.data import ENGINE_DESCRIPTIONS
from searx.data import fetch_engine_descriptions
from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path
@ -1102,17 +1102,10 @@ def image_proxy():
@app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions():
locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy()
result = fetch_engine_descriptions('en')
if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
for engine, description in fetch_engine_descriptions(locale).items():
result[engine] = description
for engine, description in result.items():
if len(description) == 2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str):
description = [description, 'wikipedia']
result[engine] = description
# overwrite by about:description (from settings)
for engine_name, engine_mod in engines.items():