Merge pull request #333 from dalf/enh-engine-descriptions

RFC: /preferences: display engine descriptions
This commit is contained in:
Alexandre Flament 2021-09-25 11:29:25 +02:00 committed by GitHub
commit b046322c7b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 4111 additions and 57 deletions

View File

@ -14,6 +14,7 @@ __all__ = [
'WIKIDATA_UNITS', 'WIKIDATA_UNITS',
'EXTERNAL_BANGS', 'EXTERNAL_BANGS',
'OSM_KEYS_TAGS', 'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'ahmia_blacklist_loader', 'ahmia_blacklist_loader',
] ]
@ -45,3 +46,4 @@ EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json') WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json') EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json') OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')

File diff suppressed because it is too large Load Diff

View File

@ -12,6 +12,16 @@ from dateutil.parser import parse as dateparse
from lxml import html from lxml import html
from searx.utils import extract_text from searx.utils import extract_text
# about
about = {
"website": 'https://bandcamp.com/',
"wikidata_id": 'Q545966',
"official_api_documentation": 'https://bandcamp.com/developer',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
categories = ['music'] categories = ['music']
paging = True paging = True

View File

@ -9,9 +9,9 @@ from searx.utils import searx_useragent
# about # about
about = { about = {
"website": 'https://photon.komoot.de', "website": 'https://photon.komoot.io',
"wikidata_id": None, "wikidata_id": None,
"official_api_documentation": 'https://photon.komoot.de/', "official_api_documentation": 'https://photon.komoot.io/',
"use_official_api": True, "use_official_api": True,
"require_api_key": False, "require_api_key": False,
"results": 'JSON', "results": 'JSON',

View File

@ -1618,7 +1618,7 @@ engines:
categories: general categories: general
about: about:
website: https://brave.com/search/ website: https://brave.com/search/
wikidata_id: Q22906900 wikidata_id: Q107355971
use_official_api: false use_official_api: false
require_api_key: false require_api_key: false
results: HTML results: HTML

View File

@ -1309,6 +1309,7 @@ input.cursor-text {
font-size: 14px; font-size: 14px;
font-weight: normal; font-weight: normal;
z-index: 1000000; z-index: 1000000;
max-width: 40rem;
} }
td:hover .engine-tooltip, td:hover .engine-tooltip,
th:hover .engine-tooltip, th:hover .engine-tooltip,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1336,6 +1336,7 @@ input.cursor-text {
font-size: 14px; font-size: 14px;
font-weight: normal; font-weight: normal;
z-index: 1000000; z-index: 1000000;
max-width: 40rem;
} }
td:hover .engine-tooltip, td:hover .engine-tooltip,
th:hover .engine-tooltip, th:hover .engine-tooltip,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -299,6 +299,29 @@ $(document).ready(function(){
$( this ).off( event ); $( this ).off( event );
}); });
}); });
;$(document).ready(function(){
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
$.ajax("engine_descriptions.json", dataType="json").done(function(data) {
engine_descriptions = data;
for (const [engine_name, description] of Object.entries(data)) {
let elements = $('[data-engine-name="' + engine_name + '"] .description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (document.querySelector('body[class="preferences_endpoint"]')) {
$('[data-engine-name]').hover(function() {
load_engine_descriptions();
});
}
});
;$(document).ready(function(){ ;$(document).ready(function(){
$("#allow-all-engines").click(function() { $("#allow-all-engines").click(function() {
$(".onoffswitch-checkbox").each(function() { this.checked = false;}); $(".onoffswitch-checkbox").each(function() { this.checked = false;});

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,23 @@
$(document).ready(function(){
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
$.ajax("engine_descriptions.json", dataType="json").done(function(data) {
engine_descriptions = data;
for (const [engine_name, description] of Object.entries(data)) {
let elements = $('[data-engine-name="' + engine_name + '"] .description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (document.querySelector('body[class="preferences_endpoint"]')) {
$('[data-engine-name]').hover(function() {
load_engine_descriptions();
});
}
});

View File

@ -25,6 +25,7 @@ input.cursor-text {
font-size: 14px; font-size: 14px;
font-weight: normal; font-weight: normal;
z-index: 1000000; z-index: 1000000;
max-width: 40rem;
} }
td:hover .engine-tooltip, th:hover .engine-tooltip, .engine-tooltip:hover { td:hover .engine-tooltip, th:hover .engine-tooltip, .engine-tooltip:hover {

View File

@ -1929,6 +1929,14 @@ td:hover .engine-tooltip,
#main_preferences div.selectable_url pre { #main_preferences div.selectable_url pre {
width: 100%; width: 100%;
} }
#main_preferences th.name .engine-tooltip {
margin-top: 1.8rem;
left: 20rem;
max-width: 40rem;
}
#main_preferences th.name .engine-tooltip .engine-description {
margin-top: 0.5rem;
}
@media screen and (max-width: 75em) { @media screen and (max-width: 75em) {
.preferences_back { .preferences_back {
clear: both; clear: both;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1929,6 +1929,14 @@ td:hover .engine-tooltip,
#main_preferences div.selectable_url pre { #main_preferences div.selectable_url pre {
width: 100%; width: 100%;
} }
#main_preferences th.name .engine-tooltip {
margin-top: 1.8rem;
left: 20rem;
max-width: 40rem;
}
#main_preferences th.name .engine-tooltip .engine-description {
margin-top: 0.5rem;
}
@media screen and (max-width: 75em) { @media screen and (max-width: 75em) {
.preferences_back { .preferences_back {
clear: both; clear: both;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -621,6 +621,33 @@ searx.ready(function() {
}); });
}); });
})(window, document, window.searx); })(window, document, window.searx);
;(function (w, d, searx) {
'use strict';
searx.ready(function() {
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
searx.http("GET", "engine_descriptions.json").then(function(content) {
engine_descriptions = JSON.parse(content);
for (const [engine_name, description] of Object.entries(engine_descriptions)) {
let elements = d.querySelectorAll('[data-engine-name="' + engine_name + '"] .engine-description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (d.querySelector('body[class="preferences_endpoint"]')) {
for(const el of d.querySelectorAll('[data-engine-name]')) {
searx.on(el, 'mouseenter', load_engine_descriptions);
}
}
});
})(window, document, window.searx);
;/** ;/**
* searx is free software: you can redistribute it and/or modify * searx is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by * it under the terms of the GNU Affero General Public License as published by

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,27 @@
(function (w, d, searx) {
'use strict';
searx.ready(function() {
let engine_descriptions = null;
function load_engine_descriptions() {
if (engine_descriptions == null) {
searx.http("GET", "engine_descriptions.json").then(function(content) {
engine_descriptions = JSON.parse(content);
for (const [engine_name, description] of Object.entries(engine_descriptions)) {
let elements = d.querySelectorAll('[data-engine-name="' + engine_name + '"] .engine-description');
for(const element of elements) {
let source = ' (<i>' + searx.translations['Source'] + ':&nbsp;' + description[1] + '</i>)';
element.innerHTML = description[0] + source;
}
}
});
}
}
if (d.querySelector('body[class="preferences_endpoint"]')) {
for(const el of d.querySelectorAll('[data-engine-name]')) {
searx.on(el, 'mouseenter', load_engine_descriptions);
}
}
});
})(window, document, window.searx);

View File

@ -108,6 +108,18 @@
width: 100%; width: 100%;
} }
} }
th.name {
.engine-tooltip {
margin-top: 1.8rem;
left: 20rem;
max-width: 40rem;
.engine-description {
margin-top: 0.5rem;
}
}
}
} }
@media screen and (max-width: 75em) { @media screen and (max-width: 75em) {

View File

@ -4,6 +4,7 @@
{% if search_engine.about is defined or stats[search_engine.name]['result_count'] > 0 %} {% if search_engine.about is defined or stats[search_engine.name]['result_count'] > 0 %}
{% set about = search_engine.about %} {% set about = search_engine.about %}
<div class="engine-tooltip" role="tooltip" id="{{ id }}">{{- "" -}} <div class="engine-tooltip" role="tooltip" id="{{ id }}">{{- "" -}}
<p class="description"></p>
{% if search_engine.about is defined %} {% if search_engine.about is defined %}
<h5><a href="{{about.website}}" rel="noreferrer">{{about.website}}</a></h5> <h5><a href="{{about.website}}" rel="noreferrer">{{about.website}}</a></h5>
{%- if about.wikidata_id -%}<p><a href="https://www.wikidata.org/wiki/{{about.wikidata_id}}" rel="noreferrer">wikidata.org/wiki/{{about.wikidata_id}}</a></p>{%- endif -%} {%- if about.wikidata_id -%}<p><a href="https://www.wikidata.org/wiki/{{about.wikidata_id}}" rel="noreferrer">wikidata.org/wiki/{{about.wikidata_id}}</a></p>{%- endif -%}
@ -343,7 +344,7 @@
<td class="onoff-checkbox"> <td class="onoff-checkbox">
{{- checkbox_toggle('engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_'), (search_engine.name, categ) in disabled_engines) -}} {{- checkbox_toggle('engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_'), (search_engine.name, categ) in disabled_engines) -}}
</td> </td>
<th scope="row"><span aria-labelledby="{{ 'tooltip_' + categ + '_' + search_engine.name }}"> <th scope="row" data-engine-name="{{ search_engine.name }}"><span aria-labelledby="{{ 'tooltip_' + categ + '_' + search_engine.name }}">
{%- if search_engine.enable_http %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif -%} {%- if search_engine.enable_http %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif -%}
{{- search_engine.name -}}</span> {{- search_engine.name -}}</span>
{{- engine_about(search_engine, 'tooltip_' + categ + '_' + search_engine.name) -}} {{- engine_about(search_engine, 'tooltip_' + categ + '_' + search_engine.name) -}}
@ -363,7 +364,7 @@
<td>{{ support_toggle(supports[search_engine.name]['safesearch']) }}</td> <td>{{ support_toggle(supports[search_engine.name]['safesearch']) }}</td>
<td>{{ support_toggle(supports[search_engine.name]['supports_selected_language']) }}</td> <td>{{ support_toggle(supports[search_engine.name]['supports_selected_language']) }}</td>
<td>{{ shortcuts[search_engine.name] }}</td> <td>{{ shortcuts[search_engine.name] }}</td>
<th scope="row"><span>{% if search_engine.enable_http %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif %}{{ search_engine.name }}</span>{{ engine_about(search_engine) }}</th> <th scope="row" data-engine-name="{{ search_engine.name }}"><span>{% if search_engine.enable_http %}{{ icon('exclamation-sign', 'No HTTPS') }}{% endif %}{{ search_engine.name }}</span>{{ engine_about(search_engine) }}</th>
<td class="onoff-checkbox"> <td class="onoff-checkbox">
{{ checkbox_toggle('engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_'), (search_engine.name, categ) in disabled_engines) }} {{ checkbox_toggle('engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_'), (search_engine.name, categ) in disabled_engines) }}
</td> </td>

View File

@ -22,6 +22,7 @@
{% if search_engine.about is defined %} {% if search_engine.about is defined %}
{% set about = search_engine.about %} {% set about = search_engine.about %}
<div class="engine-tooltip" role="tooltip">{{- "" -}} <div class="engine-tooltip" role="tooltip">{{- "" -}}
<p class="engine-description"></p>
<p><a href="{{about.website}}" rel="noreferrer">{{about.website}}</a></p> <p><a href="{{about.website}}" rel="noreferrer">{{about.website}}</a></p>
{%- if about.wikidata_id -%}<p><a href="https://www.wikidata.org/wiki/{{about.wikidata_id}}" rel="noreferrer">wikidata.org/wiki/{{about.wikidata_id}}</a></p>{%- endif -%} {%- if about.wikidata_id -%}<p><a href="https://www.wikidata.org/wiki/{{about.wikidata_id}}" rel="noreferrer">wikidata.org/wiki/{{about.wikidata_id}}</a></p>{%- endif -%}
{%- if search_engine.enable_http %}<p>{{ icon('exclamation-sign', 'No HTTPS') }}{{ _('No HTTPS')}}</p>{% endif -%} {%- if search_engine.enable_http %}<p>{{ icon('exclamation-sign', 'No HTTPS') }}{{ _('No HTTPS')}}</p>{% endif -%}
@ -262,7 +263,7 @@
{% set engine_id = 'engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_') %} {% set engine_id = 'engine_' + search_engine.name|replace(' ', '_') + '__' + categ|replace(' ', '_') %}
<tr> <tr>
<td class="engine_checkbox">{{ checkbox_onoff(engine_id, (search_engine.name, categ) in disabled_engines) }}</td> <td class="engine_checkbox">{{ checkbox_onoff(engine_id, (search_engine.name, categ) in disabled_engines) }}</td>
<th class="name">{% if search_engine.enable_http %}{{ icon('warning', 'No HTTPS') }}{% endif %} {{ search_engine.name }} {{ engine_about(search_engine) }}</th> <th class="name" data-engine-name="{{ search_engine.name }}">{% if search_engine.enable_http %}{{ icon('warning', 'No HTTPS') }}{% endif %} {{ search_engine.name }} {{ engine_about(search_engine) }}</th>
<td class="shortcut">{{ shortcuts[search_engine.name] }}</td> <td class="shortcut">{{ shortcuts[search_engine.name] }}</td>
<td>{{ checkbox(engine_id + '_supported_languages', supports[search_engine.name]['supports_selected_language'], true, true) }}</td> <td>{{ checkbox(engine_id + '_supported_languages', supports[search_engine.name]['supports_selected_language'], true, true) }}</td>
<td>{{ checkbox(engine_id + '_safesearch', supports[search_engine.name]['safesearch'], true, true) }}</td> <td>{{ checkbox(engine_id + '_safesearch', supports[search_engine.name]['safesearch'], true, true) }}</td>

View File

@ -54,6 +54,7 @@ from searx import (
settings, settings,
searx_debug, searx_debug,
) )
from searx.data import ENGINE_DESCRIPTIONS
from searx.settings_defaults import OUTPUT_FORMATS from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path from searx.settings_loader import get_default_settings_path
from searx.exceptions import SearxParameterException from searx.exceptions import SearxParameterException
@ -393,7 +394,9 @@ def image_proxify(url):
def get_translations(): def get_translations():
return { return {
# when there is autocompletion # when there is autocompletion
'no_item_found': gettext('No item found') 'no_item_found': gettext('No item found'),
# /preferences: the source of the engine description (wikipedata, wikidata, website)
'Source': gettext('Source'),
} }
@ -1140,6 +1143,23 @@ def image_proxy():
return '', 400 return '', 400
@app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions():
locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy()
if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
result[engine] = description
for engine, description in result.items():
if len(description) ==2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str):
description = [ description, 'wikipedia' ]
result[engine] = description
return jsonify(result)
@app.route('/stats', methods=['GET']) @app.route('/stats', methods=['GET'])
def stats(): def stats():
"""Render engine statistics page.""" """Render engine statistics page."""

View File

@ -1,15 +1,19 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys
import json import json
from urllib.parse import quote, urlparse from urllib.parse import urlparse
import detect_language from os.path import join
from lxml.html import fromstring from lxml.html import fromstring
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.utils import extract_text from searx.utils import extract_text, match_language
from searx.locales import LOCALE_NAMES from searx.locales import LOCALE_NAMES
import searx from searx import searx_dir
from searx.utils import gen_useragent
import searx.search import searx.search
import searx.network import searx.network
@ -18,6 +22,7 @@ set_loggers(wikidata, 'wikidata')
SPARQL_WIKIPEDIA_ARTICLE = """ SPARQL_WIKIPEDIA_ARTICLE = """
SELECT DISTINCT ?item ?name SELECT DISTINCT ?item ?name
WHERE { WHERE {
hint:Query hint:optimizer "None".
VALUES ?item { %IDS% } VALUES ?item { %IDS% }
?article schema:about ?item ; ?article schema:about ?item ;
schema:inLanguage ?lang ; schema:inLanguage ?lang ;
@ -38,8 +43,23 @@ WHERE {
ORDER BY ?itemLang ORDER BY ?itemLang
""" """
NOT_A_DESCRIPTION = [
'web site',
'site web',
'komputa serĉilo',
'interreta serĉilo',
'bilaketa motor',
'web search engine',
'wikimedia täpsustuslehekülg',
]
SKIP_ENGINE_SOURCE = [
('gitlab', 'wikidata') # descriptions are about wikipedia disambiguation pages
]
LANGUAGES = LOCALE_NAMES.keys() LANGUAGES = LOCALE_NAMES.keys()
LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) WIKIPEDIA_LANGUAGES = {'language': 'wikipedia_language'}
LANGUAGES_SPARQL = ''
IDS = None IDS = None
descriptions = {} descriptions = {}
@ -54,15 +74,30 @@ def normalize_description(description):
def update_description(engine_name, lang, description, source, replace=True): def update_description(engine_name, lang, description, source, replace=True):
if not isinstance(description, str):
return
description = normalize_description(description)
if description.lower() == engine_name.lower():
return
if description.lower() in NOT_A_DESCRIPTION:
return
if (engine_name, source) in SKIP_ENGINE_SOURCE:
return
if ' ' not in description:
# skip unique word description (like "website")
return
if replace or lang not in descriptions[engine_name]: if replace or lang not in descriptions[engine_name]:
descriptions[engine_name][lang] = [normalize_description(description), source] descriptions[engine_name][lang] = [description, source]
def get_wikipedia_summary(language, pageid): def get_wikipedia_summary(lang, pageid):
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' params = {
url = search_url.format(title=quote(pageid), language=language) 'language': lang.replace('_','-'),
'headers': {}
}
searx.engines.engines['wikipedia'].request(pageid, params)
try: try:
response = searx.network.get(url) response = searx.network.get(params['url'], headers=params['headers'], timeout=10)
response.raise_for_status() response.raise_for_status()
api_result = json.loads(response.text) api_result = json.loads(response.text)
return api_result.get('extract') return api_result.get('extract')
@ -71,15 +106,19 @@ def get_wikipedia_summary(language, pageid):
def detect_language(text): def detect_language(text):
r = cld3.get_language(str(text)) # pylint: disable=E1101 try:
if r is not None and r.probability >= 0.98 and r.is_reliable: r = detect_langs(str(text)) # pylint: disable=E1101
return r.language except LangDetectException:
return None
if len(r) > 0 and r[0].prob > 0.95:
return r[0].lang
return None return None
def get_website_description(url, lang1, lang2=None): def get_website_description(url, lang1, lang2=None):
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'User-Agent': gen_useragent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'DNT': '1', 'DNT': '1',
'Upgrade-Insecure-Requests': '1', 'Upgrade-Insecure-Requests': '1',
@ -117,8 +156,15 @@ def get_website_description(url, lang1, lang2=None):
def initialize(): def initialize():
global descriptions, wd_to_engine_name, IDS global IDS, WIKIPEDIA_LANGUAGES, LANGUAGES_SPARQL
searx.search.initialize() searx.search.initialize()
wikipedia_engine = searx.engines.engines['wikipedia']
WIKIPEDIA_LANGUAGES = {
language: wikipedia_engine.url_lang(language.replace('_', '-'))
for language in LANGUAGES
}
WIKIPEDIA_LANGUAGES['nb_NO'] = 'no'
LANGUAGES_SPARQL = ', '.join(f"'{l}'" for l in set(WIKIPEDIA_LANGUAGES.values()))
for engine_name, engine in searx.engines.engines.items(): for engine_name, engine in searx.engines.engines.items():
descriptions[engine_name] = {} descriptions[engine_name] = {}
wikidata_id = getattr(engine, "about", {}).get('wikidata_id') wikidata_id = getattr(engine, "about", {}).get('wikidata_id')
@ -129,7 +175,7 @@ def initialize():
def fetch_wikidata_descriptions(): def fetch_wikidata_descriptions():
global IDS searx.network.set_timeout_for_thread(60)
result = wikidata.send_wikidata_query( result = wikidata.send_wikidata_query(
SPARQL_DESCRIPTION SPARQL_DESCRIPTION
.replace('%IDS%', IDS) .replace('%IDS%', IDS)
@ -138,15 +184,15 @@ def fetch_wikidata_descriptions():
if result is not None: if result is not None:
for binding in result['results']['bindings']: for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
lang = binding['itemDescription']['xml:lang'] wikidata_lang = binding['itemDescription']['xml:lang']
description = binding['itemDescription']['value'] description = binding['itemDescription']['value']
if ' ' in description: # skip unique word description (like "website") for engine_name in wd_to_engine_name[wikidata_id]:
for engine_name in wd_to_engine_name[wikidata_id]: for lang in LANGUAGES:
update_description(engine_name, lang, description, 'wikidata') if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang:
update_description(engine_name, lang, description, 'wikidata')
def fetch_wikipedia_descriptions(): def fetch_wikipedia_descriptions():
global IDS
result = wikidata.send_wikidata_query( result = wikidata.send_wikidata_query(
SPARQL_WIKIPEDIA_ARTICLE SPARQL_WIKIPEDIA_ARTICLE
.replace('%IDS%', IDS) .replace('%IDS%', IDS)
@ -155,12 +201,13 @@ def fetch_wikipedia_descriptions():
if result is not None: if result is not None:
for binding in result['results']['bindings']: for binding in result['results']['bindings']:
wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '')
lang = binding['name']['xml:lang'] wikidata_lang = binding['name']['xml:lang']
pageid = binding['name']['value'] pageid = binding['name']['value']
description = get_wikipedia_summary(lang, pageid) for engine_name in wd_to_engine_name[wikidata_id]:
if description is not None and ' ' in description: for lang in LANGUAGES:
for engine_name in wd_to_engine_name[wikidata_id]: if WIKIPEDIA_LANGUAGES[lang] == wikidata_lang:
update_description(engine_name, lang, description, 'wikipedia') description = get_wikipedia_summary(lang, pageid)
update_description(engine_name, lang, description, 'wikipedia')
def normalize_url(url): def normalize_url(url):
@ -173,36 +220,96 @@ def normalize_url(url):
def fetch_website_description(engine_name, website): def fetch_website_description(engine_name, website):
default_lang, default_description = get_website_description(website, None, None) default_lang, default_description = get_website_description(website, None, None)
if default_lang is None or default_description is None: if default_lang is None or default_description is None:
# the front page can't be fetched: skip this engine
return return
if default_lang not in descriptions[engine_name]:
descriptions[engine_name][default_lang] = [normalize_description(default_description), website] wikipedia_languages_r = { V: K for K, V in WIKIPEDIA_LANGUAGES.items() }
for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'): languages = ['en', 'es', 'pt', 'ru', 'tr', 'fr']
if request_lang.split('-')[0] not in descriptions[engine_name]: languages = languages + [ l for l in LANGUAGES if l not in languages]
lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0])
if desc is not None and desc != default_description: previous_matched_lang = None
update_description(engine_name, lang, desc, website, replace=False) previous_count = 0
for lang in languages:
if lang not in descriptions[engine_name]:
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None:
continue
matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None)
if matched_lang is None:
fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
if matched_lang is not None:
update_description(engine_name, matched_lang, desc, website, replace=False)
# check if desc changed with the different lang values
if matched_lang == previous_matched_lang:
previous_count += 1
if previous_count == 6:
# the website has returned the same description for 6 different languages in Accept-Language header
# stop now
break
else: else:
break previous_matched_lang = matched_lang
previous_count = 0
def fetch_website_descriptions(): def fetch_website_descriptions():
for engine_name, engine in searx.engines.engines.items(): for engine_name, engine in searx.engines.engines.items():
website = getattr(engine, "about", {}).get('website') website = getattr(engine, "about", {}).get('website')
if website is None: if website is None and hasattr(engine, "search_url"):
website = normalize_url(getattr(engine, "search_url")) website = normalize_url(getattr(engine, "search_url"))
if website is None: if website is None and hasattr(engine, "base_url"):
website = normalize_url(getattr(engine, "base_url")) website = normalize_url(getattr(engine, "base_url"))
if website is not None: if website is not None:
fetch_website_description(engine_name, website) fetch_website_description(engine_name, website)
def get_engine_descriptions_filename():
return join(join(searx_dir, "data"), "engine_descriptions.json")
def get_output():
"""
From descriptions[engine][language] = [description, source]
To
* output[language][engine] = description_and_source
* description_and_source can be:
* [description, source]
* description (if source = "wikipedia")
* [f"engine:lang", "ref"] (reference to another existing description)
"""
output = {
locale: {} for locale in LOCALE_NAMES
}
seen_descriptions = {}
for engine_name, lang_descriptions in descriptions.items():
for language, description in lang_descriptions.items():
if description[0] in seen_descriptions:
ref = seen_descriptions[description[0]]
description = [f'{ref[0]}:{ref[1]}', 'ref']
else:
seen_descriptions[description[0]] = (engine_name, language)
if description[1] == 'wikipedia':
description = description[0]
output.setdefault(language, {}).setdefault(engine_name, description)
return output
def main(): def main():
initialize() initialize()
print('Fetching wikidata descriptions')
fetch_wikidata_descriptions() fetch_wikidata_descriptions()
print('Fetching wikipedia descriptions')
fetch_wikipedia_descriptions() fetch_wikipedia_descriptions()
print('Fetching website descriptions')
fetch_website_descriptions() fetch_website_descriptions()
sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False)) output = get_output()
with open(get_engine_descriptions_filename(), 'w', encoding='utf8') as f:
f.write(json.dumps(output, indent=1, separators=(',', ':'), ensure_ascii=False))
if __name__ == "__main__": if __name__ == "__main__":