From a4dcfa025c690dc4c824b2261242748a331a97e8 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Wed, 13 Jan 2021 11:31:25 +0100 Subject: [PATCH] [enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information --- searx/engines/1337x.py | 14 ++ searx/engines/acgsou.py | 18 +- searx/engines/ahmia.py | 19 ++- searx/engines/apkmirror.py | 17 +- searx/engines/archlinux.py | 20 ++- searx/engines/arxiv.py | 19 ++- searx/engines/base.py | 21 ++- searx/engines/bing.py | 22 +-- searx/engines/bing_images.py | 21 +-- searx/engines/bing_news.py | 20 ++- searx/engines/bing_videos.py | 19 ++- searx/engines/btdigg.py | 22 ++- searx/engines/command.py | 19 +-- searx/engines/currency_convert.py | 14 ++ searx/engines/dailymotion.py | 21 +-- searx/engines/deezer.py | 19 ++- searx/engines/deviantart.py | 21 +-- searx/engines/dictzone.py | 17 +- searx/engines/digbt.py | 18 +- searx/engines/digg.py | 19 ++- searx/engines/doku.py | 24 +-- searx/engines/duckduckgo.py | 22 +-- searx/engines/duckduckgo_definitions.py | 20 ++- searx/engines/duckduckgo_images.py | 25 +-- searx/engines/duden.py | 17 +- searx/engines/dummy-offline.py | 14 +- searx/engines/dummy.py | 14 +- searx/engines/ebay.py | 23 ++- searx/engines/elasticsearch.py | 5 + searx/engines/etools.py | 18 +- searx/engines/fdroid.py | 18 +- searx/engines/flickr.py | 20 ++- searx/engines/flickr_noapi.py | 23 +-- searx/engines/framalibre.py | 19 ++- searx/engines/frinkiac.py | 23 ++- searx/engines/genius.py | 21 ++- searx/engines/gentoo.py | 20 ++- searx/engines/gigablast.py | 18 +- searx/engines/github.py | 21 ++- searx/engines/google.py | 26 +-- searx/engines/google_images.py | 22 ++- searx/engines/google_news.py | 19 ++- searx/engines/google_videos.py | 19 ++- searx/engines/ina.py | 25 +-- searx/engines/invidious.py | 23 ++- searx/engines/json_engine.py | 2 + searx/engines/kickass.py | 19 ++- searx/engines/mediawiki.py | 23 +-- searx/engines/microsoft_academic.py | 20 ++- searx/engines/mixcloud.py | 19 ++- searx/engines/not_evil.py | 19 ++- searx/engines/nyaa.py | 18 +- searx/engines/opensemantic.py | 22 ++- searx/engines/openstreetmap.py | 19 ++- searx/engines/pdbe.py | 19 ++- searx/engines/peertube.py | 21 +-- searx/engines/photon.py | 19 ++- searx/engines/piratebay.py | 23 ++- searx/engines/pubmed.py | 22 ++- searx/engines/qwant.py | 18 +- searx/engines/recoll.py | 17 +- searx/engines/reddit.py | 19 ++- searx/engines/scanr_structures.py | 19 ++- searx/engines/searchcode_code.py | 20 ++- searx/engines/searx_engine.py | 18 +- searx/engines/sepiasearch.py | 22 ++- searx/engines/soundcloud.py | 18 +- searx/engines/spotify.py | 19 ++- searx/engines/stackoverflow.py | 21 ++- searx/engines/startpage.py | 25 +-- searx/engines/tokyotoshokan.py | 19 ++- searx/engines/torrentz.py | 20 ++- searx/engines/translated.py | 18 +- searx/engines/unsplash.py | 19 ++- searx/engines/vimeo.py | 27 +-- searx/engines/wikidata.py | 20 ++- searx/engines/wikipedia.py | 19 ++- searx/engines/wolframalpha_api.py | 23 ++- searx/engines/wolframalpha_noapi.py | 23 ++- searx/engines/www1x.py | 19 ++- searx/engines/xpath.py | 2 + searx/engines/yacy.py | 27 +-- searx/engines/yahoo.py | 20 ++- searx/engines/yahoo_news.py | 24 +-- searx/engines/yandex.py | 18 +- searx/engines/yggtorrent.py | 23 ++- searx/engines/youtube_api.py | 23 ++- searx/engines/youtube_noapi.py | 23 ++- searx/settings.yml | 213 ++++++++++++++++++++++++ utils/fetch_engine_descriptions.py | 206 +++++++++++++++++++++++ 90 files changed, 1421 insertions(+), 725 deletions(-) create mode 100644 utils/fetch_engine_descriptions.py diff --git a/searx/engines/1337x.py b/searx/engines/1337x.py index 18478876a..9cc7c1b79 100644 --- a/searx/engines/1337x.py +++ b/searx/engines/1337x.py @@ -1,7 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + 1337x +""" + from urllib.parse import quote, urljoin from lxml import html from searx.utils import extract_text, get_torrent_size, eval_xpath, eval_xpath_list, eval_xpath_getindex +# about +about = { + "website": 'https://1337x.to/', + "wikidata_id": 'Q28134166', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} url = 'https://1337x.to/' search_url = url + 'search/{search_term}/{pageno}/' diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py index 637443edc..ea9793f10 100644 --- a/searx/engines/acgsou.py +++ b/searx/engines/acgsou.py @@ -1,18 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Acgsou (Japanese Animation/Music/Comics Bittorrent tracker) - - @website https://www.acgsou.com/ - @provide-api no - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content, seed, leech, torrentfile """ from urllib.parse import urlencode from lxml import html from searx.utils import extract_text, get_torrent_size, eval_xpath_list, eval_xpath_getindex +# about +about = { + "website": 'https://www.acgsou.com/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['files', 'images', 'videos', 'music'] paging = True diff --git a/searx/engines/ahmia.py b/searx/engines/ahmia.py index 7a2ae0075..6c502bb40 100644 --- a/searx/engines/ahmia.py +++ b/searx/engines/ahmia.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Ahmia (Onions) - - @website http://msydqstlz2kzerdg.onion - @provides-api no - - @using-api no - @results HTML - @stable no - @parse url, title, content """ from urllib.parse import urlencode, urlparse, parse_qs from lxml.html import fromstring from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath +# about +about = { + "website": 'http://msydqstlz2kzerdg.onion', + "wikidata_id": 'Q18693938', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine config categories = ['onions'] paging = True diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py index 3a948dcb4..a4c66e891 100644 --- a/searx/engines/apkmirror.py +++ b/searx/engines/apkmirror.py @@ -1,18 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ APK Mirror - - @website https://www.apkmirror.com - - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, thumbnail_src """ from urllib.parse import urlencode from lxml import html from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex +# about +about = { + "website": 'https://www.apkmirror.com', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} # engine dependent config categories = ['it'] diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index 04117c07d..d29d65ba3 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -1,20 +1,24 @@ -# -*- coding: utf-8 -*- - +# SPDX-License-Identifier: AGPL-3.0-or-later """ Arch Linux Wiki - @website https://wiki.archlinux.org - @provide-api no (Mediawiki provides API, but Arch Wiki blocks access to it - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title + API: Mediawiki provides API, but Arch Wiki blocks access to it """ from urllib.parse import urlencode, urljoin from lxml import html from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex +# about +about = { + "website": 'https://wiki.archlinux.org/', + "wikidata_id": 'Q101445877', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['it'] language_support = True diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index 1190de363..09ea07ea5 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -1,20 +1,21 @@ -#!/usr/bin/env python - +# SPDX-License-Identifier: AGPL-3.0-or-later """ ArXiV (Scientific preprints) - @website https://arxiv.org - @provide-api yes (export.arxiv.org/api/query) - @using-api yes - @results XML-RSS - @stable yes - @parse url, title, publishedDate, content - More info on api: https://arxiv.org/help/api/user-manual """ from lxml import html from datetime import datetime from searx.utils import eval_xpath_list, eval_xpath_getindex +# about +about = { + "website": 'https://arxiv.org', + "wikidata_id": 'Q118398', + "official_api_documentation": 'https://arxiv.org/help/api', + "use_official_api": True, + "require_api_key": False, + "results": 'XML-RSS', +} categories = ['science'] paging = True diff --git a/searx/engines/base.py b/searx/engines/base.py index 3648d7ed0..463274681 100755 --- a/searx/engines/base.py +++ b/searx/engines/base.py @@ -1,16 +1,6 @@ -#!/usr/bin/env python - +# SPDX-License-Identifier: AGPL-3.0-or-later """ BASE (Scholar publications) - - @website https://base-search.net - @provide-api yes with authorization (https://api.base-search.net/) - - @using-api yes - @results XML - @stable ? - @parse url, title, publishedDate, content - More info on api: http://base-search.net/about/download/base_interface.pdf """ from urllib.parse import urlencode @@ -19,6 +9,15 @@ from datetime import datetime import re from searx.utils import searx_useragent +# about +about = { + "website": 'https://base-search.net', + "wikidata_id": 'Q448335', + "official_api_documentation": 'https://api.base-search.net/', + "use_official_api": True, + "require_api_key": False, + "results": 'XML', +} categories = ['science'] diff --git a/searx/engines/bing.py b/searx/engines/bing.py index f0882fcc9..edf6baef9 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -1,16 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Bing (Web) - - @website https://www.bing.com - @provide-api yes (http://datamarket.azure.com/dataset/bing/search), - max. 5000 query/month - - @using-api no (because of query limit) - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content - - @todo publishedDate """ import re @@ -21,6 +11,16 @@ from searx.utils import eval_xpath, extract_text, match_language logger = logger.getChild('bing engine') +# about +about = { + "website": 'https://www.bing.com', + "wikidata_id": 'Q182496', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['general'] paging = True diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 2bcf82b84..b4ca57f4b 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -1,15 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Bing (Images) - - @website https://www.bing.com/images - @provide-api yes (http://datamarket.azure.com/dataset/bing/search), - max. 5000 query/month - - @using-api no (because of query limit) - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, img_src - """ from urllib.parse import urlencode @@ -20,6 +11,16 @@ from searx.utils import match_language from searx.engines.bing import language_aliases from searx.engines.bing import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import +# about +about = { + "website": 'https://www.bing.com/images', + "wikidata_id": 'Q182496', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['images'] paging = True diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index b95def48b..2e4b78278 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -1,14 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Bing (News) - - @website https://www.bing.com/news - @provide-api yes (http://datamarket.azure.com/dataset/bing/search), - max. 5000 query/month - - @using-api no (because of query limit) - @results RSS (using search portal) - @stable yes (except perhaps for the images) - @parse url, title, content, publishedDate, thumbnail """ from datetime import datetime @@ -20,6 +12,16 @@ from searx.utils import match_language, eval_xpath_getindex from searx.engines.bing import language_aliases from searx.engines.bing import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import +# about +about = { + "website": 'https://www.bing.com/news', + "wikidata_id": 'Q2878637', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'RSS', +} + # engine dependent config categories = ['news'] paging = True diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index 143c71a3e..b4584bb37 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Bing (Videos) - - @website https://www.bing.com/videos - @provide-api yes (http://datamarket.azure.com/dataset/bing/search) - - @using-api no - @results HTML - @stable no - @parse url, title, content, thumbnail """ from json import loads @@ -18,6 +11,16 @@ from searx.utils import match_language from searx.engines.bing import language_aliases from searx.engines.bing import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import +# about +about = { + "website": 'https://www.bing.com/videos', + "wikidata_id": 'Q4914152', + "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + categories = ['videos'] paging = True safesearch = True diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index 72bda8d20..863396f6e 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -1,19 +1,25 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ BTDigg (Videos, Music, Files) - - @website https://btdig.com - @provide-api yes (on demand) - - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content, seed, leech, magnetlink """ from lxml import html from urllib.parse import quote, urljoin from searx.utils import extract_text, get_torrent_size +# about +about = { + "website": 'https://btdig.com', + "wikidata_id": 'Q4836698', + "official_api_documentation": { + 'url': 'https://btdig.com/contacts', + 'comment': 'on demand' + }, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['videos', 'music', 'files'] paging = True diff --git a/searx/engines/command.py b/searx/engines/command.py index 6321e0004..33270d245 100644 --- a/searx/engines/command.py +++ b/searx/engines/command.py @@ -1,18 +1,7 @@ -''' -searx is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -searx is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. - -You should have received a copy of the GNU Affero General Public License -along with searx. If not, see < http://www.gnu.org/licenses/ >. -''' - +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Command (offline) +""" import re from os.path import expanduser, isabs, realpath, commonprefix diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index 7098dd3c7..d4c3b5f81 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -1,5 +1,19 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + currency convert (DuckDuckGo) +""" + import json +# about +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "official_api_documentation": 'https://duckduckgo.com/api', + "use_official_api": False, + "require_api_key": False, + "results": 'JSONP', +} engine_type = 'online_currency' categories = [] diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 1e24e41da..874e0f42a 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -1,15 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Dailymotion (Videos) - - @website https://www.dailymotion.com - @provide-api yes (http://www.dailymotion.com/developer) - - @using-api yes - @results JSON - @stable yes - @parse url, title, thumbnail, publishedDate, embedded - - @todo set content-parameter with correct data """ from json import loads @@ -17,6 +8,16 @@ from datetime import datetime from urllib.parse import urlencode from searx.utils import match_language, html_to_text +# about +about = { + "website": 'https://www.dailymotion.com', + "wikidata_id": 'Q769222', + "official_api_documentation": 'https://www.dailymotion.com/developer', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['videos'] paging = True diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py index 48c0429a7..946bd3ebe 100644 --- a/searx/engines/deezer.py +++ b/searx/engines/deezer.py @@ -1,18 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Deezer (Music) - - @website https://deezer.com - @provide-api yes (http://developers.deezer.com/api/) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, embedded """ from json import loads from urllib.parse import urlencode +# about +about = { + "website": 'https://deezer.com', + "wikidata_id": 'Q602243', + "official_api_documentation": 'https://developers.deezer.com/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['music'] paging = True diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index 0378929b2..7840495e1 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -1,21 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Deviantart (Images) - - @website https://www.deviantart.com/ - @provide-api yes (https://www.deviantart.com/developers/) (RSS) - - @using-api no (TODO, rewrite to api) - @results HTML - @stable no (HTML can change) - @parse url, title, img_src - - @todo rewrite to api """ # pylint: disable=missing-function-docstring from urllib.parse import urlencode from lxml import html +# about +about = { + "website": 'https://www.deviantart.com/', + "wikidata_id": 'Q46523', + "official_api_documentation": 'https://www.deviantart.com/developers/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['images'] paging = True diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 5e6f688a1..2483c0805 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -1,18 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Dictzone - - @website https://dictzone.com/ - @provide-api no - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content """ from urllib.parse import urljoin from lxml import html from searx.utils import eval_xpath +# about +about = { + "website": 'https://dictzone.com/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} engine_type = 'online_dictionnary' categories = ['general'] diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py index b1a90fb2f..109662a49 100644 --- a/searx/engines/digbt.py +++ b/searx/engines/digbt.py @@ -1,19 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ DigBT (Videos, Music, Files) - - @website https://digbt.org - @provide-api no - - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content, magnetlink """ from urllib.parse import urljoin from lxml import html from searx.utils import extract_text, get_torrent_size +# about +about = { + "website": 'https://digbt.org', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} categories = ['videos', 'music', 'files'] paging = True diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 85f727f0d..defcacd20 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Digg (News, Social media) - - @website https://digg.com - @provide-api no - - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content, publishedDate, thumbnail """ # pylint: disable=missing-function-docstring @@ -17,6 +10,16 @@ from datetime import datetime from lxml import html +# about +about = { + "website": 'https://digg.com', + "wikidata_id": 'Q270478', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['news', 'social media'] paging = True diff --git a/searx/engines/doku.py b/searx/engines/doku.py index e1b10d664..ed1eab388 100644 --- a/searx/engines/doku.py +++ b/searx/engines/doku.py @@ -1,18 +1,22 @@ -# Doku Wiki -# -# @website https://www.dokuwiki.org/ -# @provide-api yes -# (https://www.dokuwiki.org/devel:xmlrpc) -# -# @using-api no -# @results HTML -# @stable yes -# @parse (general) url, title, content +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Doku Wiki +""" from urllib.parse import urlencode from lxml.html import fromstring from searx.utils import extract_text, eval_xpath +# about +about = { + "website": 'https://www.dokuwiki.org/', + "wikidata_id": 'Q851864', + "official_api_documentation": 'https://www.dokuwiki.org/devel:xmlrpc', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['general'] # TODO , 'images', 'music', 'videos', 'files' paging = False diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index c1c984623..fc20de239 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,22 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ DuckDuckGo (Web) - - @website https://duckduckgo.com/ - @provide-api yes (https://duckduckgo.com/api), - but not all results from search-site - - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content - - @todo rewrite to api """ from lxml.html import fromstring from json import loads from searx.utils import extract_text, match_language, eval_xpath +# about +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "official_api_documentation": 'https://duckduckgo.com/api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['general'] paging = False diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 1d1c84b4b..0473b0a95 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,12 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ -DuckDuckGo (definitions) - -- `Instant Answer API`_ -- `DuckDuckGo query`_ - -.. _Instant Answer API: https://duckduckgo.com/api -.. _DuckDuckGo query: https://api.duckduckgo.com/?q=DuckDuckGo&format=json&pretty=1 - + DuckDuckGo (Instant Answer API) """ import json @@ -22,6 +16,16 @@ from searx.external_urls import get_external_url, get_earth_coordinates_url, are logger = logger.getChild('duckduckgo_definitions') +# about +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "official_api_documentation": 'https://duckduckgo.com/api', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + URL = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 009f81cca..b5c2d4506 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -1,16 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ DuckDuckGo (Images) - - @website https://duckduckgo.com/ - @provide-api yes (https://duckduckgo.com/api), - but images are not supported - - @using-api no - @results JSON (site requires js to get images) - @stable no (JSON can change) - @parse url, title, img_src - - @todo avoid extra request """ from json import loads @@ -20,6 +10,19 @@ from searx.engines.duckduckgo import get_region_code from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import from searx.poolrequests import get +# about +about = { + "website": 'https://duckduckgo.com/', + "wikidata_id": 'Q12805', + "official_api_documentation": { + 'url': 'https://duckduckgo.com/api', + 'comment': 'but images are not supported', + }, + "use_official_api": False, + "require_api_key": False, + "results": 'JSON (site requires js to get images)', +} + # engine dependent config categories = ['images'] paging = True diff --git a/searx/engines/duden.py b/searx/engines/duden.py index 1475fb846..f1c9efd3f 100644 --- a/searx/engines/duden.py +++ b/searx/engines/duden.py @@ -1,11 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Duden - @website https://www.duden.de - @provide-api no - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content """ import re @@ -13,6 +8,16 @@ from urllib.parse import quote, urljoin from lxml import html from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +# about +about = { + "website": 'https://www.duden.de', + "wikidata_id": 'Q73624591', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + categories = ['general'] paging = True language_support = False diff --git a/searx/engines/dummy-offline.py b/searx/engines/dummy-offline.py index 13a9ecc01..cf2f75312 100644 --- a/searx/engines/dummy-offline.py +++ b/searx/engines/dummy-offline.py @@ -1,11 +1,19 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Dummy Offline - - @results one result - @stable yes """ +# about +about = { + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + + def search(query, request_params): return [{ 'result': 'this is what you get', diff --git a/searx/engines/dummy.py b/searx/engines/dummy.py index 50b56ef78..1a1b57d8c 100644 --- a/searx/engines/dummy.py +++ b/searx/engines/dummy.py @@ -1,10 +1,18 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Dummy - - @results empty array - @stable yes """ +# about +about = { + "website": None, + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'empty array', +} + # do search-request def request(query, params): diff --git a/searx/engines/ebay.py b/searx/engines/ebay.py index e2e5ded6a..45c633b42 100644 --- a/searx/engines/ebay.py +++ b/searx/engines/ebay.py @@ -1,17 +1,22 @@ -# Ebay (Videos, Music, Files) -# -# @website https://www.ebay.com -# @provide-api no (nothing found) -# -# @using-api no -# @results HTML (using search portal) -# @stable yes (HTML can change) -# @parse url, title, content, price, shipping, source +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Ebay (Videos, Music, Files) +""" from lxml import html from searx.engines.xpath import extract_text from urllib.parse import quote +# about +about = { + "website": 'https://www.ebay.com', + "wikidata_id": 'Q58024', + "official_api_documentation": 'https://developer.ebay.com/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + categories = ['shopping'] paging = True diff --git a/searx/engines/elasticsearch.py b/searx/engines/elasticsearch.py index 0e2d35756..da7f98074 100644 --- a/searx/engines/elasticsearch.py +++ b/searx/engines/elasticsearch.py @@ -1,3 +1,8 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Elasticsearch +""" + from json import loads, dumps from requests.auth import HTTPBasicAuth from searx.exceptions import SearxEngineAPIException diff --git a/searx/engines/etools.py b/searx/engines/etools.py index a0762d1c7..77d7e71c6 100644 --- a/searx/engines/etools.py +++ b/searx/engines/etools.py @@ -1,18 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ eTools (Web) - - @website https://www.etools.ch - @provide-api no - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content """ from lxml import html from urllib.parse import quote from searx.utils import extract_text, eval_xpath +# about +about = { + "website": 'https://www.etools.ch', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + categories = ['general'] paging = False language_support = False diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py index 3d37db44e..8fff2e384 100644 --- a/searx/engines/fdroid.py +++ b/searx/engines/fdroid.py @@ -1,18 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ F-Droid (a repository of FOSS applications for Android) - - @website https://f-droid.org/ - @provide-api no - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content """ from urllib.parse import urlencode from lxml import html from searx.utils import extract_text +# about +about = { + "website": 'https://f-droid.org/', + "wikidata_id": 'Q1386210', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['files'] paging = True diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py index b23c447b8..b0ddf6224 100644 --- a/searx/engines/flickr.py +++ b/searx/engines/flickr.py @@ -1,21 +1,23 @@ -#!/usr/bin/env python - +# SPDX-License-Identifier: AGPL-3.0-or-later """ Flickr (Images) - @website https://www.flickr.com - @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) - - @using-api yes - @results JSON - @stable yes - @parse url, title, thumbnail, img_src More info on api-key : https://www.flickr.com/services/apps/create/ """ from json import loads from urllib.parse import urlencode +# about +about = { + "website": 'https://www.flickr.com', + "wikidata_id": 'Q103204', + "official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html', + "use_official_api": True, + "require_api_key": True, + "results": 'JSON', +} + categories = ['images'] nb_per_page = 15 diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 4bcf837cb..a07aad51e 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -1,15 +1,6 @@ -#!/usr/bin/env python - +# SPDX-License-Identifier: AGPL-3.0-or-later """ - Flickr (Images) - - @website https://www.flickr.com - @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) - - @using-api no - @results HTML - @stable no - @parse url, title, thumbnail, img_src + Flickr (Images) """ from json import loads @@ -21,6 +12,16 @@ from searx.utils import ecma_unescape, html_to_text logger = logger.getChild('flickr-noapi') +# about +about = { + "website": 'https://www.flickr.com', + "wikidata_id": 'Q103204', + "official_api_documentation": 'https://secure.flickr.com/services/api/flickr.photos.search.html', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + categories = ['images'] url = 'https://www.flickr.com/' diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py index e3d056425..42c08cf95 100644 --- a/searx/engines/framalibre.py +++ b/searx/engines/framalibre.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ FramaLibre (It) - - @website https://framalibre.org/ - @provide-api no - - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content, thumbnail, img_src """ from html import escape @@ -15,6 +8,16 @@ from urllib.parse import urljoin, urlencode from lxml import html from searx.utils import extract_text +# about +about = { + "website": 'https://framalibre.org/', + "wikidata_id": 'Q30213882', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['it'] paging = True diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py index 5b174a687..f43bb6e20 100644 --- a/searx/engines/frinkiac.py +++ b/searx/engines/frinkiac.py @@ -1,17 +1,24 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ -Frinkiac (Images) - -@website https://www.frinkiac.com -@provide-api no -@using-api no -@results JSON -@stable no -@parse url, title, img_src + Frinkiac (Images) """ from json import loads from urllib.parse import urlencode +# about +about = { + "website": 'https://frinkiac.com', + "wikidata_id": 'Q24882614', + "official_api_documentation": { + 'url': None, + 'comment': 'see https://github.com/MitchellAW/CompuGlobal' + }, + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + categories = ['images'] BASE = 'https://frinkiac.com/' diff --git a/searx/engines/genius.py b/searx/engines/genius.py index 2bfbfddf5..1667d529d 100644 --- a/searx/engines/genius.py +++ b/searx/engines/genius.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ -Genius - - @website https://www.genius.com/ - @provide-api yes (https://docs.genius.com/) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, thumbnail, publishedDate + Genius """ from json import loads from urllib.parse import urlencode from datetime import datetime +# about +about = { + "website": 'https://genius.com/', + "wikidata_id": 'Q3419343', + "official_api_documentation": 'https://docs.genius.com/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['music'] paging = True diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py index 16b3e692d..55f15576e 100644 --- a/searx/engines/gentoo.py +++ b/searx/engines/gentoo.py @@ -1,20 +1,22 @@ -# -*- coding: utf-8 -*- - +# SPDX-License-Identifier: AGPL-3.0-or-later """ Gentoo Wiki - - @website https://wiki.gentoo.org - @provide-api yes - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title """ from urllib.parse import urlencode, urljoin from lxml import html from searx.utils import extract_text +# about +about = { + "website": 'https://wiki.gentoo.org/', + "wikidata_id": 'Q1050637', + "official_api_documentation": 'https://wiki.gentoo.org/api.php', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['it'] language_support = True diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 1d71b18e9..f5f89a736 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -1,14 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """ Gigablast (Web) - - @website https://gigablast.com - @provide-api yes (https://gigablast.com/api.html) - - @using-api yes - @results XML - @stable yes - @parse url, title, content """ # pylint: disable=missing-function-docstring, invalid-name @@ -18,6 +10,16 @@ from urllib.parse import urlencode # from searx import logger from searx.poolrequests import get +# about +about = { + "website": 'https://www.gigablast.com', + "wikidata_id": 'Q3105449', + "official_api_documentation": 'https://gigablast.com/api.html', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['general'] # gigablast's pagination is totally damaged, don't use it diff --git a/searx/engines/github.py b/searx/engines/github.py index 80b50ceda..b68caa350 100644 --- a/searx/engines/github.py +++ b/searx/engines/github.py @@ -1,18 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ - Github (It) - - @website https://github.com/ - @provide-api yes (https://developer.github.com/v3/) - - @using-api yes - @results JSON - @stable yes (using api) - @parse url, title, content + Github (IT) """ from json import loads from urllib.parse import urlencode +# about +about = { + "website": 'https://github.com/', + "wikidata_id": 'Q364', + "official_api_documentation": 'https://developer.github.com/v3/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['it'] diff --git a/searx/engines/google.py b/searx/engines/google.py index 17ab21f6a..4198de640 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,19 +1,11 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Google (Web) -:website: https://www.google.com -:provide-api: yes (https://developers.google.com/custom-search/) -:using-api: not the offical, since it needs registration to another service -:results: HTML -:stable: no -:parse: url, title, content, number_of_results, answer, suggestion, correction - -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. - -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions + For detailed description of the *REST-full* API see: `Query Parameter + Definitions`_. + .. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions """ # pylint: disable=invalid-name, missing-function-docstring @@ -27,6 +19,16 @@ from searx.exceptions import SearxEngineCaptchaException logger = logger.getChild('google engine') +# about +about = { + "website": 'https://www.google.com', + "wikidata_id": 'Q9366', + "official_api_documentation": 'https://developers.google.com/custom-search/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['general'] paging = True diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 9ef1be753..8c2cb9d2a 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,14 +1,6 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Google (Images) -:website: https://images.google.com (redirected to subdomain www.) -:provide-api: yes (https://developers.google.com/custom-search/) -:using-api: not the offical, since it needs registration to another service -:results: HTML -:stable: no -:template: images.html -:parse: url, title, content, source, thumbnail_src, img_src - For detailed description of the *REST-full* API see: `Query Parameter Definitions`_. @@ -18,10 +10,6 @@ Definitions`_. ``data:` scheme).:: Header set Content-Security-Policy "img-src 'self' data: ;" - -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions - """ from urllib.parse import urlencode, urlparse, unquote @@ -39,6 +27,16 @@ from searx.engines.google import ( logger = logger.getChild('google images') +# about +about = { + "website": 'https://images.google.com/', + "wikidata_id": 'Q521550', + "official_api_documentation": 'https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions', # NOQA + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['images'] diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index f1b7cfa79..63fef6696 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Google (News) - - @website https://news.google.com - @provide-api no - - @using-api no - @results HTML - @stable no - @parse url, title, content, publishedDate """ from urllib.parse import urlencode @@ -15,6 +8,16 @@ from lxml import html from searx.utils import match_language from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import +# about +about = { + "website": 'https://news.google.com', + "wikidata_id": 'Q12020', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # search-url categories = ['news'] paging = True diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index eedefbf45..61e01ca7b 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Google (Videos) - - @website https://www.google.com - @provide-api yes (https://developers.google.com/custom-search/) - - @using-api no - @results HTML - @stable no - @parse url, title, content, thumbnail """ from datetime import date, timedelta @@ -16,6 +9,16 @@ from lxml import html from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex import re +# about +about = { + "website": 'https://www.google.com', + "wikidata_id": 'Q219885', + "official_api_documentation": 'https://developers.google.com/custom-search/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['videos'] paging = True diff --git a/searx/engines/ina.py b/searx/engines/ina.py index ce241d409..1a47ca51e 100644 --- a/searx/engines/ina.py +++ b/searx/engines/ina.py @@ -1,15 +1,7 @@ -# INA (Videos) -# -# @website https://www.ina.fr/ -# @provide-api no -# -# @using-api no -# @results HTML (using search portal) -# @stable no (HTML can change) -# @parse url, title, content, publishedDate, thumbnail -# -# @todo set content-parameter with correct data -# @todo embedded (needs some md5 from video page) +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + INA (Videos) +""" from json import loads from html import unescape @@ -18,6 +10,15 @@ from lxml import html from dateutil import parser from searx.utils import extract_text +# about +about = { + "website": 'https://www.ina.fr/', + "wikidata_id": 'Q1665109', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} # engine dependent config categories = ['videos'] diff --git a/searx/engines/invidious.py b/searx/engines/invidious.py index 6ea942699..1d6d69f64 100644 --- a/searx/engines/invidious.py +++ b/searx/engines/invidious.py @@ -1,17 +1,22 @@ -# Invidious (Videos) -# -# @website https://invidio.us/ -# @provide-api yes (https://github.com/omarroth/invidious/wiki/API) -# -# @using-api yes -# @results JSON -# @stable yes -# @parse url, title, content, publishedDate, thumbnail, embedded, author, length +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Invidious (Videos) +""" from urllib.parse import quote_plus from dateutil import parser import time +# about +about = { + "website": 'https://instances.invidio.us/', + "wikidata_id": 'Q79343316', + "official_api_documentation": 'https://github.com/omarroth/invidious/wiki/API', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ["videos", "music"] paging = True diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index e2aa436cc..f4a5ff6d2 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + from collections.abc import Iterable from json import loads from urllib.parse import urlencode diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py index 90bd33063..6a44e2fd7 100644 --- a/searx/engines/kickass.py +++ b/searx/engines/kickass.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Kickass Torrent (Videos, Music, Files) - - @website https://kickass.so - @provide-api no (nothing found) - - @using-api no - @results HTML (using search portal) - @stable yes (HTML can change) - @parse url, title, content, seed, leech, magnetlink """ from lxml import html @@ -15,6 +8,16 @@ from operator import itemgetter from urllib.parse import quote, urljoin from searx.utils import extract_text, get_torrent_size, convert_str_to_int +# about +about = { + "website": 'https://kickass.so', + "wikidata_id": 'Q17062285', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['videos', 'music', 'files'] paging = True diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 50ba74efc..21abff86e 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -1,21 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ - general mediawiki-engine (Web) - - @website websites built on mediawiki (https://www.mediawiki.org) - @provide-api yes (http://www.mediawiki.org/wiki/API:Search) - - @using-api yes - @results JSON - @stable yes - @parse url, title - - @todo content + General mediawiki-engine (Web) """ from json import loads from string import Formatter from urllib.parse import urlencode, quote +# about +about = { + "website": None, + "wikidata_id": None, + "official_api_documentation": 'http://www.mediawiki.org/wiki/API:Search', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['general'] language_support = True diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py index 7426eef7e..14de4ac9a 100644 --- a/searx/engines/microsoft_academic.py +++ b/searx/engines/microsoft_academic.py @@ -1,12 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ -Microsoft Academic (Science) - -@website https://academic.microsoft.com -@provide-api yes -@using-api no -@results JSON -@stable no -@parse url, title, content + Microsoft Academic (Science) """ from datetime import datetime @@ -15,6 +9,16 @@ from uuid import uuid4 from urllib.parse import urlencode from searx.utils import html_to_text +# about +about = { + "website": 'https://academic.microsoft.com', + "wikidata_id": 'Q28136779', + "official_api_documentation": 'http://ma-graph.org/', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + categories = ['images'] paging = True result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' diff --git a/searx/engines/mixcloud.py b/searx/engines/mixcloud.py index 0606350a9..a6fd1c0a1 100644 --- a/searx/engines/mixcloud.py +++ b/searx/engines/mixcloud.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Mixcloud (Music) - - @website https://http://www.mixcloud.com/ - @provide-api yes (http://www.mixcloud.com/developers/ - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, embedded, publishedDate """ from json import loads from dateutil import parser from urllib.parse import urlencode +# about +about = { + "website": 'https://www.mixcloud.com/', + "wikidata_id": 'Q6883832', + "official_api_documentation": 'http://www.mixcloud.com/developers/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['music'] paging = True diff --git a/searx/engines/not_evil.py b/searx/engines/not_evil.py index e84f153bd..df41c0941 100644 --- a/searx/engines/not_evil.py +++ b/searx/engines/not_evil.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ not Evil (Onions) - - @website http://hss3uro2hsxfogfq.onion - @provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm) - - @using-api no - @results HTML - @stable no - @parse url, title, content """ from urllib.parse import urlencode from lxml import html from searx.engines.xpath import extract_text +# about +about = { + "website": 'http://hss3uro2hsxfogfq.onion', + "wikidata_id": None, + "official_api_documentation": 'http://hss3uro2hsxfogfq.onion/api.htm', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['onions'] paging = True diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index e0a91494f..f8178d637 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -1,18 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Nyaa.si (Anime Bittorrent tracker) - - @website https://nyaa.si/ - @provide-api no - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content, seed, leech, torrentfile """ from lxml import html from urllib.parse import urlencode from searx.utils import extract_text, get_torrent_size, int_or_zero +# about +about = { + "website": 'https://nyaa.si/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['files', 'images', 'videos', 'music'] paging = True diff --git a/searx/engines/opensemantic.py b/searx/engines/opensemantic.py index 9364bab41..64bc321f1 100644 --- a/searx/engines/opensemantic.py +++ b/searx/engines/opensemantic.py @@ -1,18 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ -Open Semantic Search - - @website https://www.opensemanticsearch.org/ - @provide-api yes (https://www.opensemanticsearch.org/dev) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, publishedDate + Open Semantic Search """ + from dateutil import parser from json import loads from urllib.parse import quote +# about +about = { + "website": 'https://www.opensemanticsearch.org/', + "wikidata_id": None, + "official_api_documentation": 'https://www.opensemanticsearch.org/dev', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + base_url = 'http://localhost:8983/solr/opensemanticsearch/' search_string = 'query?q={query}' diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 5475c7a6d..f11aa5f8c 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ OpenStreetMap (Map) - - @website https://openstreetmap.org/ - @provide-api yes (http://wiki.openstreetmap.org/wiki/Nominatim) - - @using-api yes - @results JSON - @stable yes - @parse url, title """ import re from json import loads from flask_babel import gettext +# about +about = { + "website": 'https://www.openstreetmap.org/', + "wikidata_id": 'Q936', + "official_api_documentation": 'http://wiki.openstreetmap.org/wiki/Nominatim', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['map'] paging = False diff --git a/searx/engines/pdbe.py b/searx/engines/pdbe.py index 2db92868a..b9bbfaf1b 100644 --- a/searx/engines/pdbe.py +++ b/searx/engines/pdbe.py @@ -1,18 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ PDBe (Protein Data Bank in Europe) - - @website https://www.ebi.ac.uk/pdbe - @provide-api yes (https://www.ebi.ac.uk/pdbe/api/doc/search.html), - unlimited - @using-api yes - @results python dictionary (from json) - @stable yes - @parse url, title, content, img_src """ from json import loads from flask_babel import gettext +# about +about = { + "website": 'https://www.ebi.ac.uk/pdbe', + "wikidata_id": 'Q55823905', + "official_api_documentation": 'https://www.ebi.ac.uk/pdbe/api/doc/search.html', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + categories = ['science'] hide_obsolete = False diff --git a/searx/engines/peertube.py b/searx/engines/peertube.py index e43b2a6b7..549141079 100644 --- a/searx/engines/peertube.py +++ b/searx/engines/peertube.py @@ -1,15 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ peertube (Videos) - - @website https://www.peertube.live - @provide-api yes (https://docs.joinpeertube.org/api-rest-reference.html) - - @using-api yes - @results JSON - @stable yes - @parse url, title, thumbnail, publishedDate, embedded - - @todo implement time range support """ from json import loads @@ -17,6 +8,16 @@ from datetime import datetime from urllib.parse import urlencode from searx.utils import html_to_text +# about +about = { + "website": 'https://joinpeertube.org', + "wikidata_id": 'Q50938515', + "official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ["videos"] paging = True diff --git a/searx/engines/photon.py b/searx/engines/photon.py index 7a6fc8321..f12bcd22a 100644 --- a/searx/engines/photon.py +++ b/searx/engines/photon.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Photon (Map) - - @website https://photon.komoot.de - @provide-api yes (https://photon.komoot.de/) - - @using-api yes - @results JSON - @stable yes - @parse url, title """ from json import loads from urllib.parse import urlencode from searx.utils import searx_useragent +# about +about = { + "website": 'https://photon.komoot.de', + "wikidata_id": None, + "official_api_documentation": 'https://photon.komoot.de/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['map'] paging = False diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index 828241ece..98a2dd9f2 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -1,12 +1,7 @@ -# Piratebay (Videos, Music, Files) -# -# @website https://thepiratebay.org -# @provide-api yes (https://apibay.org/) -# -# @using-api yes -# @results JSON -# @stable no (the API is not documented nor versioned) -# @parse url, title, seed, leech, magnetlink, filesize, publishedDate +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Piratebay (Videos, Music, Files) +""" from json import loads from datetime import datetime @@ -15,6 +10,16 @@ from operator import itemgetter from urllib.parse import quote from searx.utils import get_torrent_size +# about +about = { + "website": 'https://thepiratebay.org', + "wikidata_id": 'Q22663', + "official_api_documentation": 'https://apibay.org/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ["videos", "music", "files"] diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index 07c45709e..da02f91ca 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -1,14 +1,6 @@ -#!/usr/bin/env python - +# SPDX-License-Identifier: AGPL-3.0-or-later """ PubMed (Scholar publications) - @website https://www.ncbi.nlm.nih.gov/pubmed/ - @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/) - @using-api yes - @results XML - @stable yes - @parse url, title, publishedDate, content - More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/ """ from flask_babel import gettext @@ -17,6 +9,18 @@ from datetime import datetime from urllib.parse import urlencode from searx.poolrequests import get +# about +about = { + "website": 'https://www.ncbi.nlm.nih.gov/pubmed/', + "wikidata_id": 'Q1540899', + "official_api_documentation": { + 'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/', + 'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/' + }, + "use_official_api": True, + "require_api_key": False, + "results": 'XML', +} categories = ['science'] diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index b785719d9..87499c8ad 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Qwant (Web, Images, News, Social) - - @website https://qwant.com/ - @provide-api not officially (https://api.qwant.com/api/search/) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content """ from datetime import datetime @@ -17,6 +10,15 @@ from searx.utils import html_to_text, match_language from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException from searx.raise_for_httperror import raise_for_httperror +# about +about = { + "website": 'https://www.qwant.com/', + "wikidata_id": 'Q14657870', + "official_api_documentation": None, + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} # engine dependent config categories = [] diff --git a/searx/engines/recoll.py b/searx/engines/recoll.py index 5a956b8bf..d90005a95 100644 --- a/searx/engines/recoll.py +++ b/searx/engines/recoll.py @@ -1,17 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Recoll (local search engine) - - @using-api yes - @results JSON - @stable yes - @parse url, content, size, abstract, author, mtype, subtype, time, \ - filename, label, type, embedded """ from datetime import date, timedelta from json import loads from urllib.parse import urlencode, quote +# about +about = { + "website": None, + "wikidata_id": 'Q15735774', + "official_api_documentation": 'https://www.lesbonscomptes.com/recoll/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config time_range_support = True diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index e732875cb..ee734ace2 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Reddit - - @website https://www.reddit.com/ - @provide-api yes (https://www.reddit.com/dev/api) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, thumbnail, publishedDate """ import json from datetime import datetime from urllib.parse import urlencode, urljoin, urlparse +# about +about = { + "website": 'https://www.reddit.com/', + "wikidata_id": 'Q1136', + "official_api_documentation": 'https://www.reddit.com/dev/api', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['general', 'images', 'news', 'social media'] page_size = 25 diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py index 72fd2b3c9..51c925247 100644 --- a/searx/engines/scanr_structures.py +++ b/searx/engines/scanr_structures.py @@ -1,18 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ ScanR Structures (Science) - - @website https://scanr.enseignementsup-recherche.gouv.fr - @provide-api yes (https://scanr.enseignementsup-recherche.gouv.fr/api/swagger-ui.html) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, img_src """ from json import loads, dumps from searx.utils import html_to_text +# about +about = { + "website": 'https://scanr.enseignementsup-recherche.gouv.fr', + "wikidata_id": 'Q44105684', + "official_api_documentation": 'https://scanr.enseignementsup-recherche.gouv.fr/opendata', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['science'] paging = True diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 706285814..8c1330d98 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -1,18 +1,20 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ - Searchcode (It) - - @website https://searchcode.com/ - @provide-api yes (https://searchcode.com/api/) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content + Searchcode (IT) """ from json import loads from urllib.parse import urlencode +# about +about = { + "website": 'https://searchcode.com/', + "wikidata_id": None, + "official_api_documentation": 'https://searchcode.com/api/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} # engine dependent config categories = ['it'] diff --git a/searx/engines/searx_engine.py b/searx/engines/searx_engine.py index 87e5e05c2..c4f016adc 100644 --- a/searx/engines/searx_engine.py +++ b/searx/engines/searx_engine.py @@ -1,18 +1,20 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Searx (all) - - @website https://github.com/searx/searx - @provide-api yes (https://searx.github.io/searx/dev/search_api.html) - - @using-api yes - @results JSON - @stable yes (using api) - @parse url, title, content """ from json import loads from searx.engines import categories as searx_categories +# about +about = { + "website": 'https://github.com/searx/searx', + "wikidata_id": 'Q17639196', + "official_api_documentation": 'https://searx.github.io/searx/dev/search_api.html', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} categories = searx_categories.keys() diff --git a/searx/engines/sepiasearch.py b/searx/engines/sepiasearch.py index 0b7c1ba6e..3433c897a 100644 --- a/searx/engines/sepiasearch.py +++ b/searx/engines/sepiasearch.py @@ -1,17 +1,23 @@ -# SepiaSearch (Videos) -# -# @website https://sepiasearch.org -# @provide-api https://framagit.org/framasoft/peertube/search-index/-/tree/master/server/controllers/api -# @using-api yes -# @results JSON -# @stable yes -# @parse url, title, content, publishedDate, thumbnail +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + SepiaSearch (Videos) +""" from json import loads from dateutil import parser, relativedelta from urllib.parse import urlencode from datetime import datetime +# about +about = { + "website": 'https://sepiasearch.org', + "wikidata_id": None, + "official_api_documentation": "https://framagit.org/framasoft/peertube/search-index/-/tree/master/server/controllers/api", # NOQA + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + categories = ['videos'] paging = True language_support = True diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 84ff21a88..9e414746f 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Soundcloud (Music) - - @website https://soundcloud.com - @provide-api yes (https://developers.soundcloud.com/) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, publishedDate, embedded """ import re @@ -18,6 +11,15 @@ from urllib.parse import quote_plus, urlencode from searx import logger from searx.poolrequests import get as http_get +# about +about = { + "website": 'https://soundcloud.com', + "wikidata_id": 'Q568769', + "official_api_documentation": 'https://developers.soundcloud.com/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} # engine dependent config categories = ['music'] diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index 74942326e..0ad8bfe32 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Spotify (Music) - - @website https://spotify.com - @provide-api yes (https://developer.spotify.com/web-api/search-item/) - - @using-api yes - @results JSON - @stable yes - @parse url, title, content, embedded """ from json import loads @@ -15,6 +8,16 @@ from urllib.parse import urlencode import requests import base64 +# about +about = { + "website": 'https://www.spotify.com', + "wikidata_id": 'Q689141', + "official_api_documentation": 'https://developer.spotify.com/web-api/search-item/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['music'] paging = True diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index f730264e2..91eaa68e9 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ - Stackoverflow (It) - - @website https://stackoverflow.com/ - @provide-api not clear (https://api.stackexchange.com/docs/advanced-search) - - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, content + Stackoverflow (IT) """ from urllib.parse import urlencode, urljoin, urlparse @@ -15,6 +8,16 @@ from lxml import html from searx.utils import extract_text from searx.exceptions import SearxEngineCaptchaException +# about +about = { + "website": 'https://stackoverflow.com/', + "wikidata_id": 'Q549037', + "official_api_documentation": 'https://api.stackexchange.com/docs', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['it'] paging = True diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index cd8b132f9..68157971d 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -1,14 +1,7 @@ -# Startpage (Web) -# -# @website https://startpage.com -# @provide-api no (nothing found) -# -# @using-api no -# @results HTML -# @stable no (HTML can change) -# @parse url, title, content -# -# @todo paging +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Startpage (Web) +""" from lxml import html from dateutil import parser @@ -19,6 +12,16 @@ from babel import Locale from babel.localedata import locale_identifiers from searx.utils import extract_text, eval_xpath, match_language +# about +about = { + "website": 'https://startpage.com', + "wikidata_id": 'Q2333295', + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['general'] # there is a mechanism to block "bot" search diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py index 9fffba8a6..91d1f01d5 100644 --- a/searx/engines/tokyotoshokan.py +++ b/searx/engines/tokyotoshokan.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Tokyo Toshokan (A BitTorrent Library for Japanese Media) - - @website https://www.tokyotosho.info/ - @provide-api no - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, publishedDate, seed, leech, - filesize, magnetlink, content """ import re @@ -16,6 +9,16 @@ from lxml import html from datetime import datetime from searx.utils import extract_text, get_torrent_size, int_or_zero +# about +about = { + "website": 'https://www.tokyotosho.info/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['files', 'videos', 'music'] paging = True diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py index 4d3e6fdd7..94a7a5343 100644 --- a/searx/engines/torrentz.py +++ b/searx/engines/torrentz.py @@ -1,14 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Torrentz2.is (BitTorrent meta-search engine) - - @website https://torrentz2.is/ - @provide-api no - - @using-api no - @results HTML - @stable no (HTML can change, although unlikely, - see https://torrentz.is/torrentz.btsearch) - @parse url, title, publishedDate, seed, leech, filesize, magnetlink """ import re @@ -17,6 +9,16 @@ from lxml import html from datetime import datetime from searx.utils import extract_text, get_torrent_size +# about +about = { + "website": 'https://torrentz2.is/', + "wikidata_id": 'Q1156687', + "official_api_documentation": 'https://torrentz.is/torrentz.btsearch', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['files', 'videos', 'music'] paging = True diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 2706e3617..9c53d70ad 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -1,14 +1,18 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ MyMemory Translated - - @website https://mymemory.translated.net/ - @provide-api yes (https://mymemory.translated.net/doc/spec.php) - @using-api yes - @results JSON - @stable yes - @parse url, title, content """ +# about +about = { + "website": 'https://mymemory.translated.net/', + "wikidata_id": None, + "official_api_documentation": 'https://mymemory.translated.net/doc/spec.php', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + engine_type = 'online_dictionnary' categories = ['general'] url = 'https://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' diff --git a/searx/engines/unsplash.py b/searx/engines/unsplash.py index 45c6b30da..3bbdf630d 100644 --- a/searx/engines/unsplash.py +++ b/searx/engines/unsplash.py @@ -1,18 +1,21 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Unsplash - - @website https://unsplash.com - @provide-api yes (https://unsplash.com/developers) - - @using-api no - @results JSON (using search portal's infiniscroll API) - @stable no (JSON format could change any time) - @parse url, title, img_src, thumbnail_src """ from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl from json import loads +# about +about = { + "website": 'https://unsplash.com', + "wikidata_id": 'Q28233552', + "official_api_documentation": 'https://unsplash.com/developers', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + url = 'https://unsplash.com/' search_url = url + 'napi/search/photos?' categories = ['images'] diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index fd3abc858..824579256 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -1,21 +1,22 @@ -# Vimeo (Videos) -# -# @website https://vimeo.com/ -# @provide-api yes (http://developer.vimeo.com/api), -# they have a maximum count of queries/hour -# -# @using-api no (TODO, rewrite to api) -# @results HTML (using search portal) -# @stable no (HTML can change) -# @parse url, title, publishedDate, thumbnail, embedded -# -# @todo rewrite to api -# @todo set content-parameter with correct data +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Wikipedia (Web +""" from urllib.parse import urlencode from json import loads from dateutil import parser +# about +about = { + "website": 'https://vimeo.com/', + "wikidata_id": 'Q156376', + "official_api_documentation": 'http://developer.vimeo.com/api', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['videos'] paging = True diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 8d787caac..c8e4cfae6 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -1,14 +1,6 @@ -# -*- coding: utf-8 -*- +# SPDX-License-Identifier: AGPL-3.0-or-later """ Wikidata - - @website https://wikidata.org - @provide-api yes (https://query.wikidata.org/) - - @using-api yes - @results JSON - @stable yes - @parse url, infobox """ @@ -27,6 +19,16 @@ from searx.engines.wikipedia import _fetch_supported_languages, supported_langua logger = logger.getChild('wikidata') +# about +about = { + "website": 'https://wikidata.org/', + "wikidata_id": 'Q2013', + "official_api_documentation": 'https://query.wikidata.org/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # SPARQL SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql' SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain' diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 54d75108e..eff301145 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -1,13 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Wikipedia (Web) - - @website https://en.wikipedia.org/api/rest_v1/ - @provide-api yes - - @using-api yes - @results JSON - @stable yes - @parse url, infobox """ from urllib.parse import quote @@ -16,6 +9,16 @@ from lxml.html import fromstring from searx.utils import match_language, searx_useragent from searx.raise_for_httperror import raise_for_httperror +# about +about = { + "website": 'https://www.wikipedia.org/', + "wikidata_id": 'Q52', + "official_api_documentation": 'https://en.wikipedia.org/api/', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # search-url search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index 520eaa209..9c84e2809 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -1,16 +1,21 @@ -# Wolfram Alpha (Science) -# -# @website https://www.wolframalpha.com -# @provide-api yes (https://api.wolframalpha.com/v2/) -# -# @using-api yes -# @results XML -# @stable yes -# @parse url, infobox +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Wolfram|Alpha (Science) +""" from lxml import etree from urllib.parse import urlencode +# about +about = { + "website": 'https://www.wolframalpha.com', + "wikidata_id": 'Q207006', + "official_api_documentation": 'https://products.wolframalpha.com/api/', + "use_official_api": True, + "require_api_key": False, + "results": 'XML', +} + # search-url search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' site_url = 'https://www.wolframalpha.com/input/?{query}' diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 943d4f3fb..8e427d575 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -1,12 +1,7 @@ -# Wolfram|Alpha (Science) -# -# @website https://www.wolframalpha.com/ -# @provide-api yes (https://api.wolframalpha.com/v2/) -# -# @using-api no -# @results JSON -# @stable no -# @parse url, infobox +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Wolfram|Alpha (Science) +""" from json import loads from time import time @@ -14,6 +9,16 @@ from urllib.parse import urlencode from searx.poolrequests import get as http_get +# about +about = { + "website": 'https://www.wolframalpha.com/', + "wikidata_id": 'Q207006', + "official_api_documentation": 'https://products.wolframalpha.com/api/', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + # search-url url = 'https://www.wolframalpha.com/' diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index b8f111a50..96b8d680c 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -1,19 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ 1x (Images) - - @website http://1x.com/ - @provide-api no - - @using-api no - @results HTML - @stable no (HTML can change) - @parse url, title, thumbnail """ from lxml import html, etree from urllib.parse import urlencode, urljoin from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex +# about +about = { + "website": 'https://1x.com/', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['images'] paging = False diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 1507176ec..612f69abd 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later + from lxml import html from urllib.parse import urlencode from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 6f7ab759f..afd59cd49 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -1,16 +1,7 @@ -# Yacy (Web, Images, Videos, Music, Files) -# -# @website http://yacy.net -# @provide-api yes -# (http://www.yacy-websuche.de/wiki/index.php/Dev:APIyacysearch) -# -# @using-api yes -# @results JSON -# @stable yes -# @parse (general) url, title, content, publishedDate -# @parse (images) url, title, img_src -# -# @todo parse video, audio and file results +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Yacy (Web, Images, Videos, Music, Files) +""" from json import loads from dateutil import parser @@ -20,6 +11,16 @@ from requests.auth import HTTPDigestAuth from searx.utils import html_to_text +# about +about = { + "website": 'https://yacy.net/', + "wikidata_id": 'Q1759675', + "official_api_documentation": 'https://wiki.yacy.net/index.php/Dev:API', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['general', 'images'] # TODO , 'music', 'videos', 'files' paging = True diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 3420aa6d5..eb07a45fc 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -1,20 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Yahoo (Web) - - @website https://search.yahoo.com/web - @provide-api yes (https://developer.yahoo.com/boss/search/), - $0.80/1000 queries - - @using-api no (because pricing) - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content, suggestion """ from urllib.parse import unquote, urlencode from lxml import html from searx.utils import extract_text, extract_url, match_language, eval_xpath +# about +about = { + "website": 'https://search.yahoo.com/', + "wikidata_id": None, + "official_api_documentation": 'https://developer.yahoo.com/api/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['general'] paging = True diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 793d1104a..b324ecdf3 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -1,13 +1,7 @@ -# Yahoo (News) -# -# @website https://news.yahoo.com -# @provide-api yes (https://developer.yahoo.com/boss/search/) -# $0.80/1000 queries -# -# @using-api no (because pricing) -# @results HTML (using search portal) -# @stable no (HTML can change) -# @parse url, title, content, publishedDate +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Yahoo (News) +""" import re from datetime import datetime, timedelta @@ -18,6 +12,16 @@ from searx.engines.yahoo import _fetch_supported_languages, supported_languages_ from dateutil import parser from searx.utils import extract_text, extract_url, match_language +# about +about = { + "website": 'https://news.yahoo.com', + "wikidata_id": 'Q3044717', + "official_api_documentation": 'https://developer.yahoo.com/api/', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['news'] paging = True diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index b4a6a54cf..57a2f4b79 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -1,12 +1,6 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Yahoo (Web) - - @website https://yandex.ru/ - @provide-api ? - @using-api no - @results HTML (using search portal) - @stable no (HTML can change) - @parse url, title, content """ from urllib.parse import urlencode, urlparse @@ -16,6 +10,16 @@ from searx.exceptions import SearxEngineCaptchaException logger = logger.getChild('yandex engine') +# about +about = { + "website": 'https://yandex.ru/', + "wikidata_id": 'Q5281', + "official_api_documentation": "?", + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['general'] paging = True diff --git a/searx/engines/yggtorrent.py b/searx/engines/yggtorrent.py index ec84d2c6b..cad2de52b 100644 --- a/searx/engines/yggtorrent.py +++ b/searx/engines/yggtorrent.py @@ -1,12 +1,7 @@ -# Yggtorrent (Videos, Music, Files) -# -# @website https://www2.yggtorrent.si -# @provide-api no (nothing found) -# -# @using-api no -# @results HTML (using search portal) -# @stable no (HTML can change) -# @parse url, title, seed, leech, publishedDate, filesize +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Yggtorrent (Videos, Music, Files) +""" from lxml import html from operator import itemgetter @@ -15,6 +10,16 @@ from urllib.parse import quote from searx.utils import extract_text, get_torrent_size from searx.poolrequests import get as http_get +# about +about = { + "website": 'https://www2.yggtorrent.si', + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['videos', 'music', 'files'] paging = True diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py index 8c12ac4d2..b3dcb4907 100644 --- a/searx/engines/youtube_api.py +++ b/searx/engines/youtube_api.py @@ -1,18 +1,23 @@ -# Youtube (Videos) -# -# @website https://www.youtube.com/ -# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list) -# -# @using-api yes -# @results JSON -# @stable yes -# @parse url, title, content, publishedDate, thumbnail, embedded +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Youtube (Videos) +""" from json import loads from dateutil import parser from urllib.parse import urlencode from searx.exceptions import SearxEngineAPIException +# about +about = { + "website": 'https://www.youtube.com/', + "wikidata_id": 'Q866', + "official_api_documentation": 'https://developers.google.com/youtube/v3/docs/search/list?apix=true', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + # engine dependent config categories = ['videos', 'music'] paging = False diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 36fc72e36..4a6df57c4 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -1,17 +1,22 @@ -# Youtube (Videos) -# -# @website https://www.youtube.com/ -# @provide-api yes (https://developers.google.com/apis-explorer/#p/youtube/v3/youtube.search.list) -# -# @using-api no -# @results HTML -# @stable no -# @parse url, title, content, publishedDate, thumbnail, embedded +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Youtube (Videos) +""" from functools import reduce from json import loads from urllib.parse import quote_plus +# about +about = { + "website": 'https://www.youtube.com/', + "wikidata_id": 'Q866', + "official_api_documentation": 'https://developers.google.com/youtube/v3/docs/search/list?apix=true', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + # engine dependent config categories = ['videos', 'music'] paging = True diff --git a/searx/settings.yml b/searx/settings.yml index 55c9849c1..e3259220b 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -157,6 +157,13 @@ engines: timeout : 7.0 disabled : True shortcut : ai + about: + website: https://archive.is/ + wikidata_id: Q13515725 + official_api_documentation: http://mementoweb.org/depot/native/archiveis/ + use_official_api: false + require_api_key: false + results: HTML - name : arxiv engine : arxiv @@ -201,6 +208,13 @@ engines: timeout : 4.0 disabled : True shortcut : bb + about: + website: https://bitbucket.org/ + wikidata_id: Q2493781 + official_api_documentation: https://developer.atlassian.com/bitbucket + use_official_api: false + require_api_key: false + results: HTML - name : btdigg engine : btdigg @@ -216,6 +230,13 @@ engines: categories : videos disabled : True shortcut : c3tv + about: + website: https://media.ccc.de/ + wikidata_id: Q80729951 + official_api_documentation: https://github.com/voc/voctoweb + use_official_api: false + require_api_key: false + results: HTML - name : crossref engine : json_engine @@ -226,6 +247,13 @@ engines: content_query : fullCitation categories : science shortcut : cr + about: + website: https://www.crossref.org/ + wikidata_id: Q5188229 + official_api_documentation: https://github.com/CrossRef/rest-api-doc + use_official_api: false + require_api_key: false + results: JSON - name : currency engine : currency_convert @@ -271,6 +299,13 @@ engines: categories : general shortcut : ew disabled : True + about: + website: https://www.erowid.org/ + wikidata_id: Q1430691 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML # - name : elasticsearch # shortcut : es @@ -321,6 +356,13 @@ engines: first_page_num : 1 shortcut : et disabled : True + about: + website: https://www.etymonline.com/ + wikidata_id: Q1188617 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML # - name : ebay # engine : ebay @@ -360,6 +402,9 @@ engines: search_type : title timeout : 5.0 disabled : True + about: + website: https://directory.fsf.org/ + wikidata_id: Q2470288 - name : frinkiac engine : frinkiac @@ -394,6 +439,13 @@ engines: shortcut : gl timeout : 10.0 disabled : True + about: + website: https://about.gitlab.com/ + wikidata_id: Q16639197 + official_api_documentation: https://docs.gitlab.com/ee/api/ + use_official_api: false + require_api_key: false + results: JSON - name : github engine : github @@ -411,6 +463,13 @@ engines: categories : it shortcut : cb disabled : True + about: + website: https://codeberg.org/ + wikidata_id: + official_api_documentation: https://try.gitea.io/api/swagger + use_official_api: false + require_api_key: false + results: JSON - name : google engine : google @@ -441,6 +500,13 @@ engines: first_page_num : 0 categories : science shortcut : gos + about: + website: https://scholar.google.com/ + wikidata_id: Q494817 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : google play apps engine : xpath @@ -453,6 +519,13 @@ engines: categories : files shortcut : gpa disabled : True + about: + website: https://play.google.com/ + wikidata_id: Q79576 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : google play movies engine : xpath @@ -465,6 +538,13 @@ engines: categories : videos shortcut : gpm disabled : True + about: + website: https://play.google.com/ + wikidata_id: Q79576 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : google play music engine : xpath @@ -477,6 +557,13 @@ engines: categories : music shortcut : gps disabled : True + about: + website: https://play.google.com/ + wikidata_id: Q79576 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : geektimes engine : xpath @@ -489,6 +576,13 @@ engines: timeout : 4.0 disabled : True shortcut : gt + about: + website: https://geektimes.ru/ + wikidata_id: Q50572423 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : habrahabr engine : xpath @@ -501,6 +595,13 @@ engines: timeout : 4.0 disabled : True shortcut : habr + about: + website: https://habr.com/ + wikidata_id: Q4494434 + official_api_documentation: https://habr.com/en/docs/help/api/ + use_official_api: false + require_api_key: false + results: HTML - name : hoogle engine : json_engine @@ -513,6 +614,13 @@ engines: page_size : 20 categories : it shortcut : ho + about: + website: https://www.haskell.org/ + wikidata_id: Q34010 + official_api_documentation: https://hackage.haskell.org/api + use_official_api: false + require_api_key: false + results: JSON - name : ina engine : ina @@ -543,6 +651,13 @@ engines: timeout : 7.0 disabled : True shortcut : lg + about: + website: http://libgen.rs/ + wikidata_id: Q22017206 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : lobste.rs engine : xpath @@ -555,6 +670,13 @@ engines: shortcut : lo timeout : 3.0 disabled: True + about: + website: https://lobste.rs/ + wikidata_id: Q60762874 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : metager engine : xpath @@ -566,6 +688,13 @@ engines: categories : general shortcut : mg disabled : True + about: + website: https://metager.org/ + wikidata_id: Q1924645 + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : microsoft academic engine : microsoft_academic @@ -589,6 +718,13 @@ engines: disabled: True timeout: 5.0 shortcut : npm + about: + website: https://npms.io/ + wikidata_id: Q7067518 + official_api_documentation: https://api-docs.npms.io/ + use_official_api: false + require_api_key: false + results: JSON # Requires Tor - name : not evil @@ -617,6 +753,13 @@ engines: categories : science shortcut : oad timeout: 5.0 + about: + website: https://www.openaire.eu/ + wikidata_id: Q25106053 + official_api_documentation: https://api.openaire.eu/ + use_official_api: false + require_api_key: false + results: JSON - name : openairepublications engine : json_engine @@ -629,6 +772,13 @@ engines: categories : science shortcut : oap timeout: 5.0 + about: + website: https://www.openaire.eu/ + wikidata_id: Q25106053 + official_api_documentation: https://api.openaire.eu/ + use_official_api: false + require_api_key: false + results: JSON # - name : opensemanticsearch # engine : opensemantic @@ -650,6 +800,13 @@ engines: timeout : 4.0 disabled : True shortcut : or + about: + website: https://openrepos.net/ + wikidata_id: + official_api_documentation: + use_official_api: false + require_api_key: false + results: HTML - name : pdbe engine : pdbe @@ -768,6 +925,13 @@ engines: content_xpath : .//div[@class="search-result-abstract"] shortcut : se categories : science + about: + website: https://www.semanticscholar.org/ + wikidata_id: Q22908627 + official_api_documentation: https://api.semanticscholar.org/ + use_official_api: false + require_api_key: false + results: HTML # Spotify needs API credentials # - name : spotify @@ -876,6 +1040,9 @@ engines: number_of_results : 5 search_type : text disabled : True + about: + website: https://www.wikibooks.org/ + wikidata_id: Q367 - name : wikinews engine : mediawiki @@ -885,6 +1052,9 @@ engines: number_of_results : 5 search_type : text disabled : True + about: + website: https://www.wikinews.org/ + wikidata_id: Q964 - name : wikiquote engine : mediawiki @@ -896,6 +1066,9 @@ engines: disabled : True additional_tests: rosebud: *test_rosebud + about: + website: https://www.wikiquote.org/ + wikidata_id: Q369 - name : wikisource engine : mediawiki @@ -905,6 +1078,9 @@ engines: number_of_results : 5 search_type : text disabled : True + about: + website: https://www.wikisource.org/ + wikidata_id: Q263 - name : wiktionary engine : mediawiki @@ -914,6 +1090,9 @@ engines: number_of_results : 5 search_type : text disabled : True + about: + website: https://www.wiktionary.org/ + wikidata_id: Q151 - name : wikiversity engine : mediawiki @@ -923,6 +1102,9 @@ engines: number_of_results : 5 search_type : text disabled : True + about: + website: https://www.wikiversity.org/ + wikidata_id: Q370 - name : wikivoyage engine : mediawiki @@ -932,6 +1114,9 @@ engines: number_of_results : 5 search_type : text disabled : True + about: + website: https://www.wikivoyage.org/ + wikidata_id: Q373 - name : wolframalpha shortcut : wa @@ -979,6 +1164,13 @@ engines: first_page_num : 0 page_size : 10 disabled : True + about: + website: https://www.seznam.cz/ + wikidata_id: Q3490485 + official_api_documentation: https://api.sklik.cz/ + use_official_api: false + require_api_key: false + results: HTML - name : mojeek shortcut: mjk @@ -993,6 +1185,13 @@ engines: first_page_num : 0 page_size : 10 disabled : True + about: + website: https://www.mojeek.com/ + wikidata_id: Q60747299 + official_api_documentation: https://www.mojeek.com/services/api.html/ + use_official_api: false + require_api_key: false + results: HTML - name : naver shortcut: nvr @@ -1007,6 +1206,13 @@ engines: first_page_num : 1 page_size : 10 disabled : True + about: + website: https://www.naver.com/ + wikidata_id: Q485639 + official_api_documentation: https://developers.naver.com/docs/nmt/examples/ + use_official_api: false + require_api_key: false + results: HTML - name : rubygems shortcut: rbg @@ -1021,6 +1227,13 @@ engines: first_page_num : 1 categories: it disabled : True + about: + website: https://rubygems.org/ + wikidata_id: Q1853420 + official_api_documentation: https://guides.rubygems.org/rubygems-org-api/ + use_official_api: false + require_api_key: false + results: HTML - name : peertube engine: peertube diff --git a/utils/fetch_engine_descriptions.py b/utils/fetch_engine_descriptions.py new file mode 100644 index 000000000..9ca001d45 --- /dev/null +++ b/utils/fetch_engine_descriptions.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +import sys +import json +from urllib.parse import quote, urlparse +from os.path import realpath, dirname +import cld3 +from lxml.html import fromstring + +# set path +sys.path.append(realpath(dirname(realpath(__file__)) + '/../')) + +from searx.engines.wikidata import send_wikidata_query +from searx.utils import extract_text +import searx +import searx.search +import searx.poolrequests + +SPARQL_WIKIPEDIA_ARTICLE = """ +SELECT DISTINCT ?item ?name +WHERE { + VALUES ?item { %IDS% } + ?article schema:about ?item ; + schema:inLanguage ?lang ; + schema:name ?name ; + schema:isPartOf [ wikibase:wikiGroup "wikipedia" ] . + FILTER(?lang in (%LANGUAGES_SPARQL%)) . + FILTER (!CONTAINS(?name, ':')) . +} +""" + +SPARQL_DESCRIPTION = """ +SELECT DISTINCT ?item ?itemDescription +WHERE { + VALUES ?item { %IDS% } + ?item schema:description ?itemDescription . + FILTER (lang(?itemDescription) in (%LANGUAGES_SPARQL%)) +} +ORDER BY ?itemLang +""" + +LANGUAGES = searx.settings['locales'].keys() +LANGUAGES_SPARQL = ', '.join(set(map(lambda l: repr(l.split('_')[0]), LANGUAGES))) +IDS = None + +descriptions = {} +wd_to_engine_name = {} + + +def normalize_description(description): + for c in [chr(c) for c in range(0, 31)]: + description = description.replace(c, ' ') + description = ' '.join(description.strip().split()) + return description + + +def update_description(engine_name, lang, description, source, replace=True): + if replace or lang not in descriptions[engine_name]: + descriptions[engine_name][lang] = [normalize_description(description), source] + + +def get_wikipedia_summary(language, pageid): + search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' + url = search_url.format(title=quote(pageid), language=language) + try: + response = searx.poolrequests.get(url) + response.raise_for_status() + api_result = json.loads(response.text) + return api_result.get('extract') + except: + return None + + +def detect_language(text): + r = cld3.get_language(str(text)) # pylint: disable=E1101 + if r is not None and r.probability >= 0.98 and r.is_reliable: + return r.language + return None + + +def get_website_description(url, lang1, lang2=None): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'DNT': '1', + 'Upgrade-Insecure-Requests': '1', + 'Sec-GPC': '1', + 'Cache-Control': 'max-age=0', + } + if lang1 is not None: + lang_list = [lang1] + if lang2 is not None: + lang_list.append(lang2) + headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8' + try: + response = searx.poolrequests.get(url, headers=headers, timeout=10) + response.raise_for_status() + except Exception: + return (None, None) + + try: + html = fromstring(response.text) + except ValueError: + html = fromstring(response.content) + + description = extract_text(html.xpath('/html/head/meta[@name="description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/meta[@property="og:description"]/@content')) + if not description: + description = extract_text(html.xpath('/html/head/title')) + lang = extract_text(html.xpath('/html/@lang')) + if lang is None and len(lang1) > 0: + lang = lang1 + lang = detect_language(description) or lang or 'en' + lang = lang.split('_')[0] + lang = lang.split('-')[0] + return (lang, description) + + +def initialize(): + global descriptions, wd_to_engine_name, IDS + searx.search.initialize() + for engine_name, engine in searx.engines.engines.items(): + descriptions[engine_name] = {} + wikidata_id = getattr(engine, "about", {}).get('wikidata_id') + if wikidata_id is not None: + wd_to_engine_name.setdefault(wikidata_id, set()).add(engine_name) + + IDS = ' '.join(list(map(lambda wd_id: 'wd:' + wd_id, wd_to_engine_name.keys()))) + + +def fetch_wikidata_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_DESCRIPTION + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['itemDescription']['xml:lang'] + description = binding['itemDescription']['value'] + if ' ' in description: # skip unique word description (like "website") + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikidata') + + +def fetch_wikipedia_descriptions(): + global IDS + result = send_wikidata_query(SPARQL_WIKIPEDIA_ARTICLE + .replace('%IDS%', IDS) + .replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) + if result is not None: + for binding in result['results']['bindings']: + wikidata_id = binding['item']['value'].replace('http://www.wikidata.org/entity/', '') + lang = binding['name']['xml:lang'] + pageid = binding['name']['value'] + description = get_wikipedia_summary(lang, pageid) + if description is not None and ' ' in description: + for engine_name in wd_to_engine_name[wikidata_id]: + update_description(engine_name, lang, description, 'wikipedia') + + +def normalize_url(url): + url = url.replace('{language}', 'en') + url = urlparse(url)._replace(path='/', params='', query='', fragment='').geturl() + url = url.replace('https://api.', 'https://') + return url + + +def fetch_website_description(engine_name, website): + default_lang, default_description = get_website_description(website, None, None) + if default_lang is None or default_description is None: + return + if default_lang not in descriptions[engine_name]: + descriptions[engine_name][default_lang] = [normalize_description(default_description), website] + for request_lang in ('en-US', 'es-US', 'fr-FR', 'zh', 'ja', 'ru', 'ar', 'ko'): + if request_lang.split('-')[0] not in descriptions[engine_name]: + lang, desc = get_website_description(website, request_lang, request_lang.split('-')[0]) + if desc is not None and desc != default_description: + update_description(engine_name, lang, desc, website, replace=False) + else: + break + + +def fetch_website_descriptions(): + for engine_name, engine in searx.engines.engines.items(): + website = getattr(engine, "about", {}).get('website') + if website is None: + website = normalize_url(getattr(engine, "search_url")) + if website is None: + website = normalize_url(getattr(engine, "base_url")) + if website is not None: + fetch_website_description(engine_name, website) + + +def main(): + initialize() + fetch_wikidata_descriptions() + fetch_wikipedia_descriptions() + fetch_website_descriptions() + + sys.stdout.write(json.dumps(descriptions, indent=1, separators=(',', ':'), ensure_ascii=False)) + + +if __name__ == "__main__": + main()