From ffea5d8ef5540bc4be08b2b26e1819d5401f854d Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 3 Jan 2022 12:40:06 +0100 Subject: [PATCH 1/2] [docs] add documentation for the scripts in searxng_extra/update Signed-off-by: Markus Heiser --- docs/dev/searxng_extra/index.rst | 9 +- docs/dev/searxng_extra/update.rst | 88 +++++++++++++++++++ .../update/update_ahmia_blacklist.py | 17 ++-- searxng_extra/update/update_currencies.py | 6 ++ .../update/update_engine_descriptions.py | 7 ++ searxng_extra/update/update_external_bangs.py | 13 +-- .../update/update_firefox_version.py | 14 ++- searxng_extra/update/update_languages.py | 10 ++- searxng_extra/update/update_osm_keys_tags.py | 5 +- searxng_extra/update/update_wikidata_units.py | 12 ++- 10 files changed, 157 insertions(+), 24 deletions(-) create mode 100644 docs/dev/searxng_extra/update.rst diff --git a/docs/dev/searxng_extra/index.rst b/docs/dev/searxng_extra/index.rst index f38bb3154..c2b5c312b 100644 --- a/docs/dev/searxng_extra/index.rst +++ b/docs/dev/searxng_extra/index.rst @@ -1,14 +1,15 @@ .. _searxng_extra: -====================================================== -Tooling box ``searxng_extra`` for developers and users -====================================================== +============================= +Tooling box ``searxng_extra`` +============================= -In the folder :origin:`searxng_extra/` we maintain some tools useful for +In the folder :origin:`searxng_extra/` we maintain some tools useful for CI and developers. .. toctree:: :maxdepth: 2 :caption: Contents + update standalone_searx.py diff --git a/docs/dev/searxng_extra/update.rst b/docs/dev/searxng_extra/update.rst new file mode 100644 index 000000000..d05c81409 --- /dev/null +++ b/docs/dev/searxng_extra/update.rst @@ -0,0 +1,88 @@ +========================= +``searxng_extra/update/`` +========================= + +:origin:`[source] ` + +Scripts to update static data in :origin:`searx/data/` + +.. _update_ahmia_blacklist.py: + +``update_ahmia_blacklist.py`` +============================= + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_ahmia_blacklist + :members: + + +``update_currencies.py`` +======================== + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_currencies + :members: + +``update_engine_descriptions.py`` +================================= + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_engine_descriptions + :members: + + +``update_external_bangs.py`` +============================ + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_external_bangs + :members: + + +``update_firefox_version.py`` +============================= + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_firefox_version + :members: + + +``update_languages.py`` +======================= + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_languages + :members: + + +``update_osm_keys_tags.py`` +=========================== + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_osm_keys_tags + :members: + + +``update_pygments.py`` +====================== + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_pygments + :members: + + +``update_wikidata_units.py`` +============================ + +:origin:`[source] ` + +.. automodule:: searxng_extra.update.update_wikidata_units + :members: diff --git a/searxng_extra/update/update_ahmia_blacklist.py b/searxng_extra/update/update_ahmia_blacklist.py index f7695deae..57fb78b34 100755 --- a/searxng_extra/update/update_ahmia_blacklist.py +++ b/searxng_extra/update/update_ahmia_blacklist.py @@ -1,10 +1,14 @@ #!/usr/bin/env python # SPDX-License-Identifier: AGPL-3.0-or-later +"""This script saves `Ahmia's blacklist`_ for onion sites. -# This script saves Ahmia's blacklist for onion sites. -# More info in https://ahmia.fi/blacklist/ +Output file: :origin:`searx/data/ahmia_blacklist.txt` (:origin:`CI Update data +... <.github/workflows/data-update.yml>`). + +.. _Ahmia's blacklist: https://ahmia.fi/blacklist/ + +""" -# set path from os.path import join import requests @@ -26,6 +30,7 @@ def get_ahmia_blacklist_filename(): return join(join(searx_dir, "data"), "ahmia_blacklist.txt") -blacklist = fetch_ahmia_blacklist() -with open(get_ahmia_blacklist_filename(), "w") as f: - f.write('\n'.join(blacklist)) +if __name__ == '__main__': + blacklist = fetch_ahmia_blacklist() + with open(get_ahmia_blacklist_filename(), "w") as f: + f.write('\n'.join(blacklist)) diff --git a/searxng_extra/update/update_currencies.py b/searxng_extra/update/update_currencies.py index 3373e2455..cdff4cbc9 100755 --- a/searxng_extra/update/update_currencies.py +++ b/searxng_extra/update/update_currencies.py @@ -1,6 +1,12 @@ #!/usr/bin/env python # SPDX-License-Identifier: AGPL-3.0-or-later +"""Fetch currencies from :origin:`searx/engines/wikidata.py` engine. + +Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ... +<.github/workflows/data-update.yml>`). + +""" import re import unicodedata import json diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index 51cfc7cc2..bab1a0349 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -1,6 +1,13 @@ #!/usr/bin/env python # SPDX-License-Identifier: AGPL-3.0-or-later +"""Fetch website description from websites and from +:origin:`searx/engines/wikidata.py` engine. + +Output file: :origin:`searx/data/engine_descriptions.json`. + +""" + import json from urllib.parse import urlparse from os.path import join diff --git a/searxng_extra/update/update_external_bangs.py b/searxng_extra/update/update_external_bangs.py index d5c6b585a..be3aade0f 100755 --- a/searxng_extra/update/update_external_bangs.py +++ b/searxng_extra/update/update_external_bangs.py @@ -1,17 +1,20 @@ #!/usr/bin/env python # lint: pylint # SPDX-License-Identifier: AGPL-3.0-or-later -""" -Update searx/data/external_bangs.json using the duckduckgo bangs. +"""Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs +(:origin:`CI Update data ... <.github/workflows/data-update.yml>`). + +https://duckduckgo.com/newbang loads: -https://duckduckgo.com/newbang loads * a javascript which provides the bang version ( https://duckduckgo.com/bv1.js ) * a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example ) This script loads the javascript, then the bangs. -The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ), -but most probably it will requires to update RE_BANG_VERSION +The javascript URL may change in the future ( for example +https://duckduckgo.com/bv2.js ), but most probably it will requires to update +RE_BANG_VERSION + """ # pylint: disable=C0116 diff --git a/searxng_extra/update/update_firefox_version.py b/searxng_extra/update/update_firefox_version.py index 750e955fd..163982b16 100755 --- a/searxng_extra/update/update_firefox_version.py +++ b/searxng_extra/update/update_firefox_version.py @@ -1,6 +1,13 @@ #!/usr/bin/env python # SPDX-License-Identifier: AGPL-3.0-or-later +"""Fetch firefox useragent signatures + +Output file: :origin:`searx/data/useragents.json` (:origin:`CI Update data ... +<.github/workflows/data-update.yml>`). + +""" + import json import requests import re @@ -66,6 +73,7 @@ def get_useragents_filename(): return join(join(searx_dir, "data"), "useragents.json") -useragents["versions"] = fetch_firefox_last_versions() -with open(get_useragents_filename(), "w") as f: - json.dump(useragents, f, indent=4, ensure_ascii=False) +if __name__ == '__main__': + useragents["versions"] = fetch_firefox_last_versions() + with open(get_useragents_filename(), "w", encoding='utf-8') as f: + json.dump(useragents, f, indent=4, ensure_ascii=False) diff --git a/searxng_extra/update/update_languages.py b/searxng_extra/update/update_languages.py index f37345808..9a71566a9 100755 --- a/searxng_extra/update/update_languages.py +++ b/searxng_extra/update/update_languages.py @@ -1,9 +1,13 @@ #!/usr/bin/env python # SPDX-License-Identifier: AGPL-3.0-or-later +"""This script generates languages.py from intersecting each engine's supported +languages. -# This script generates languages.py from intersecting each engine's supported languages. -# -# Output files: searx/data/engines_languages.json and searx/languages.py +Output files: :origin:`searx/data/engines_languages.json` and +:origin:`searx/languages.py` (:origin:`CI Update data ... +<.github/workflows/data-update.yml>`). + +""" import json from pathlib import Path diff --git a/searxng_extra/update/update_osm_keys_tags.py b/searxng_extra/update/update_osm_keys_tags.py index 2916cbff1..1d691c194 100755 --- a/searxng_extra/update/update_osm_keys_tags.py +++ b/searxng_extra/update/update_osm_keys_tags.py @@ -5,7 +5,10 @@ To get the i18n names, the scripts uses `Wikidata Query Service`_ instead of for example `OSM tags API`_ (sidenote: the actual change log from -map.atownsend.org.uk_ might be useful to normalize OSM tags) +map.atownsend.org.uk_ might be useful to normalize OSM tags). + +Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ... +<.github/workflows/data-update.yml>`). .. _Wikidata Query Service: https://query.wikidata.org/ .. _OSM tags API: https://taginfo.openstreetmap.org/taginfo/apidoc diff --git a/searxng_extra/update/update_wikidata_units.py b/searxng_extra/update/update_wikidata_units.py index 43a872b1b..e999b6cfd 100755 --- a/searxng_extra/update/update_wikidata_units.py +++ b/searxng_extra/update/update_wikidata_units.py @@ -3,6 +3,13 @@ # lint: pylint # pylint: disable=missing-module-docstring +"""Fetch units from :origin:`searx/engines/wikidata.py` engine. + +Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data +... <.github/workflows/data-update.yml>`). + +""" + import json import collections @@ -54,5 +61,6 @@ def get_wikidata_units_filename(): return join(join(searx_dir, "data"), "wikidata_units.json") -with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f: - json.dump(get_data(), f, indent=4, ensure_ascii=False) +if __name__ == '__main__': + with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f: + json.dump(get_data(), f, indent=4, ensure_ascii=False) From 295876abaa93b8dea44dc0beaf8eb2596da69aed Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 3 Jan 2022 12:58:48 +0100 Subject: [PATCH 2/2] [pylint] add scripts from searxng_extra/update to pylint Signed-off-by: Markus Heiser --- .../update/update_ahmia_blacklist.py | 7 ++-- searxng_extra/update/update_currencies.py | 7 +++- .../update/update_engine_descriptions.py | 7 +++- .../update/update_firefox_version.py | 37 ++++++++++--------- searxng_extra/update/update_languages.py | 29 +++++++++------ 5 files changed, 49 insertions(+), 38 deletions(-) diff --git a/searxng_extra/update/update_ahmia_blacklist.py b/searxng_extra/update/update_ahmia_blacklist.py index 57fb78b34..26c485195 100755 --- a/searxng_extra/update/update_ahmia_blacklist.py +++ b/searxng_extra/update/update_ahmia_blacklist.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# lint: pylint # SPDX-License-Identifier: AGPL-3.0-or-later """This script saves `Ahmia's blacklist`_ for onion sites. @@ -21,9 +22,7 @@ def fetch_ahmia_blacklist(): resp = requests.get(URL, timeout=3.0) if resp.status_code != 200: raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code) - else: - blacklist = resp.text.split() - return blacklist + return resp.text.split() def get_ahmia_blacklist_filename(): @@ -32,5 +31,5 @@ def get_ahmia_blacklist_filename(): if __name__ == '__main__': blacklist = fetch_ahmia_blacklist() - with open(get_ahmia_blacklist_filename(), "w") as f: + with open(get_ahmia_blacklist_filename(), "w", encoding='utf-8') as f: f.write('\n'.join(blacklist)) diff --git a/searxng_extra/update/update_currencies.py b/searxng_extra/update/update_currencies.py index cdff4cbc9..e51692e72 100755 --- a/searxng_extra/update/update_currencies.py +++ b/searxng_extra/update/update_currencies.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# lint: pylint # SPDX-License-Identifier: AGPL-3.0-or-later """Fetch currencies from :origin:`searx/engines/wikidata.py` engine. @@ -7,13 +8,15 @@ Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ... <.github/workflows/data-update.yml>`). """ + +# pylint: disable=invalid-name + import re import unicodedata import json # set path -from sys import path -from os.path import realpath, dirname, join +from os.path import join from searx import searx_dir from searx.locales import LOCALE_NAMES diff --git a/searxng_extra/update/update_engine_descriptions.py b/searxng_extra/update/update_engine_descriptions.py index bab1a0349..5b73fd396 100755 --- a/searxng_extra/update/update_engine_descriptions.py +++ b/searxng_extra/update/update_engine_descriptions.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# lint: pylint # SPDX-License-Identifier: AGPL-3.0-or-later """Fetch website description from websites and from @@ -8,6 +9,8 @@ Output file: :origin:`searx/data/engine_descriptions.json`. """ +# pylint: disable=invalid-name, global-statement + import json from urllib.parse import urlparse from os.path import join @@ -109,7 +112,7 @@ def get_wikipedia_summary(lang, pageid): response.raise_for_status() api_result = json.loads(response.text) return api_result.get('extract') - except: + except Exception: # pylint: disable=broad-except return None @@ -141,7 +144,7 @@ def get_website_description(url, lang1, lang2=None): try: response = searx.network.get(url, headers=headers, timeout=10) response.raise_for_status() - except Exception: + except Exception: # pylint: disable=broad-except return (None, None) try: diff --git a/searxng_extra/update/update_firefox_version.py b/searxng_extra/update/update_firefox_version.py index 163982b16..a447f9fd5 100755 --- a/searxng_extra/update/update_firefox_version.py +++ b/searxng_extra/update/update_firefox_version.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# lint: pylint # SPDX-License-Identifier: AGPL-3.0-or-later """Fetch firefox useragent signatures @@ -9,20 +10,21 @@ Output file: :origin:`searx/data/useragents.json` (:origin:`CI Update data ... """ import json -import requests import re -from os.path import dirname, join +from os.path import join from urllib.parse import urlparse, urljoin -from distutils.version import LooseVersion, StrictVersion +from distutils.version import LooseVersion + +import requests from lxml import html from searx import searx_dir URL = 'https://ftp.mozilla.org/pub/firefox/releases/' RELEASE_PATH = '/pub/firefox/releases/' -NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$') -# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$') -# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$') +NORMAL_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?$') +# BETA_REGEX = re.compile(r'.*[0-9]b([0-9\-a-z]+)$') +# ESR_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?esr$') # useragents = { @@ -39,20 +41,19 @@ def fetch_firefox_versions(): resp = requests.get(URL, timeout=2.0) if resp.status_code != 200: raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) - else: - dom = html.fromstring(resp.text) - versions = [] + dom = html.fromstring(resp.text) + versions = [] - for link in dom.xpath('//a/@href'): - url = urlparse(urljoin(URL, link)) - path = url.path - if path.startswith(RELEASE_PATH): - version = path[len(RELEASE_PATH) : -1] - if NORMAL_REGEX.match(version): - versions.append(LooseVersion(version)) + for link in dom.xpath('//a/@href'): + url = urlparse(urljoin(URL, link)) + path = url.path + if path.startswith(RELEASE_PATH): + version = path[len(RELEASE_PATH) : -1] + if NORMAL_REGEX.match(version): + versions.append(LooseVersion(version)) - list.sort(versions, reverse=True) - return versions + list.sort(versions, reverse=True) + return versions def fetch_firefox_last_versions(): diff --git a/searxng_extra/update/update_languages.py b/searxng_extra/update/update_languages.py index 9a71566a9..754180c47 100755 --- a/searxng_extra/update/update_languages.py +++ b/searxng_extra/update/update_languages.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# lint: pylint + # SPDX-License-Identifier: AGPL-3.0-or-later """This script generates languages.py from intersecting each engine's supported languages. @@ -9,6 +11,8 @@ Output files: :origin:`searx/data/engines_languages.json` and """ +# pylint: disable=invalid-name + import json from pathlib import Path from pprint import pformat @@ -28,7 +32,7 @@ languages_file = Path(searx_dir) / 'languages.py' def fetch_supported_languages(): set_timeout_for_thread(10.0) - engines_languages = dict() + engines_languages = {} names = list(engines) names.sort() @@ -36,7 +40,7 @@ def fetch_supported_languages(): if hasattr(engines[engine_name], 'fetch_supported_languages'): engines_languages[engine_name] = engines[engine_name].fetch_supported_languages() print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name)) - if type(engines_languages[engine_name]) == list: + if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck engines_languages[engine_name] = sorted(engines_languages[engine_name]) print("fetched languages from %s engines" % len(engines_languages)) @@ -59,7 +63,7 @@ def get_locale(lang_code): # Join all language lists. def join_language_lists(engines_languages): - language_list = dict() + language_list = {} for engine_name in engines_languages: for lang_code in engines_languages[engine_name]: @@ -95,7 +99,7 @@ def join_language_lists(engines_languages): 'name': language_name, 'english_name': english_name, 'counter': set(), - 'countries': dict(), + 'countries': {}, } # add language with country if not in list @@ -123,6 +127,7 @@ def join_language_lists(engines_languages): def filter_language_list(all_languages): min_engines_per_lang = 13 min_engines_per_country = 7 + # pylint: disable=consider-using-dict-items, consider-iterating-dictionary main_engines = [ engine_name for engine_name in engines.keys() @@ -142,7 +147,7 @@ def filter_language_list(all_languages): } def _copy_lang_data(lang, country_name=None): - new_dict = dict() + new_dict = {} new_dict['name'] = all_languages[lang]['name'] new_dict['english_name'] = all_languages[lang]['english_name'] if country_name: @@ -150,10 +155,10 @@ def filter_language_list(all_languages): return new_dict # for each language get country codes supported by most engines or at least one country code - filtered_languages_with_countries = dict() + filtered_languages_with_countries = {} for lang, lang_data in filtered_languages.items(): countries = lang_data['countries'] - filtered_countries = dict() + filtered_countries = {} # get language's country codes with enough supported engines for lang_country, country_data in countries.items(): @@ -215,7 +220,7 @@ def write_languages_file(languages): language_codes = tuple(language_codes) - with open(languages_file, 'w') as new_file: + with open(languages_file, 'w', encoding='utf-8') as new_file: file_content = "{file_headers} {language_codes},\n)\n".format( # fmt: off file_headers = '\n'.join(file_headers), @@ -228,7 +233,7 @@ def write_languages_file(languages): if __name__ == "__main__": load_engines(settings['engines']) - engines_languages = fetch_supported_languages() - all_languages = join_language_lists(engines_languages) - filtered_languages = filter_language_list(all_languages) - write_languages_file(filtered_languages) + _engines_languages = fetch_supported_languages() + _all_languages = join_language_lists(_engines_languages) + _filtered_languages = filter_language_list(_all_languages) + write_languages_file(_filtered_languages)