mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00

Function 'write_external_bang_doc(..)' generates documentation of the external bangs in folder 'docs/admin/external_bang'. Ducumentation will be update when external bangs are updated:: ./manage pyenv.cmd searx_extra/update/update_external_bangs.py Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
305 lines
8.9 KiB
Python
Executable file
305 lines
8.9 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# lint: pylint
|
|
"""
|
|
Update searx/data/external_bangs.json using the duckduckgo bangs.
|
|
|
|
https://duckduckgo.com/newbang loads:
|
|
|
|
* a javascript which provides the bang version ( https://duckduckgo.com/bv1.js )
|
|
* a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example )
|
|
|
|
This script loads the javascript, then the bangs.
|
|
|
|
The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ),
|
|
but most probably it will requires to update RE_BANG_VERSION
|
|
"""
|
|
# pylint: disable=C0116
|
|
|
|
import sys
|
|
import json
|
|
import re
|
|
|
|
import httpx
|
|
from fspath import FSPath
|
|
|
|
from searx import searx_dir # pylint: disable=E0401 C0413
|
|
from searx.external_bang import (
|
|
get_bang_definition_and_ac,
|
|
resolve_bang_definition,
|
|
)
|
|
|
|
# from https://duckduckgo.com/newbang
|
|
URL_BV1 = 'https://duckduckgo.com/bv1.js'
|
|
RE_BANG_VERSION = re.compile(r'\/bang\.v([0-9]+)\.js')
|
|
HTTPS_COLON = 'https:'
|
|
HTTP_COLON = 'http:'
|
|
|
|
|
|
def get_bang_url():
|
|
response = httpx.get(URL_BV1)
|
|
response.raise_for_status()
|
|
|
|
r = RE_BANG_VERSION.findall(response.text)
|
|
return f'https://duckduckgo.com/bang.v{r[0]}.js', r[0]
|
|
|
|
|
|
def fetch_ddg_bangs(url):
|
|
response = httpx.get(url)
|
|
response.raise_for_status()
|
|
return json.loads(response.content.decode())
|
|
|
|
|
|
def merge_when_no_leaf(node):
|
|
"""Minimize the number of nodes
|
|
|
|
A -> B -> C
|
|
B is child of A
|
|
C is child of B
|
|
|
|
If there are no C equals to '*', then each C are merged into A
|
|
|
|
For example:
|
|
d -> d -> g -> * (ddg*)
|
|
-> i -> g -> * (dig*)
|
|
becomes
|
|
d -> dg -> *
|
|
-> ig -> *
|
|
"""
|
|
restart = False
|
|
if not isinstance(node, dict):
|
|
return
|
|
|
|
# create a copy of the keys so node can be modified
|
|
keys = list(node.keys())
|
|
|
|
for key in keys:
|
|
if key == '*':
|
|
continue
|
|
|
|
value = node[key]
|
|
value_keys = list(value.keys())
|
|
if '*' not in value_keys:
|
|
for value_key in value_keys:
|
|
node[key + value_key] = value[value_key]
|
|
merge_when_no_leaf(node[key + value_key])
|
|
del node[key]
|
|
restart = True
|
|
else:
|
|
merge_when_no_leaf(value)
|
|
|
|
if restart:
|
|
merge_when_no_leaf(node)
|
|
|
|
|
|
def optimize_leaf(parent, parent_key, node):
|
|
if not isinstance(node, dict):
|
|
return
|
|
|
|
if len(node) == 1 and '*' in node and parent is not None:
|
|
parent[parent_key] = node['*']
|
|
else:
|
|
for key, value in node.items():
|
|
optimize_leaf(node, key, value)
|
|
|
|
|
|
def parse_ddg_bangs(ddg_bangs):
|
|
bang_trie = {}
|
|
bang_urls = {}
|
|
bang_doc = {}
|
|
|
|
for bang_definition in ddg_bangs:
|
|
# bang_list
|
|
bang_url = bang_definition['u']
|
|
if '{{{s}}}' not in bang_url:
|
|
# ignore invalid bang
|
|
continue
|
|
|
|
bang_url = bang_url.replace('{{{s}}}', chr(2))
|
|
|
|
# only for the https protocol: "https://example.com" becomes "//example.com"
|
|
if bang_url.startswith(HTTPS_COLON + '//'):
|
|
bang_url = bang_url[len(HTTPS_COLON):]
|
|
|
|
#
|
|
if bang_url.startswith(HTTP_COLON + '//') and bang_url[len(HTTP_COLON):] in bang_urls:
|
|
# if the bang_url uses the http:// protocol, and the same URL exists in https://
|
|
# then reuse the https:// bang definition. (written //example.com)
|
|
bang_def_output = bang_urls[bang_url[len(HTTP_COLON):]]
|
|
else:
|
|
# normal use case : new http:// URL or https:// URL (without "https:", see above)
|
|
bang_rank = str(bang_definition['r'])
|
|
bang_def_output = bang_url + chr(1) + bang_rank
|
|
bang_def_output = bang_urls.setdefault(bang_url, bang_def_output)
|
|
|
|
bang_urls[bang_url] = bang_def_output
|
|
|
|
# bang name
|
|
bang = bang_definition['t']
|
|
bang_doc[bang] = (bang_url, bang_definition)
|
|
|
|
# bang_trie
|
|
t = bang_trie
|
|
for bang_letter in bang:
|
|
t = t.setdefault(bang_letter, {})
|
|
t = t.setdefault('*', bang_def_output)
|
|
|
|
# optimize the trie
|
|
merge_when_no_leaf(bang_trie)
|
|
optimize_leaf(None, None, bang_trie)
|
|
|
|
return bang_trie, bang_doc
|
|
|
|
|
|
def sort_by_category(bang_doc):
|
|
|
|
ret_val = {}
|
|
for bang_name, (bang_url, bang_definition) in bang_doc.items():
|
|
|
|
# add category
|
|
categ = bang_definition.get('c', 'no category')
|
|
ret_val[categ] = ret_val.get(categ, {})
|
|
|
|
# add sub-category
|
|
sub_categ = bang_definition.get('sc', 'no sub-category')
|
|
ret_val[categ][sub_categ] = ret_val[categ].get(sub_categ, {})
|
|
|
|
# add bang name to sub-category dict and set tuple: bang_url,
|
|
# bang_definition
|
|
ret_val[categ][sub_categ][bang_name] = bang_url, bang_definition
|
|
|
|
return ret_val
|
|
|
|
FILE_INTRO = """\
|
|
.. Do not edit this file, this file was created by:
|
|
..
|
|
.. ./manage pyenv.cmd searx_extra/update/update_external_bangs.py
|
|
"""
|
|
|
|
CATEGORY_INDEX="""
|
|
.. toctree::
|
|
:maxdepth: 2
|
|
:caption: Contents
|
|
:glob:
|
|
"""
|
|
|
|
TABLE_INTRO = """
|
|
.. flat-table:: Bangs in *{categ_name} / {sub_categ_name}*
|
|
:header-rows: 1
|
|
:stub-columns: 1
|
|
:widths: 2 1 4 6
|
|
|
|
* - Bang
|
|
- Rank
|
|
- Description
|
|
- URL
|
|
"""
|
|
|
|
TABLE_ROW = """
|
|
* - `!!{bang_name} <{url}>`__
|
|
- {rank}
|
|
- {bang_description}
|
|
- ``{url}``
|
|
"""
|
|
|
|
def write_external_bang_doc(external_bangs_db, bang_doc):
|
|
"""Generate documentation of the external bangs in
|
|
``docs/admin/external_bang``.
|
|
|
|
"""
|
|
# pylint: disable=too-many-locals
|
|
def rst_title(name, tag="=", topline=True):
|
|
ret_val = "\n%s" % (name,)
|
|
if topline:
|
|
ret_val = "\n" + len(name) * tag + ret_val
|
|
ret_val += "\n" + len(name) * tag + "\n"
|
|
return ret_val
|
|
|
|
def get_valid_filename(fname):
|
|
fname = str(fname).strip().replace(' ', '_')
|
|
return re.sub(r'(?u)[^-\w.]', '', fname)
|
|
|
|
folder = FSPath(searx_dir + "/../docs/admin/external_bang")
|
|
print(f're-create external bang documentation in: {folder}')
|
|
folder.delete()
|
|
folder.makedirs()
|
|
|
|
bang_doc = sort_by_category(bang_doc)
|
|
categ_list = list(bang_doc)
|
|
categ_list.sort(key=lambda v: v.lower())
|
|
|
|
for categ_name in categ_list:
|
|
categ = bang_doc[categ_name]
|
|
|
|
# for each category create on folder with index.rst in
|
|
|
|
categ_folder = folder / get_valid_filename(categ_name)
|
|
categ_folder.makedirs()
|
|
|
|
sub_categ_list = list(categ)
|
|
sub_categ_list.sort(key=lambda v: v.lower())
|
|
|
|
fname = categ_folder / 'index.rst'
|
|
print(f"create file: {fname}")
|
|
with open(fname, 'w') as out:
|
|
out.write(FILE_INTRO.format(**locals()))
|
|
out.write(rst_title(categ_name, topline=True))
|
|
out.write(CATEGORY_INDEX.format(**locals()))
|
|
for sub_categ_name in sub_categ_list:
|
|
out.write("\n %s" % get_valid_filename(sub_categ_name))
|
|
|
|
for sub_categ_name in sub_categ_list:
|
|
sub_categ = categ[sub_categ_name]
|
|
|
|
# for each sub-category create on reST-file
|
|
|
|
fname = categ_folder / get_valid_filename(sub_categ_name + ".rst")
|
|
print(f"create file: {fname}")
|
|
with open(fname, 'w') as out:
|
|
out.write(FILE_INTRO.format(**locals()))
|
|
out.write(rst_title(sub_categ_name))
|
|
out.write(TABLE_INTRO.format(**locals()))
|
|
|
|
bang_name_list = list(sub_categ)
|
|
bang_name_list.sort(key=lambda v: v.lower())
|
|
|
|
# for each bang create on table row
|
|
|
|
for bang_name in bang_name_list:
|
|
# pylint: disable=possibly-unused-variable
|
|
bang_url, bang_definition = sub_categ[bang_name]
|
|
bang_description = bang_definition.get('s','...')
|
|
bang_node, bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang_name)
|
|
bang_ac_list = ', '.join(bang_ac_list) or '...'
|
|
|
|
try:
|
|
url, rank = resolve_bang_definition(bang_node, '')
|
|
url = url.strip()
|
|
except AttributeError:
|
|
# There is one defect entry in external_bangs_db where
|
|
# the bang_node is a dict and not a string ... why?
|
|
sys.stderr.write("ignore error with bang:: '%s'\n" % (bang_node, ))
|
|
continue
|
|
|
|
out.write(TABLE_ROW.format(**locals()))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
_bangs_url, _bangs_version = get_bang_url()
|
|
|
|
print(f'fetch & parse bangs from {_bangs_url}')
|
|
_bang_trie, _bang_doc = parse_ddg_bangs(fetch_ddg_bangs(_bangs_url))
|
|
|
|
# generate JSON file for: searx.data.EXTERNAL_BANGS
|
|
|
|
_fname = FSPath(searx_dir + "/data/external_bangs.json")
|
|
print(f'update file: {_fname}')
|
|
_external_bangs_db = {
|
|
'version': _bangs_version,
|
|
'trie': _bang_trie,
|
|
}
|
|
with open(_fname, 'w') as fp:
|
|
json.dump(_external_bangs_db, fp, ensure_ascii=False, indent=4)
|
|
|
|
# generate documentation
|
|
write_external_bang_doc(_external_bangs_db, _bang_doc)
|