diff --git a/searx/engines/command.py b/searx/engines/command.py new file mode 100644 index 000000000..b9e672ffa --- /dev/null +++ b/searx/engines/command.py @@ -0,0 +1,184 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. +''' + + +from os.path import expanduser, isabs, realpath, commonprefix +from re import MULTILINE, search as re_search +from shlex import split as shlex_split +from subprocess import Popen, PIPE +from time import time +from threading import Thread + +from searx import logger + + +offline = True +paging = True +command = [] +delimiter = {} +parse_regex = {} +query_type = '' +query_enum = [] +environment_variables = {} +working_dir = realpath('.') +result_separator = '\n' +result_template = 'key-value.html' +timeout = 4.0 + +_command_logger = logger.getChild('command') +_compiled_parse_regex = {} + + +def init(engine_settings): + check_parsing_options(engine_settings) + + if 'command' not in engine_settings: + raise ValueError('engine command : missing configuration key: command') + + global command, working_dir, result_template, delimiter, parse_regex, timeout, environment_variables + + command = engine_settings['command'] + + if 'working_dir' in engine_settings: + working_dir = engine_settings['working_dir'] + if not isabs(engine_settings['working_dir']): + working_dir = realpath(working_dir) + + if 'parse_regex' in engine_settings: + parse_regex = engine_settings['parse_regex'] + for result_key, regex in parse_regex.items(): + _compiled_parse_regex[result_key] = re.compile(regex, flags=MULTILINE) + if 'delimiter' in engine_settings: + delimiter = engine_settings['delimiter'] + + if 'environment_variables' in engine_settings: + environment_variables = engine_settings['environment_variables'] + + +def search(query, params): + cmd = _get_command_to_run(query) + if not cmd: + return [] + + results = [] + reader_thread = Thread(target=_get_results_from_process, args=(results, cmd, params['pageno'])) + reader_thread.start() + reader_thread.join(timeout=timeout) + + return results + + +def _get_command_to_run(query): + params = shlex_split(query.decode('utf-8')) + __check_query_params(params) + + cmd = [] + for c in command: + if c == '{{QUERY}}': + cmd.extend(params) + else: + cmd.append(c) + + return cmd + + +def _get_results_from_process(results, cmd, pageno): + leftover = '' + count = 0 + start, end = __get_results_limits(pageno) + with Popen(cmd, stdout=PIPE, stderr=PIPE, env=environment_variables) as process: + line = process.stdout.readline() + while line: + buf = leftover + line.decode('utf-8') + raw_results = buf.split(result_separator) + if raw_results[-1]: + leftover = raw_results[-1] + raw_results = raw_results[:-1] + + for raw_result in raw_results: + result = __parse_single_result(raw_result) + if result is None: + _command_logger.debug('skipped result:', raw_result) + continue + + if start <= count and count <= end: + result['template'] = result_template + results.append(result) + + count += 1 + if end < count: + return results + + line = process.stdout.readline() + + return_code = process.wait(timeout=timeout) + if return_code != 0: + raise RuntimeError('non-zero return code when running command', cmd, return_code) + + +def __get_results_limits(pageno): + start = (pageno - 1) * 10 + end = start + 9 + return start, end + + +def __check_query_params(params): + if not query_type: + return + + if query_type == 'path': + query_path = params[-1] + query_path = expanduser(query_path) + if commonprefix([realpath(query_path), working_dir]) != working_dir: + raise ValueError('requested path is outside of configured working directory') + elif query_type == 'enum' and len(query_enum) > 0: + for param in params: + if param not in query_enum: + raise ValueError('submitted query params is not allowed', param, 'allowed params:', query_enum) + + +def check_parsing_options(engine_settings): + """ Checks if delimiter based parsing or regex parsing is configured correctly """ + + if 'delimiter' not in engine_settings and 'parse_regex' not in engine_settings: + raise ValueError('failed to init settings for parsing lines: missing delimiter or parse_regex') + if 'delimiter' in engine_settings and 'parse_regex' in engine_settings: + raise ValueError('failed to init settings for parsing lines: too many settings') + + if 'delimiter' in engine_settings: + if 'chars' not in engine_settings['delimiter'] or 'keys' not in engine_settings['delimiter']: + raise ValueError + + +def __parse_single_result(raw_result): + """ Parses command line output based on configuration """ + + result = {} + + if delimiter: + elements = raw_result.split(delimiter['chars'], maxsplit=len(delimiter['keys']) - 1) + if len(elements) != len(delimiter['keys']): + return {} + for i in range(len(elements)): + result[delimiter['keys'][i]] = elements[i] + + if parse_regex: + for result_key, regex in _compiled_parse_regex.items(): + found = regex.search(raw_result) + if not found: + return {} + result[result_key] = raw_result[found.start():found.end()] + + return result diff --git a/searx/settings.yml b/searx/settings.yml index d6ea53177..9140522c4 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -920,6 +920,77 @@ engines: # shortcut : uw # base_url : 'http://doc.ubuntu-fr.org' +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: git grep +# engine: command +# command: ['git', 'grep', '{{QUERY}}'] +# shortcut: gg +# tokens: [] +# disabled: True +# delimiter: +# chars: ':' +# keys: ['filepath', 'code'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: locate +# engine: command +# command: ['locate', '{{QUERY}}'] +# shortcut: loc +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: find +# engine: command +# command: ['find', '.', '-name', '{{QUERY}}'] +# query_type: path +# shortcut: fnd +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: pattern search in files +# engine: command +# command: ['fgrep', '{{QUERY}}'] +# shortcut: fgr +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + +# Be careful when enabling this engine if you are +# running a public instance. Do not expose any sensitive +# information. You can restrict access by configuring a list +# of access tokens under tokens. +# - name: regex search in files +# engine: command +# command: ['grep', '{{QUERY}}'] +# shortcut: gr +# tokens: [] +# disabled: True +# delimiter: +# chars: ' ' +# keys: ['line'] + locales: en : English ar : العَرَبِيَّة (Arabic) diff --git a/searx/templates/oscar/result_templates/key-value.html b/searx/templates/oscar/result_templates/key-value.html index 67c748e7f..d5c56a189 100644 --- a/searx/templates/oscar/result_templates/key-value.html +++ b/searx/templates/oscar/result_templates/key-value.html @@ -6,7 +6,7 @@ {% continue %} {% endif %} - {{ key|upper }}: {{ value }} + {{ key|upper }}: {{ value|truncate }} {% endfor %} diff --git a/tests/unit/engines/test_command.py b/tests/unit/engines/test_command.py new file mode 100644 index 000000000..0aa1c6201 --- /dev/null +++ b/tests/unit/engines/test_command.py @@ -0,0 +1,241 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +''' + +from sys import version_info + +from searx.engines import command as command_engine +from searx.testing import SearxTestCase + + +class TestCommandEngine(SearxTestCase): + def test_basic_seq_command_engine(self): + ls_engine = command_engine + ls_engine.command = ['seq', '{{QUERY}}'] + ls_engine.delimiter = {'chars': ' ', 'keys': ['number']} + expected_results = [ + {'number': '1', 'template': 'key-value.html'}, + {'number': '2', 'template': 'key-value.html'}, + {'number': '3', 'template': 'key-value.html'}, + {'number': '4', 'template': 'key-value.html'}, + {'number': '5', 'template': 'key-value.html'}, + ] + results = ls_engine.search('5'.encode('utf-8'), {'pageno': 1}) + self.assertEqual(results, expected_results) + + def test_delimiter_parsing_command_engine(self): + searx_logs = '''DEBUG:searx.webapp:static directory is /home/n/p/searx/searx/static +DEBUG:searx.webapp:templates directory is /home/n/p/searx/searx/templates +DEBUG:searx.engines:soundcloud engine: Starting background initialization +DEBUG:searx.engines:wolframalpha engine: Starting background initialization +DEBUG:searx.engines:locate engine: Starting background initialization +DEBUG:searx.engines:regex search in files engine: Starting background initialization +DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.wolframalpha.com +DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): soundcloud.com +DEBUG:searx.engines:find engine: Starting background initialization +DEBUG:searx.engines:pattern search in files engine: Starting background initialization +DEBUG:searx.webapp:starting webserver on 127.0.0.1:8888 +WARNING:werkzeug: * Debugger is active! +INFO:werkzeug: * Debugger PIN: 299-578-362''' + echo_engine = command_engine + echo_engine.command = ['echo', searx_logs] + echo_engine.delimiter = {'chars': ':', 'keys': ['level', 'component', 'message']} + + expected_results_by_page = [ + [ + { + 'component': 'searx.webapp', + 'message': 'static directory is /home/n/p/searx/searx/static', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.webapp', + 'message': 'templates directory is /home/n/p/searx/searx/templates', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'soundcloud engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'wolframalpha engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'locate engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'regex search in files engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'urllib3.connectionpool', + 'message': 'Starting new HTTPS connection (1): www.wolframalpha.com', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'urllib3.connectionpool', + 'message': 'Starting new HTTPS connection (1): soundcloud.com', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'find engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'searx.engines', + 'message': 'pattern search in files engine: Starting background initialization', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + + ], + [ + { + 'component': 'searx.webapp', + 'message': 'starting webserver on 127.0.0.1:8888', + 'template': 'key-value.html', + 'level': 'DEBUG', + }, + { + 'component': 'werkzeug', + 'message': ' * Debugger is active!', + 'template': 'key-value.html', + 'level': 'WARNING', + }, + { + 'component': 'werkzeug', + 'message': ' * Debugger PIN: 299-578-362', + 'template': 'key-value.html', + 'level': 'INFO', + }, + ], + + ] + + for i in [0, 1]: + results = echo_engine.search(''.encode('utf-8'), {'pageno': i + 1}) + self.assertEqual(results, expected_results_by_page[i]) + + def test_regex_parsing_command_engine(self): + txt = '''commit 35f9a8c81d162a361b826bbcd4a1081a4fbe76a7 +Author: Noémi Ványi +Date: Tue Oct 15 11:31:33 2019 +0200 + +first interesting message + +commit 6c3c206316153ccc422755512bceaa9ab0b14faa +Author: Noémi Ványi +Date: Mon Oct 14 17:10:08 2019 +0200 + +second interesting message + +commit d8594d2689b4d5e0d2f80250223886c3a1805ef5 +Author: Noémi Ványi +Date: Mon Oct 14 14:45:05 2019 +0200 + +third interesting message + +commit ''' + git_log_engine = command_engine + git_log_engine.command = ['echo', txt] + git_log_engine.result_separator = '\n\ncommit ' + git_log_engine.delimiter = {} + git_log_engine.parse_regex = { + 'commit': '\w{40}', + 'author': '[\w* ]* <\w*@?\w*\.?\w*>', + 'date': 'Date: .*', + 'message': '\n\n.*$' + } + expected_results = [ + { + 'commit': '35f9a8c81d162a361b826bbcd4a1081a4fbe76a7', + 'author': ' Noémi Ványi ', + 'date': 'Date: Tue Oct 15 11:31:33 2019 +0200', + 'message': '\n\nfirst interesting message', + 'template': 'key-value.html', + }, + { + 'commit': '6c3c206316153ccc422755512bceaa9ab0b14faa', + 'author': ' Noémi Ványi ', + 'date': 'Date: Mon Oct 14 17:10:08 2019 +0200', + 'message': '\n\nsecond interesting message', + 'template': 'key-value.html', + }, + { + 'commit': 'd8594d2689b4d5e0d2f80250223886c3a1805ef5', + 'author': ' Noémi Ványi ', + 'date': 'Date: Mon Oct 14 14:45:05 2019 +0200', + 'message': '\n\nthird interesting message', + 'template': 'key-value.html', + }, + + ] + + results = git_log_engine.search(''.encode('utf-8'), {'pageno': 1}) + self.assertEqual(results, expected_results) + + def test_working_dir_path_query(self): + ls_engine = command_engine + ls_engine.command = ['ls', '{{QUERY}}'] + ls_engine.result_separator = '\n' + ls_engine.delimiter = {'chars': ' ', 'keys': ['file']} + ls_engine.query_type = 'path' + + results = ls_engine.search('.'.encode(), {'pageno': 1}) + self.assertTrue(len(results) != 0) + + forbidden_paths = [ + '..', + '../..', + './..', + '~', + '/var', + ] + for forbidden_path in forbidden_paths: + self.assertRaises(ValueError, ls_engine.search, '..'.encode(), {'pageno': 1}) + + def test_enum_queries(self): + echo_engine = command_engine + echo_engine.command = ['echo', '{{QUERY}}'] + echo_engine.query_type = 'enum' + echo_engine.query_enum = ['i-am-allowed-to-say-this', 'and-that'] + + for allowed in echo_engine.query_enum: + results = echo_engine.search(allowed.encode(), {'pageno': 1}) + self.assertTrue(len(results) != 0) + + forbidden_queries = [ + 'forbidden', + 'banned', + 'prohibited', + ] + for forbidden in forbidden_queries: + self.assertRaises(ValueError, echo_engine.search, forbidden.encode(), {'pageno': 1})