From c03e4c86bc49d6ef4664c038066d9f1c16e7dafc Mon Sep 17 00:00:00 2001 From: rachmadani haryono Date: Wed, 4 Nov 2020 20:38:54 +0800 Subject: [PATCH] Feature/standalone searx update (#1591) * chg: dev: update standalone_searx parent d8a5df721b33dd8a7cc9e21dba4060f21d629f69 author rachmadaniHaryono 1603896594 +0800 committer rachmadaniHaryono 1603896619 +0800 chg: dev: debug engine_shortcuts chg: dev: only initilize if engine is given chg: dev: split main chg: dev: standalone_searx chg: dev: update standalone_searx chg: doc: remove unnecessary log chg: test: differentiate travis chg: test: disable shortcut chg: test: use default engine settings fix: dev: category choices fix: dev: duplicate engine shortcut fix: dev: travis python3 fix: test: use empty string as shortcut fix: test: apkm fix: test: engine shortcut fix: test: mypy fix: test: parameter fix: test: pep8 fix: test: py2 compatibilities fix: test: searx settings fix: test: travis engines new: dev: deduplicate engine new: dev: main receive engines parameter new: dev: parse_argument accept engines parameter new: dev: split search query from get_result func new: test: basic result case Suggestions: use RawTextQuery to make the suggestions URLs. Update all themes accordingly. * new: doc: searx import and init * chg: dev: parse_argument - doc - run on __main__ - simple parse_args * chg: doc: module * chg: dev: import section - remove unused python path modification - new required package * chg: dev: script run - parse_argument func return directly parsed results - main func return dict instead json text - dump directly on sys.stdout.write * chg: dev: get_search_query and get_search_query func * chg: dev: main func - move inner function outside - return dict instead of json text * new: dev: add utils to doc sys path * new: doc: standalone_searx * fix: doc: run script * chg: dev: mypy type hint * chg: dev: SearchQuery don't have attr engines * chg: dev: reset engines __init__ * chg: test: unit test update * chg: dev: pylint and flake8 * new: test: standalone_searx * chg: dev: main func and doc * chg: dev: import and type hint * new: dev: main func - remove get_result func - single func which just translate dict * chg: test: put mypy on dev requirement * chg: doc: update * new: doc: add standalone_searx module member * chg: doc: shell command line * chg: dev: remove mypy * chg: doc: module docstring --- docs/conf.py | 1 + docs/utils/index.rst | 1 + docs/utils/standalone_searx.py.rst | 11 ++ tests/unit/test_standalone_searx.py | 118 +++++++++++++ utils/standalone_searx.py | 251 ++++++++++++++++++++-------- 5 files changed, 313 insertions(+), 69 deletions(-) create mode 100644 docs/utils/standalone_searx.py.rst create mode 100644 tests/unit/test_standalone_searx.py diff --git a/docs/conf.py b/docs/conf.py index 66c20594d..4b348ae0e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -87,6 +87,7 @@ issues_github_path = "searx/searx" # HTML ----------------------------------------------------------------- sys.path.append(os.path.abspath('_themes')) +sys.path.insert(0, os.path.abspath("../utils/")) html_theme_path = ['_themes'] html_theme = "searx" diff --git a/docs/utils/index.rst b/docs/utils/index.rst index 13914af28..3c7387875 100644 --- a/docs/utils/index.rst +++ b/docs/utils/index.rst @@ -16,6 +16,7 @@ developers. filtron.sh morty.sh lxc.sh + standalone_searx.py .. _toolboxing common: diff --git a/docs/utils/standalone_searx.py.rst b/docs/utils/standalone_searx.py.rst new file mode 100644 index 000000000..557c4b75b --- /dev/null +++ b/docs/utils/standalone_searx.py.rst @@ -0,0 +1,11 @@ + +.. _standalone_searx.py: + +============================= +``utils/standalone_searx.py`` +============================= + +.. automodule:: standalone_searx + :members: + + diff --git a/tests/unit/test_standalone_searx.py b/tests/unit/test_standalone_searx.py new file mode 100644 index 000000000..cd1a14f46 --- /dev/null +++ b/tests/unit/test_standalone_searx.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +"""Test utils/standalone_searx.py""" +import datetime +import importlib.util +import sys + +from mock import Mock, patch +from nose2.tools import params + +from searx.testing import SearxTestCase + + +def get_standalone_searx_module(): + """Get standalone_searx module.""" + module_name = 'utils.standalone_searx' + filename = 'utils/standalone_searx.py' + spec = importlib.util.spec_from_file_location(module_name, filename) + sas = importlib.util.module_from_spec(spec) + spec.loader.exec_module(sas) + return sas + + +class StandaloneSearx(SearxTestCase): + """Unit test for standalone_searx.""" + + def test_parse_argument_no_args(self): + """Test parse argument without args.""" + sas = get_standalone_searx_module() + with patch.object(sys, 'argv', ['standalone_searx']), \ + self.assertRaises(SystemExit): + sas.parse_argument() + + def test_parse_argument_basic_args(self): + """Test parse argument with basic args.""" + sas = get_standalone_searx_module() + query = 'red box' + exp_dict = { + 'query': query, 'category': 'general', 'lang': 'all', 'pageno': 1, + 'safesearch': '0', 'timerange': None} + args = ['standalone_searx', query] + with patch.object(sys, 'argv', args): + res = sas.parse_argument() + self.assertEqual(exp_dict, vars(res)) + res2 = sas.parse_argument(args[1:]) + self.assertEqual(exp_dict, vars(res2)) + + def test_to_dict(self): + """test to_dict.""" + sas = get_standalone_searx_module() + self.assertEqual( + sas.to_dict( + sas.get_search_query(sas.parse_argument(['red box']))), + { + 'search': { + 'q': 'red box', 'pageno': 1, 'lang': 'all', + 'safesearch': 0, 'timerange': None + }, + 'results': [], 'infoboxes': [], 'suggestions': [], + 'answers': [], 'paging': False, 'results_number': 0 + } + ) + + def test_to_dict_with_mock(self): + """test to dict.""" + sas = get_standalone_searx_module() + with patch.object(sas.searx.search, 'Search') as mock_s: + m_search = mock_s().search() + m_sq = Mock() + self.assertEqual( + sas.to_dict(m_sq), + { + 'answers': [], + 'infoboxes': m_search.infoboxes, + 'paging': m_search.paging, + 'results': m_search.get_ordered_results(), + 'results_number': m_search.results_number(), + 'search': { + 'lang': m_sq.lang, + 'pageno': m_sq.pageno, + 'q': m_sq.query, + 'safesearch': m_sq.safesearch, + 'timerange': m_sq.time_range, + }, + 'suggestions': [] + } + ) + + def test_get_search_query(self): + """test get_search_query.""" + sas = get_standalone_searx_module() + args = sas.parse_argument(['rain', ]) + search_q = sas.get_search_query(args) + self.assertTrue(search_q) + self.assertEqual(str(search_q), 'rain;[]') + + def test_no_parsed_url(self): + """test no_parsed_url func""" + sas = get_standalone_searx_module() + self.assertEqual( + sas.no_parsed_url([{'parsed_url': 'http://example.com'}]), + [{}] + ) + + @params( + (datetime.datetime(2020, 1, 1), '2020-01-01T00:00:00'), + ('a'.encode('utf8'), 'a'), + (set([1]), [1]) + ) + def test_json_serial(self, arg, exp_res): + """test json_serial func""" + sas = get_standalone_searx_module() + self.assertEqual(sas.json_serial(arg), exp_res) + + def test_json_serial_error(self): + """test error on json_serial.""" + sas = get_standalone_searx_module() + with self.assertRaises(TypeError): + sas.json_serial('a') diff --git a/utils/standalone_searx.py b/utils/standalone_searx.py index 3aab7a6cc..0a35cc4a2 100755 --- a/utils/standalone_searx.py +++ b/utils/standalone_searx.py @@ -1,5 +1,63 @@ #!/usr/bin/env python +"""Script to run searx from terminal. +Getting categories without initiate the engine will only return `['general']` + +>>> import searx.engines +... list(searx.engines.categories.keys()) +['general'] +>>> import searx +... searx.engines.initialize_engines(searx.settings['engines']) +... list(searx.engines.categories.keys()) +['general', 'it', 'science', 'images', 'news', 'videos', 'music', 'files', 'social media', 'map'] + +Example to use this script: + +.. code:: bash + + $ SEARX_DEBUG=1 python3 utils/standalone_searx.py rain + +Example to run it from python: + +>>> import importlib +... import json +... import sys +... import searx +... import searx.engines +... search_query = 'rain' +... # initialize engines +... searx.engines.initialize_engines(searx.settings['engines']) +... # load engines categories once instead of each time the function called +... engine_cs = list(searx.engines.categories.keys()) +... # load module +... spec = importlib.util.spec_from_file_location( +... 'utils.standalone_searx', 'utils/standalone_searx.py') +... sas = importlib.util.module_from_spec(spec) +... spec.loader.exec_module(sas) +... # use function from module +... prog_args = sas.parse_argument([search_query], category_choices=engine_cs) +... search_q = sas.get_search_query(prog_args, engine_categories=engine_cs) +... res_dict = sas.to_dict(search_q) +... sys.stdout.write(json.dumps( +... res_dict, sort_keys=True, indent=4, ensure_ascii=False, +... default=sas.json_serial)) +{ + "answers": [], + "infoboxes": [ {...} ], + "paging": true, + "results": [... ], + "results_number": 820000000.0, + "search": { + "lang": "all", + "pageno": 1, + "q": "rain", + "safesearch": 0, + "timerange": null + }, + "suggestions": [...] +} +""" # noqa: E501 +# pylint: disable=pointless-string-statement ''' searx is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by @@ -16,90 +74,145 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. (C) 2016- by Alexandre Flament, ''' - -# set path -from sys import path -from os.path import realpath, dirname -path.append(realpath(dirname(realpath(__file__)) + '/../')) - -# initialization -from json import dumps -from searx import settings +# pylint: disable=wrong-import-position +import argparse import sys -import codecs +from datetime import datetime +from json import dumps +from typing import Any, Dict, List, Optional + +import searx +import searx.engines +import searx.preferences import searx.query import searx.search -import searx.engines -import searx.webapdater -import searx.preferences import searx.webadapter -import argparse -searx.engines.initialize_engines(settings['engines']) +EngineCategoriesVar = Optional[List[str]] -# command line parsing -parser = argparse.ArgumentParser(description='Standalone searx.') -parser.add_argument('query', type=str, - help='Text query') -parser.add_argument('--category', type=str, nargs='?', - choices=searx.engines.categories.keys(), - default='general', - help='Search category') -parser.add_argument('--lang', type=str, nargs='?',default='all', - help='Search language') -parser.add_argument('--pageno', type=int, nargs='?', default=1, - help='Page number starting from 1') -parser.add_argument('--safesearch', type=str, nargs='?', choices=['0', '1', '2'], default='0', - help='Safe content filter from none to strict') -parser.add_argument('--timerange', type=str, nargs='?', choices=['day', 'week', 'month', 'year'], - help='Filter by time range') -args = parser.parse_args() -# search results for the query -form = { - "q":args.query, - "categories":args.category.decode(), - "pageno":str(args.pageno), - "language":args.lang, - "time_range":args.timerange -} -preferences = searx.preferences.Preferences(['oscar'], searx.engines.categories.keys(), searx.engines.engines, []) -preferences.key_value_settings['safesearch'].parse(args.safesearch) +def get_search_query( + args: argparse.Namespace, engine_categories: EngineCategoriesVar = None +) -> searx.search.SearchQuery: + """Get search results for the query""" + if engine_categories is None: + engine_categories = list(searx.engines.categories.keys()) + try: + category = args.category.decode('utf-8') + except AttributeError: + category = args.category + form = { + "q": args.query, + "categories": category, + "pageno": str(args.pageno), + "language": args.lang, + "time_range": args.timerange + } + preferences = searx.preferences.Preferences( + ['oscar'], engine_categories, searx.engines.engines, []) + preferences.key_value_settings['safesearch'].parse(args.safesearch) -search_query, raw_text_query, _, _ = searx.webadapter.get_search_query_from_webapp(preferences, form) -search = searx.search.Search(search_query) -result_container = search.search() + search_query = searx.webadapter.get_search_query_from_webapp( + preferences, form)[0] + return search_query -# output -from datetime import datetime -def no_parsed_url(results): +def no_parsed_url(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Remove parsed url from dict.""" for result in results: del result['parsed_url'] return results -def json_serial(obj): - """JSON serializer for objects not serializable by default json code""" + +def json_serial(obj: Any) -> Any: + """JSON serializer for objects not serializable by default json code. + + :raise TypeError: raised when **obj** is not serializable + """ if isinstance(obj, datetime): serial = obj.isoformat() return serial - raise TypeError ("Type not serializable") + if isinstance(obj, bytes): + return obj.decode('utf8') + if isinstance(obj, set): + return list(obj) + raise TypeError("Type ({}) not serializable".format(type(obj))) -result_container_json = { - "search": { - "q": search_query.query, - "pageno": search_query.pageno, - "lang": search_query.lang, - "safesearch": search_query.safesearch, - "timerange": search_query.time_range, - "engines": search_query.engines - }, - "results": no_parsed_url(result_container.get_ordered_results()), - "infoboxes": result_container.infoboxes, - "suggestions": list(result_container.suggestions), - "answers": list(result_container.answers), - "paging": result_container.paging, - "results_number": result_container.results_number() -} -sys.stdout = codecs.getwriter("UTF-8")(sys.stdout) -sys.stdout.write(dumps(result_container_json, sort_keys=True, indent=4, ensure_ascii=False, encoding="utf-8", default=json_serial)) + +def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]: + """Get result from parsed arguments.""" + result_container = searx.search.Search(search_query).search() + result_container_json = { + "search": { + "q": search_query.query, + "pageno": search_query.pageno, + "lang": search_query.lang, + "safesearch": search_query.safesearch, + "timerange": search_query.time_range, + }, + "results": no_parsed_url(result_container.get_ordered_results()), + "infoboxes": result_container.infoboxes, + "suggestions": list(result_container.suggestions), + "answers": list(result_container.answers), + "paging": result_container.paging, + "results_number": result_container.results_number() + } + return result_container_json + + +def parse_argument( + args: Optional[List[str]]=None, + category_choices: EngineCategoriesVar=None +) -> argparse.Namespace: + """Parse command line. + + :raise SystemExit: Query argument required on `args` + + Examples: + + >>> import importlib + ... # load module + ... spec = importlib.util.spec_from_file_location( + ... 'utils.standalone_searx', 'utils/standalone_searx.py') + ... sas = importlib.util.module_from_spec(spec) + ... spec.loader.exec_module(sas) + ... sas.parse_argument() + usage: ptipython [-h] [--category [{general}]] [--lang [LANG]] [--pageno [PAGENO]] [--safesearch [{0,1,2}]] [--timerange [{day,week,month,year}]] + query + SystemExit: 2 + >>> sas.parse_argument(['rain']) + Namespace(category='general', lang='all', pageno=1, query='rain', safesearch='0', timerange=None) + """ # noqa: E501 + if not category_choices: + category_choices = list(searx.engines.categories.keys()) + parser = argparse.ArgumentParser(description='Standalone searx.') + parser.add_argument('query', type=str, + help='Text query') + parser.add_argument('--category', type=str, nargs='?', + choices=category_choices, + default='general', + help='Search category') + parser.add_argument('--lang', type=str, nargs='?', default='all', + help='Search language') + parser.add_argument('--pageno', type=int, nargs='?', default=1, + help='Page number starting from 1') + parser.add_argument( + '--safesearch', type=str, nargs='?', + choices=['0', '1', '2'], default='0', + help='Safe content filter from none to strict') + parser.add_argument( + '--timerange', type=str, + nargs='?', choices=['day', 'week', 'month', 'year'], + help='Filter by time range') + return parser.parse_args(args) + + +if __name__ == '__main__': + searx.engines.initialize_engines(searx.settings['engines']) + engine_cs = list(searx.engines.categories.keys()) + prog_args = parse_argument(category_choices=engine_cs) + search_q = get_search_query(prog_args, engine_categories=engine_cs) + res_dict = to_dict(search_q) + sys.stdout.write(dumps( + res_dict, sort_keys=True, indent=4, ensure_ascii=False, + default=json_serial))