From 510aba5e6699f76e5b9dc32db18b0f19db6e5da4 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Wed, 1 Oct 2014 17:18:18 +0200 Subject: [PATCH 1/2] implement query parser and use it inside autocompletion --- searx/query.py | 125 ++++++++++++++++++++++++++++++++++++++++++++++++ searx/webapp.py | 30 ++++++++++-- 2 files changed, 152 insertions(+), 3 deletions(-) create mode 100644 searx/query.py diff --git a/searx/query.py b/searx/query.py new file mode 100644 index 000000000..59a1e347b --- /dev/null +++ b/searx/query.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python + +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2014 by Thomas Pointhuber, +''' + +from searx.languages import language_codes +from searx.engines import ( + categories, engines, engine_shortcuts +) +import string +import re + + +class Query(object): + """parse query""" + + def __init__(self, query, blocked_engines): + self.query = query + self.blocked_engines = [] + + if blocked_engines: + self.blocked_engines = blocked_engines + + self.query_parts = [] + self.engines = [] + self.languages = [] + + def parse_query(self): + self.query_parts = [] + + # split query, including whitespaces + raw_query_parts = re.split(r'(\s+)', self.query) + + parse_next = True + + for query_part in raw_query_parts: + if not parse_next: + self.query_parts[-1] += query_part + continue + + parse_next = False + + # part does only contain spaces, skip + if query_part.isspace(): + parse_next = True + self.query_parts.append(query_part) + continue + + # this force a language + if query_part[0] == ':': + lang = query_part[1:].lower() + + # check if any language-code is equal with declared language-codes + for lc in language_codes: + lang_id, lang_name, country = map(str.lower, lc) + + # if correct language-code is found, set it as new search-language + if lang == lang_id\ + or lang_id.startswith(lang)\ + or lang == lang_name\ + or lang == country: + parse_next = True + self.languages.append(lang) + break + + # this force a engine or category + if query_part[0] == '!': + prefix = query_part[1:].replace('_', ' ') + + # check if prefix is equal with engine shortcut + if prefix in engine_shortcuts\ + and not engine_shortcuts[prefix] in self.blocked_engines: + parse_next = True + self.engines.append({'category': 'none', + 'name': engine_shortcuts[prefix]}) + + # check if prefix is equal with engine name + elif prefix in engines\ + and not prefix in self.blocked_engines: + parse_next = True + self.engines.append({'category': 'none', + 'name': prefix}) + + # check if prefix is equal with categorie name + elif prefix in categories: + # using all engines for that search, which are declared under that categorie name + parse_next = True + self.engines.extend({'category': prefix, + 'name': engine.name} + for engine in categories[prefix] + if not engine in self.blocked_engines) + + # append query part to query_part list + self.query_parts.append(query_part) + + def changeSearchQuery(self, search_query): + if len(self.query_parts): + self.query_parts[-1] = search_query + else: + self.query_parts.append(search_query) + + def getSearchQuery(self): + if len(self.query_parts): + return self.query_parts[-1] + else: + return '' + + def getFullQuery(self): + # get full querry including whitespaces + return string.join(self.query_parts, '') + diff --git a/searx/webapp.py b/searx/webapp.py index 42cb42678..f66466b35 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -47,6 +47,7 @@ from searx.utils import ( from searx.https_rewrite import https_rules from searx.languages import language_codes from searx.search import Search +from searx.query import Query from searx.autocomplete import backends as autocomplete_backends @@ -308,23 +309,46 @@ def autocompleter(): """Return autocompleter results""" request_data = {} + # select request method if request.method == 'POST': request_data = request.form else: request_data = request.args - query = request_data.get('q', '').encode('utf-8') + # set blocked engines + if request.cookies.get('blocked_engines'): + blocked_engines = request.cookies['blocked_engines'].split(',') # noqa + else: + blocked_engines = [] - if not query: + # parse query + query = Query(request_data.get('q', '').encode('utf-8'), blocked_engines) + query.parse_query() + + # check if search query is set + if not query.getSearchQuery(): return + # run autocompleter completer = autocomplete_backends.get(request.cookies.get('autocomplete')) + # check if valid autocompleter is selected if not completer: return - results = completer(query) + # run autocompletion + raw_results = completer(query.getSearchQuery()) + # parse results (write :language and !engine back to result string) + results = [] + for result in raw_results: + result_query = query + result_query.changeSearchQuery(result) + + # add parsed result + results.append(result_query.getFullQuery()) + + # return autocompleter results if request_data.get('format') == 'x-suggestions': return Response(json.dumps([query, results]), mimetype='application/json') From 62d1a70c84367403222c15e25f597a8d6b336151 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Wed, 1 Oct 2014 17:57:53 +0200 Subject: [PATCH 2/2] using Query class for parsing of search query --- searx/query.py | 4 ++- searx/search.py | 65 ++++++++----------------------------------------- 2 files changed, 13 insertions(+), 56 deletions(-) diff --git a/searx/query.py b/searx/query.py index 59a1e347b..612d46f4b 100644 --- a/searx/query.py +++ b/searx/query.py @@ -39,6 +39,7 @@ class Query(object): self.engines = [] self.languages = [] + # parse query, if tags are set, which change the serch engine or search-language def parse_query(self): self.query_parts = [] @@ -55,7 +56,8 @@ class Query(object): parse_next = False # part does only contain spaces, skip - if query_part.isspace(): + if query_part.isspace()\ + or query_part == '': parse_next = True self.query_parts.append(query_part) continue diff --git a/searx/search.py b/searx/search.py index c861a795a..17556dc4e 100644 --- a/searx/search.py +++ b/searx/search.py @@ -25,6 +25,7 @@ from searx.engines import ( ) from searx.languages import language_codes from searx.utils import gen_useragent +from searx.query import Query number_of_searches = 0 @@ -235,7 +236,15 @@ class Search(object): self.pageno = int(pageno_param) # parse query, if tags are set, which change the serch engine or search-language - self.parse_query() + query_obj = Query(self.query, self.blocked_engines) + query_obj.parse_query() + + # get last selected language in query, if possible + # TODO support search with multible languages + if len(query_obj.languages): + self.lang = query_obj.languages[-1] + + self.engines = query_obj.engines self.categories = [] @@ -276,60 +285,6 @@ class Search(object): for x in categories[categ] if not x.name in self.blocked_engines) - # parse query, if tags are set, which change the serch engine or search-language - def parse_query(self): - query_parts = self.query.split() - modified = False - - # check if language-prefix is set - if query_parts[0].startswith(':'): - lang = query_parts[0][1:].lower() - - # check if any language-code is equal with declared language-codes - for lc in language_codes: - lang_id, lang_name, country = map(str.lower, lc) - - # if correct language-code is found, set it as new search-language - if lang == lang_id\ - or lang_id.startswith(lang)\ - or lang == lang_name\ - or lang == country: - self.lang = lang - modified = True - break - - # check if category/engine prefix is set - elif query_parts[0].startswith('!'): - prefix = query_parts[0][1:].replace('_', ' ') - - # check if prefix is equal with engine shortcut - if prefix in engine_shortcuts\ - and not engine_shortcuts[prefix] in self.blocked_engines: - modified = True - self.engines.append({'category': 'none', - 'name': engine_shortcuts[prefix]}) - - # check if prefix is equal with engine name - elif prefix in engines\ - and not prefix in self.blocked_engines: - modified = True - self.engines.append({'category': 'none', - 'name': prefix}) - - # check if prefix is equal with categorie name - elif prefix in categories: - modified = True - # using all engines for that search, which are declared under that categorie name - self.engines.extend({'category': prefix, - 'name': engine.name} - for engine in categories[prefix] - if not engine in self.blocked_engines) - - # if language, category or engine were specificed in this query, search for more tags which does the same - if modified: - self.query = self.query.replace(query_parts[0], '', 1).strip() - self.parse_query() - # do search-request def search(self, request): global number_of_searches