[mod] move some code from webapp module to webutils module (no functional change)

Over the years the webapp module became more and more a mess.  To improve the
modulaization a little this patch moves some implementations from the webapp
module to webutils module.

HINT: this patch brings non functional change

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-06-18 16:43:48 +02:00 committed by Markus Heiser
parent 71b6ff07ca
commit fa1ef9a07b
7 changed files with 164 additions and 131 deletions

View File

@ -6,6 +6,7 @@ from typing import List, NamedTuple, Set
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
from searx import logger from searx import logger
from searx import utils
from searx.engines import engines from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error from searx.metrics import histogram_observe, counter_add, count_error
@ -353,6 +354,10 @@ class ResultContainer:
for result in self._merged_results: for result in self._merged_results:
score = result_score(result) score = result_score(result)
result['score'] = score result['score'] = score
if result.get('content'):
result['content'] = utils.html_to_text(result['content']).strip()
# removing html content and whitespace duplications
result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())
for result_engine in result['engines']: for result_engine in result['engines']:
counter_add(score, 'engine', result_engine, 'score') counter_add(score, 'engine', result_engine, 'score')
@ -415,11 +420,19 @@ class ResultContainer:
def results_length(self): def results_length(self):
return len(self._merged_results) return len(self._merged_results)
def results_number(self): @property
def number_of_results(self) -> int:
"""Returns the average of results number, returns zero if the average
result number is smaller than the actual result count."""
resultnum_sum = sum(self._number_of_results) resultnum_sum = sum(self._number_of_results)
if not resultnum_sum or not self._number_of_results: if not resultnum_sum or not self._number_of_results:
return 0 return 0
return resultnum_sum / len(self._number_of_results)
average = int(resultnum_sum / len(self._number_of_results))
if average < self.results_length():
average = 0
return average
def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False): def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
if engines[engine_name].display_error_messages: if engines[engine_name].display_error_messages:

View File

@ -58,7 +58,7 @@ from searx import (
from searx import infopage from searx import infopage
from searx.data import ENGINE_DESCRIPTIONS from searx.data import ENGINE_DESCRIPTIONS
from searx.results import Timing, UnresponsiveEngine from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path from searx.settings_loader import get_default_settings_path
from searx.exceptions import SearxParameterException from searx.exceptions import SearxParameterException
@ -68,18 +68,18 @@ from searx.engines import (
engines, engines,
engine_shortcuts, engine_shortcuts,
) )
from searx import webutils
from searx.webutils import ( from searx.webutils import (
UnicodeWriter,
highlight_content, highlight_content,
get_static_files, get_static_files,
get_result_templates, get_result_templates,
get_themes, get_themes,
prettify_url, exception_classname_to_text,
new_hmac, new_hmac,
is_hmac_of, is_hmac_of,
is_flask_run_cmdline, is_flask_run_cmdline,
group_engines_in_tab, group_engines_in_tab,
searxng_l10n_timespan,
) )
from searx.webadapter import ( from searx.webadapter import (
get_search_query_from_webapp, get_search_query_from_webapp,
@ -87,7 +87,6 @@ from searx.webadapter import (
parse_lang, parse_lang,
) )
from searx.utils import ( from searx.utils import (
html_to_text,
gen_useragent, gen_useragent,
dict_subset, dict_subset,
) )
@ -165,39 +164,6 @@ app.jinja_env.add_extension('jinja2.ext.loopcontrols') # pylint: disable=no-mem
app.jinja_env.filters['group_engines_in_tab'] = group_engines_in_tab # pylint: disable=no-member app.jinja_env.filters['group_engines_in_tab'] = group_engines_in_tab # pylint: disable=no-member
app.secret_key = settings['server']['secret_key'] app.secret_key = settings['server']['secret_key']
timeout_text = gettext('timeout')
parsing_error_text = gettext('parsing error')
http_protocol_error_text = gettext('HTTP protocol error')
network_error_text = gettext('network error')
ssl_cert_error_text = gettext("SSL error: certificate validation has failed")
exception_classname_to_text = {
None: gettext('unexpected crash'),
'timeout': timeout_text,
'asyncio.TimeoutError': timeout_text,
'httpx.TimeoutException': timeout_text,
'httpx.ConnectTimeout': timeout_text,
'httpx.ReadTimeout': timeout_text,
'httpx.WriteTimeout': timeout_text,
'httpx.HTTPStatusError': gettext('HTTP error'),
'httpx.ConnectError': gettext("HTTP connection error"),
'httpx.RemoteProtocolError': http_protocol_error_text,
'httpx.LocalProtocolError': http_protocol_error_text,
'httpx.ProtocolError': http_protocol_error_text,
'httpx.ReadError': network_error_text,
'httpx.WriteError': network_error_text,
'httpx.ProxyError': gettext("proxy error"),
'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"),
'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"),
'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"),
'searx.exceptions.SearxEngineAPIException': gettext("server API error"),
'searx.exceptions.SearxEngineXPathException': parsing_error_text,
'KeyError': parsing_error_text,
'json.decoder.JSONDecodeError': parsing_error_text,
'lxml.etree.ParserError': parsing_error_text,
'ssl.SSLCertVerificationError': ssl_cert_error_text, # for Python > 3.7
'ssl.CertificateError': ssl_cert_error_text, # for Python 3.7
}
class ExtendedRequest(flask.Request): class ExtendedRequest(flask.Request):
"""This class is never initialized and only used for type checking.""" """This class is never initialized and only used for type checking."""
@ -686,9 +652,7 @@ def search():
search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp( search_query, raw_text_query, _, _, selected_locale = get_search_query_from_webapp(
request.preferences, request.form request.preferences, request.form
) )
# search = Search(search_query) # without plugins
search = SearchWithPlugins(search_query, request.user_plugins, request) # pylint: disable=redefined-outer-name search = SearchWithPlugins(search_query, request.user_plugins, request) # pylint: disable=redefined-outer-name
result_container = search.search() result_container = search.search()
except SearxParameterException as e: except SearxParameterException as e:
@ -698,45 +662,54 @@ def search():
logger.exception(e, exc_info=True) logger.exception(e, exc_info=True)
return index_error(output_format, gettext('search error')), 500 return index_error(output_format, gettext('search error')), 500
# results # 1. check if the result is a redirect for an external bang
results = result_container.get_ordered_results()
number_of_results = result_container.results_number()
if number_of_results < result_container.results_length():
number_of_results = 0
# checkin for a external bang
if result_container.redirect_url: if result_container.redirect_url:
return redirect(result_container.redirect_url) return redirect(result_container.redirect_url)
# Server-Timing header # 2. add Server-Timing header for measuring performance characteristics of
# web applications
request.timings = result_container.get_timings() # pylint: disable=assigning-non-slot request.timings = result_container.get_timings() # pylint: disable=assigning-non-slot
# 3. formats without a template
if output_format == 'json':
response = webutils.get_json_response(search_query, result_container)
return Response(response, mimetype='application/json')
if output_format == 'csv':
csv = webutils.CSVWriter(StringIO())
webutils.write_csv_response(csv, result_container)
csv.stream.seek(0)
response = Response(csv.stream.read(), mimetype='application/csv')
cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query)
response.headers.add('Content-Disposition', cont_disp)
return response
# 4. formats rendered by a template / RSS & HTML
current_template = None current_template = None
previous_result = None previous_result = None
# output results = result_container.get_ordered_results()
for result in results: for result in results:
if output_format == 'html': if output_format == 'html':
if 'content' in result and result['content']: if 'content' in result and result['content']:
result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query) result['content'] = highlight_content(escape(result['content'][:1024]), search_query.query)
if 'title' in result and result['title']: if 'title' in result and result['title']:
result['title'] = highlight_content(escape(result['title'] or ''), search_query.query) result['title'] = highlight_content(escape(result['title'] or ''), search_query.query)
else:
if result.get('content'):
result['content'] = html_to_text(result['content']).strip()
# removing html content and whitespace duplications
result['title'] = ' '.join(html_to_text(result['title']).strip().split())
if 'url' in result: if 'url' in result:
result['pretty_url'] = prettify_url(result['url']) result['pretty_url'] = webutils.prettify_url(result['url'])
if result.get('publishedDate'): # do not try to get a date from an empty string or a None type if result.get('publishedDate'): # do not try to get a date from an empty string or a None type
try: # test if publishedDate >= 1900 (datetime module bug) try: # test if publishedDate >= 1900 (datetime module bug)
result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
except ValueError: except ValueError:
result['publishedDate'] = None result['publishedDate'] = None
else: else:
result['publishedDate'] = searxng_l10n_timespan(result['publishedDate']) result['publishedDate'] = webutils.searxng_l10n_timespan(result['publishedDate'])
# set result['open_group'] = True when the template changes from the previous result # set result['open_group'] = True when the template changes from the previous result
# set result['close_group'] = True when the template changes on the next result # set result['close_group'] = True when the template changes on the next result
@ -750,42 +723,7 @@ def search():
if previous_result: if previous_result:
previous_result['close_group'] = True previous_result['close_group'] = True
if output_format == 'json': # 4.a RSS
x = {
'query': search_query.query,
'number_of_results': number_of_results,
'results': results,
'answers': list(result_container.answers),
'corrections': list(result_container.corrections),
'infoboxes': result_container.infoboxes,
'suggestions': list(result_container.suggestions),
'unresponsive_engines': __get_translated_errors(result_container.unresponsive_engines),
}
response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item)
return Response(response, mimetype='application/json')
if output_format == 'csv':
csv = UnicodeWriter(StringIO())
keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
csv.writerow(keys)
for row in results:
row['host'] = row['parsed_url'].netloc
row['type'] = 'result'
csv.writerow([row.get(key, '') for key in keys])
for a in result_container.answers:
row = {'title': a, 'type': 'answer'}
csv.writerow([row.get(key, '') for key in keys])
for a in result_container.suggestions:
row = {'title': a, 'type': 'suggestion'}
csv.writerow([row.get(key, '') for key in keys])
for a in result_container.corrections:
row = {'title': a, 'type': 'correction'}
csv.writerow([row.get(key, '') for key in keys])
csv.stream.seek(0)
response = Response(csv.stream.read(), mimetype='application/csv')
cont_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search_query.query)
response.headers.add('Content-Disposition', cont_disp)
return response
if output_format == 'rss': if output_format == 'rss':
response_rss = render( response_rss = render(
@ -795,11 +733,11 @@ def search():
corrections=result_container.corrections, corrections=result_container.corrections,
suggestions=result_container.suggestions, suggestions=result_container.suggestions,
q=request.form['q'], q=request.form['q'],
number_of_results=number_of_results, number_of_results=result_container.number_of_results,
) )
return Response(response_rss, mimetype='text/xml') return Response(response_rss, mimetype='text/xml')
# HTML output format # 4.b HTML
# suggestions: use RawTextQuery to get the suggestion URLs with the same bang # suggestions: use RawTextQuery to get the suggestion URLs with the same bang
suggestion_urls = list( suggestion_urls = list(
@ -827,14 +765,14 @@ def search():
selected_categories = search_query.categories, selected_categories = search_query.categories,
pageno = search_query.pageno, pageno = search_query.pageno,
time_range = search_query.time_range or '', time_range = search_query.time_range or '',
number_of_results = format_decimal(number_of_results), number_of_results = format_decimal(result_container.number_of_results),
suggestions = suggestion_urls, suggestions = suggestion_urls,
answers = result_container.answers, answers = result_container.answers,
corrections = correction_urls, corrections = correction_urls,
infoboxes = result_container.infoboxes, infoboxes = result_container.infoboxes,
engine_data = result_container.engine_data, engine_data = result_container.engine_data,
paging = result_container.paging, paging = result_container.paging,
unresponsive_engines = __get_translated_errors( unresponsive_engines = webutils.get_translated_errors(
result_container.unresponsive_engines result_container.unresponsive_engines
), ),
current_locale = request.preferences.get_value("locale"), current_locale = request.preferences.get_value("locale"),
@ -849,25 +787,6 @@ def search():
) )
def __get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]):
translated_errors = []
# make a copy unresponsive_engines to avoid "RuntimeError: Set changed size
# during iteration" it happens when an engine modifies the ResultContainer
# after the search_multiple_requests method has stopped waiting
for unresponsive_engine in unresponsive_engines:
error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type)
if not error_user_text:
error_user_text = exception_classname_to_text[None]
error_msg = gettext(error_user_text)
if unresponsive_engine.suspended:
error_msg = gettext('Suspended') + ': ' + error_msg
translated_errors.append((unresponsive_engine.engine, error_msg))
return sorted(translated_errors, key=lambda e: e[0])
@app.route('/about', methods=['GET']) @app.route('/about', methods=['GET'])
def about(): def about():
"""Redirect to about page""" """Redirect to about page"""

View File

@ -9,31 +9,80 @@ import hmac
import re import re
import inspect import inspect
import itertools import itertools
import json
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING
from io import StringIO from io import StringIO
from codecs import getincrementalencoder from codecs import getincrementalencoder
from flask_babel import gettext, format_date from flask_babel import gettext, format_date # type: ignore
from searx import logger, settings from searx import logger, settings
from searx.engines import DEFAULT_CATEGORY from searx.engines import DEFAULT_CATEGORY
if TYPE_CHECKING: if TYPE_CHECKING:
from searx.enginelib import Engine from searx.enginelib import Engine
from searx.results import ResultContainer
from searx.search import SearchQuery
from searx.results import UnresponsiveEngine
VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$') VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$')
logger = logger.getChild('webutils') logger = logger.getChild('webutils')
timeout_text = gettext('timeout')
parsing_error_text = gettext('parsing error')
http_protocol_error_text = gettext('HTTP protocol error')
network_error_text = gettext('network error')
ssl_cert_error_text = gettext("SSL error: certificate validation has failed")
exception_classname_to_text = {
None: gettext('unexpected crash'),
'timeout': timeout_text,
'asyncio.TimeoutError': timeout_text,
'httpx.TimeoutException': timeout_text,
'httpx.ConnectTimeout': timeout_text,
'httpx.ReadTimeout': timeout_text,
'httpx.WriteTimeout': timeout_text,
'httpx.HTTPStatusError': gettext('HTTP error'),
'httpx.ConnectError': gettext("HTTP connection error"),
'httpx.RemoteProtocolError': http_protocol_error_text,
'httpx.LocalProtocolError': http_protocol_error_text,
'httpx.ProtocolError': http_protocol_error_text,
'httpx.ReadError': network_error_text,
'httpx.WriteError': network_error_text,
'httpx.ProxyError': gettext("proxy error"),
'searx.exceptions.SearxEngineCaptchaException': gettext("CAPTCHA"),
'searx.exceptions.SearxEngineTooManyRequestsException': gettext("too many requests"),
'searx.exceptions.SearxEngineAccessDeniedException': gettext("access denied"),
'searx.exceptions.SearxEngineAPIException': gettext("server API error"),
'searx.exceptions.SearxEngineXPathException': parsing_error_text,
'KeyError': parsing_error_text,
'json.decoder.JSONDecodeError': parsing_error_text,
'lxml.etree.ParserError': parsing_error_text,
'ssl.SSLCertVerificationError': ssl_cert_error_text, # for Python > 3.7
'ssl.CertificateError': ssl_cert_error_text, # for Python 3.7
}
class UnicodeWriter:
""" def get_translated_errors(unresponsive_engines: Iterable[UnresponsiveEngine]):
A CSV writer which will write rows to CSV file "f", translated_errors = []
which is encoded in the given encoding.
""" for unresponsive_engine in unresponsive_engines:
error_user_text = exception_classname_to_text.get(unresponsive_engine.error_type)
if not error_user_text:
error_user_text = exception_classname_to_text[None]
error_msg = gettext(error_user_text)
if unresponsive_engine.suspended:
error_msg = gettext('Suspended') + ': ' + error_msg
translated_errors.append((unresponsive_engine.engine, error_msg))
return sorted(translated_errors, key=lambda e: e[0])
class CSVWriter:
"""A CSV writer which will write rows to CSV file "f", which is encoded in
the given encoding."""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue # Redirect output to a queue
@ -59,6 +108,58 @@ class UnicodeWriter:
self.writerow(row) self.writerow(row)
def write_csv_response(csv: CSVWriter, rc: ResultContainer) -> None:
"""Write rows of the results to a query (``application/csv``) into a CSV
table (:py:obj:`CSVWriter`). First line in the table contain the column
names. The column "type" specifies the type, the following types are
included in the table:
- result
- answer
- suggestion
- correction
"""
results = rc.get_ordered_results()
keys = ('title', 'url', 'content', 'host', 'engine', 'score', 'type')
csv.writerow(keys)
for row in results:
row['host'] = row['parsed_url'].netloc
row['type'] = 'result'
csv.writerow([row.get(key, '') for key in keys])
for a in rc.answers:
row = {'title': a, 'type': 'answer'}
csv.writerow([row.get(key, '') for key in keys])
for a in rc.suggestions:
row = {'title': a, 'type': 'suggestion'}
csv.writerow([row.get(key, '') for key in keys])
for a in rc.corrections:
row = {'title': a, 'type': 'correction'}
csv.writerow([row.get(key, '') for key in keys])
def get_json_response(sq: SearchQuery, rc: ResultContainer) -> str:
"""Returns the JSON string of the results to a query (``application/json``)"""
results = rc.number_of_results
x = {
'query': sq.query,
'number_of_results': results,
'results': rc.get_ordered_results(),
'answers': list(rc.answers),
'corrections': list(rc.corrections),
'infoboxes': rc.infoboxes,
'suggestions': list(rc.suggestions),
'unresponsive_engines': get_translated_errors(rc.unresponsive_engines),
}
response = json.dumps(x, default=lambda item: list(item) if isinstance(item, set) else item)
return response
def get_themes(templates_path): def get_themes(templates_path):
"""Returns available themes list.""" """Returns available themes list."""
return os.listdir(templates_path) return os.listdir(templates_path)

View File

@ -60,7 +60,7 @@ Example to run it from python:
"infoboxes": [ {...} ], "infoboxes": [ {...} ],
"paging": true, "paging": true,
"results": [... ], "results": [... ],
"results_number": 820000000.0, "number_of_results": 820000000.0,
"search": { "search": {
"lang": "all", "lang": "all",
"pageno": 1, "pageno": 1,
@ -150,7 +150,7 @@ def to_dict(search_query: searx.search.SearchQuery) -> Dict[str, Any]:
"suggestions": list(result_container.suggestions), "suggestions": list(result_container.suggestions),
"answers": list(result_container.answers), "answers": list(result_container.answers),
"paging": result_container.paging, "paging": result_container.paging,
"results_number": result_container.results_number(), "number_of_results": result_container.number_of_results,
} }
return result_container_json return result_container_json

View File

@ -57,7 +57,7 @@ class StandaloneSearx(SearxTestCase):
'suggestions': [], 'suggestions': [],
'answers': [], 'answers': [],
'paging': False, 'paging': False,
'results_number': 0, 'number_of_results': 0,
}, },
) )
@ -73,7 +73,7 @@ class StandaloneSearx(SearxTestCase):
'infoboxes': m_search.infoboxes, 'infoboxes': m_search.infoboxes,
'paging': m_search.paging, 'paging': m_search.paging,
'results': m_search.get_ordered_results(), 'results': m_search.get_ordered_results(),
'results_number': m_search.results_number(), 'number_of_results': m_search.number_of_results,
'search': { 'search': {
'lang': m_sq.lang, 'lang': m_sq.lang,
'pageno': m_sq.pageno, 'pageno': m_sq.pageno,

View File

@ -69,7 +69,7 @@ class ViewsTestCase(SearxTestCase):
infoboxes=[], infoboxes=[],
unresponsive_engines=set(), unresponsive_engines=set(),
results=test_results, results=test_results,
results_number=lambda: 3, number_of_results=3,
results_length=lambda: len(test_results), results_length=lambda: len(test_results),
get_timings=lambda: timings, get_timings=lambda: timings,
redirect_url=None, redirect_url=None,

View File

@ -64,7 +64,7 @@ class TestWebUtils(SearxTestCase):
class TestUnicodeWriter(SearxTestCase): class TestUnicodeWriter(SearxTestCase):
def setUp(self): def setUp(self):
self.unicode_writer = webutils.UnicodeWriter(mock.MagicMock()) self.unicode_writer = webutils.CSVWriter(mock.MagicMock())
def test_write_row(self): def test_write_row(self):
row = [1, 2, 3] row = [1, 2, 3]