diff --git a/searx/utils.py b/searx/utils.py index d8842c65f..0eb9f6a34 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -77,6 +77,10 @@ def highlight_content(content, query): return content +class HTMLTextExtractorException(Exception): + pass + + class HTMLTextExtractor(HTMLParser): def __init__(self): @@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser): return if tag != self.tags[-1]: - raise Exception("invalid html") + raise HTMLTextExtractorException() self.tags.pop() @@ -128,7 +132,10 @@ def html_to_text(html): html = html.replace('\n', ' ') html = ' '.join(html.split()) s = HTMLTextExtractor() - s.feed(html) + try: + s.feed(html) + except HTMLTextExtractorException: + logger.debug("HTMLTextExtractor: invalid HTML\n%s", html) return s.get_text() diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 5f98511c3..08b759542 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -52,6 +52,10 @@ class TestUtils(SearxTestCase): self.assertIsNotNone(utils.html_to_text(html)) self.assertEqual(utils.html_to_text(html), "Test text") + def test_html_to_text_invalid(self): + html = '
Lorem ipsumdolor sit amet
' + self.assertEqual(utils.html_to_text(html), "Lorem ipsum") + def test_prettify_url(self): data = (('https://searx.me/', 'https://searx.me/'), ('https://searx.me/ű', 'https://searx.me/ű'), @@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase): self.html_text_extractor.handle_entityref(entity) self.assertIn(entity, self.html_text_extractor.result) + def test_invalid_html(self): + text = 'Lorem ipsumdolor sit amet
' + with self.assertRaises(utils.HTMLTextExtractorException): + self.html_text_extractor.feed(text) + class TestUnicodeWriter(SearxTestCase):