[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception

Close #2188
2024-01-01 19:24:07 +01:00 · 2020-09-11 10:23:56 +02:00 · 2020-09-11 10:23:56 +02:00 · 6deb85072a
commit 6deb85072a
parent ae07f4a211
2 changed files with 18 additions and 2 deletions
--- a/searx/utils.py
+++ b/searx/utils.py
@ -77,6 +77,10 @@ def highlight_content(content, query):
    return content
 class HTMLTextExtractorException(Exception):
    pass
 class HTMLTextExtractor(HTMLParser):
    def __init__(self):
@ -92,7 +96,7 @@ class HTMLTextExtractor(HTMLParser):
            return
        if tag != self.tags[-1]:
-            raise Exception("invalid html")
+            raise HTMLTextExtractorException()
        self.tags.pop()
@ -128,7 +132,10 @@ def html_to_text(html):
    html = html.replace('\n', ' ')
    html = ' '.join(html.split())
    s = HTMLTextExtractor()
-    s.feed(html)
+    try:
        s.feed(html)
    except HTMLTextExtractorException:
        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html)
    return s.get_text()
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@ -52,6 +52,10 @@ class TestUtils(SearxTestCase):
        self.assertIsNotNone(utils.html_to_text(html))
        self.assertEqual(utils.html_to_text(html), "Test text")
    def test_html_to_text_invalid(self):
        html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
        self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
    def test_prettify_url(self):
        data = (('https://searx.me/', 'https://searx.me/'),
                ('https://searx.me/ű', 'https://searx.me/ű'),
@ -116,6 +120,11 @@ class TestHTMLTextExtractor(SearxTestCase):
        self.html_text_extractor.handle_entityref(entity)
        self.assertIn(entity, self.html_text_extractor.result)
    def test_invalid_html(self):
        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
        with self.assertRaises(utils.HTMLTextExtractorException):
            self.html_text_extractor.feed(text)
 class TestUnicodeWriter(SearxTestCase):