From eea673831bc83cd9ca85a8c86898df217cd91a28 Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarit.de>
Date: Wed, 18 Oct 2023 14:34:18 +0200
Subject: [PATCH] [fix] HTMLParser: undocumented not implemented method

In python versions <py3.10 there is an issue with an undocumented method
HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed
in Python 3.5.

To be compatible to higher versions (>=py3.10) an error method is implemented
which throws an AssertionError exception like the higher Python versions do [3].

[1] https://github.com/python/cpython/issues/76025
[2] https://bugs.python.org/issue31844
[3] https://github.com/python/cpython/pull/8562

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
---
 searx/utils.py           | 14 +++++++++++++-
 tests/unit/test_utils.py |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/searx/utils.py b/searx/utils.py
index 7f6017617..c009c3144 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from numbers import Number
 from os.path import splitext, join
 from random import choice
 from html.parser import HTMLParser
+from html import escape
 from urllib.parse import urljoin, urlparse
 from markdown_it import MarkdownIt
 
@@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception):
     """Internal exception raised when the HTML is invalid"""
 
 
-class _HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://bugs.python.org/issue31844)
+class _HTMLTextExtractor(HTMLParser):
     """Internal class to extract text from HTML"""
 
     def __init__(self):
@@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://b
     def get_text(self):
         return ''.join(self.result).strip()
 
+    def error(self, message):
+        # error handle is needed in <py3.10
+        # https://github.com/python/cpython/pull/8562/files
+        raise AssertionError(message)
+
 
 def html_to_text(html_str: str) -> str:
     """Extract text from a HTML string
@@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str:
 
         >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
         'Example'
+
+        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
+        'regexp: (?<![a-zA-Z]'
     """
     html_str = html_str.replace('\n', ' ').replace('\r', ' ')
     html_str = ' '.join(html_str.split())
     s = _HTMLTextExtractor()
     try:
         s.feed(html_str)
+    except AssertionError:
+        s = _HTMLTextExtractor()
+        s.feed(escape(html_str, quote=True))
     except _HTMLTextExtractorException:
         logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
     return s.get_text()
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 2ad4593a1..6398e63f0 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -41,6 +41,7 @@ class TestUtils(SearxTestCase):
         self.assertIsInstance(utils.html_to_text(html_str), str)
         self.assertIsNotNone(utils.html_to_text(html_str))
         self.assertEqual(utils.html_to_text(html_str), "Test text")
+        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
 
     def test_extract_text(self):
         html_str = """