[fix] HTMLParser: undocumented not implemented method

In python versions <py3.10 there is an issue with an undocumented method
HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed
in Python 3.5.

To be compatible to higher versions (>=py3.10) an error method is implemented
which throws an AssertionError exception like the higher Python versions do [3].

[1] https://github.com/python/cpython/issues/76025
[2] https://bugs.python.org/issue31844
[3] https://github.com/python/cpython/pull/8562

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-10-18 14:34:18 +02:00 committed by MatthieuBarbu
parent 98cd1028b6
commit eea673831b
2 changed files with 14 additions and 1 deletions

View File

@ -15,6 +15,7 @@ from numbers import Number
from os.path import splitext, join from os.path import splitext, join
from random import choice from random import choice
from html.parser import HTMLParser from html.parser import HTMLParser
from html import escape
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from markdown_it import MarkdownIt from markdown_it import MarkdownIt
@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception):
"""Internal exception raised when the HTML is invalid""" """Internal exception raised when the HTML is invalid"""
class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844) class _HTMLTextExtractor(HTMLParser):
"""Internal class to extract text from HTML""" """Internal class to extract text from HTML"""
def __init__(self): def __init__(self):
@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b
def get_text(self): def get_text(self):
return ''.join(self.result).strip() return ''.join(self.result).strip()
def error(self, message):
# error handle is needed in <py3.10
# https://github.com/python/cpython/pull/8562/files
raise AssertionError(message)
def html_to_text(html_str: str) -> str: def html_to_text(html_str: str) -> str:
"""Extract text from a HTML string """Extract text from a HTML string
@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example' 'Example'
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
'regexp: (?<![a-zA-Z]'
""" """
html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split()) html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor() s = _HTMLTextExtractor()
try: try:
s.feed(html_str) s.feed(html_str)
except AssertionError:
s = _HTMLTextExtractor()
s.feed(escape(html_str, quote=True))
except _HTMLTextExtractorException: except _HTMLTextExtractorException:
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str) logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
return s.get_text() return s.get_text()

View File

@ -41,6 +41,7 @@ class TestUtils(SearxTestCase):
self.assertIsInstance(utils.html_to_text(html_str), str) self.assertIsInstance(utils.html_to_text(html_str), str)
self.assertIsNotNone(utils.html_to_text(html_str)) self.assertIsNotNone(utils.html_to_text(html_str))
self.assertEqual(utils.html_to_text(html_str), "Test text") self.assertEqual(utils.html_to_text(html_str), "Test text")
self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
def test_extract_text(self): def test_extract_text(self):
html_str = """ html_str = """