mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[fix] HTMLParser: undocumented not implemented method
In python versions <py3.10 there is an issue with an undocumented method HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed in Python 3.5. To be compatible to higher versions (>=py3.10) an error method is implemented which throws an AssertionError exception like the higher Python versions do [3]. [1] https://github.com/python/cpython/issues/76025 [2] https://bugs.python.org/issue31844 [3] https://github.com/python/cpython/pull/8562 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									01b5b9cb8e
								
							
						
					
					
						commit
						ef56e1d684
					
				
					 2 changed files with 14 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -15,6 +15,7 @@ from numbers import Number
 | 
			
		|||
from os.path import splitext, join
 | 
			
		||||
from random import choice
 | 
			
		||||
from html.parser import HTMLParser
 | 
			
		||||
from html import escape
 | 
			
		||||
from urllib.parse import urljoin, urlparse
 | 
			
		||||
from markdown_it import MarkdownIt
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception):
 | 
			
		|||
    """Internal exception raised when the HTML is invalid"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class _HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://bugs.python.org/issue31844)
 | 
			
		||||
class _HTMLTextExtractor(HTMLParser):
 | 
			
		||||
    """Internal class to extract text from HTML"""
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -137,6 +138,11 @@ class _HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://b
 | 
			
		|||
    def get_text(self):
 | 
			
		||||
        return ''.join(self.result).strip()
 | 
			
		||||
 | 
			
		||||
    def error(self, message):
 | 
			
		||||
        # error handle is needed in <py3.10
 | 
			
		||||
        # https://github.com/python/cpython/pull/8562/files
 | 
			
		||||
        raise AssertionError(message)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def html_to_text(html_str: str) -> str:
 | 
			
		||||
    """Extract text from a HTML string
 | 
			
		||||
| 
						 | 
				
			
			@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str:
 | 
			
		|||
 | 
			
		||||
        >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
 | 
			
		||||
        'Example'
 | 
			
		||||
 | 
			
		||||
        >>> html_to_text(r'regexp: (?<![a-zA-Z]')
 | 
			
		||||
        'regexp: (?<![a-zA-Z]'
 | 
			
		||||
    """
 | 
			
		||||
    html_str = html_str.replace('\n', ' ').replace('\r', ' ')
 | 
			
		||||
    html_str = ' '.join(html_str.split())
 | 
			
		||||
    s = _HTMLTextExtractor()
 | 
			
		||||
    try:
 | 
			
		||||
        s.feed(html_str)
 | 
			
		||||
    except AssertionError:
 | 
			
		||||
        s = _HTMLTextExtractor()
 | 
			
		||||
        s.feed(escape(html_str, quote=True))
 | 
			
		||||
    except _HTMLTextExtractorException:
 | 
			
		||||
        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
 | 
			
		||||
    return s.get_text()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -41,6 +41,7 @@ class TestUtils(SearxTestCase):
 | 
			
		|||
        self.assertIsInstance(utils.html_to_text(html_str), str)
 | 
			
		||||
        self.assertIsNotNone(utils.html_to_text(html_str))
 | 
			
		||||
        self.assertEqual(utils.html_to_text(html_str), "Test text")
 | 
			
		||||
        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
 | 
			
		||||
 | 
			
		||||
    def test_extract_text(self):
 | 
			
		||||
        html_str = """
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue