searx.utils.html_to_text: replace <br/> by a space

This commit is contained in:
Alexandre Flament 2022-04-15 13:37:27 +02:00 committed by Markus Heiser
parent 1a82e79b50
commit 4224607c62
1 changed files with 3 additions and 1 deletions

View File

@ -88,6 +88,8 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
self.tags.append(tag) self.tags.append(tag)
if tag == 'br':
self.result.append(' ')
def handle_endtag(self, tag): def handle_endtag(self, tag):
if not self.tags: if not self.tags:
@ -142,7 +144,7 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example' 'Example'
""" """
html_str = html_str.replace('\n', ' ') html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split()) html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor() s = _HTMLTextExtractor()
try: try: