mirror of https://github.com/searxng/searxng.git
searx.utils.html_to_text: replace <br/> by a space
This commit is contained in:
parent
1a82e79b50
commit
4224607c62
|
@ -88,6 +88,8 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
self.tags.append(tag)
|
self.tags.append(tag)
|
||||||
|
if tag == 'br':
|
||||||
|
self.result.append(' ')
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
if not self.tags:
|
if not self.tags:
|
||||||
|
@ -142,7 +144,7 @@ def html_to_text(html_str: str) -> str:
|
||||||
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
|
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
|
||||||
'Example'
|
'Example'
|
||||||
"""
|
"""
|
||||||
html_str = html_str.replace('\n', ' ')
|
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
|
||||||
html_str = ' '.join(html_str.split())
|
html_str = ' '.join(html_str.split())
|
||||||
s = _HTMLTextExtractor()
|
s = _HTMLTextExtractor()
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in New Issue