forked from zaclys/searxng
[mod] utils.py: add markdown_to_text helper function
This commit is contained in:
parent
ac9c88094d
commit
8a39b8a12d
|
@ -42,10 +42,9 @@ Implementations
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
from markdown_it import MarkdownIt
|
|
||||||
from flask_babel import gettext
|
from flask_babel import gettext
|
||||||
|
|
||||||
from searx.utils import html_to_text
|
from searx.utils import markdown_to_text
|
||||||
|
|
||||||
about = {
|
about = {
|
||||||
"website": 'https://lemmy.ml/',
|
"website": 'https://lemmy.ml/',
|
||||||
|
@ -78,11 +77,6 @@ def request(query, params):
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
def _format_content(content):
|
|
||||||
html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
|
|
||||||
return html_to_text(html)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_communities(json):
|
def _get_communities(json):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
|
@ -97,7 +91,7 @@ def _get_communities(json):
|
||||||
{
|
{
|
||||||
'url': result['community']['actor_id'],
|
'url': result['community']['actor_id'],
|
||||||
'title': result['community']['title'],
|
'title': result['community']['title'],
|
||||||
'content': _format_content(result['community'].get('description', '')),
|
'content': markdown_to_text(result['community'].get('description', '')),
|
||||||
'img_src': result['community'].get('icon', result['community'].get('banner')),
|
'img_src': result['community'].get('icon', result['community'].get('banner')),
|
||||||
'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
|
'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
|
||||||
'metadata': metadata,
|
'metadata': metadata,
|
||||||
|
@ -114,7 +108,7 @@ def _get_users(json):
|
||||||
{
|
{
|
||||||
'url': result['person']['actor_id'],
|
'url': result['person']['actor_id'],
|
||||||
'title': result['person']['name'],
|
'title': result['person']['name'],
|
||||||
'content': _format_content(result['person'].get('bio', '')),
|
'content': markdown_to_text(result['person'].get('bio', '')),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -140,7 +134,7 @@ def _get_posts(json):
|
||||||
|
|
||||||
content = result['post'].get('body', '').strip()
|
content = result['post'].get('body', '').strip()
|
||||||
if content:
|
if content:
|
||||||
content = _format_content(content)
|
content = markdown_to_text(content)
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
{
|
{
|
||||||
|
@ -164,7 +158,7 @@ def _get_comments(json):
|
||||||
|
|
||||||
content = result['comment'].get('content', '').strip()
|
content = result['comment'].get('content', '').strip()
|
||||||
if content:
|
if content:
|
||||||
content = _format_content(content)
|
content = markdown_to_text(content)
|
||||||
|
|
||||||
metadata = (
|
metadata = (
|
||||||
f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
|
f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
|
||||||
|
@ -176,7 +170,7 @@ def _get_comments(json):
|
||||||
{
|
{
|
||||||
'url': result['comment']['ap_id'],
|
'url': result['comment']['ap_id'],
|
||||||
'title': result['post']['name'],
|
'title': result['post']['name'],
|
||||||
'content': _format_content(result['comment']['content']),
|
'content': markdown_to_text(result['comment']['content']),
|
||||||
'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
|
'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
|
||||||
'metadata': metadata,
|
'metadata': metadata,
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,6 +15,7 @@ from os.path import splitext, join
|
||||||
from random import choice
|
from random import choice
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
from markdown_it import MarkdownIt
|
||||||
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
||||||
|
@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
|
||||||
return s.get_text()
|
return s.get_text()
|
||||||
|
|
||||||
|
|
||||||
|
def markdown_to_text(markdown_str: str) -> str:
|
||||||
|
"""Extract text from a Markdown string
|
||||||
|
|
||||||
|
Args:
|
||||||
|
* markdown_str (str): string Markdown
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
* str: extracted text
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> markdown_to_text('[example](https://example.com)')
|
||||||
|
'example'
|
||||||
|
|
||||||
|
>>> markdown_to_text('## Headline')
|
||||||
|
'Headline'
|
||||||
|
"""
|
||||||
|
|
||||||
|
html_str = (
|
||||||
|
MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
|
||||||
|
)
|
||||||
|
return html_to_text(html_str)
|
||||||
|
|
||||||
|
|
||||||
def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
|
def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
|
||||||
"""Extract text from a lxml result
|
"""Extract text from a lxml result
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue