mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
initial videos parsing, works sometimes.
This commit is contained in:
parent
d57cdbc8a2
commit
ab2f49a538
1 changed files with 44 additions and 38 deletions
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
from urllib.parse import urlencode, urlparse, parse_qs
|
from urllib.parse import urlencode, urlparse, parse_qs
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from searx.utils import humanize_bytes
|
from searx.utils import humanize_bytes
|
||||||
|
@ -87,7 +88,7 @@ def request(query, params):
|
||||||
elif yandex_category == 'images':
|
elif yandex_category == 'images':
|
||||||
params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}"
|
params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}"
|
||||||
elif yandex_category == 'videos':
|
elif yandex_category == 'videos':
|
||||||
params['url'] = f"{base_url_videos}?{url_extension}&{urlencode(query_params_videos)}"
|
params['url'] = f"{base_url_videos}?{url_extension}{urlencode(query_params_videos)}"
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
@ -194,46 +195,51 @@ def response(resp):
|
||||||
raise SearxEngineCaptchaException()
|
raise SearxEngineCaptchaException()
|
||||||
|
|
||||||
html_data = html.fromstring(resp.text)
|
html_data = html.fromstring(resp.text)
|
||||||
text = unescape(html.tostring(html_data, encoding='unicode'))
|
html_sample = unescape(html.tostring(html_data, encoding='unicode'))
|
||||||
|
|
||||||
video_urls = re.findall(r'ner" href="(.*?)"', text)
|
start_tag = '{"pages":{"search":{"query":'
|
||||||
valid_urls = [url for url in video_urls if url.startswith('http')]
|
end_tag = '}}</noframes>'
|
||||||
thumbnail_urls = re.findall(r'"image":"//(.*?)"', text)
|
|
||||||
https_thumbnail_urls = ['https://' + url for url in thumbnail_urls]
|
start_index = html_sample.find(start_tag)
|
||||||
durations = re.findall(r'"durationText":"(.*?)"', text)
|
start_index = start_index if start_index != -1 else -1
|
||||||
titles = re.findall(r'"clear_title":"(.*?)"', text)
|
|
||||||
descriptions = re.findall(r'"clear_description":"(.*?)"', text)
|
end_index = html_sample.find(end_tag, start_index)
|
||||||
authors = re.findall(r'"clipChannel":{"name":"(.*?)",', text)
|
end_index = end_index + len(end_tag) if end_index != -1 else -1
|
||||||
raw_dates = re.findall(r'"time":"(.*?)"', text)
|
|
||||||
|
content_between_tags = html_sample[start_index:end_index] if start_index != -1 and end_index != -1 else None
|
||||||
|
|
||||||
|
json_resp = r'''{}'''.format(content_between_tags.rsplit('</noframes>', 1)[0])
|
||||||
|
|
||||||
|
# # save to a file
|
||||||
|
# with open('/home/user/Desktop/json_resp.txt', 'w') as f:
|
||||||
|
# sys.stdout = f
|
||||||
|
# print(json_resp)
|
||||||
|
|
||||||
|
json_resp2 = json.loads(json_resp.encode("UTF-8"))
|
||||||
|
|
||||||
|
# # save to a file
|
||||||
|
# with open('/home/user/Desktop/json_resp2.txt', 'w') as f:
|
||||||
|
# sys.stdout = f
|
||||||
|
# print(json_resp2)
|
||||||
|
|
||||||
embedded_urls = [get_youtube_iframe_src(url) for url in valid_urls]
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for url, title, description, author, raw_date, duration, thumbnail, embedded_url in zip(
|
|
||||||
valid_urls,
|
|
||||||
titles[::2],
|
|
||||||
descriptions,
|
|
||||||
authors,
|
|
||||||
raw_dates[::2],
|
|
||||||
durations,
|
|
||||||
https_thumbnail_urls[::2],
|
|
||||||
embedded_urls,
|
|
||||||
):
|
|
||||||
date_timestamp = datetime.strptime(raw_date.split("T")[0], "%Y-%m-%d")
|
|
||||||
date_utc = datetime.utcfromtimestamp(date_timestamp.timestamp())
|
|
||||||
|
|
||||||
results.append(
|
for item in json_resp2['pages']['search']['viewerData']['organicSnippets']['0']:
|
||||||
{
|
if 'title' in item:
|
||||||
"url": url,
|
title = item['clear_title']
|
||||||
"title": title,
|
print(title)
|
||||||
"content": description,
|
url = item['url']
|
||||||
"author": author,
|
print(url)
|
||||||
"publishedDate": date_utc,
|
description = item['clear_description']
|
||||||
"length": duration,
|
thumbnail = item['thumb']['image']
|
||||||
"thumbnail": thumbnail,
|
|
||||||
"iframe_src": embedded_url,
|
|
||||||
"template": "videos.html",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
results.append({
|
||||||
|
"title": title,
|
||||||
|
"url": url,
|
||||||
|
"content": description,
|
||||||
|
"thumbnail": thumbnail,
|
||||||
|
"template": "videos.html",
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
Loading…
Add table
Reference in a new issue