initial videos parsing, works sometimes.

This commit is contained in:
Austin-Olacsi 2024-06-28 12:56:59 -06:00 committed by GitHub
parent d57cdbc8a2
commit ab2f49a538
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4,6 +4,7 @@
import re
import sys
import json
from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html
from searx.utils import humanize_bytes
@ -87,7 +88,7 @@ def request(query, params):
elif yandex_category == 'images':
params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}"
elif yandex_category == 'videos':
params['url'] = f"{base_url_videos}?{url_extension}&{urlencode(query_params_videos)}"
params['url'] = f"{base_url_videos}?{url_extension}{urlencode(query_params_videos)}"
return params
@ -194,46 +195,51 @@ def response(resp):
raise SearxEngineCaptchaException()
html_data = html.fromstring(resp.text)
text = unescape(html.tostring(html_data, encoding='unicode'))
html_sample = unescape(html.tostring(html_data, encoding='unicode'))
video_urls = re.findall(r'ner" href="(.*?)"', text)
valid_urls = [url for url in video_urls if url.startswith('http')]
thumbnail_urls = re.findall(r'"image":"//(.*?)"', text)
https_thumbnail_urls = ['https://' + url for url in thumbnail_urls]
durations = re.findall(r'"durationText":"(.*?)"', text)
titles = re.findall(r'"clear_title":"(.*?)"', text)
descriptions = re.findall(r'"clear_description":"(.*?)"', text)
authors = re.findall(r'"clipChannel":{"name":"(.*?)",', text)
raw_dates = re.findall(r'"time":"(.*?)"', text)
start_tag = '{"pages":{"search":{"query":'
end_tag = '}}</noframes>'
start_index = html_sample.find(start_tag)
start_index = start_index if start_index != -1 else -1
end_index = html_sample.find(end_tag, start_index)
end_index = end_index + len(end_tag) if end_index != -1 else -1
content_between_tags = html_sample[start_index:end_index] if start_index != -1 and end_index != -1 else None
json_resp = r'''{}'''.format(content_between_tags.rsplit('</noframes>', 1)[0])
# # save to a file
# with open('/home/user/Desktop/json_resp.txt', 'w') as f:
# sys.stdout = f
# print(json_resp)
json_resp2 = json.loads(json_resp.encode("UTF-8"))
# # save to a file
# with open('/home/user/Desktop/json_resp2.txt', 'w') as f:
# sys.stdout = f
# print(json_resp2)
embedded_urls = [get_youtube_iframe_src(url) for url in valid_urls]
results = []
for url, title, description, author, raw_date, duration, thumbnail, embedded_url in zip(
valid_urls,
titles[::2],
descriptions,
authors,
raw_dates[::2],
durations,
https_thumbnail_urls[::2],
embedded_urls,
):
date_timestamp = datetime.strptime(raw_date.split("T")[0], "%Y-%m-%d")
date_utc = datetime.utcfromtimestamp(date_timestamp.timestamp())
results.append(
{
"url": url,
for item in json_resp2['pages']['search']['viewerData']['organicSnippets']['0']:
if 'title' in item:
title = item['clear_title']
print(title)
url = item['url']
print(url)
description = item['clear_description']
thumbnail = item['thumb']['image']
results.append({
"title": title,
"url": url,
"content": description,
"author": author,
"publishedDate": date_utc,
"length": duration,
"thumbnail": thumbnail,
"iframe_src": embedded_url,
"template": "videos.html",
}
)
})
return results