initial videos parsing, works sometimes.

This commit is contained in:
Austin-Olacsi 2024-06-28 12:56:59 -06:00 committed by GitHub
parent d57cdbc8a2
commit ab2f49a538
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -4,6 +4,7 @@
import re import re
import sys import sys
import json
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from searx.utils import humanize_bytes from searx.utils import humanize_bytes
@ -87,7 +88,7 @@ def request(query, params):
elif yandex_category == 'images': elif yandex_category == 'images':
params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}" params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}"
elif yandex_category == 'videos': elif yandex_category == 'videos':
params['url'] = f"{base_url_videos}?{url_extension}&{urlencode(query_params_videos)}" params['url'] = f"{base_url_videos}?{url_extension}{urlencode(query_params_videos)}"
return params return params
@ -194,46 +195,51 @@ def response(resp):
raise SearxEngineCaptchaException() raise SearxEngineCaptchaException()
html_data = html.fromstring(resp.text) html_data = html.fromstring(resp.text)
text = unescape(html.tostring(html_data, encoding='unicode')) html_sample = unescape(html.tostring(html_data, encoding='unicode'))
video_urls = re.findall(r'ner" href="(.*?)"', text) start_tag = '{"pages":{"search":{"query":'
valid_urls = [url for url in video_urls if url.startswith('http')] end_tag = '}}</noframes>'
thumbnail_urls = re.findall(r'"image":"//(.*?)"', text)
https_thumbnail_urls = ['https://' + url for url in thumbnail_urls] start_index = html_sample.find(start_tag)
durations = re.findall(r'"durationText":"(.*?)"', text) start_index = start_index if start_index != -1 else -1
titles = re.findall(r'"clear_title":"(.*?)"', text)
descriptions = re.findall(r'"clear_description":"(.*?)"', text) end_index = html_sample.find(end_tag, start_index)
authors = re.findall(r'"clipChannel":{"name":"(.*?)",', text) end_index = end_index + len(end_tag) if end_index != -1 else -1
raw_dates = re.findall(r'"time":"(.*?)"', text)
content_between_tags = html_sample[start_index:end_index] if start_index != -1 and end_index != -1 else None
json_resp = r'''{}'''.format(content_between_tags.rsplit('</noframes>', 1)[0])
# # save to a file
# with open('/home/user/Desktop/json_resp.txt', 'w') as f:
# sys.stdout = f
# print(json_resp)
json_resp2 = json.loads(json_resp.encode("UTF-8"))
# # save to a file
# with open('/home/user/Desktop/json_resp2.txt', 'w') as f:
# sys.stdout = f
# print(json_resp2)
embedded_urls = [get_youtube_iframe_src(url) for url in valid_urls]
results = [] results = []
for url, title, description, author, raw_date, duration, thumbnail, embedded_url in zip(
valid_urls,
titles[::2],
descriptions,
authors,
raw_dates[::2],
durations,
https_thumbnail_urls[::2],
embedded_urls,
):
date_timestamp = datetime.strptime(raw_date.split("T")[0], "%Y-%m-%d")
date_utc = datetime.utcfromtimestamp(date_timestamp.timestamp())
results.append( for item in json_resp2['pages']['search']['viewerData']['organicSnippets']['0']:
{ if 'title' in item:
"url": url, title = item['clear_title']
"title": title, print(title)
"content": description, url = item['url']
"author": author, print(url)
"publishedDate": date_utc, description = item['clear_description']
"length": duration, thumbnail = item['thumb']['image']
"thumbnail": thumbnail,
"iframe_src": embedded_url,
"template": "videos.html",
}
)
return results results.append({
"title": title,
"url": url,
"content": description,
"thumbnail": thumbnail,
"template": "videos.html",
})
return results