more WIP video results parsing

This commit is contained in:
Austin-Olacsi 2024-06-30 18:18:42 -06:00 committed by GitHub
parent 1d97ecdb56
commit 2a07441707
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -5,6 +5,7 @@
import re import re
import sys import sys
import json import json
import time
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from searx.utils import humanize_bytes from searx.utils import humanize_bytes
@ -93,23 +94,14 @@ def request(query, params):
return params return params
def get_youtube_iframe_src(url): # get embedded youtube links
def _get_iframe_src(url):
parsed_url = urlparse(url) parsed_url = urlparse(url)
if parsed_url.path == '/watch' and parsed_url.query:
# Check for http://www.youtube.com/v/videoid format video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
if (
parsed_url.netloc.endswith('youtube.com')
and parsed_url.path.startswith('/v/')
and len(parsed_url.path.split('/')) == 3
):
video_id = parsed_url.path.split('/')[-1]
return 'https://www.youtube-nocookie.com/embed/' + video_id
# Check for http://www.youtube.com/watch?v=videoid format
elif parsed_url.netloc.endswith('youtube.com') and parsed_url.path == '/watch' and parsed_url.query:
video_id = parse_qs(parsed_url.query).get('v', [])
if video_id: if video_id:
return 'https://www.youtube-nocookie.com/embed/' + video_id[0] return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
return None
def response(resp): def response(resp):
if yandex_category == 'web': if yandex_category == 'web':
@ -215,10 +207,31 @@ def response(resp):
# sys.stdout = f # sys.stdout = f
# print(json_resp) # print(json_resp)
#the json.loads below sometimes fails. because some keys in the json_resp (especialy the clear_description) may contain unescaped double quotes for example, in which case the json is not valid... #sometimes json_resp is valid json, sometimes not.
json_resp2 = json.loads(json_resp.encode("UTF-8")) #but we can (ussually) validate the json by removing the values in the clear_description and clear_title keys.
#we don't need them and they may contain unescaped characters that make the decoding fail. so for now...
# # save to a file # Step 1: Remove everything between "clear_description":" and ","
pattern_desc = r'("clear_description":")(.*?)(",")'
json_resp = re.sub(pattern_desc, r'\1\3', json_resp)
# Step 2: Remove everything between "clear_title":" and ","
pattern_title = r'("clear_title":")(.*?)(",")'
json_resp = re.sub(pattern_title, r'\1\3', json_resp)
#to do: when the search query is butterfly, yandex videos page 2 is broken
# save to a file
# with open('/home/user/Desktop/json_resp.txt', 'w') as f:
# sys.stdout = f
# print(json_resp)
json_resp2 = json.loads(json_resp.encode("UTF-8"))
# json_resp2 = json.loads(json_resp)
# save to a file
# with open('/home/user/Desktop/json_resp2.txt', 'w') as f: # with open('/home/user/Desktop/json_resp2.txt', 'w') as f:
# sys.stdout = f # sys.stdout = f
# print(json_resp2) # print(json_resp2)
@ -226,20 +239,28 @@ def response(resp):
results = [] results = []
for item in json_resp2['pages']['search']['viewerData']['organicSnippets']['0']: for snippet_key in json_resp2['pages']['search']['viewerData']['organicSnippets']:
if 'title' in item: for item in json_resp2['pages']['search']['viewerData']['organicSnippets'][snippet_key]:
title = item['clear_title'] if 'title' in item:
print(title) title = item['title']['text']
url = item['url'] url = item['url']
print(url) description = item['description']
description = item['clear_description']
thumbnail = item['thumb']['image'] thumbnail = item['thumb']['image']
length = item['thumb']['duration']
channel = item['channel']['name']
release_time = item['time']
release_date = datetime.strptime(release_time.split("T")[0], "%Y-%m-%d")
formatted_date = datetime.utcfromtimestamp(release_date.timestamp())
results.append({ results.append({
"title": title, "title": title,
"url": url, "url": url,
"author": channel,
"publishedDate": formatted_date,
"length": length,
"content": description, "content": description,
"thumbnail": thumbnail, "thumbnail": thumbnail,
"iframe_src": _get_iframe_src(url),
"template": "videos.html", "template": "videos.html",
}) })