initial videos parsing, works sometimes.

2024-01-01 19:24:07 +01:00 · 2024-06-28 12:56:59 -06:00 · 2024-06-28 12:56:59 -06:00 · ab2f49a538
commit ab2f49a538
parent d57cdbc8a2
1 changed files with 44 additions and 38 deletions
--- a/searx/engines/yandex.py
+++ b/searx/engines/yandex.py
@ -4,6 +4,7 @@
 import re
 import sys
 import json
 from urllib.parse import urlencode, urlparse, parse_qs
 from lxml import html
 from searx.utils import humanize_bytes
@ -87,7 +88,7 @@ def request(query, params):
    elif yandex_category == 'images':
        params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}"
    elif yandex_category == 'videos':
-        params['url'] = f"{base_url_videos}?{url_extension}&{urlencode(query_params_videos)}"
+        params['url'] = f"{base_url_videos}?{url_extension}{urlencode(query_params_videos)}"
    return params
@ -194,46 +195,51 @@ def response(resp):
            raise SearxEngineCaptchaException()
        html_data = html.fromstring(resp.text)
-        text = unescape(html.tostring(html_data, encoding='unicode'))
+        html_sample = unescape(html.tostring(html_data, encoding='unicode'))
-        video_urls = re.findall(r'ner" href="(.*?)"', text)
+        start_tag = '{"pages":{"search":{"query":'
-        valid_urls = [url for url in video_urls if url.startswith('http')]
+        end_tag = '}}</noframes>'
-        thumbnail_urls = re.findall(r'"image":"//(.*?)"', text)
+
-        https_thumbnail_urls = ['https://' + url for url in thumbnail_urls]
+        start_index = html_sample.find(start_tag)
-        durations = re.findall(r'"durationText":"(.*?)"', text)
+        start_index = start_index if start_index != -1 else -1
-        titles = re.findall(r'"clear_title":"(.*?)"', text)
+
-        descriptions = re.findall(r'"clear_description":"(.*?)"', text)
+        end_index = html_sample.find(end_tag, start_index)
-        authors = re.findall(r'"clipChannel":{"name":"(.*?)",', text)
+        end_index = end_index + len(end_tag) if end_index != -1 else -1
-        raw_dates = re.findall(r'"time":"(.*?)"', text)
+
        content_between_tags = html_sample[start_index:end_index] if start_index != -1 and end_index != -1 else None
        json_resp = r'''{}'''.format(content_between_tags.rsplit('</noframes>', 1)[0])
 #      # save to a file
 #        with open('/home/user/Desktop/json_resp.txt', 'w') as f:
 #         sys.stdout = f
 #         print(json_resp)
        json_resp2 = json.loads(json_resp.encode("UTF-8"))
 #      # save to a file
 #        with open('/home/user/Desktop/json_resp2.txt', 'w') as f:
 #         sys.stdout = f
 #         print(json_resp2)
        embedded_urls = [get_youtube_iframe_src(url) for url in valid_urls]
        results = []
        for url, title, description, author, raw_date, duration, thumbnail, embedded_url in zip(
            valid_urls,
            titles[::2],
            descriptions,
            authors,
            raw_dates[::2],
            durations,
            https_thumbnail_urls[::2],
            embedded_urls,
        ):
            date_timestamp = datetime.strptime(raw_date.split("T")[0], "%Y-%m-%d")
            date_utc = datetime.utcfromtimestamp(date_timestamp.timestamp())
-            results.append(
+        for item in json_resp2['pages']['search']['viewerData']['organicSnippets']['0']:
-                {
+         if 'title' in item:
-                    "url": url,
+             title = item['clear_title']
-                    "title": title,
+             print(title)
-                    "content": description,
+             url = item['url']
-                    "author": author,
+             print(url)
-                    "publishedDate": date_utc,
+             description = item['clear_description']
-                    "length": duration,
+             thumbnail = item['thumb']['image']
                    "thumbnail": thumbnail,
                    "iframe_src": embedded_url,
                    "template": "videos.html",
                }
            )
-        return results
+             results.append({
                "title": title,
                "url": url,
                "content": description,
                "thumbnail": thumbnail,
                "template": "videos.html",
            })
    return results