mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	initial videos parsing, works sometimes.
This commit is contained in:
		
							parent
							
								
									d57cdbc8a2
								
							
						
					
					
						commit
						ab2f49a538
					
				
					 1 changed files with 44 additions and 38 deletions
				
			
		|  | @ -4,6 +4,7 @@ | |||
| 
 | ||||
| import re | ||||
| import sys | ||||
| import json | ||||
| from urllib.parse import urlencode, urlparse, parse_qs | ||||
| from lxml import html | ||||
| from searx.utils import humanize_bytes | ||||
|  | @ -87,7 +88,7 @@ def request(query, params): | |||
|     elif yandex_category == 'images': | ||||
|         params['url'] = f"{base_url_images}?{url_extension}{urlencode(query_params_images)}" | ||||
|     elif yandex_category == 'videos': | ||||
|         params['url'] = f"{base_url_videos}?{url_extension}&{urlencode(query_params_videos)}" | ||||
|         params['url'] = f"{base_url_videos}?{url_extension}{urlencode(query_params_videos)}" | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
|  | @ -194,46 +195,51 @@ def response(resp): | |||
|             raise SearxEngineCaptchaException() | ||||
| 
 | ||||
|         html_data = html.fromstring(resp.text) | ||||
|         text = unescape(html.tostring(html_data, encoding='unicode')) | ||||
|         html_sample = unescape(html.tostring(html_data, encoding='unicode')) | ||||
| 
 | ||||
|         video_urls = re.findall(r'ner" href="(.*?)"', text) | ||||
|         valid_urls = [url for url in video_urls if url.startswith('http')] | ||||
|         thumbnail_urls = re.findall(r'"image":"//(.*?)"', text) | ||||
|         https_thumbnail_urls = ['https://' + url for url in thumbnail_urls] | ||||
|         durations = re.findall(r'"durationText":"(.*?)"', text) | ||||
|         titles = re.findall(r'"clear_title":"(.*?)"', text) | ||||
|         descriptions = re.findall(r'"clear_description":"(.*?)"', text) | ||||
|         authors = re.findall(r'"clipChannel":{"name":"(.*?)",', text) | ||||
|         raw_dates = re.findall(r'"time":"(.*?)"', text) | ||||
|         start_tag = '{"pages":{"search":{"query":' | ||||
|         end_tag = '}}</noframes>' | ||||
| 
 | ||||
|         start_index = html_sample.find(start_tag) | ||||
|         start_index = start_index if start_index != -1 else -1 | ||||
| 
 | ||||
|         end_index = html_sample.find(end_tag, start_index) | ||||
|         end_index = end_index + len(end_tag) if end_index != -1 else -1 | ||||
| 
 | ||||
|         content_between_tags = html_sample[start_index:end_index] if start_index != -1 and end_index != -1 else None | ||||
| 
 | ||||
|         json_resp = r'''{}'''.format(content_between_tags.rsplit('</noframes>', 1)[0]) | ||||
| 
 | ||||
| #      # save to a file | ||||
| #        with open('/home/user/Desktop/json_resp.txt', 'w') as f: | ||||
| #         sys.stdout = f | ||||
| #         print(json_resp) | ||||
| 
 | ||||
|         json_resp2 = json.loads(json_resp.encode("UTF-8")) | ||||
| 
 | ||||
| #      # save to a file | ||||
| #        with open('/home/user/Desktop/json_resp2.txt', 'w') as f: | ||||
| #         sys.stdout = f | ||||
| #         print(json_resp2) | ||||
| 
 | ||||
|         embedded_urls = [get_youtube_iframe_src(url) for url in valid_urls] | ||||
| 
 | ||||
|         results = [] | ||||
|         for url, title, description, author, raw_date, duration, thumbnail, embedded_url in zip( | ||||
|             valid_urls, | ||||
|             titles[::2], | ||||
|             descriptions, | ||||
|             authors, | ||||
|             raw_dates[::2], | ||||
|             durations, | ||||
|             https_thumbnail_urls[::2], | ||||
|             embedded_urls, | ||||
|         ): | ||||
|             date_timestamp = datetime.strptime(raw_date.split("T")[0], "%Y-%m-%d") | ||||
|             date_utc = datetime.utcfromtimestamp(date_timestamp.timestamp()) | ||||
| 
 | ||||
|             results.append( | ||||
|                 { | ||||
|                     "url": url, | ||||
|         for item in json_resp2['pages']['search']['viewerData']['organicSnippets']['0']: | ||||
|          if 'title' in item: | ||||
|              title = item['clear_title'] | ||||
|              print(title) | ||||
|              url = item['url'] | ||||
|              print(url) | ||||
|              description = item['clear_description'] | ||||
|              thumbnail = item['thumb']['image'] | ||||
| 
 | ||||
|              results.append({ | ||||
|                 "title": title, | ||||
|                 "url": url, | ||||
|                 "content": description, | ||||
|                     "author": author, | ||||
|                     "publishedDate": date_utc, | ||||
|                     "length": duration, | ||||
|                 "thumbnail": thumbnail, | ||||
|                     "iframe_src": embedded_url, | ||||
|                 "template": "videos.html", | ||||
|                 } | ||||
|             ) | ||||
|             }) | ||||
| 
 | ||||
|     return results | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Austin-Olacsi
						Austin-Olacsi