forked from zaclys/searxng
		
	[refactor] digg - improve results and clean up source code
- strip html tags and superfluous quotation marks from content - remove not needed cookie from request - remove superfluous imports Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									6b0a896f01
								
							
						
					
					
						commit
						bef185723a
					
				
					 1 changed files with 32 additions and 35 deletions
				
			
		|  | @ -1,7 +1,7 @@ | |||
| """ | ||||
|  Digg (News, Social media) | ||||
| 
 | ||||
|  @website     https://digg.com/ | ||||
|  @website     https://digg.com | ||||
|  @provide-api no | ||||
| 
 | ||||
|  @using-api   no | ||||
|  | @ -11,59 +11,56 @@ | |||
| """ | ||||
| # pylint: disable=missing-function-docstring | ||||
| 
 | ||||
| import random | ||||
| import string | ||||
| from json import loads | ||||
| from urllib.parse import urlencode | ||||
| from datetime import datetime | ||||
| 
 | ||||
| from lxml import html | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['news', 'social media'] | ||||
| paging = True | ||||
| base_url = 'https://digg.com' | ||||
| 
 | ||||
| # search-url | ||||
| base_url = 'https://digg.com/' | ||||
| search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' | ||||
| search_url = base_url + ( | ||||
|     '/api/search/' | ||||
|     '?{query}' | ||||
|     '&from={position}' | ||||
|     '&size=20' | ||||
|     '&format=html' | ||||
| ) | ||||
| 
 | ||||
| # specific xpath variables | ||||
| results_xpath = '//article' | ||||
| link_xpath = './/small[@class="time"]//a' | ||||
| title_xpath = './/h2//a//text()' | ||||
| content_xpath = './/p//text()' | ||||
| pubdate_xpath = './/time' | ||||
| 
 | ||||
| digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ | ||||
|     string.digits + "+_" | ||||
| 
 | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     offset = (params['pageno'] - 1) * 20 | ||||
|     params['url'] = search_url.format(position=offset, | ||||
|                                       query=urlencode({'q': query})) | ||||
|     params['cookies']['frontend.auid'] = ''.join(random.choice( | ||||
|         digg_cookie_chars) for _ in range(22)) | ||||
|     params['url'] = search_url.format( | ||||
|         query = urlencode({'q': query}), | ||||
|         position = offset, | ||||
|     ) | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     search_result = loads(resp.text) | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in search_result['mapped']: | ||||
|     for result in loads(resp.text)['mapped']: | ||||
| 
 | ||||
|         # strip html tags and superfluous quotation marks from content | ||||
|         content = html.document_fromstring( | ||||
|             result['excerpt'] | ||||
|         ).text_content() | ||||
| 
 | ||||
|         # 'created': {'ISO': '2020-10-16T14:09:55Z', ...} | ||||
|         published = datetime.strptime(result['created']['ISO'], "%Y-%m-%dT%H:%M:%SZ") | ||||
|         # append result | ||||
|         results.append({'url': result['url'], | ||||
|         published = datetime.strptime( | ||||
|             result['created']['ISO'], '%Y-%m-%dT%H:%M:%SZ' | ||||
|         ) | ||||
|         results.append({ | ||||
|             'url': result['url'], | ||||
|             'title': result['title'], | ||||
|                         'content': result['excerpt'], | ||||
|             'content' : content, | ||||
|             'template': 'videos.html', | ||||
|             'publishedDate': published, | ||||
|                         'thumbnail': result['images']['thumbImage']}) | ||||
|             'thumbnail': result['images']['thumbImage'], | ||||
|         }) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Markus Heiser
						Markus Heiser