[refactor] digg - improve results and clean up source code

- strip html tags and superfluous quotation marks from content
- remove not needed cookie from request
- remove superfluous imports

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2020-12-02 21:54:27 +01:00
parent 6b0a896f01
commit bef185723a
1 changed files with 32 additions and 35 deletions

View File

@ -1,7 +1,7 @@
""" """
Digg (News, Social media) Digg (News, Social media)
@website https://digg.com/ @website https://digg.com
@provide-api no @provide-api no
@using-api no @using-api no
@ -11,59 +11,56 @@
""" """
# pylint: disable=missing-function-docstring # pylint: disable=missing-function-docstring
import random
import string
from json import loads from json import loads
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
from lxml import html
# engine dependent config # engine dependent config
categories = ['news', 'social media'] categories = ['news', 'social media']
paging = True paging = True
base_url = 'https://digg.com'
# search-url # search-url
base_url = 'https://digg.com/' search_url = base_url + (
search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' '/api/search/'
'?{query}'
'&from={position}'
'&size=20'
'&format=html'
)
# specific xpath variables
results_xpath = '//article'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/h2//a//text()'
content_xpath = './/p//text()'
pubdate_xpath = './/time'
digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
string.digits + "+_"
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 20 offset = (params['pageno'] - 1) * 20
params['url'] = search_url.format(position=offset, params['url'] = search_url.format(
query=urlencode({'q': query})) query = urlencode({'q': query}),
params['cookies']['frontend.auid'] = ''.join(random.choice( position = offset,
digg_cookie_chars) for _ in range(22)) )
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_result = loads(resp.text)
# parse results # parse results
for result in search_result['mapped']: for result in loads(resp.text)['mapped']:
# strip html tags and superfluous quotation marks from content
content = html.document_fromstring(
result['excerpt']
).text_content()
# 'created': {'ISO': '2020-10-16T14:09:55Z', ...} # 'created': {'ISO': '2020-10-16T14:09:55Z', ...}
published = datetime.strptime(result['created']['ISO'], "%Y-%m-%dT%H:%M:%SZ") published = datetime.strptime(
# append result result['created']['ISO'], '%Y-%m-%dT%H:%M:%SZ'
results.append({'url': result['url'], )
'title': result['title'], results.append({
'content': result['excerpt'], 'url': result['url'],
'template': 'videos.html', 'title': result['title'],
'publishedDate': published, 'content' : content,
'thumbnail': result['images']['thumbImage']}) 'template': 'videos.html',
'publishedDate': published,
'thumbnail': result['images']['thumbImage'],
})
# return results
return results return results