fix stackoverflow and add comments

This commit is contained in:
Thomas Pointhuber 2014-09-02 18:49:42 +02:00
parent 80f98d6041
commit a46bbb4042
2 changed files with 38 additions and 11 deletions

View File

@ -1,30 +1,58 @@
## Stackoverflow (It)
#
# @website https://stackoverflow.com/
# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, content
from urlparse import urljoin from urlparse import urljoin
from cgi import escape from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
# engine dependent config
categories = ['it'] categories = ['it']
url = 'http://stackoverflow.com/'
search_url = url+'search?{query}&page={pageno}'
result_xpath = './/div[@class="excerpt"]//text()'
paging = True paging = True
# search-url
url = 'http://stackoverflow.com/'
search_url = url+'search?{query}&page={pageno}'
# specific xpath variables
results_xpath = '//div[contains(@class,"question-summary")]'
link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
title_xpath = './/text()'
content_xpath = './/div[@class="excerpt"]//text()'
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}), params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']) pageno=params['pageno'])
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="question-summary search-result"]'):
link = result.xpath('.//div[@class="result-link"]//a')[0] # parse results
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(url, link.attrib.get('href')) href = urljoin(url, link.attrib.get('href'))
title = escape(' '.join(link.xpath('.//text()'))) title = escape(' '.join(link.xpath(title_xpath)))
content = escape(' '.join(result.xpath(result_xpath))) content = escape(' '.join(result.xpath(content_xpath)))
results.append({'url': href, 'title': title, 'content': content})
# append result
results.append({'url': href,
'title': title,
'content': content})
# return results
return results return results

View File

@ -90,7 +90,6 @@ engines:
- name : stackoverflow - name : stackoverflow
engine : stackoverflow engine : stackoverflow
categories : it
shortcut : st shortcut : st
- name : startpage - name : startpage