Merge pull request #88 from pointhi/engines

update and fix search engines
This commit is contained in:
Adam Tauber 2014-09-01 18:30:55 +02:00
commit f36d1e28fa
11 changed files with 406 additions and 106 deletions

View File

@ -1,48 +1,81 @@
## Bing (Web)
#
# @website https://www.bing.com
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo publishedDate
from urllib import urlencode from urllib import urlencode
from cgi import escape from cgi import escape
from lxml import html from lxml import html
base_url = 'http://www.bing.com/' # engine dependent config
search_string = 'search?{query}&first={offset}' categories = ['general']
paging = True paging = True
language_support = True language_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'search?{query}&first={offset}'
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en-US' language = 'en-US'
else: else:
language = params['language'].replace('_', '-') language = params['language'].replace('_', '-')
search_path = search_string.format( search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}), query=urlencode({'q': query, 'setmkt': language}),
offset=offset) offset=offset)
params['cookies']['SRCHHPGUSR'] = \ params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
#if params['category'] == 'images':
# params['url'] = base_url + 'images/' + search_path
params['url'] = base_url + search_path params['url'] = base_url + search_path
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
# parse results
for result in dom.xpath('//div[@class="sa_cc"]'): for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0] link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p//text()'))) content = escape(' '.join(result.xpath('.//p//text()')))
results.append({'url': url, 'title': title, 'content': content})
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results if something is found
if results: if results:
return results return results
# parse results again if nothing is found yet
for result in dom.xpath('//li[@class="b_algo"]'): for result in dom.xpath('//li[@class="b_algo"]'):
link = result.xpath('.//h2/a')[0] link = result.xpath('.//h2/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p//text()'))) content = escape(' '.join(result.xpath('.//p//text()')))
results.append({'url': url, 'title': title, 'content': content})
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results return results

View File

@ -0,0 +1,81 @@
## Bing (Images)
#
# @website https://www.bing.com/images
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, img_src
#
# @todo currently there are up to 35 images receive per page, because bing does not parse count=10. limited response to 10 images
from urllib import urlencode
from cgi import escape
from lxml import html
from yaml import load
import re
# engine dependent config
categories = ['images']
paging = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}'
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1
# required for cookie
language = 'en-US'
search_path = search_string.format(
query=urlencode({'q': query}),
offset=offset)
params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
params['url'] = base_url + search_path
print(params['url'])
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.content)
# init regex for yaml-parsing
p = re.compile( '({|,)([a-z]+):(")')
# parse results
for result in dom.xpath('//div[@class="dg_u"]'):
link = result.xpath('./a')[0]
# parse yaml-data (it is required to add a space, to make it parsable)
yaml_data = load(p.sub( r'\1\2: \3', link.attrib.get('m')))
title = link.attrib.get('t1')
#url = 'http://' + link.attrib.get('t3')
url = yaml_data.get('surl')
img_src = yaml_data.get('imgurl')
# append result
results.append({'template': 'images.html',
'url': url,
'title': title,
'content': '',
'img_src': img_src})
# TODO stop parsing if 10 images are found
if len(results) >= 10:
break
# return results
return results

View File

@ -1,50 +1,86 @@
## Bing (News)
#
# @website https://www.bing.com/news
# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
#
# @using-api no (because of query limit)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate
from urllib import urlencode from urllib import urlencode
from cgi import escape from cgi import escape
from lxml import html from lxml import html
from datetime import datetime, timedelta
from dateutil import parser
import re
# engine dependent config
categories = ['news'] categories = ['news']
base_url = 'http://www.bing.com/'
search_string = 'news/search?{query}&first={offset}'
paging = True paging = True
language_support = True language_support = True
# search-url
base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}'
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en-US' language = 'en-US'
else: else:
language = params['language'].replace('_', '-') language = params['language'].replace('_', '-')
search_path = search_string.format( search_path = search_string.format(
query=urlencode({'q': query, 'setmkt': language}), query=urlencode({'q': query, 'setmkt': language}),
offset=offset) offset=offset)
params['cookies']['SRCHHPGUSR'] = \ params['cookies']['SRCHHPGUSR'] = \
'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
#if params['category'] == 'images':
# params['url'] = base_url + 'images/' + search_path
params['url'] = base_url + search_path params['url'] = base_url + search_path
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
for result in dom.xpath('//div[@class="sa_cc"]'):
link = result.xpath('.//h3/a')[0] # parse results
for result in dom.xpath('//div[@class="sn_r"]'):
link = result.xpath('.//div[@class="newstitle"]/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = ' '.join(link.xpath('.//text()')) title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p//text()'))) content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')))
results.append({'url': url, 'title': title, 'content': content})
# parse publishedDate
publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()')))
if results: if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
return results timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(minutes=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
else:
publishedDate = parser.parse(publishedDate)
for result in dom.xpath('//li[@class="b_algo"]'): # append result
link = result.xpath('.//h2/a')[0] results.append({'url': url,
url = link.attrib.get('href') 'title': title,
title = ' '.join(link.xpath('.//text()')) 'publishedDate': publishedDate,
content = escape(' '.join(result.xpath('.//p//text()'))) 'content': content})
results.append({'url': url, 'title': title, 'content': content})
# return results
return results return results

View File

@ -1,45 +1,61 @@
## Dailymotion (Videos)
#
# @website https://www.dailymotion.com
# @provide-api yes (http://www.dailymotion.com/developer)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, thumbnail
#
# @todo set content-parameter with correct data
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from lxml import html from lxml import html
# engine dependent config
categories = ['videos'] categories = ['videos']
locale = 'en_US' locale = 'en_US'
# see http://www.dailymotion.com/doc/api/obj-video.html
search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page={pageno}&{query}' # noqa
# TODO use video result template
content_tpl = '<a href="{0}" title="{0}" ><img src="{1}" /></a><br />'
paging = True paging = True
# search-url
# see http://www.dailymotion.com/doc/api/obj-video.html
search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=5&page={pageno}&{query}' # noqa
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format( params['url'] = search_url.format(
query=urlencode({'search': query, 'localization': locale}), query=urlencode({'search': query, 'localization': locale}),
pageno=params['pageno']) pageno=params['pageno'])
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = loads(resp.text)
# return empty array if there are no results
if not 'list' in search_res: if not 'list' in search_res:
return results return []
# parse results
for res in search_res['list']: for res in search_res['list']:
title = res['title'] title = res['title']
url = res['url'] url = res['url']
if res['thumbnail_360_url']: #content = res['description']
content = content_tpl.format(url, res['thumbnail_360_url']) content = ''
else: thumbnail = res['thumbnail_360_url']
content = ''
if res['description']: results.append({'template': 'videos.html',
description = text_content_from_html(res['description']) 'url': url,
content += description[:500] 'title': title,
results.append({'url': url, 'title': title, 'content': content}) 'content': content,
'thumbnail': thumbnail})
# return results
return results return results
def text_content_from_html(html_string):
desc_html = html.fragment_fromstring(html_string, create_parent=True)
return desc_html.text_content()

View File

@ -1,37 +1,57 @@
#!/usr/bin/env python ## Google (Web)
#
# @website https://www.google.com
# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
#
# @using-api yes
# @results JSON
# @stable yes (but deprecated)
# @parse url, title, content
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
# engine dependent config
categories = ['general'] categories = ['general']
url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa
paging = True paging = True
language_support = True language_support = True
# search-url
url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 8 offset = (params['pageno'] - 1) * 8
language = 'en-US' language = 'en-US'
if params['language'] != 'all': if params['language'] != 'all':
language = params['language'].replace('_', '-') language = params['language'].replace('_', '-')
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query}), query=urlencode({'q': query}),
language=language) language=language)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = loads(resp.text)
# return empty array if there are no results
if not search_res.get('responseData', {}).get('results'): if not search_res.get('responseData', {}).get('results'):
return [] return []
# parse results
for result in search_res['responseData']['results']: for result in search_res['responseData']['results']:
# append result
results.append({'url': result['unescapedUrl'], results.append({'url': result['unescapedUrl'],
'title': result['titleNoFormatting'], 'title': result['titleNoFormatting'],
'content': result['content']}) 'content': result['content']})
# return results
return results return results

View File

@ -1,37 +1,58 @@
#!/usr/bin/env python ## Google (Images)
#
# @website https://www.google.com
# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
#
# @using-api yes
# @results JSON
# @stable yes (but deprecated)
# @parse url, title, img_src
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
# engine dependent config
categories = ['images'] categories = ['images']
paging = True
# search-url
url = 'https://ajax.googleapis.com/' url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # noqa search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}' # noqa
paging = True
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 8 offset = (params['pageno'] - 1) * 8
params['url'] = search_url.format(query=urlencode({'q': query}), params['url'] = search_url.format(query=urlencode({'q': query}),
offset=offset) offset=offset)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = loads(resp.text)
if not search_res.get('responseData'):
return [] # return empty array if there are no results
if not search_res['responseData'].get('results'): if not search_res.get('responseData', {}).get('results'):
return [] return []
# parse results
for result in search_res['responseData']['results']: for result in search_res['responseData']['results']:
href = result['originalContextUrl'] href = result['originalContextUrl']
title = result['title'] title = result['title']
if not result['url']: if not result['url']:
continue continue
# append result
results.append({'url': href, results.append({'url': href,
'title': title, 'title': title,
'content': '', 'content': '',
'img_src': result['url'], 'img_src': result['url'],
'template': 'images.html'}) 'template': 'images.html'})
# return results
return results return results

View File

@ -1,43 +1,62 @@
#!/usr/bin/env python ## Google (News)
#
# @website https://www.google.com
# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
#
# @using-api yes
# @results JSON
# @stable yes (but deprecated)
# @parse url, title, content, publishedDate
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from dateutil import parser from dateutil import parser
# search-url
categories = ['news'] categories = ['news']
url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa
paging = True paging = True
language_support = True language_support = True
# engine dependent config
url = 'https://ajax.googleapis.com/'
search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 8 offset = (params['pageno'] - 1) * 8
language = 'en-US' language = 'en-US'
if params['language'] != 'all': if params['language'] != 'all':
language = params['language'].replace('_', '-') language = params['language'].replace('_', '-')
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query}), query=urlencode({'q': query}),
language=language) language=language)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = loads(resp.text)
# return empty array if there are no results
if not search_res.get('responseData', {}).get('results'): if not search_res.get('responseData', {}).get('results'):
return [] return []
# parse results
for result in search_res['responseData']['results']: for result in search_res['responseData']['results']:
# parse publishedDate
# Mon, 10 Mar 2014 16:26:15 -0700
publishedDate = parser.parse(result['publishedDate']) publishedDate = parser.parse(result['publishedDate'])
# append result
results.append({'url': result['unescapedUrl'], results.append({'url': result['unescapedUrl'],
'title': result['titleNoFormatting'], 'title': result['titleNoFormatting'],
'publishedDate': publishedDate, 'publishedDate': publishedDate,
'content': result['content']}) 'content': result['content']})
# return results
return results return results

View File

@ -1,43 +1,58 @@
## Vimeo (Videos)
#
# @website https://vimeo.com/
# @provide-api yes (http://developer.vimeo.com/api), they have a maximum count of queries/hour
#
# @using-api no (TODO, rewrite to api)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, publishedDate, thumbnail
#
# @todo rewrite to api
# @todo set content-parameter with correct data
from urllib import urlencode from urllib import urlencode
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from dateutil import parser from dateutil import parser
base_url = 'http://vimeo.com' # engine dependent config
search_url = base_url + '/search?{query}' categories = ['videos']
url_xpath = None paging = True
content_xpath = None
title_xpath = None # search-url
results_xpath = '' base_url = 'https://vimeo.com'
content_tpl = '<a href="{0}"> <img src="{2}"/> </a>' search_url = base_url + '/search/page:{pageno}?{query}'
# specific xpath variables
url_xpath = './a/@href'
content_xpath = './a/img/@src'
title_xpath = './a/div[@class="data"]/p[@class="title"]/text()'
results_xpath = '//div[@id="browse_content"]/ol/li'
publishedDate_xpath = './/p[@class="meta"]//attribute::datetime' publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
# the cookie set by vimeo contains all the following values,
# but only __utma seems to be requiered
cookie = {
#'vuid':'918282893.1027205400'
# 'ab_bs':'%7B%223%22%3A279%7D'
'__utma': '00000000.000#0000000.0000000000.0000000000.0000000000.0'
# '__utmb':'18302654.1.10.1388942090'
#, '__utmc':'18302654'
#, '__utmz':'18#302654.1388942090.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)' # noqa
#, '__utml':'search'
}
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query})) params['url'] = search_url.format(pageno=params['pageno'] ,
params['cookies'] = cookie query=urlencode({'q': query}))
# TODO required?
params['cookies']['__utma'] = '00000000.000#0000000.0000000000.0000000000.0000000000.0'
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
p = HTMLParser() p = HTMLParser()
# parse results
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = base_url + result.xpath(url_xpath)[0] url = base_url + result.xpath(url_xpath)[0]
title = p.unescape(extract_text(result.xpath(title_xpath))) title = p.unescape(extract_text(result.xpath(title_xpath)))
@ -45,10 +60,13 @@ def response(resp):
publishedDate = parser.parse(extract_text( publishedDate = parser.parse(extract_text(
result.xpath(publishedDate_xpath)[0])) result.xpath(publishedDate_xpath)[0]))
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content_tpl.format(url, title, thumbnail), 'content': '',
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate, 'publishedDate': publishedDate,
'thumbnail': thumbnail}) 'thumbnail': thumbnail})
# return results
return results return results

View File

@ -1,64 +1,99 @@
#!/usr/bin/env python ## Yahoo (Web)
#
# @website https://search.yahoo.com/web
# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
#
# @using-api no (because pricing)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, suggestion
from urllib import urlencode from urllib import urlencode
from urlparse import unquote from urlparse import unquote
from lxml import html from lxml import html
from searx.engines.xpath import extract_text, extract_url from searx.engines.xpath import extract_text, extract_url
# engine dependent config
categories = ['general'] categories = ['general']
search_url = 'http://search.yahoo.com/search?{query}&b={offset}' paging = True
language_support = True
# search-url
search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
# specific xpath variables
results_xpath = '//div[@class="res"]' results_xpath = '//div[@class="res"]'
url_xpath = './/h3/a/@href' url_xpath = './/h3/a/@href'
title_xpath = './/h3/a' title_xpath = './/h3/a'
content_xpath = './/div[@class="abstr"]' content_xpath = './/div[@class="abstr"]'
suggestion_xpath = '//div[@id="satat"]//a' suggestion_xpath = '//div[@id="satat"]//a'
paging = True
# remove yahoo-specific tracking-url
def parse_url(url_string): def parse_url(url_string):
endings = ['/RS', '/RK'] endings = ['/RS', '/RK']
endpositions = [] endpositions = []
start = url_string.find('http', url_string.find('/RU=')+1) start = url_string.find('http', url_string.find('/RU=')+1)
for ending in endings: for ending in endings:
endpos = url_string.rfind(ending) endpos = url_string.rfind(ending)
if endpos > -1: if endpos > -1:
endpositions.append(endpos) endpositions.append(endpos)
end = min(endpositions) end = min(endpositions)
return unquote(url_string[start:end]) return unquote(url_string[start:end])
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en' language = 'en'
else: else:
language = params['language'].split('_')[0] language = params['language'].split('_')[0]
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query})) query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language) .format(lang=language)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
try: try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url)) url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(result.xpath(title_xpath)[0])
except: except:
continue continue
content = extract_text(result.xpath(content_xpath)[0])
results.append({'url': url, 'title': title, 'content': content})
content = extract_text(result.xpath(content_xpath)[0])
# append result
results.append({'url': url,
'title': title,
'content': content})
# if no suggestion found, return results
if not suggestion_xpath: if not suggestion_xpath:
return results return results
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath): for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)}) results.append({'suggestion': extract_text(suggestion)})
# return results
return results return results

View File

@ -1,4 +1,12 @@
#!/usr/bin/env python ## Yahoo (News)
#
# @website https://news.yahoo.com
# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
#
# @using-api no (because pricing)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
@ -8,8 +16,15 @@ from datetime import datetime, timedelta
import re import re
from dateutil import parser from dateutil import parser
# engine dependent config
categories = ['news'] categories = ['news']
search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' paging = True
language_support = True
# search-url
search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
# specific xpath variables
results_xpath = '//div[@class="res"]' results_xpath = '//div[@class="res"]'
url_xpath = './/h3/a/@href' url_xpath = './/h3/a/@href'
title_xpath = './/h3/a' title_xpath = './/h3/a'
@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]'
publishedDate_xpath = './/span[@class="timestamp"]' publishedDate_xpath = './/span[@class="timestamp"]'
suggestion_xpath = '//div[@id="satat"]//a' suggestion_xpath = '//div[@id="satat"]//a'
paging = True
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en' language = 'en'
else: else:
language = params['language'].split('_')[0] language = params['language'].split('_')[0]
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query})) query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language) .format(lang=language)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = parse_url(extract_url(result.xpath(url_xpath), search_url)) url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(result.xpath(title_xpath)[0])
content = extract_text(result.xpath(content_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0])
# parse publishedDate
publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
if re.match("^[0-9]+ minute(s|) ago$", publishedDate): if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
@ -58,15 +82,11 @@ def response(resp):
if publishedDate.year == 1900: if publishedDate.year == 1900:
publishedDate = publishedDate.replace(year=datetime.now().year) publishedDate = publishedDate.replace(year=datetime.now().year)
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content, 'content': content,
'publishedDate': publishedDate}) 'publishedDate': publishedDate})
if not suggestion_xpath: # return results
return results
for suggestion in dom.xpath(suggestion_xpath):
results.append({'suggestion': extract_text(suggestion)})
return results return results

View File

@ -20,6 +20,11 @@ engines:
locale : en-US locale : en-US
shortcut : bi shortcut : bi
- name : bing images
engine : bing_images
locale : en-US
shortcut : bii
- name : bing news - name : bing news
engine : bing_news engine : bing_news
locale : en-US locale : en-US
@ -148,11 +153,7 @@ engines:
- name : vimeo - name : vimeo
engine : vimeo engine : vimeo
categories : videos locale : en-US
results_xpath : //div[@id="browse_content"]/ol/li
url_xpath : ./a/@href
title_xpath : ./a/div[@class="data"]/p[@class="title"]/text()
content_xpath : ./a/img/@src
shortcut : vm shortcut : vm
locales: locales: