Merge pull request #89 from pointhi/engines

update search engines and add comments to it
This commit is contained in:
Adam Tauber 2014-09-02 22:00:30 +02:00
commit f825752145
16 changed files with 412 additions and 121 deletions

View File

@ -23,6 +23,7 @@ language_support = True
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
search_string = 'search?{query}&first={offset}' search_string = 'search?{query}&first={offset}'
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1

View File

@ -24,6 +24,7 @@ paging = True
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
search_string = 'images/search?{query}&count=10&first={offset}' search_string = 'images/search?{query}&count=10&first={offset}'
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1

View File

@ -24,6 +24,7 @@ language_support = True
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}' search_string = 'news/search?{query}&first={offset}'
# do search-request # do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1

View File

@ -1,35 +1,61 @@
## Deviantart (Images)
#
# @website https://www.deviantart.com/
# @provide-api yes (https://www.deviantart.com/developers/) (RSS)
#
# @using-api no (TODO, rewrite to api)
# @results HTML
# @stable no (HTML can change)
# @parse url, title, thumbnail, img_src
#
# @todo rewrite to api
from urllib import urlencode from urllib import urlencode
from urlparse import urljoin from urlparse import urljoin
from lxml import html from lxml import html
# engine dependent config
categories = ['images'] categories = ['images']
paging = True
# search-url
base_url = 'https://www.deviantart.com/' base_url = 'https://www.deviantart.com/'
search_url = base_url+'search?offset={offset}&{query}' search_url = base_url+'search?offset={offset}&{query}'
paging = True
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 24 offset = (params['pageno'] - 1) * 24
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
query=urlencode({'q': query})) query=urlencode({'q': query}))
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
# return empty array if a redirection code is returned
if resp.status_code == 302: if resp.status_code == 302:
return results return []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'): for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
link = result.xpath('.//a[contains(@class, "thumb")]')[0] link = result.xpath('.//a[contains(@class, "thumb")]')[0]
url = urljoin(base_url, link.attrib.get('href')) url = urljoin(base_url, link.attrib.get('href'))
title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') # noqa title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') # noqa
title = ''.join(title_links[0].xpath('.//text()')) title = ''.join(title_links[0].xpath('.//text()'))
img_src = link.xpath('.//img')[0].attrib['src'] img_src = link.xpath('.//img')[0].attrib['src']
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'img_src': img_src, 'img_src': img_src,
'template': 'images.html'}) 'template': 'images.html'})
# return results
return results return results

View File

@ -1,65 +1,69 @@
## DuckDuckGo (Web)
#
# @website https://duckduckgo.com/
# @provide-api yes (https://duckduckgo.com/api), but not all results from search-site
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo rewrite to api
# @todo language support (the current used site does not support language-change)
from urllib import urlencode from urllib import urlencode
from lxml.html import fromstring from lxml.html import fromstring
from searx.utils import html_to_text from searx.utils import html_to_text
url = 'https://duckduckgo.com/html?{query}&s={offset}' # engine dependent config
categories = ['general']
paging = True
locale = 'us-en' locale = 'us-en'
# search-url
url = 'https://duckduckgo.com/html?{query}&s={offset}'
# specific xpath variables
result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa
url_xpath = './/a[@class="large"]/@href'
title_xpath = './/a[@class="large"]//text()'
content_xpath = './/div[@class="snippet"]//text()'
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 30 offset = (params['pageno'] - 1) * 30
q = urlencode({'q': query,
'l': locale}) params['url'] = url.format(
params['url'] = url.format(query=q, offset=offset) query=urlencode({'q': query, 'l': locale}),
offset=offset)
return params return params
# get response from search-request
def response(resp): def response(resp):
result_xpath = '//div[@class="results_links results_links_deep web-result"]' # noqa
url_xpath = './/a[@class="large"]/@href'
title_xpath = './/a[@class="large"]//text()'
content_xpath = './/div[@class="snippet"]//text()'
results = [] results = []
doc = fromstring(resp.text) doc = fromstring(resp.text)
# parse results
for r in doc.xpath(result_xpath): for r in doc.xpath(result_xpath):
try: try:
res_url = r.xpath(url_xpath)[-1] res_url = r.xpath(url_xpath)[-1]
except: except:
continue continue
if not res_url: if not res_url:
continue continue
title = html_to_text(''.join(r.xpath(title_xpath))) title = html_to_text(''.join(r.xpath(title_xpath)))
content = html_to_text(''.join(r.xpath(content_xpath))) content = html_to_text(''.join(r.xpath(content_xpath)))
# append result
results.append({'title': title, results.append({'title': title,
'content': content, 'content': content,
'url': res_url}) 'url': res_url})
# return results
return results return results
#from json import loads
#search_url = url + 'd.js?{query}&p=1&s={offset}'
#
#paging = True
#
#
#def request(query, params):
# offset = (params['pageno'] - 1) * 30
# q = urlencode({'q': query,
# 'l': locale})
# params['url'] = search_url.format(query=q, offset=offset)
# return params
#
#
#def response(resp):
# results = []
# search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
# for r in search_res:
# if not r.get('t'):
# continue
# results.append({'title': r['t'],
# 'content': html_to_text(r['a']),
# 'url': r['u']})
# return results

View File

@ -1,6 +1,14 @@
## Dummy
#
# @results empty array
# @stable yes
# do search-request
def request(query, params): def request(query, params):
return params return params
# get response from search-request
def response(resp): def response(resp):
return [] return []

View File

@ -1,35 +1,60 @@
## General Files (Files)
#
# @website http://www.general-files.org
# @provide-api no (nothing found)
#
# @using-api no (because nothing found)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo detect torrents?
from lxml import html from lxml import html
# engine dependent config
categories = ['files']
paging = True
# search-url
base_url = 'http://www.general-file.com' base_url = 'http://www.general-file.com'
search_url = base_url + '/files-{letter}/{query}/{pageno}' search_url = base_url + '/files-{letter}/{query}/{pageno}'
# specific xpath variables
result_xpath = '//table[@class="block-file"]' result_xpath = '//table[@class="block-file"]'
title_xpath = './/h2/a//text()' title_xpath = './/h2/a//text()'
url_xpath = './/h2/a/@href' url_xpath = './/h2/a/@href'
content_xpath = './/p//text()' content_xpath = './/p//text()'
paging = True
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=query, params['url'] = search_url.format(query=query,
letter=query[0], letter=query[0],
pageno=params['pageno']) pageno=params['pageno'])
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(result_xpath): for result in dom.xpath(result_xpath):
url = result.xpath(url_xpath)[0] url = result.xpath(url_xpath)[0]
# skip fast download links # skip fast download links
if not url.startswith('/'): if not url.startswith('/'):
continue continue
# append result
results.append({'url': base_url + url, results.append({'url': base_url + url,
'title': ''.join(result.xpath(title_xpath)), 'title': ''.join(result.xpath(title_xpath)),
'content': ''.join(result.xpath(content_xpath))}) 'content': ''.join(result.xpath(content_xpath))})
# return results
return results return results

View File

@ -1,31 +1,59 @@
## Github (It)
#
# @website https://github.com/
# @provide-api yes (https://developer.github.com/v3/)
#
# @using-api yes
# @results JSON
# @stable yes (using api)
# @parse url, title, content
from urllib import urlencode from urllib import urlencode
from json import loads from json import loads
from cgi import escape from cgi import escape
# engine dependent config
categories = ['it'] categories = ['it']
# search-url
search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa search_url = 'https://api.github.com/search/repositories?sort=stars&order=desc&{query}' # noqa
accept_header = 'application/vnd.github.preview.text-match+json' accept_header = 'application/vnd.github.preview.text-match+json'
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query})) params['url'] = search_url.format(query=urlencode({'q': query}))
params['headers']['Accept'] = accept_header params['headers']['Accept'] = accept_header
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = loads(resp.text)
# check if items are recieved
if not 'items' in search_res: if not 'items' in search_res:
return results return []
# parse results
for res in search_res['items']: for res in search_res['items']:
title = res['name'] title = res['name']
url = res['html_url'] url = res['html_url']
if res['description']: if res['description']:
content = escape(res['description'][:500]) content = escape(res['description'][:500])
else: else:
content = '' content = ''
results.append({'url': url, 'title': title, 'content': content})
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results return results

View File

@ -1,39 +1,61 @@
## Piratebay (Videos, Music, Files)
#
# @website https://thepiratebay.se
# @provide-api no (nothing found)
#
# @using-api no
# @results HTML (using search portal)
# @stable yes (HTML can change)
# @parse url, title, content, seed, leech, magnetlink
from urlparse import urljoin from urlparse import urljoin
from cgi import escape from cgi import escape
from urllib import quote from urllib import quote
from lxml import html from lxml import html
from operator import itemgetter from operator import itemgetter
categories = ['videos', 'music'] # engine dependent config
categories = ['videos', 'music', 'files']
paging = True
# search-url
url = 'https://thepiratebay.se/' url = 'https://thepiratebay.se/'
search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' search_url = url + 'search/{search_term}/{pageno}/99/{search_type}'
search_types = {'videos': '200',
'music': '100',
'files': '0'}
# piratebay specific type-definitions
search_types = {'files': '0',
'music': '100',
'videos': '200'}
# specific xpath variables
magnet_xpath = './/a[@title="Download this torrent using magnet"]' magnet_xpath = './/a[@title="Download this torrent using magnet"]'
content_xpath = './/font[@class="detDesc"]//text()' content_xpath = './/font[@class="detDesc"]//text()'
paging = True
# do search-request
def request(query, params): def request(query, params):
search_type = search_types.get(params['category'], '200') search_type = search_types.get(params['category'], '0')
params['url'] = search_url.format(search_term=quote(query), params['url'] = search_url.format(search_term=quote(query),
search_type=search_type, search_type=search_type,
pageno=params['pageno'] - 1) pageno=params['pageno'] - 1)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
search_res = dom.xpath('//table[@id="searchResult"]//tr') search_res = dom.xpath('//table[@id="searchResult"]//tr')
# return empty array if nothing is found
if not search_res: if not search_res:
return results return []
# parse results
for result in search_res[1:]: for result in search_res[1:]:
link = result.xpath('.//div[@class="detName"]//a')[0] link = result.xpath('.//div[@class="detName"]//a')[0]
href = urljoin(url, link.attrib.get('href')) href = urljoin(url, link.attrib.get('href'))
@ -41,17 +63,21 @@ def response(resp):
content = escape(' '.join(result.xpath(content_xpath))) content = escape(' '.join(result.xpath(content_xpath)))
seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
# convert seed to int if possible
if seed.isdigit(): if seed.isdigit():
seed = int(seed) seed = int(seed)
else: else:
seed = 0 seed = 0
# convert leech to int if possible
if leech.isdigit(): if leech.isdigit():
leech = int(leech) leech = int(leech)
else: else:
leech = 0 leech = 0
magnetlink = result.xpath(magnet_xpath)[0] magnetlink = result.xpath(magnet_xpath)[0]
# append result
results.append({'url': href, results.append({'url': href,
'title': title, 'title': title,
'content': content, 'content': content,
@ -60,4 +86,5 @@ def response(resp):
'magnetlink': magnetlink.attrib['href'], 'magnetlink': magnetlink.attrib['href'],
'template': 'torrent.html'}) 'template': 'torrent.html'})
# return results sorted by seeder
return sorted(results, key=itemgetter('seed'), reverse=True) return sorted(results, key=itemgetter('seed'), reverse=True)

View File

@ -1,30 +1,55 @@
## Soundcloud (Music)
#
# @website https://soundcloud.com
# @provide-api yes (https://developers.soundcloud.com/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content
from json import loads from json import loads
from urllib import urlencode from urllib import urlencode
# engine dependent config
categories = ['music'] categories = ['music']
guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
url = 'https://api.soundcloud.com/'
search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id='+guest_client_id # noqa
paging = True paging = True
# api-key
guest_client_id = 'b45b1aa10f1ac2941910a7f0d10f8e28'
# search-url
url = 'https://api.soundcloud.com/'
search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id={client_id}'
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 20 offset = (params['pageno'] - 1) * 20
params['url'] = search_url.format(query=urlencode({'q': query}), params['url'] = search_url.format(query=urlencode({'q': query}),
offset=offset) offset=offset,
client_id=guest_client_id)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = loads(resp.text)
# parse results
for result in search_res.get('collection', []): for result in search_res.get('collection', []):
if result['kind'] in ('track', 'playlist'): if result['kind'] in ('track', 'playlist'):
title = result['title'] title = result['title']
content = result['description'] content = result['description']
# append result
results.append({'url': result['permalink_url'], results.append({'url': result['permalink_url'],
'title': title, 'title': title,
'content': content}) 'content': content})
# return results
return results return results

View File

@ -1,30 +1,58 @@
## Stackoverflow (It)
#
# @website https://stackoverflow.com/
# @provide-api not clear (https://api.stackexchange.com/docs/advanced-search)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, content
from urlparse import urljoin from urlparse import urljoin
from cgi import escape from cgi import escape
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
# engine dependent config
categories = ['it'] categories = ['it']
url = 'http://stackoverflow.com/'
search_url = url+'search?{query}&page={pageno}'
result_xpath = './/div[@class="excerpt"]//text()'
paging = True paging = True
# search-url
url = 'http://stackoverflow.com/'
search_url = url+'search?{query}&page={pageno}'
# specific xpath variables
results_xpath = '//div[contains(@class,"question-summary")]'
link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
title_xpath = './/text()'
content_xpath = './/div[@class="excerpt"]//text()'
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}), params['url'] = search_url.format(query=urlencode({'q': query}),
pageno=params['pageno']) pageno=params['pageno'])
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="question-summary search-result"]'):
link = result.xpath('.//div[@class="result-link"]//a')[0] # parse results
for result in dom.xpath(results_xpath):
link = result.xpath(link_xpath)[0]
href = urljoin(url, link.attrib.get('href')) href = urljoin(url, link.attrib.get('href'))
title = escape(' '.join(link.xpath('.//text()'))) title = escape(' '.join(link.xpath(title_xpath)))
content = escape(' '.join(result.xpath(result_xpath))) content = escape(' '.join(result.xpath(content_xpath)))
results.append({'url': href, 'title': title, 'content': content})
# append result
results.append({'url': href,
'title': title,
'content': content})
# return results
return results return results

View File

@ -1,47 +1,79 @@
## Startpage (Web)
#
# @website https://startpage.com
# @provide-api no (nothing found)
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo paging
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from cgi import escape from cgi import escape
import re
base_url = None # engine dependent config
search_url = None categories = ['general']
# there is a mechanism to block "bot" search (probably the parameter qid), require storing of qid's between mulitble search-calls
#paging = False
language_support = True
# TODO paging # search-url
paging = False base_url = 'https://startpage.com/'
# TODO complete list of country mapping search_url = base_url + 'do/search'
country_map = {'en_US': 'eng',
'en_UK': 'uk', # specific xpath variables
'nl_NL': 'ned'} # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
results_xpath = '//div[@class="result"]'
link_xpath = './/h3/a'
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
params['url'] = search_url params['url'] = search_url
params['method'] = 'POST' params['method'] = 'POST'
params['data'] = {'query': query, params['data'] = {'query': query,
'startat': (params['pageno'] - 1) * 10} # offset 'startat': offset}
country = country_map.get(params['language'], 'eng')
params['cookies']['preferences'] = \ # set language if specified
'lang_homepageEEEs/air/{country}/N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE0N1Ncolor_schemeEEEnewN1Nnum_of_resultsEEE10N1N'.format(country=country) # noqa if params['language'] != 'all':
params['data']['with_language'] = 'lang_' + params['language'].split('_')[0]
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"] # parse results
for result in dom.xpath('//div[@class="result"]'): for result in dom.xpath(results_xpath):
link = result.xpath('.//h3/a')[0] link = result.xpath(link_xpath)[0]
url = link.attrib.get('href') url = link.attrib.get('href')
if url.startswith('http://www.google.')\
or url.startswith('https://www.google.'):
continue
title = escape(link.text_content()) title = escape(link.text_content())
content = '' # block google-ad url's
if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
continue
if result.xpath('./p[@class="desc"]'): if result.xpath('./p[@class="desc"]'):
content = escape(result.xpath('./p[@class="desc"]')[0].text_content()) content = escape(result.xpath('./p[@class="desc"]')[0].text_content())
else:
content = ''
results.append({'url': url, 'title': title, 'content': content}) # append result
results.append({'url': url,
'title': title,
'content': content})
# return results
return results return results

View File

@ -1,30 +1,63 @@
## Twitter (Social media)
#
# @website https://www.bing.com/news
# @provide-api yes (https://dev.twitter.com/docs/using-search)
#
# @using-api no
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content
#
# @todo publishedDate
from urlparse import urljoin from urlparse import urljoin
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from cgi import escape from cgi import escape
# engine dependent config
categories = ['social media'] categories = ['social media']
language_support = True
# search-url
base_url = 'https://twitter.com/' base_url = 'https://twitter.com/'
search_url = base_url+'search?' search_url = base_url+'search?'
# specific xpath variables
results_xpath = '//li[@data-item-type="tweet"]'
link_xpath = './/small[@class="time"]//a'
title_xpath = './/span[@class="username js-action-profile-name"]//text()' title_xpath = './/span[@class="username js-action-profile-name"]//text()'
content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()' content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
# do search-request
def request(query, params): def request(query, params):
params['url'] = search_url + urlencode({'q': query}) params['url'] = search_url + urlencode({'q': query})
# set language if specified
if params['language'] != 'all':
params['cookies']['lang'] = params['language'].split('_')[0]
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for tweet in dom.xpath('//li[@data-item-type="tweet"]'):
link = tweet.xpath('.//small[@class="time"]//a')[0] # parse results
for tweet in dom.xpath(results_xpath):
link = tweet.xpath(link_xpath)[0]
url = urljoin(base_url, link.attrib.get('href')) url = urljoin(base_url, link.attrib.get('href'))
title = ''.join(tweet.xpath(title_xpath)) title = ''.join(tweet.xpath(title_xpath))
content = escape(''.join(tweet.xpath(content_xpath))) content = escape(''.join(tweet.xpath(content_xpath)))
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content}) 'content': content})
# return results
return results return results

View File

@ -1,30 +1,67 @@
## Wikipedia (Web)
#
# @website http://www.wikipedia.org
# @provide-api yes (http://www.mediawiki.org/wiki/API:Search)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title
#
# @todo content
from json import loads from json import loads
from urllib import urlencode, quote from urllib import urlencode, quote
url = 'https://{language}.wikipedia.org/' # engine dependent config
categories = ['general']
search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}' # noqa
number_of_results = 10
language_support = True language_support = True
paging = True
number_of_results = 1
# search-url
url = 'https://{language}.wikipedia.org/'
search_url = url + 'w/api.php?action=query&list=search&{query}&srprop=timestamp&format=json&sroffset={offset}&srlimit={limit}' # noqa
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * number_of_results
if params['language'] == 'all': if params['language'] == 'all':
language = 'en' language = 'en'
else: else:
language = params['language'].split('_')[0] language = params['language'].split('_')[0]
# write search-language back to params, required in response
params['language'] = language params['language'] = language
params['url'] = search_url.format(query=urlencode({'srsearch': query}), params['url'] = search_url.format(query=urlencode({'srsearch': query}),
offset=offset, offset=offset,
limit=number_of_results,
language=language) language=language)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = []
search_results = loads(resp.text) search_results = loads(resp.text)
res = search_results.get('query', {}).get('search', [])
return [{'url': url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8')), # noqa # return empty array if there are no results
'title': result['title']} for result in res[:int(number_of_results)]] if not search_results.get('query', {}).get('search'):
return []
# parse results
for result in search_results['query']['search']:
res_url = url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
# append result
results.append({'url': res_url,
'title': result['title'],
'content': ''})
# return results
return results

View File

@ -1,42 +1,69 @@
## Youtube (Videos)
#
# @website https://www.youtube.com/
# @provide-api yes (http://gdata-samples-youtube-search-py.appspot.com/)
#
# @using-api yes
# @results JSON
# @stable yes
# @parse url, title, content, publishedDate, thumbnail
from json import loads from json import loads
from urllib import urlencode from urllib import urlencode
from dateutil import parser from dateutil import parser
# engine dependent config
categories = ['videos'] categories = ['videos']
search_url = ('https://gdata.youtube.com/feeds/api/videos'
'?alt=json&{query}&start-index={index}&max-results=25') # noqa
paging = True paging = True
language_support = True
# search-url
base_url = 'https://gdata.youtube.com/feeds/api/videos'
search_url = base_url + '?alt=json&{query}&start-index={index}&max-results=5' # noqa
# do search-request
def request(query, params): def request(query, params):
index = (params['pageno'] - 1) * 25 + 1 index = (params['pageno'] - 1) * 5 + 1
params['url'] = search_url.format(query=urlencode({'q': query}), params['url'] = search_url.format(query=urlencode({'q': query}),
index=index) index=index)
# add language tag if specified
if params['language'] != 'all':
params['url'] += '&lr=' + params['language'].split('_')[0]
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
search_results = loads(resp.text) search_results = loads(resp.text)
# return empty array if there are no results
if not 'feed' in search_results: if not 'feed' in search_results:
return results return []
feed = search_results['feed'] feed = search_results['feed']
# parse results
for result in feed['entry']: for result in feed['entry']:
url = [x['href'] for x in result['link'] if x['type'] == 'text/html'] url = [x['href'] for x in result['link'] if x['type'] == 'text/html']
if not url: if not url:
return return
# remove tracking # remove tracking
url = url[0].replace('feature=youtube_gdata', '') url = url[0].replace('feature=youtube_gdata', '')
if url.endswith('&'): if url.endswith('&'):
url = url[:-1] url = url[:-1]
title = result['title']['$t'] title = result['title']['$t']
content = '' content = ''
thumbnail = '' thumbnail = ''
#"2013-12-31T15:22:51.000Z"
pubdate = result['published']['$t'] pubdate = result['published']['$t']
publishedDate = parser.parse(pubdate) publishedDate = parser.parse(pubdate)
@ -49,6 +76,7 @@ def response(resp):
else: else:
content = result['content']['$t'] content = result['content']['$t']
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content, 'content': content,
@ -56,4 +84,5 @@ def response(resp):
'publishedDate': publishedDate, 'publishedDate': publishedDate,
'thumbnail': thumbnail}) 'thumbnail': thumbnail})
# return results
return results return results

View File

@ -11,9 +11,8 @@ server:
engines: engines:
- name : wikipedia - name : wikipedia
engine : wikipedia engine : wikipedia
number_of_results : 1
paging : False
shortcut : wp shortcut : wp
# number_of_results : 1 # default is 1
- name : bing - name : bing
engine : bing engine : bing
@ -37,7 +36,6 @@ engines:
- name : deviantart - name : deviantart
engine : deviantart engine : deviantart
categories : images
shortcut : da shortcut : da
timeout: 3.0 timeout: 3.0
@ -47,7 +45,6 @@ engines:
- name : duckduckgo - name : duckduckgo
engine : duckduckgo engine : duckduckgo
locale : en-us
shortcut : ddg shortcut : ddg
# down - website is under criminal investigation by the UK # down - website is under criminal investigation by the UK
@ -64,12 +61,10 @@ engines:
- name : general-file - name : general-file
engine : generalfile engine : generalfile
categories : files
shortcut : gf shortcut : gf
- name : github - name : github
engine : github engine : github
categories : it
shortcut : gh shortcut : gh
- name : google - name : google
@ -86,23 +81,18 @@ engines:
- name : piratebay - name : piratebay
engine : piratebay engine : piratebay
categories : videos, music, files
shortcut : tpb shortcut : tpb
- name : soundcloud - name : soundcloud
engine : soundcloud engine : soundcloud
categories : music
shortcut : sc shortcut : sc
- name : stackoverflow - name : stackoverflow
engine : stackoverflow engine : stackoverflow
categories : it
shortcut : st shortcut : st
- name : startpage - name : startpage
engine : startpage engine : startpage
base_url : 'https://startpage.com/'
search_url : 'https://startpage.com/do/search'
shortcut : sp shortcut : sp
# +30% page load time # +30% page load time
@ -113,7 +103,6 @@ engines:
- name : twitter - name : twitter
engine : twitter engine : twitter
categories : social media
shortcut : tw shortcut : tw
# maybe in a fun category # maybe in a fun category
@ -142,13 +131,10 @@ engines:
- name : youtube - name : youtube
engine : youtube engine : youtube
categories : videos
shortcut : yt shortcut : yt
- name : dailymotion - name : dailymotion
engine : dailymotion engine : dailymotion
locale : en_US
categories : videos
shortcut : dm shortcut : dm
- name : vimeo - name : vimeo