Merge pull request #16 from dalf/master

bug fixes
This commit is contained in:
Adam Tauber 2014-01-05 05:49:39 -08:00
commit 5c9f6d5174
9 changed files with 85 additions and 41 deletions

View File

@ -5,7 +5,7 @@ number_of_results = 1
[bing] [bing]
engine = bing engine = bing
language = en-us locale = en-US
[cc] [cc]
engine=currency_convert engine=currency_convert
@ -20,6 +20,7 @@ engine = duckduckgo_definitions
[duckduckgo] [duckduckgo]
engine = duckduckgo engine = duckduckgo
locale = en-us
[flickr] [flickr]
engine = flickr engine = flickr
@ -63,17 +64,17 @@ categories = social media
[urbandictionary] [urbandictionary]
engine = xpath engine = xpath
search_url = http://www.urbandictionary.com/define.php?term={query} search_url = http://www.urbandictionary.com/define.php?term={query}
url_xpath = //div[@id="entries"]//div[@class="word"]//a url_xpath = //div[@id="entries"]//div[@class="word"]/a/@href
title_xpath = //div[@id="entries"]//div[@class="word"]//span//text() title_xpath = //div[@id="entries"]//div[@class="word"]/span
content_xpath = //div[@id="entries"]//div[@class="text"]//div[@class="definition"]//text() content_xpath = //div[@id="entries"]//div[@class="text"]/div[@class="definition"]
[yahoo] [yahoo]
engine = xpath engine = xpath
search_url = http://search.yahoo.com/search?p={query} search_url = http://search.yahoo.com/search?p={query}
results_xpath = //div[@class="res"] results_xpath = //div[@class="res"]
url_xpath = .//span[@class="url"]//text() url_xpath = .//h3/a/@href
content_xpath = .//div[@class="abstr"]//text() title_xpath = .//h3/a
title_xpath = .//h3/a//text() content_xpath = .//div[@class="abstr"]
suggestion_xpath = //div[@id="satat"]//a suggestion_xpath = //div[@id="satat"]//a
[youtube] [youtube]
@ -82,5 +83,6 @@ categories = videos
[dailymotion] [dailymotion]
engine = dailymotion engine = dailymotion
locale = en_US
categories = videos categories = videos

View File

@ -4,11 +4,11 @@ from cgi import escape
base_url = 'http://www.bing.com/' base_url = 'http://www.bing.com/'
search_string = 'search?{query}' search_string = 'search?{query}'
language = 'en-us' # see http://msdn.microsoft.com/en-us/library/dd251064.aspx locale = 'en-US' # see http://msdn.microsoft.com/en-us/library/dd251064.aspx
def request(query, params): def request(query, params):
search_path = search_string.format(query=urlencode({'q': query, 'setmkt': language})) search_path = search_string.format(query=urlencode({'q': query, 'setmkt': locale}))
#if params['category'] == 'images': #if params['category'] == 'images':
# params['url'] = base_url + 'images/' + search_path # params['url'] = base_url + 'images/' + search_path
params['url'] = base_url + search_path params['url'] = base_url + search_path

View File

@ -1,16 +1,17 @@
from urllib import urlencode from urllib import urlencode
from lxml import html
from json import loads from json import loads
from cgi import escape from cgi import escape
categories = ['videos'] categories = ['videos']
localization = 'en' locale = 'en_US'
# see http://www.dailymotion.com/doc/api/obj-video.html # see http://www.dailymotion.com/doc/api/obj-video.html
search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page=1&{query}' search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page=1&{query}'
def request(query, params): def request(query, params):
global search_url global search_url
params['url'] = search_url.format(query=urlencode({'search': query, 'localization': localization })) params['url'] = search_url.format(query=urlencode({'search': query, 'localization': locale }))
return params return params
@ -27,6 +28,11 @@ def response(resp):
else: else:
content = '' content = ''
if res['description']: if res['description']:
content += escape(res['description'][:500]) description = text_content_from_html(res['description'])
content += description[:500]
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
return results return results
def text_content_from_html(html_string):
desc_html = html.fragment_fromstring(html_string, create_parent=True)
return desc_html.text_content()

View File

@ -3,10 +3,11 @@ from urllib import urlencode
from searx.utils import html_to_text from searx.utils import html_to_text
url = 'https://duckduckgo.com/' url = 'https://duckduckgo.com/'
search_url = url + 'd.js?{query}&l=us-en&p=1&s=0' search_url = url + 'd.js?{query}&p=1&s=0'
locale = 'us-en'
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query})) params['url'] = search_url.format(query=urlencode({'q': query, 'l': locale}))
return params return params

View File

@ -1,7 +1,7 @@
import json import json
from urllib import urlencode from urllib import urlencode
url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0' url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1'
def request(query, params): def request(query, params):
params['url'] = url.format(query=urlencode({'q': query})) params['url'] = url.format(query=urlencode({'q': query}))

0
searx/engines/flickr.py Executable file → Normal file
View File

0
searx/engines/google_images.py Executable file → Normal file
View File

View File

@ -19,14 +19,13 @@ def response(resp):
global base_url global base_url
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
for result in dom.xpath('//div[@class="result"]'): # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads : div[@class="result"] are the direct childs of div[@id="results"]
for result in dom.xpath('//div[@id="results"]/div[@class="result"]'):
link = result.xpath('.//h3/a')[0] link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
parsed_url = urlparse(url) parsed_url = urlparse(url)
# TODO better google link detection title = link.text_content()
if parsed_url.netloc.find('www.google.com') >= 0: content = result.xpath('./p[@class="desc"]')[0].text_content()
continue
title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p[@class="desc"]//text()')))
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
return results return results

View File

@ -1,5 +1,5 @@
from lxml import html from lxml import html
from urllib import urlencode from urllib import urlencode, unquote
from urlparse import urlparse, urljoin from urlparse import urlparse, urljoin
from cgi import escape from cgi import escape
from lxml.etree import _ElementStringResult from lxml.etree import _ElementStringResult
@ -11,32 +11,64 @@ title_xpath = None
suggestion_xpath = '' suggestion_xpath = ''
results_xpath = '' results_xpath = ''
def extract_url(xpath_results): '''
url = '' if xpath_results is list, extract the text from each result and concat the list
parsed_search_url = urlparse(search_url) if xpath_results is a xml element, extract all the text node from it ( text_content() method from lxml )
if xpath_results is a string element, then it's already done
'''
def extract_text(xpath_results):
if type(xpath_results) == list: if type(xpath_results) == list:
# it's list of result : concat everything using recursive call
if not len(xpath_results): if not len(xpath_results):
raise Exception('Empty url resultset') raise Exception('Empty url resultset')
if type(xpath_results[0]) == _ElementStringResult: result = ''
url = ''.join(xpath_results) for e in xpath_results:
if url.startswith('//'): result = result + extract_text(e)
url = parsed_search_url.scheme+url return result
elif url.startswith('/'): elif type(xpath_results) == _ElementStringResult:
url = urljoin(search_url, url) # it's a string
#TODO return ''.join(xpath_results)
else:
url = xpath_results[0].attrib.get('href')
else: else:
url = xpath_results.attrib.get('href') # it's a element
if not url.startswith('http://') and not url.startswith('https://'): return xpath_results.text_content()
url = 'http://'+url
def extract_url(xpath_results):
url = extract_text(xpath_results)
if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
url = parsed_search_url.scheme+url
elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url)
# normalize url
url = normalize_url(url)
return url
def normalize_url(url):
parsed_url = urlparse(url) parsed_url = urlparse(url)
# add a / at this end of the url if there is no path
if not parsed_url.netloc: if not parsed_url.netloc:
raise Exception('Cannot parse url') raise Exception('Cannot parse url')
if not parsed_url.path: if not parsed_url.path:
url += '/' url += '/'
# FIXME : hack for yahoo
if parsed_url.hostname == 'search.yahoo.com' and parsed_url.path.startswith('/r'):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
return unquote(p[mark+3:]).decode('utf-8')
return url return url
def request(query, params): def request(query, params):
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
params['url'] = search_url.format(query=query) params['url'] = search_url.format(query=query)
@ -50,15 +82,19 @@ def response(resp):
if results_xpath: if results_xpath:
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath)) url = extract_url(result.xpath(url_xpath))
title = ' '.join(result.xpath(title_xpath)) title = extract_text(result.xpath(title_xpath)[0 ])
content = escape(' '.join(result.xpath(content_xpath))) content = extract_text(result.xpath(content_xpath)[0])
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
else: else:
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)): for url, title, content in zip(
map(extract_url, dom.xpath(url_xpath)), \
map(extract_text, dom.xpath(title_xpath)), \
map(extract_text, dom.xpath(content_xpath)), \
):
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath: if not suggestion_xpath:
return results return results
for suggestion in dom.xpath(suggestion_xpath): for suggestion in dom.xpath(suggestion_xpath):
results.append({'suggestion': escape(''.join(suggestion.xpath('.//text()')))}) results.append({'suggestion': extract_text(suggestion)})
return results return results