This commit is contained in:
pw3t 2014-01-05 17:57:55 +01:00
commit 0d93ad2018
14 changed files with 98 additions and 55 deletions

View File

@ -5,7 +5,7 @@ number_of_results = 1
[bing] [bing]
engine = bing engine = bing
language = en-us locale = en-US
[cc] [cc]
engine=currency_convert engine=currency_convert
@ -20,6 +20,7 @@ engine = duckduckgo_definitions
[duckduckgo] [duckduckgo]
engine = duckduckgo engine = duckduckgo
locale = en-us
[flickr] [flickr]
engine = flickr engine = flickr
@ -63,17 +64,17 @@ categories = social media
[urbandictionary] [urbandictionary]
engine = xpath engine = xpath
search_url = http://www.urbandictionary.com/define.php?term={query} search_url = http://www.urbandictionary.com/define.php?term={query}
url_xpath = //div[@id="entries"]//div[@class="word"]//a url_xpath = //div[@id="entries"]//div[@class="word"]/a/@href
title_xpath = //div[@id="entries"]//div[@class="word"]//span//text() title_xpath = //div[@id="entries"]//div[@class="word"]/span
content_xpath = //div[@id="entries"]//div[@class="text"]//div[@class="definition"]//text() content_xpath = //div[@id="entries"]//div[@class="text"]/div[@class="definition"]
[yahoo] [yahoo]
engine = xpath engine = xpath
search_url = http://search.yahoo.com/search?p={query} search_url = http://search.yahoo.com/search?p={query}
results_xpath = //div[@class="res"] results_xpath = //div[@class="res"]
url_xpath = .//span[@class="url"]//text() url_xpath = .//h3/a/@href
content_xpath = .//div[@class="abstr"]//text() title_xpath = .//h3/a
title_xpath = .//h3/a//text() content_xpath = .//div[@class="abstr"]
suggestion_xpath = //div[@id="satat"]//a suggestion_xpath = //div[@id="satat"]//a
[youtube] [youtube]
@ -82,5 +83,6 @@ categories = videos
[dailymotion] [dailymotion]
engine = dailymotion engine = dailymotion
locale = en_US
categories = videos categories = videos

View File

@ -261,7 +261,7 @@ def get_engines_stats():
for engine in errors: for engine in errors:
if max_errors: if max_errors:
engine['percentage'] = int(engine['avg']/max_errors*100) engine['percentage'] = int(float(engine['avg'])/max_errors*100)
else: else:
engine['percentage'] = 0 engine['percentage'] = 0

View File

@ -4,11 +4,11 @@ from cgi import escape
base_url = 'http://www.bing.com/' base_url = 'http://www.bing.com/'
search_string = 'search?{query}' search_string = 'search?{query}'
language = 'en-us' # see http://msdn.microsoft.com/en-us/library/dd251064.aspx locale = 'en-US' # see http://msdn.microsoft.com/en-us/library/dd251064.aspx
def request(query, params): def request(query, params):
search_path = search_string.format(query=urlencode({'q': query, 'setmkt': language})) search_path = search_string.format(query=urlencode({'q': query, 'setmkt': locale}))
#if params['category'] == 'images': #if params['category'] == 'images':
# params['url'] = base_url + 'images/' + search_path # params['url'] = base_url + 'images/' + search_path
params['url'] = base_url + search_path params['url'] = base_url + search_path

View File

@ -1,16 +1,17 @@
from urllib import urlencode from urllib import urlencode
from lxml import html
from json import loads from json import loads
from cgi import escape from cgi import escape
categories = ['videos'] categories = ['videos']
localization = 'en' locale = 'en_US'
# see http://www.dailymotion.com/doc/api/obj-video.html # see http://www.dailymotion.com/doc/api/obj-video.html
search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page=1&{query}' search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page=1&{query}'
def request(query, params): def request(query, params):
global search_url global search_url
params['url'] = search_url.format(query=urlencode({'search': query, 'localization': localization })) params['url'] = search_url.format(query=urlencode({'search': query, 'localization': locale }))
return params return params
@ -27,6 +28,11 @@ def response(resp):
else: else:
content = '' content = ''
if res['description']: if res['description']:
content += escape(res['description'][:500]) description = text_content_from_html(res['description'])
content += description[:500]
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
return results return results
def text_content_from_html(html_string):
desc_html = html.fragment_fromstring(html_string, create_parent=True)
return desc_html.text_content()

View File

@ -3,10 +3,11 @@ from urllib import urlencode
from searx.utils import html_to_text from searx.utils import html_to_text
url = 'https://duckduckgo.com/' url = 'https://duckduckgo.com/'
search_url = url + 'd.js?{query}&l=us-en&p=1&s=0' search_url = url + 'd.js?{query}&p=1&s=0'
locale = 'us-en'
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query})) params['url'] = search_url.format(query=urlencode({'q': query, 'l': locale}))
return params return params

View File

@ -1,7 +1,7 @@
import json import json
from urllib import urlencode from urllib import urlencode
url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0' url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1'
def request(query, params): def request(query, params):
params['url'] = url.format(query=urlencode({'q': query})) params['url'] = url.format(query=urlencode({'q': query}))

View File

@ -1,6 +1,4 @@
from json import loads
from urllib import urlencode from urllib import urlencode
from searx.utils import html_to_text
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
url = 'http://www.filecrop.com/' url = 'http://www.filecrop.com/'

0
searx/engines/flickr.py Executable file → Normal file
View File

0
searx/engines/google_images.py Executable file → Normal file
View File

View File

@ -19,14 +19,13 @@ def response(resp):
global base_url global base_url
results = [] results = []
dom = html.fromstring(resp.content) dom = html.fromstring(resp.content)
for result in dom.xpath('//div[@class="result"]'): # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads : div[@class="result"] are the direct childs of div[@id="results"]
for result in dom.xpath('//div[@id="results"]/div[@class="result"]'):
link = result.xpath('.//h3/a')[0] link = result.xpath('.//h3/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
parsed_url = urlparse(url) parsed_url = urlparse(url)
# TODO better google link detection title = link.text_content()
if parsed_url.netloc.find('www.google.com') >= 0: content = result.xpath('./p[@class="desc"]')[0].text_content()
continue
title = ' '.join(link.xpath('.//text()'))
content = escape(' '.join(result.xpath('.//p[@class="desc"]//text()')))
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
return results return results

View File

@ -1,5 +1,5 @@
from lxml import html from lxml import html
from urllib import urlencode from urllib import urlencode, unquote
from urlparse import urlparse, urljoin from urlparse import urlparse, urljoin
from cgi import escape from cgi import escape
from lxml.etree import _ElementStringResult from lxml.etree import _ElementStringResult
@ -11,32 +11,64 @@ title_xpath = None
suggestion_xpath = '' suggestion_xpath = ''
results_xpath = '' results_xpath = ''
def extract_url(xpath_results): '''
url = '' if xpath_results is list, extract the text from each result and concat the list
parsed_search_url = urlparse(search_url) if xpath_results is a xml element, extract all the text node from it ( text_content() method from lxml )
if xpath_results is a string element, then it's already done
'''
def extract_text(xpath_results):
if type(xpath_results) == list: if type(xpath_results) == list:
# it's list of result : concat everything using recursive call
if not len(xpath_results): if not len(xpath_results):
raise Exception('Empty url resultset') raise Exception('Empty url resultset')
if type(xpath_results[0]) == _ElementStringResult: result = ''
url = ''.join(xpath_results) for e in xpath_results:
result = result + extract_text(e)
return result
elif type(xpath_results) == _ElementStringResult:
# it's a string
return ''.join(xpath_results)
else:
# it's a element
return xpath_results.text_content()
def extract_url(xpath_results):
url = extract_text(xpath_results)
if url.startswith('//'): if url.startswith('//'):
# add http or https to this kind of url //example.com/
parsed_search_url = urlparse(search_url)
url = parsed_search_url.scheme+url url = parsed_search_url.scheme+url
elif url.startswith('/'): elif url.startswith('/'):
# fix relative url to the search engine
url = urljoin(search_url, url) url = urljoin(search_url, url)
#TODO
else: # normalize url
url = xpath_results[0].attrib.get('href') url = normalize_url(url)
else:
url = xpath_results.attrib.get('href') return url
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://'+url
def normalize_url(url):
parsed_url = urlparse(url) parsed_url = urlparse(url)
# add a / at this end of the url if there is no path
if not parsed_url.netloc: if not parsed_url.netloc:
raise Exception('Cannot parse url') raise Exception('Cannot parse url')
if not parsed_url.path: if not parsed_url.path:
url += '/' url += '/'
# FIXME : hack for yahoo
if parsed_url.hostname == 'search.yahoo.com' and parsed_url.path.startswith('/r'):
p = parsed_url.path
mark = p.find('/**')
if mark != -1:
return unquote(p[mark+3:]).decode('utf-8')
return url return url
def request(query, params): def request(query, params):
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
params['url'] = search_url.format(query=query) params['url'] = search_url.format(query=query)
@ -50,15 +82,19 @@ def response(resp):
if results_xpath: if results_xpath:
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath)) url = extract_url(result.xpath(url_xpath))
title = ' '.join(result.xpath(title_xpath)) title = extract_text(result.xpath(title_xpath)[0 ])
content = escape(' '.join(result.xpath(content_xpath))) content = extract_text(result.xpath(content_xpath)[0])
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
else: else:
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)): for url, title, content in zip(
map(extract_url, dom.xpath(url_xpath)), \
map(extract_text, dom.xpath(title_xpath)), \
map(extract_text, dom.xpath(content_xpath)), \
):
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
if not suggestion_xpath: if not suggestion_xpath:
return results return results
for suggestion in dom.xpath(suggestion_xpath): for suggestion in dom.xpath(suggestion_xpath):
results.append({'suggestion': escape(''.join(suggestion.xpath('.//text()')))}) results.append({'suggestion': extract_text(suggestion)})
return results return results

View File

@ -1,5 +1,5 @@
from json import loads from json import loads
from urllib import urlencode, quote from urllib import urlencode
url = 'http://localhost:8090' url = 'http://localhost:8090'
search_url = '/yacysearch.json?{query}&maximumRecords=10' search_url = '/yacysearch.json?{query}&maximumRecords=10'

View File

@ -37,7 +37,7 @@
<p>It's ok if you don't trust us regarding the logs, <a href="https://github.com/asciimoo/searx">take the code</a> and run it yourself! decentralize!</p> <p>It's ok if you don't trust us regarding the logs, <a href="https://github.com/asciimoo/searx">take the code</a> and run it yourself! decentralize!</p>
<h3>How to add to firefox?</h3> <h3>How to add to firefox?</h3>
<p><a href="#" onclick="window.external.AddSearchProvider(window.location.protocol + '//' + window.location.host + '/opensearch.xml')">Install</a> searx as a search engine on any version of Firefox! (javascript required)</p> <p><a href="#" onclick="window.external.AddSearchProvider(window.location.protocol + '//' + window.location.host + '/opensearch.xml')">Install</a> searx as a search engine on any version of Firefox! (javascript required)</p>
<h2 id="faq">Developer FAQ</h2> <h2 id="dev_faq">Developer FAQ</h2>
<h3>New engines?</h3> <h3>New engines?</h3>
<p><ul> <p><ul>
<li>Edit your engines.cfg, see <a href="https://raw.github.com/asciimoo/searx/master/engines.cfg_sample">sample config</a></li> <li>Edit your engines.cfg, see <a href="https://raw.github.com/asciimoo/searx/master/engines.cfg_sample">sample config</a></li>

View File

@ -152,7 +152,8 @@ def preferences():
selected_categories.append(category) selected_categories.append(category)
if selected_categories: if selected_categories:
resp = make_response(redirect('/')) resp = make_response(redirect('/'))
resp.set_cookie('categories', ','.join(selected_categories)) # cookie max age: 4 weeks
resp.set_cookie('categories', ','.join(selected_categories), max_age=60*60*24*7*4)
return resp return resp
return render('preferences.html') return render('preferences.html')