From 17bf00ee42583910e45794e1438a2bab459225ad Mon Sep 17 00:00:00 2001 From: asciimoo Date: Sat, 9 Nov 2013 18:39:20 +0100 Subject: [PATCH] [enh] removing result html tags --- searx/engines/duckduckgo.py | 3 ++- searx/engines/startpage.py | 4 ++-- searx/engines/twitter.py | 3 ++- searx/engines/xpath.py | 3 +-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 33f56f469..d591854a5 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,5 +1,6 @@ from json import loads from urllib import urlencode +from searx.utils import html_to_text url = 'https://duckduckgo.com/' search_url = url + 'd.js?{query}&l=us-en&p=1&s=0' @@ -16,7 +17,7 @@ def response(resp): if not r.get('t'): continue results.append({'title': r['t'] - ,'content': r['a'] + ,'content': html_to_text(r['a']) ,'url': r['u'] }) return results diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 47273e6e7..061c8158d 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -1,4 +1,4 @@ -from urllib import quote +from urllib import urlencode from lxml import html from urlparse import urlparse from cgi import escape @@ -8,7 +8,7 @@ search_url = base_url+'do/search' def request(query, params): global search_url - query = quote(query.replace(' ', '+'), safe='+') + query = urlencode({'q': query})[2:] params['url'] = search_url params['method'] = 'POST' params['data'] = {'query': query} diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index d0a0aef17..f9d9e26ad 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -1,6 +1,7 @@ from urlparse import urljoin from urllib import urlencode from lxml import html +from cgi import escape categories = ['social media'] @@ -21,6 +22,6 @@ def response(resp): link = tweet.xpath('.//small[@class="time"]//a')[0] url = urljoin(base_url, link.attrib.get('href')) title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()')) - content = ''.join(map(html.tostring, tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//*'))) + content = escape(''.join(tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//text()'))) results.append({'url': url, 'title': title, 'content': content}) return results diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 8c2e04d5c..2743dc2a0 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -46,12 +46,11 @@ def request(query, params): def response(resp): results = [] dom = html.fromstring(resp.text) - query = resp.search_params['query'] if results_xpath: for result in dom.xpath(results_xpath): url = extract_url(result.xpath(url_xpath)) title = ' '.join(result.xpath(title_xpath)) - content = escape(' '.join(result.xpath(content_xpath))).replace(query, '{0}'.format(query)) + content = escape(' '.join(result.xpath(content_xpath))) results.append({'url': url, 'title': title, 'content': content}) else: for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):