+ This is the content étude à€ + + + + http://link.in.tweet + + + + + + + pic.twitter.com/rbFsfeE0l3 + +
+ + +diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index bd9a8c2fc..0e35e6188 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -13,8 +13,8 @@ from urlparse import urljoin from urllib import urlencode from lxml import html -from cgi import escape from datetime import datetime +from searx.engines.xpath import extract_text # engine dependent config categories = ['social media'] @@ -22,12 +22,12 @@ language_support = True # search-url base_url = 'https://twitter.com/' -search_url = base_url+'search?' +search_url = base_url + 'search?' # specific xpath variables results_xpath = '//li[@data-item-type="tweet"]' link_xpath = './/small[@class="time"]//a' -title_xpath = './/span[@class="username js-action-profile-name"]//text()' +title_xpath = './/span[@class="username js-action-profile-name"]' content_xpath = './/p[@class="js-tweet-text tweet-text"]' timestamp_xpath = './/span[contains(@class,"_timestamp")]' @@ -39,6 +39,8 @@ def request(query, params): # set language if specified if params['language'] != 'all': params['cookies']['lang'] = params['language'].split('_')[0] + else: + params['cookies']['lang'] = 'en' return params @@ -53,8 +55,9 @@ def response(resp): for tweet in dom.xpath(results_xpath): link = tweet.xpath(link_xpath)[0] url = urljoin(base_url, link.attrib.get('href')) - title = ''.join(tweet.xpath(title_xpath)) - content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8")) + title = extract_text(tweet.xpath(title_xpath)) + content = extract_text(tweet.xpath(content_xpath)[0]) + pubdate = tweet.xpath(timestamp_xpath) if len(pubdate) > 0: timestamp = float(pubdate[0].attrib.get('data-time')) diff --git a/searx/tests/engines/test_twitter.py b/searx/tests/engines/test_twitter.py new file mode 100644 index 000000000..b444b48ee --- /dev/null +++ b/searx/tests/engines/test_twitter.py @@ -0,0 +1,502 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import twitter +from searx.testing import SearxTestCase + + +class TestTwitterEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 0 + dicto['language'] = 'fr_FR' + params = twitter.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['url']) + self.assertIn('twitter.com', params['url']) + self.assertIn('cookies', params) + self.assertIn('lang', params['cookies']) + self.assertIn('fr', params['cookies']['lang']) + + dicto['language'] = 'all' + params = twitter.request(query, dicto) + self.assertIn('cookies', params) + self.assertIn('lang', params['cookies']) + self.assertIn('en', params['cookies']['lang']) + + def test_response(self): + self.assertRaises(AttributeError, twitter.response, None) + self.assertRaises(AttributeError, twitter.response, []) + self.assertRaises(AttributeError, twitter.response, '') + self.assertRaises(AttributeError, twitter.response, '[]') + + response = mock.Mock(text='') + self.assertEqual(twitter.response(response), []) + + html = """ +
+ This is the content étude à€ + + + + http://link.in.tweet + + + + + + + pic.twitter.com/rbFsfeE0l3 + +
+ + ++ This is the content étude à€ + + + + http://link.in.tweet + + + + + + + pic.twitter.com/rbFsfeE0l3 + +
+ + ++ This should be the content.
+