[fix] bing_news based on RSS output format

This commit is contained in:
Dalf 2015-06-04 18:30:08 +02:00
parent cdf931f949
commit 62cc2a5658
3 changed files with 157 additions and 238 deletions

View File

@ -6,18 +6,17 @@
max. 5000 query/month max. 5000 query/month
@using-api no (because of query limit) @using-api no (because of query limit)
@results HTML (using search portal) @results RSS (using search portal)
@stable no (HTML can change) @stable yes (except perhaps for the images)
@parse url, title, content, publishedDate @parse url, title, content, publishedDate, thumbnail
""" """
from urllib import urlencode from urllib import urlencode
from cgi import escape from urlparse import urlparse, parse_qsl
from lxml import html from datetime import datetime
from datetime import datetime, timedelta
from dateutil import parser from dateutil import parser
import re from lxml import etree
from searx.engines.xpath import extract_text from searx.utils import list_get
# engine dependent config # engine dependent config
categories = ['news'] categories = ['news']
@ -26,7 +25,25 @@ language_support = True
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}' search_string = 'news/search?{query}&first={offset}&format=RSS'
# remove click
def url_cleanup(url_string):
parsed_url = urlparse(url_string)
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
query = dict(parse_qsl(parsed_url.query))
return query.get('url', None)
return url_string
# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
def image_url_cleanup(url_string):
parsed_url = urlparse(url_string)
if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
query = dict(parse_qsl(parsed_url.query))
return "https://www.bing.com/th?id=" + query.get('id')
return url_string
# do search-request # do search-request
@ -42,8 +59,6 @@ def request(query, params):
query=urlencode({'q': query, 'setmkt': language}), query=urlencode({'q': query, 'setmkt': language}),
offset=offset) offset=offset)
params['cookies']['_FP'] = "ui=en-US"
params['url'] = base_url + search_path params['url'] = base_url + search_path
return params return params
@ -53,50 +68,44 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) rss = etree.fromstring(resp.content)
ns = rss.nsmap
# parse results # parse results
for result in dom.xpath('//div[@class="sn_r"]'): for item in rss.xpath('./channel/item'):
link = result.xpath('.//div[@class="newstitle"]/a')[0] # url / title / content
url = link.attrib.get('href') url = url_cleanup(item.xpath('./link/text()')[0])
title = extract_text(link) title = list_get(item.xpath('./title/text()'), 0, url)
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]') content = list_get(item.xpath('./description/text()'), 0, '')
content = escape(extract_text(contentXPath))
# parse publishedDate # publishedDate
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div' publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
'//div[contains(@class,"sn_ST")]' try:
'//span[contains(@class,"sn_tm")]') publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
publishedDate = datetime.now()
except ValueError:
publishedDate = datetime.now()
publishedDate = escape(extract_text(publishedDateXPath)) # thumbnail
thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
if re.match("^[0-9]+ minute(s|) ago$", publishedDate): if thumbnail is not None:
timeNumbers = re.findall(r'\d+', publishedDate) thumbnail = image_url_cleanup(thumbnail)
publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
else:
try:
publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
publishedDate = datetime.now()
except ValueError:
publishedDate = datetime.now()
# append result # append result
results.append({'url': url, if thumbnail is not None:
'title': title, results.append({'template': 'videos.html',
'publishedDate': publishedDate, 'url': url,
'content': content}) 'title': title,
'publishedDate': publishedDate,
'content': content,
'thumbnail': thumbnail})
else:
results.append({'url': url,
'title': title,
'publishedDate': publishedDate,
'content': content})
# return results # return results
return results return results

View File

@ -2,6 +2,7 @@ from collections import defaultdict
import mock import mock
from searx.engines import bing_news from searx.engines import bing_news
from searx.testing import SearxTestCase from searx.testing import SearxTestCase
import lxml
class TestBingNewsEngine(SearxTestCase): class TestBingNewsEngine(SearxTestCase):
@ -16,14 +17,10 @@ class TestBingNewsEngine(SearxTestCase):
self.assertIn(query, params['url']) self.assertIn(query, params['url'])
self.assertIn('bing.com', params['url']) self.assertIn('bing.com', params['url'])
self.assertIn('fr', params['url']) self.assertIn('fr', params['url'])
self.assertIn('_FP', params['cookies'])
self.assertIn('en', params['cookies']['_FP'])
dicto['language'] = 'all' dicto['language'] = 'all'
params = bing_news.request(query, dicto) params = bing_news.request(query, dicto)
self.assertIn('en', params['url']) self.assertIn('en', params['url'])
self.assertIn('_FP', params['cookies'])
self.assertIn('en', params['cookies']['_FP'])
def test_response(self): def test_response(self):
self.assertRaises(AttributeError, bing_news.response, None) self.assertRaises(AttributeError, bing_news.response, None)
@ -37,200 +34,105 @@ class TestBingNewsEngine(SearxTestCase):
response = mock.Mock(content='<html></html>') response = mock.Mock(content='<html></html>')
self.assertEqual(bing_news.response(response), []) self.assertEqual(bing_news.response(response), [])
html = """ html = """<?xml version="1.0" encoding="utf-8" ?>
<div class="sn_r"> <rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
<div class="newstitle"> <channel>
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1"> <title>python - Bing News</title>
Title <link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> <description>Search results</description>
</div> <image>
<div class="sn_img"> <url>http://10.53.64.9/rsslogo.gif</url>
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1"> <title>test</title>
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" /> <link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> </image>
</div> <copyright>Copyright</copyright>
<div class="sn_txt"> <item>
<div class="sn_oi"> <title>Title</title>
<span class="sn_snip">Article Content</span> <link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&amp;aid=&amp;tid=c237eccc50bd4758b106a5e3c94fce09&amp;url=http%3a%2f%2furl.of.article%2f&amp;c=xxxxxxxxx&amp;mkt=en-us</link>
<div class="sn_ST"> <description>Article Content</description>
<cite class="sn_src">metronews.fr</cite> <pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
&nbsp;&#0183;&#32; <News:Source>Infoworld</News:Source>
<span class="sn_tm">44 minutes ago</span> <News:Image>http://a1.bing4.com/th?id=ON.13371337133713371337133713371337&amp;pid=News</News:Image>
</div> <News:ImageSize>w={0}&amp;h={1}&amp;c=7</News:ImageSize>
</div> <News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
</div> <News:ImageMaxWidth>620</News:ImageMaxWidth>
</div> <News:ImageMaxHeight>413</News:ImageMaxHeight>
""" </item>
<item>
<title>Another Title</title>
<link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&amp;aid=&amp;tid=c237eccc50bd4758b106a5e3c94fce09&amp;url=http%3a%2f%2fanother.url.of.article%2f&amp;c=xxxxxxxxx&amp;mkt=en-us</link>
<description>Another Article Content</description>
<pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
</item>
</channel>
</rss>""" # noqa
response = mock.Mock(content=html)
results = bing_news.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['title'], 'Title')
self.assertEqual(results[0]['url'], 'http://url.of.article/')
self.assertEqual(results[0]['content'], 'Article Content')
self.assertEqual(results[0]['thumbnail'], 'https://www.bing.com/th?id=ON.13371337133713371337133713371337')
self.assertEqual(results[1]['title'], 'Another Title')
self.assertEqual(results[1]['url'], 'http://another.url.of.article/')
self.assertEqual(results[1]['content'], 'Another Article Content')
self.assertNotIn('thumbnail', results[1])
html = """<?xml version="1.0" encoding="utf-8" ?>
<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
<channel>
<title>python - Bing News</title>
<link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
<description>Search results</description>
<image>
<url>http://10.53.64.9/rsslogo.gif</url>
<title>test</title>
<link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</image>
<copyright>Copyright</copyright>
<item>
<title>Title</title>
<link>http://another.url.of.article/</link>
<description>Article Content</description>
<pubDate>garbage</pubDate>
<News:Source>Infoworld</News:Source>
<News:Image>http://another.bing.com/image</News:Image>
<News:ImageSize>w={0}&amp;h={1}&amp;c=7</News:ImageSize>
<News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
<News:ImageMaxWidth>620</News:ImageMaxWidth>
<News:ImageMaxHeight>413</News:ImageMaxHeight>
</item>
</channel>
</rss>""" # noqa
response = mock.Mock(content=html) response = mock.Mock(content=html)
results = bing_news.response(response) results = bing_news.response(response)
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
self.assertEqual(len(results), 1) self.assertEqual(len(results), 1)
self.assertEqual(results[0]['title'], 'Title') self.assertEqual(results[0]['title'], 'Title')
self.assertEqual(results[0]['url'], 'http://url.of.article/') self.assertEqual(results[0]['url'], 'http://another.url.of.article/')
self.assertEqual(results[0]['content'], 'Article Content') self.assertEqual(results[0]['content'], 'Article Content')
self.assertEqual(results[0]['thumbnail'], 'http://another.bing.com/image')
html = """ html = """<?xml version="1.0" encoding="utf-8" ?>
<div class="sn_r"> <rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
<div class="newstitle"> <channel>
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1"> <title>python - Bing News</title>
Title <link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> <description>Search results</description>
</div> <image>
<div class="sn_img"> <url>http://10.53.64.9/rsslogo.gif</url>
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1"> <title>test</title>
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" /> <link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> </image>
</div> </channel>
<div class="sn_txt"> </rss>""" # noqa
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">44 minutes ago</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">3 hours, 44 minutes ago</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">44 hours ago</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">2 days ago</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">27/01/2015</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">Il y a 3 heures</span>
</div>
</div>
</div>
</div>
"""
response = mock.Mock(content=html)
results = bing_news.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 6)
html = """
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">44 minutes ago</span>
</div>
</div>
</div>
"""
response = mock.Mock(content=html) response = mock.Mock(content=html)
results = bing_news.response(response) results = bing_news.response(response)
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
self.assertEqual(len(results), 0) self.assertEqual(len(results), 0)
html = """<?xml version="1.0" encoding="utf-8" ?>gabarge"""
response = mock.Mock(content=html)
self.assertRaises(lxml.etree.XMLSyntaxError, bing_news.response, response)

View File

@ -228,6 +228,14 @@ def prettify_url(url):
return url return url
# get element in list or default value
def list_get(a_list, index, default=None):
if len(a_list) > index:
return a_list[index]
else:
return default
def get_blocked_engines(engines, cookies): def get_blocked_engines(engines, cookies):
if 'blocked_engines' not in cookies: if 'blocked_engines' not in cookies:
return [(engine_name, category) for engine_name in engines return [(engine_name, category) for engine_name in engines