Merge pull request #208 from pointhi/new_engines

add 1x.com engine, improve yacy-engine
This commit is contained in:
Adam Tauber 2015-02-01 14:07:34 +01:00
commit 03137eebd9
5 changed files with 157 additions and 13 deletions

82
searx/engines/www1x.py Normal file
View File

@ -0,0 +1,82 @@
## 1x (Images)
#
# @website http://1x.com/
# @provide-api no
#
# @using-api no
# @results HTML
# @stable no (HTML can change)
# @parse url, title, thumbnail, img_src, content
from urllib import urlencode
from urlparse import urljoin
from lxml import html
import string
import re
# engine dependent config
categories = ['images']
paging = False
# search-url
base_url = 'http://1x.com'
search_url = base_url+'/backend/search.php?{query}'
# do search-request
def request(query, params):
params['url'] = search_url.format(query=urlencode({'q': query}))
return params
# get response from search-request
def response(resp):
results = []
# get links from result-text
regex = re.compile('(</a>|<a)')
results_parts = re.split(regex, resp.text)
cur_element = ''
# iterate over link parts
for result_part in results_parts:
# processed start and end of link
if result_part == '<a':
cur_element = result_part
continue
elif result_part != '</a>':
cur_element += result_part
continue
cur_element += result_part
# fix xml-error
cur_element = string.replace(cur_element, '"></a>', '"/></a>')
dom = html.fromstring(cur_element)
link = dom.xpath('//a')[0]
url = urljoin(base_url, link.attrib.get('href'))
title = link.attrib.get('title', '')
thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
# TODO: get image with higher resolution
img_src = thumbnail_src
# check if url is showing to a photo
if '/photo/' not in url:
continue
# append result
results.append({'url': url,
'title': title,
'img_src': img_src,
'content': '',
'thumbnail_src': thumbnail_src,
'template': 'images.html'})
# return results
return results

View File

@ -68,9 +68,18 @@ def response(resp):
search_results = raw_search_results.get('channels', {})[0].get('items', [])
if resp.search_params['category'] == 'general':
for result in search_results:
# parse image results
if result.get('image'):
# append result
results.append({'url': result['url'],
'title': result['title'],
'content': '',
'img_src': result['image'],
'template': 'images.html'})
# parse general results
for result in search_results:
else:
publishedDate = parser.parse(result['pubDate'])
# append result
@ -79,17 +88,7 @@ def response(resp):
'content': result['description'],
'publishedDate': publishedDate})
elif resp.search_params['category'] == 'images':
# parse image results
for result in search_results:
# append result
results.append({'url': result['url'],
'title': result['title'],
'content': '',
'img_src': result['image'],
'template': 'images.html'})
#TODO parse video, audio and file results
#TODO parse video, audio and file results
# return results
return results

View File

@ -83,6 +83,11 @@ engines:
engine : www500px
shortcut : px
- name : 1x
engine : www1x
shortcut : 1x
disabled : True
- name : flickr
categories : images
shortcut : fl

View File

@ -0,0 +1,57 @@
from collections import defaultdict
import mock
from searx.engines import www1x
from searx.testing import SearxTestCase
class TestWww1xEngine(SearxTestCase):
def test_request(self):
query = 'test_query'
params = www1x.request(query, defaultdict(dict))
self.assertTrue('url' in params)
self.assertTrue(query in params['url'])
self.assertTrue('1x.com' in params['url'])
def test_response(self):
self.assertRaises(AttributeError, www1x.response, None)
self.assertRaises(AttributeError, www1x.response, [])
self.assertRaises(AttributeError, www1x.response, '')
self.assertRaises(AttributeError, www1x.response, '[]')
response = mock.Mock(text='<html></html>')
self.assertEqual(www1x.response(response), [])
html = """
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters
[
<!ELEMENT characters (character*) >
<!ELEMENT character (#PCDATA ) >
<!ENTITY iexcl "&#161;" >
<!ENTITY cent "&#162;" >
<!ENTITY pound "&#163;" >
]
><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%">
<tr>
<td style="min-width: 220px;" valign="top">
<div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div>
<div>
<a href="/photo/123456" class="dynamiclink">
<img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;">
</a>
<a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink">
<img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg">
</a>
</div>
</td>
</table>
]]></searchresult></root>
"""
response = mock.Mock(text=html)
results = www1x.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456')
self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg')
self.assertEqual(results[0]['content'], '')
self.assertEqual(results[0]['template'], 'images.html')

View File

@ -1,2 +1,3 @@
from searx.tests.engines.test_dummy import * # noqa
from searx.tests.engines.test_github import * # noqa
from searx.tests.engines.test_www1x import * # noqa