From 5d764f95cf44ab4c1ba83d7055297e3c4ea48c98 Mon Sep 17 00:00:00 2001 From: asciimoo Date: Sat, 26 Oct 2013 13:45:43 +0200 Subject: [PATCH] [enh] xpath engine absolute xpath support --- searx/engines/xpath.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 61672b8cf..00fc3fac2 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -5,10 +5,10 @@ from cgi import escape from lxml.etree import _ElementStringResult search_url = None -results_xpath = None url_xpath = None content_xpath = None title_xpath = None +results_xpath = '' def extract_url(xpath_results): url = '' @@ -26,7 +26,7 @@ def extract_url(xpath_results): else: url = xpath_results[0].attrib.get('href') else: - raise Exception('Cannot handle xpath url resultset') + url = xpath_results.attrib.get('href') if not url.startswith('http://') or not url.startswith('https://'): url = 'http://'+url parsed_url = urlparse(url) @@ -45,10 +45,15 @@ def response(resp): results = [] dom = html.fromstring(resp.text) query = resp.search_params['query'] - for result in dom.xpath(results_xpath): - url = extract_url(result.xpath(url_xpath)) - title = ' '.join(result.xpath(title_xpath)) - content = escape(' '.join(result.xpath(content_xpath))).replace(query, '{0}'.format(query)) - results.append({'url': url, 'title': title, 'content': content}) + if results_xpath: + for result in dom.xpath(results_xpath): + url = extract_url(result.xpath(url_xpath)) + title = ' '.join(result.xpath(title_xpath)) + content = escape(' '.join(result.xpath(content_xpath))).replace(query, '{0}'.format(query)) + results.append({'url': url, 'title': title, 'content': content}) + else: + for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)): + results.append({'url': url, 'title': title, 'content': content}) + return results