diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 291fee04d..bffbbe466 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -8,79 +8,85 @@ # @stable no # @parse answer -from re import search, sub +from cgi import escape from json import loads +from time import time from urllib import urlencode -from lxml import html -import HTMLParser + +from searx.poolrequests import get as http_get # search-url -url = 'http://www.wolframalpha.com/' +url = 'https://www.wolframalpha.com/' search_url = url + 'input/?{query}' +search_url = url + 'input/json.jsp'\ + '?async=true'\ + '&banners=raw'\ + '&debuggingdata=false'\ + '&format=image,plaintext,imagemap,minput,moutput'\ + '&formattimeout=2'\ + '&{query}'\ + '&output=JSON'\ + '&parsetimeout=2'\ + '&proxycode={token}'\ + '&scantimeout=0.5'\ + '&sponsorcategories=true'\ + '&statemethod=deploybutton' + # xpath variables scripts_xpath = '//script' title_xpath = '//title' failure_xpath = '//p[attribute::class="pfail"]' +token = {'value': '', + 'last_updated': None} + + +# seems, wolframalpha resets its token in every hour +def obtain_token(): + update_time = time() - (time() % 3600) + token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0) + token['value'] = loads(token_response.text)['code'] + token['last_updated'] = update_time + return token + + +obtain_token() # do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'i': query})) + # obtain token if last update was more than an hour + if time() - token['last_updated'] > 3600: + obtain_token() + params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) + params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query return params # get response from search-request def response(resp): - results = [] - line = None + resp_json = loads(resp.text) - dom = html.fromstring(resp.text) - scripts = dom.xpath(scripts_xpath) + if not resp_json['queryresult']['success']: + return [] - # the answer is inside a js function - # answer can be located in different 'pods', although by default it should be in pod_0200 - possible_locations = ['pod_0200\.push\((.*)', - 'pod_0100\.push\((.*)'] + # TODO handle resp_json['queryresult']['assumptions'] + result_chunks = [] + for pod in resp_json['queryresult']['pods']: + pod_title = pod.get('title', '') + if 'subpods' not in pod: + continue + for subpod in pod['subpods']: + if 'img' in subpod: + result_chunks.append(u'

{0}
{2}

' + .format(escape(pod_title or subpod['img']['alt']), + escape(subpod['img']['src']), + escape(subpod['img']['alt']))) - # failed result - if dom.xpath(failure_xpath): - return results + if not result_chunks: + return [] - # get line that matches the pattern - for pattern in possible_locations: - for script in scripts: - try: - line = search(pattern, script.text_content()).group(1) - break - except AttributeError: - continue - if line: - break - - if line: - # extract answer from json - answer = line[line.find('{'):line.rfind('}') + 1] - try: - answer = loads(answer) - except Exception: - answer = loads(answer.encode('unicode-escape')) - answer = answer['stringified'] - - # clean plaintext answer - h = HTMLParser.HTMLParser() - answer = h.unescape(answer.decode('unicode-escape')) - answer = sub(r'\\', '', answer) - - results.append({'answer': answer}) - - # user input is in first part of title - title = dom.xpath(title_xpath)[0].text.encode('utf-8') - result_url = request(title[:-16], {})['url'] - - # append result - results.append({'url': result_url, - 'title': title.decode('utf-8')}) - - return results + return [{'url': resp.request.headers['Referer'], + 'title': 'Wolframalpha', + 'content': ''.join(result_chunks)}] diff --git a/tests/unit/engines/test_wolframalpha_noapi.py b/tests/unit/engines/test_wolframalpha_noapi.py index cad9593f2..37f3a9059 100644 --- a/tests/unit/engines/test_wolframalpha_noapi.py +++ b/tests/unit/engines/test_wolframalpha_noapi.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from collections import defaultdict -import mock from searx.engines import wolframalpha_noapi from searx.testing import SearxTestCase @@ -21,173 +20,4 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase): self.assertRaises(AttributeError, wolframalpha_noapi.response, []) self.assertRaises(AttributeError, wolframalpha_noapi.response, '') self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]') - - html = """ - - Parangaricutirimícuaro - Wolfram|Alpha - - -
-

Wolfram|Alpha doesn't know how to interpret your input.

-
-
- Tip:  - Check your spelling, and use English - -
-
-
- - - """ - # test failed query - response = mock.Mock(text=html) - self.assertEqual(wolframalpha_noapi.response(response), []) - - html = """ - - sqrt(-1) - Wolfram|Alpha - - - - - - """ - # test plaintext - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertEquals('i', results[0]['answer']) - self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url']) - - html = """ - - integral 1/x - Wolfram|Alpha - - - - - - """ - # test integral - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('log(x)+c', results[0]['answer']) - self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url']) - - html = """ - - ∫1/x x - Wolfram|Alpha - - - - - - """ - # test input in mathematical notation - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('log(x)+c', results[0]['answer']) - self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url']) - - html = """ - - 1 euro to yen - Wolfram|Alpha - - - - - - """ - # test output with htmlentity - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('¥'.decode('utf-8'), results[0]['answer']) - self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url']) - - html = """ - - distance from nairobi to kyoto in inches - Wolfram|Alpha - - - - - - """ - # test output with utf-8 character - response = mock.Mock(text=html) - results = wolframalpha_noapi.response(response) - self.assertEqual(type(results), list) - self.assertEqual(len(results), 2) - self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer']) - self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title']) - self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches', - results[1]['url']) + # TODO