From 16bdc0baf4f2b56af000337c4a2fa1e689f1220c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Fri, 9 Dec 2016 11:44:24 +0100 Subject: [PATCH] [mod] do not escape html content in engines --- searx/engines/archlinux.py | 3 +-- searx/engines/base.py | 3 +-- searx/engines/bing.py | 5 ++--- searx/engines/btdigg.py | 5 ++--- searx/engines/dailymotion.py | 3 +-- searx/engines/deezer.py | 9 +++++---- searx/engines/dictzone.py | 5 ++--- searx/engines/digg.py | 3 +-- searx/engines/fdroid.py | 3 +-- searx/engines/flickr.py | 14 +++----------- searx/engines/flickr_noapi.py | 7 +++---- searx/engines/gigablast.py | 5 ++--- searx/engines/github.py | 3 +-- searx/engines/google.py | 5 ++--- searx/engines/kickass.py | 3 +-- searx/engines/nyaa.py | 6 ++---- searx/engines/piratebay.py | 3 +-- searx/engines/reddit.py | 3 +-- searx/engines/searchcode_doc.py | 12 ++---------- searx/engines/seedpeer.py | 1 - searx/engines/spotify.py | 9 +++++---- searx/engines/stackoverflow.py | 5 ++--- searx/engines/startpage.py | 5 ++--- searx/engines/subtitleseeker.py | 5 ++--- searx/engines/swisscows.py | 9 ++++----- searx/engines/tokyotoshokan.py | 1 - searx/engines/torrentz.py | 1 - searx/engines/translated.py | 11 +++++------ searx/engines/wolframalpha_noapi.py | 1 - searx/engines/yandex.py | 5 ++--- 30 files changed, 56 insertions(+), 97 deletions(-) diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index b846934f7..5ba512766 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -12,7 +12,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -135,7 +134,7 @@ def response(resp): for result in dom.xpath(xpath_results): link = result.xpath(xpath_link)[0] href = urljoin(base_url, link.attrib.get('href')) - title = escape(extract_text(link)) + title = extract_text(link) results.append({'url': href, 'title': title}) diff --git a/searx/engines/base.py b/searx/engines/base.py index 66491d395..a552453ce 100755 --- a/searx/engines/base.py +++ b/searx/engines/base.py @@ -16,7 +16,6 @@ from lxml import etree from urllib import urlencode from searx.utils import searx_useragent -from cgi import escape from datetime import datetime import re @@ -94,7 +93,7 @@ def response(resp): url = item.text elif item.attrib["name"] == "dcdescription": - content = escape(item.text[:300]) + content = item.text[:300] if len(item.text) > 300: content += "..." diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 540597162..58db61251 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -14,7 +14,6 @@ """ from urllib import urlencode -from cgi import escape from lxml import html from searx.engines.xpath import extract_text @@ -61,7 +60,7 @@ def response(resp): link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = escape(extract_text(result.xpath('.//p'))) + content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, @@ -73,7 +72,7 @@ def response(resp): link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = extract_text(link) - content = escape(extract_text(result.xpath('.//p'))) + content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index ea6baf1c8..33c8355de 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -11,7 +11,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter @@ -51,8 +50,8 @@ def response(resp): for result in search_res: link = result.xpath('.//td[@class="torrent_name"]//a')[0] href = urljoin(url, link.attrib.get('href')) - title = escape(extract_text(link)) - content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0])) + title = extract_text(link) + content = extract_text(result.xpath('.//pre[@class="snippet"]')[0]) content = "
".join(content.split("\n")) filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0] diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 4eb894725..317f34f59 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -14,7 +14,6 @@ from urllib import urlencode from json import loads -from cgi import escape from datetime import datetime # engine dependent config @@ -57,7 +56,7 @@ def response(resp): for res in search_res['list']: title = res['title'] url = res['url'] - content = escape(res['description']) + content = res['description'] thumbnail = res['thumbnail_360_url'] publishedDate = datetime.fromtimestamp(res['created_time'], None) embedded = embedded_url.format(videoid=res['id']) diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py index 0530bc072..8e87bbeec 100644 --- a/searx/engines/deezer.py +++ b/searx/engines/deezer.py @@ -51,10 +51,11 @@ def response(resp): if url.startswith('http://'): url = 'https' + url[4:] - content = result['artist']['name'] +\ - " • " +\ - result['album']['title'] +\ - " • " + result['title'] + content = '{} - {} - {}'.format( + result['artist']['name'], + result['album']['title'], + result['title']) + embedded = embedded_url.format(audioid=result['id']) # append result diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 9765d5f60..20a9a8980 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -12,7 +12,6 @@ import re from urlparse import urljoin from lxml import html -from cgi import escape from searx.utils import is_valid_lang categories = ['general'] @@ -62,8 +61,8 @@ def response(resp): results.append({ 'url': urljoin(resp.url, '?%d' % k), - 'title': escape(from_result.text_content()), - 'content': escape('; '.join(to_results)) + 'title': from_result.text_content(), + 'content': '; '.join(to_results) }) return results diff --git a/searx/engines/digg.py b/searx/engines/digg.py index a10b38bb6..238b466a0 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -13,7 +13,6 @@ from urllib import quote_plus from json import loads from lxml import html -from cgi import escape from dateutil import parser # engine dependent config @@ -56,7 +55,7 @@ def response(resp): url = result.attrib.get('data-contenturl') thumbnail = result.xpath('.//img')[0].attrib.get('src') title = ''.join(result.xpath(title_xpath)) - content = escape(''.join(result.xpath(content_xpath))) + content = ''.join(result.xpath(content_xpath)) pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime') publishedDate = parser.parse(pubdate) diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py index 0b16773e3..6d470a4eb 100644 --- a/searx/engines/fdroid.py +++ b/searx/engines/fdroid.py @@ -9,7 +9,6 @@ @parse url, title, content """ -from cgi import escape from urllib import urlencode from searx.engines.xpath import extract_text from lxml import html @@ -43,7 +42,7 @@ def response(resp): img_src = app.xpath('.//img/@src')[0] content = extract_text(app.xpath('./p')[0]) - content = escape(content.replace(title, '', 1).strip()) + content = content.replace(title, '', 1).strip() results.append({'url': url, 'title': title, diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py index 68d45bc17..1c3eef789 100644 --- a/searx/engines/flickr.py +++ b/searx/engines/flickr.py @@ -77,21 +77,13 @@ def response(resp): url = build_flickr_url(photo['owner'], photo['id']) - title = photo['title'] - - content = '' +\ - photo['ownername'] +\ - '
' +\ - '' +\ - photo['description']['_content'] +\ - '' - # append result results.append({'url': url, - 'title': title, + 'title': photo['title'], 'img_src': img_src, 'thumbnail_src': thumbnail_src, - 'content': content, + 'content': content = photo['description']['_content'], + 'author': photo['ownername'], 'template': 'images.html'}) # return results diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 5c4193c11..68be139be 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -102,16 +102,15 @@ def response(resp): title = photo.get('title', '') - content = '' +\ - photo['username'] +\ - '
' + author = photo['username'] # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'thumbnail_src': thumbnail_src, - 'content': content, + 'content': '', + 'author': author, 'template': 'images.html'}) return results diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 6e4e24b68..5430eb3ba 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from json import loads from random import randint from time import time @@ -78,8 +77,8 @@ def response(resp): for result in response_json['results']: # append result results.append({'url': result['url'], - 'title': escape(result['title']), - 'content': escape(result['sum'])}) + 'title': result['title'], + 'content': result['sum']}) # return results return results diff --git a/searx/engines/github.py b/searx/engines/github.py index cc1fc470c..7adef3be9 100644 --- a/searx/engines/github.py +++ b/searx/engines/github.py @@ -12,7 +12,6 @@ from urllib import urlencode from json import loads -from cgi import escape # engine dependent config categories = ['it'] @@ -48,7 +47,7 @@ def response(resp): url = res['html_url'] if res['description']: - content = escape(res['description'][:500]) + content = res['description'][:500] else: content = '' diff --git a/searx/engines/google.py b/searx/engines/google.py index ea93bc94f..0e2d522f4 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -9,7 +9,6 @@ # @parse url, title, content, suggestion import re -from cgi import escape from urllib import urlencode from urlparse import urlparse, parse_qsl from lxml import html, etree @@ -155,7 +154,7 @@ def parse_url(url_string, google_hostname): def extract_text_from_dom(result, xpath): r = result.xpath(xpath) if len(r) > 0: - return escape(extract_text(r[0])) + return extract_text(r[0]) return None @@ -264,7 +263,7 @@ def response(resp): # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion - results.append({'suggestion': escape(extract_text(suggestion))}) + results.append({'suggestion': extract_text(suggestion)}) # return results return results diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py index 9cd8284da..059fa2a66 100644 --- a/searx/engines/kickass.py +++ b/searx/engines/kickass.py @@ -11,7 +11,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter @@ -57,7 +56,7 @@ def response(resp): link = result.xpath('.//a[@class="cellMainLink"]')[0] href = urljoin(url, link.attrib['href']) title = extract_text(link) - content = escape(extract_text(result.xpath(content_xpath))) + content = extract_text(result.xpath(content_xpath)) seed = extract_text(result.xpath('.//td[contains(@class, "green")]')) leech = extract_text(result.xpath('.//td[contains(@class, "red")]')) filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]')) diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index cda8231f7..4ca5b3171 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -9,7 +9,6 @@ @parse url, title, content, seed, leech, torrentfile """ -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -78,7 +77,7 @@ def response(resp): # torrent title page_a = result.xpath(xpath_title)[0] - title = escape(extract_text(page_a)) + title = extract_text(page_a) # link to the page href = page_a.attrib.get('href') @@ -90,7 +89,7 @@ def response(resp): try: file_size, suffix = result.xpath(xpath_filesize)[0].split(' ') file_size = int(float(file_size) * get_filesize_mul(suffix)) - except Exception as e: + except: file_size = None # seed count @@ -105,7 +104,6 @@ def response(resp): # content string contains all information not included into template content = 'Category: "{category}". Downloaded {downloads} times.' content = content.format(category=category, downloads=downloads) - content = escape(content) results.append({'url': href, 'title': title, diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index 55446b410..ca21a3bb2 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -9,7 +9,6 @@ # @parse url, title, content, seed, leech, magnetlink from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter @@ -62,7 +61,7 @@ def response(resp): link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) - content = escape(extract_text(result.xpath(content_xpath))) + content = extract_text(result.xpath(content_xpath)) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] # convert seed to int if possible diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index 3ca7e44f6..b29792a3a 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -11,7 +11,6 @@ """ import json -from cgi import escape from urllib import urlencode from urlparse import urlparse, urljoin from datetime import datetime @@ -68,7 +67,7 @@ def response(resp): img_results.append(params) else: created = datetime.fromtimestamp(data['created_utc']) - content = escape(data['selftext']) + content = data['selftext'] if len(content) > 500: content = content[:500] + '...' params['content'] = content diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index f24fe6f90..6c1acdcdd 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -44,20 +44,12 @@ def response(resp): # parse results for result in search_results.get('results', []): href = result['url'] - title = "[" + result['type'] + "] " +\ - result['namespace'] +\ - " " + result['name'] - content = '[' +\ - result['type'] + "] " +\ - result['name'] + " " +\ - result['synopsis'] +\ - "
" +\ - result['description'] + title = "[{}] {} {}".format(result['type'], result['namespace'], result['name']) # append result results.append({'url': href, 'title': title, - 'content': content}) + 'content': result['description']}) # return results return results diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py index 854ebba03..e1309a9b5 100644 --- a/searx/engines/seedpeer.py +++ b/searx/engines/seedpeer.py @@ -9,7 +9,6 @@ # @parse url, title, content, seed, leech, magnetlink from urlparse import urljoin -from cgi import escape from urllib import quote from lxml import html from operator import itemgetter diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index f75796e83..0e8e69961 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -46,10 +46,11 @@ def response(resp): if result['type'] == 'track': title = result['name'] url = result['external_urls']['spotify'] - content = result['artists'][0]['name'] +\ - " • " +\ - result['album']['name'] +\ - " • " + result['name'] + content = '{} - {} - {}'.format( + result['artists'][0]['name'], + result['album']['name'], + result['name']) + embedded = embedded_url.format(audioid=result['id']) # append result diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index fdd3711a9..5e7ab2901 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -11,7 +11,6 @@ """ from urlparse import urljoin -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text @@ -48,8 +47,8 @@ def response(resp): for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) - title = escape(extract_text(link)) - content = escape(extract_text(result.xpath(content_xpath))) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': href, diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index d8b702c4d..6f6eae1cf 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -11,7 +11,6 @@ # @todo paging from lxml import html -from cgi import escape from dateutil import parser from datetime import datetime, timedelta import re @@ -79,10 +78,10 @@ def response(resp): if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): continue - title = escape(extract_text(link)) + title = extract_text(link) if result.xpath('./p[@class="desc clk"]'): - content = escape(extract_text(result.xpath('./p[@class="desc clk"]'))) + content = extract_text(result.xpath('./p[@class="desc clk"]')) else: content = '' diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index 47d27d0b2..daba68be7 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from urllib import quote_plus from lxml import html from searx.languages import language_codes @@ -59,7 +58,7 @@ def response(resp): elif search_lang: href = href + search_lang + '/' - title = escape(extract_text(link)) + title = extract_text(link) content = extract_text(result.xpath('.//div[contains(@class,"red")]')) content = content + " - " @@ -75,7 +74,7 @@ def response(resp): # append result results.append({'url': href, 'title': title, - 'content': escape(content)}) + 'content': content}) # return results return results diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index 1a94ed64e..72184e428 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -10,7 +10,6 @@ @parse url, title, content """ -from cgi import escape from json import loads from urllib import urlencode, unquote import re @@ -78,7 +77,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': escape(result['Title']), + 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) @@ -90,8 +89,8 @@ def response(resp): # append result results.append({'url': result_url, - 'title': escape(result_title), - 'content': escape(result_content)}) + 'title': result_title, + 'content': result_content}) # parse images for result in json.get('Images', []): @@ -100,7 +99,7 @@ def response(resp): # append result results.append({'url': result['SourceUrl'], - 'title': escape(result['Title']), + 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py index e2990e153..52b2cbe07 100644 --- a/searx/engines/tokyotoshokan.py +++ b/searx/engines/tokyotoshokan.py @@ -11,7 +11,6 @@ """ import re -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py index 92fbe7013..f9c832651 100644 --- a/searx/engines/torrentz.py +++ b/searx/engines/torrentz.py @@ -12,7 +12,6 @@ """ import re -from cgi import escape from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text diff --git a/searx/engines/translated.py b/searx/engines/translated.py index 02047bc93..e78db0d8e 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -9,7 +9,6 @@ @parse url, title, content """ import re -from cgi import escape from searx.utils import is_valid_lang categories = ['general'] @@ -52,14 +51,14 @@ def request(query, params): def response(resp): results = [] results.append({ - 'url': escape(web_url.format( + 'url': web_url.format( from_lang=resp.search_params['from_lang'][2], to_lang=resp.search_params['to_lang'][2], - query=resp.search_params['query'])), - 'title': escape('[{0}-{1}] {2}'.format( + query=resp.search_params['query']), + 'title': '[{0}-{1}] {2}'.format( resp.search_params['from_lang'][1], resp.search_params['to_lang'][1], - resp.search_params['query'])), - 'content': escape(resp.json()['responseData']['translatedText']) + resp.search_params['query']), + 'content': resp.json()['responseData']['translatedText'] }) return results diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index e318d93e6..1534501b3 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -8,7 +8,6 @@ # @stable no # @parse url, infobox -from cgi import escape from json import loads from time import time from urllib import urlencode diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index be3ec36ce..938fdd184 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -9,7 +9,6 @@ @parse url, title, content """ -from cgi import escape from urllib import urlencode from lxml import html from searx.search import logger @@ -52,8 +51,8 @@ def response(resp): for result in dom.xpath(results_xpath): try: res = {'url': result.xpath(url_xpath)[0], - 'title': escape(''.join(result.xpath(title_xpath))), - 'content': escape(''.join(result.xpath(content_xpath)))} + 'title': ''.join(result.xpath(title_xpath)), + 'content': ''.join(result.xpath(content_xpath))} except: logger.exception('yandex parse crash') continue