[fix] highlighting only html

This commit is contained in:
asciimoo 2014-01-10 23:38:08 +01:00
parent 04c408389d
commit 7b4ec5c5e9
3 changed files with 35 additions and 28 deletions

View File

@ -25,7 +25,6 @@ from urlparse import urlparse
from searx import settings from searx import settings
import ConfigParser import ConfigParser
import sys import sys
import re
from datetime import datetime from datetime import datetime
engine_dir = dirname(realpath(__file__)) engine_dir = dirname(realpath(__file__))
@ -106,31 +105,6 @@ def make_callback(engine_name, results, suggestions, callback, params):
results[engine_name] = cb_res results[engine_name] = cb_res
return process_callback return process_callback
def highlight_content(content, query):
if not content:
return None
# ignoring html contents
# TODO better html content detection
if content.find('<') != -1:
return content
query = query.decode('utf-8')
if content.lower().find(query.lower()) > -1:
query_regex = u'({0})'.format(re.escape(query))
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
else:
regex_parts = []
for chunk in query.split():
if len(chunk) == 1:
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
else:
regex_parts.append(u'{0}'.format(re.escape(chunk)))
query_regex = u'({0})'.format('|'.join(regex_parts))
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
return content
def score_results(results): def score_results(results):
flat_res = filter(None, chain.from_iterable(izip_longest(*results.values()))) flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
flat_len = len(flat_res) flat_len = len(flat_res)
@ -218,8 +192,6 @@ def search(query, request, selected_engines):
results = score_results(results) results = score_results(results)
for result in results: for result in results:
if 'content' in result:
result['content'] = highlight_content(result['content'], query)
for res_engine in result['engines']: for res_engine in result['engines']:
engines[result['engine']].stats['score_count'] += result['score'] engines[result['engine']].stats['score_count'] += result['score']

View File

@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
import csv import csv
import codecs import codecs
import cStringIO import cStringIO
import re
def highlight_content(content, query):
if not content:
return None
# ignoring html contents
# TODO better html content detection
if content.find('<') != -1:
return content
query = query.decode('utf-8')
if content.lower().find(query.lower()) > -1:
query_regex = u'({0})'.format(re.escape(query))
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
else:
regex_parts = []
for chunk in query.split():
if len(chunk) == 1:
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
else:
regex_parts.append(u'{0}'.format(re.escape(chunk)))
query_regex = u'({0})'.format('|'.join(regex_parts))
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
return content
class HTMLTextExtractor(HTMLParser): class HTMLTextExtractor(HTMLParser):
def __init__(self): def __init__(self):

View File

@ -29,6 +29,7 @@ import json
import cStringIO import cStringIO
from searx.utils import UnicodeWriter from searx.utils import UnicodeWriter
from flask import send_from_directory from flask import send_from_directory
from searx.utils import highlight_content, html_to_text
@ -104,6 +105,14 @@ def index():
results, suggestions = search(query, request, selected_engines) results, suggestions = search(query, request, selected_engines)
for result in results: for result in results:
if request_data.get('format', 'html') == 'html':
if 'content' in result:
result['content'] = highlight_content(result['content'], query)
result['title'] = highlight_content(result['title'], query)
else:
if 'content' in result:
result['content'] = html_to_text(result['content']).strip()
result['title'] = html_to_text(result['title']).strip()
if len(result['url']) > 74: if len(result['url']) > 74:
result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:] result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
else: else: