[enh] add infoboxes and answers

This commit is contained in:
Dalf 2014-09-28 16:51:41 +02:00
parent e39d9fe542
commit 6bfd566353
10 changed files with 525 additions and 130 deletions

View file

@ -38,17 +38,14 @@ def default_request_params():
# create a callback wrapper for the search engine results
def make_callback(engine_name, results, suggestions, callback, params):
def make_callback(engine_name, results, suggestions, answers, infoboxes, callback, params):
# creating a callback wrapper for the search engine results
def process_callback(response, **kwargs):
cb_res = []
response.search_params = params
# update stats with current page-load-time
engines[engine_name].stats['page_load_time'] += \
(datetime.now() - params['started']).total_seconds()
# callback
try:
search_results = callback(response)
except Exception, e:
@ -61,6 +58,7 @@ def make_callback(engine_name, results, suggestions, callback, params):
engine_name, str(e))
return
# add results
for result in search_results:
result['engine'] = engine_name
@ -70,21 +68,38 @@ def make_callback(engine_name, results, suggestions, callback, params):
suggestions.add(result['suggestion'])
continue
# if it is an answer, add it to list of answers
if 'answer' in result:
answers.add(result['answer'])
continue
# if it is an infobox, add it to list of infoboxes
if 'infobox' in result:
infoboxes.append(result)
print result
continue
# append result
cb_res.append(result)
results[engine_name] = cb_res
# update stats with current page-load-time
engines[engine_name].stats['page_load_time'] += \
(datetime.now() - params['started']).total_seconds()
return process_callback
# return the meaningful length of the content for a result
def content_result_len(result):
if isinstance(result.get('content'), basestring):
content = re.sub('[,;:!?\./\\\\ ()-_]', '', result.get('content'))
def content_result_len(content):
if isinstance(content, basestring):
content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
return len(content)
else:
return 0
# score results and remove duplications
def score_results(results):
# calculate scoring parameters
@ -138,7 +153,7 @@ def score_results(results):
# merge duplicates together
if duplicated:
# using content with more text
if content_result_len(res) > content_result_len(duplicated):
if content_result_len(res.get('content', '')) > content_result_len(duplicated.get('content', '')):
duplicated['content'] = res['content']
# increase result-score
@ -197,6 +212,64 @@ def score_results(results):
return gresults
def merge_two_infoboxes(infobox1, infobox2):
if 'urls' in infobox2:
urls1 = infobox1.get('urls', None)
if urls1 == None:
urls1 = []
infobox1.set('urls', urls1)
urlSet = set()
for url in infobox1.get('urls', []):
urlSet.add(url.get('url', None))
for url in infobox2.get('urls', []):
if url.get('url', None) not in urlSet:
urls1.append(url)
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None)
if attributes1 == None:
attributes1 = []
infobox1.set('attributes', attributes1)
attributeSet = set()
for attribute in infobox1.get('attributes', []):
if attribute.get('label', None) not in attributeSet:
attributeSet.add(attribute.get('label', None))
for attribute in infobox2.get('attributes', []):
attributes1.append(attribute)
if 'content' in infobox2:
content1 = infobox1.get('content', None)
content2 = infobox2.get('content', '')
if content1 != None:
if content_result_len(content2) > content_result_len(content1):
infobox1['content'] = content2
else:
infobox1.set('content', content2)
def merge_infoboxes(infoboxes):
results = []
infoboxes_id = {}
for infobox in infoboxes:
add_infobox = True
infobox_id = infobox.get('id', None)
if infobox_id != None:
existingIndex = infoboxes_id.get(infobox_id, None)
if existingIndex != None:
merge_two_infoboxes(results[existingIndex], infobox)
add_infobox=False
if add_infobox:
results.append(infobox)
infoboxes_id[infobox_id] = len(results)-1
return results
class Search(object):
"""Search information container"""
@ -219,6 +292,8 @@ class Search(object):
self.results = []
self.suggestions = []
self.answers = []
self.infoboxes = []
self.request_data = {}
# set specific language if set
@ -350,6 +425,8 @@ class Search(object):
requests = []
results = {}
suggestions = set()
answers = set()
infoboxes = []
# increase number of searches
number_of_searches += 1
@ -394,6 +471,8 @@ class Search(object):
selected_engine['name'],
results,
suggestions,
answers,
infoboxes,
engine.response,
request_params
)
@ -431,11 +510,14 @@ class Search(object):
# score results and remove duplications
results = score_results(results)
# merge infoboxes according to their ids
infoboxes = merge_infoboxes(infoboxes)
# update engine stats, using calculated score
for result in results:
for res_engine in result['engines']:
engines[result['engine']]\
.stats['score_count'] += result['score']
# return results and suggestions
return results, suggestions
# return results, suggestions, answers and infoboxes
return results, suggestions, answers, infoboxes