[enh] add infoboxes and answers

This commit is contained in:
Dalf 2014-09-28 16:51:41 +02:00
parent e39d9fe542
commit 6bfd566353
10 changed files with 525 additions and 130 deletions

View File

@ -38,16 +38,14 @@ def response(resp):
except: except:
return results return results
title = '{0} {1} in {2} is {3}'.format( answer = '{0} {1} = {2} {3} (1 {1} = {4} {3})'.format(
resp.search_params['ammount'], resp.search_params['ammount'],
resp.search_params['from'], resp.search_params['from'],
resp.search_params['ammount'] * conversion_rate,
resp.search_params['to'], resp.search_params['to'],
resp.search_params['ammount'] * conversion_rate conversion_rate
) )
content = '1 {0} is {1} {2}'.format(resp.search_params['from'],
conversion_rate,
resp.search_params['to'])
now_date = datetime.now().strftime('%Y%m%d') now_date = datetime.now().strftime('%Y%m%d')
url = 'http://finance.yahoo.com/currency/converter-results/{0}/{1}-{2}-to-{3}.html' # noqa url = 'http://finance.yahoo.com/currency/converter-results/{0}/{1}-{2}-to-{3}.html' # noqa
url = url.format( url = url.format(
@ -56,6 +54,7 @@ def response(resp):
resp.search_params['from'].lower(), resp.search_params['from'].lower(),
resp.search_params['to'].lower() resp.search_params['to'].lower()
) )
results.append({'title': title, 'content': content, 'url': url})
results.append({'answer' : answer, 'url': url})
return results return results

View File

@ -1,10 +1,25 @@
import json import json
from urllib import urlencode from urllib import urlencode
from lxml import html
from searx.engines.xpath import extract_text
url = 'http://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1' url = 'https://api.duckduckgo.com/?{query}&format=json&pretty=0&no_redirect=1&d=1'
def result_to_text(url, text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category"
dom = html.fromstring(htmlResult)
a = dom.xpath('//a')
if len(a)>=1:
return extract_text(a[0])
else:
return text
def html_to_text(htmlFragment):
dom = html.fromstring(htmlFragment)
return extract_text(dom)
def request(query, params): def request(query, params):
# TODO add kl={locale}
params['url'] = url.format(query=urlencode({'q': query})) params['url'] = url.format(query=urlencode({'q': query}))
return params return params
@ -12,12 +27,104 @@ def request(query, params):
def response(resp): def response(resp):
search_res = json.loads(resp.text) search_res = json.loads(resp.text)
results = [] results = []
content = ''
heading = search_res.get('Heading', '')
attributes = []
urls = []
infobox_id = None
relatedTopics = []
# add answer if there is one
answer = search_res.get('Answer', '')
if answer != '':
results.append({ 'answer' : html_to_text(answer) })
# add infobox
if 'Definition' in search_res: if 'Definition' in search_res:
if search_res.get('AbstractURL'): content = content + search_res.get('Definition', '')
res = {'title': search_res.get('Heading', ''),
'content': search_res.get('Definition', ''), if 'Abstract' in search_res:
'url': search_res.get('AbstractURL', ''), content = content + search_res.get('Abstract', '')
'class': 'definition_result'}
results.append(res)
# image
image = search_res.get('Image', '')
image = None if image == '' else image
# attributes
if 'Infobox' in search_res:
infobox = search_res.get('Infobox', None)
if 'content' in infobox:
for info in infobox.get('content'):
attributes.append({'label': info.get('label'), 'value': info.get('value')})
# urls
for ddg_result in search_res.get('Results', []):
if 'FirstURL' in ddg_result:
firstURL = ddg_result.get('FirstURL', '')
text = ddg_result.get('Text', '')
urls.append({'title':text, 'url':firstURL})
results.append({'title':heading, 'url': firstURL})
# related topics
for ddg_result in search_res.get('RelatedTopics', None):
if 'FirstURL' in ddg_result:
suggestion = result_to_text(ddg_result.get('FirstURL', None), ddg_result.get('Text', None), ddg_result.get('Result', None))
if suggestion != heading:
results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result:
suggestions = []
relatedTopics.append({ 'name' : ddg_result.get('Name', ''), 'suggestions': suggestions })
for topic_result in ddg_result.get('Topics', []):
suggestion = result_to_text(topic_result.get('FirstURL', None), topic_result.get('Text', None), topic_result.get('Result', None))
if suggestion != heading:
suggestions.append(suggestion)
# abstract
abstractURL = search_res.get('AbstractURL', '')
if abstractURL != '':
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'), 'url': abstractURL})
# definition
definitionURL = search_res.get('DefinitionURL', '')
if definitionURL != '':
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'), 'url': definitionURL})
# entity
entity = search_res.get('Entity', None)
# TODO continent / country / department / location / waterfall / mountain range : link to map search, get weather, near by locations
# TODO musician : link to music search
# TODO concert tour : ??
# TODO film / actor / television / media franchise : links to IMDB / rottentomatoes (or scrap result)
# TODO music : link tu musicbrainz / last.fm
# TODO book : ??
# TODO artist / playwright : ??
# TODO compagny : ??
# TODO software / os : ??
# TODO software engineer : ??
# TODO prepared food : ??
# TODO website : ??
# TODO performing art : ??
# TODO prepared food : ??
# TODO programming language : ??
# TODO file format : ??
if len(heading)>0:
# TODO get infobox.meta.value where .label='article_title'
results.append({
'infobox': heading,
'id': infobox_id,
'entity': entity,
'content': content,
'img_src' : image,
'attributes': attributes,
'urls': urls,
'relatedTopics': relatedTopics
})
return results return results

193
searx/engines/wikidata.py Normal file
View File

@ -0,0 +1,193 @@
import json
from datetime import datetime
from requests import get
from urllib import urlencode
resultCount=2
urlSearch = 'https://www.wikidata.org/w/api.php?action=query&list=search&format=json&srnamespace=0&srprop=sectionsnippet&{query}'
urlDetail = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=labels%7Cinfo%7Csitelinks%7Csitelinks%2Furls%7Cdescriptions%7Cclaims&{query}'
# find the right URL for urlMap
urlMap = 'http://www.openstreetmap.org/?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
def request(query, params):
params['url'] = urlSearch.format(query=urlencode({'srsearch': query, 'srlimit': resultCount}))
print params['url']
return params
def response(resp):
results = []
search_res = json.loads(resp.text)
# TODO parallel http queries
before = datetime.now()
for r in search_res.get('query', {}).get('search', {}):
wikidata_id = r.get('title', '')
results = results + getDetail(wikidata_id)
after = datetime.now()
print str(after - before) + " second(s)"
return results
def getDetail(wikidata_id):
language = 'fr'
url = urlDetail.format(query=urlencode({'ids': wikidata_id, 'languages': language + '|en'}))
print url
response = get(url)
result = json.loads(response.content)
result = result.get('entities', {}).get(wikidata_id, {})
title = result.get('labels', {}).get(language, {}).get('value', None)
if title == None:
title = result.get('labels', {}).get('en', {}).get('value', wikidata_id)
results = []
urls = []
attributes = []
description = result.get('descriptions', {}).get(language, {}).get('value', '')
if description == '':
description = result.get('descriptions', {}).get('en', {}).get('value', '')
claims = result.get('claims', {})
official_website = get_string(claims, 'P856', None)
print official_website
if official_website != None:
urls.append({ 'title' : 'Official site', 'url': official_website })
results.append({ 'title': title, 'url' : official_website })
if language != 'en':
add_url(urls, 'Wikipedia (' + language + ')', get_wikilink(result, language + 'wiki'))
wikipedia_en_link = get_wikilink(result, 'enwiki')
add_url(urls, 'Wikipedia (en)', wikipedia_en_link)
if language != 'en':
add_url(urls, 'Wiki voyage (' + language + ')', get_wikilink(result, language + 'wikivoyage'))
add_url(urls, 'Wiki voyage (en)', get_wikilink(result, 'enwikivoyage'))
if language != 'en':
add_url(urls, 'Wikiquote (' + language + ')', get_wikilink(result, language + 'wikiquote'))
add_url(urls, 'Wikiquote (en)', get_wikilink(result, 'enwikiquote'))
add_url(urls, 'Commons wiki', get_wikilink(result, 'commonswiki'))
add_url(urls, 'Location', get_geolink(claims, 'P625', None))
add_url(urls, 'Wikidata', 'https://www.wikidata.org/wiki/' + wikidata_id + '?uselang='+ language)
postal_code = get_string(claims, 'P281', None)
if postal_code != None:
attributes.append({'label' : 'Postal code(s)', 'value' : postal_code})
date_of_birth = get_time(claims, 'P569', None)
if date_of_birth != None:
attributes.append({'label' : 'Date of birth', 'value' : date_of_birth})
date_of_death = get_time(claims, 'P570', None)
if date_of_death != None:
attributes.append({'label' : 'Date of death', 'value' : date_of_death})
results.append({
'infobox' : title,
'id' : wikipedia_en_link,
'content' : description,
'attributes' : attributes,
'urls' : urls
})
return results
def add_url(urls, title, url):
if url != None:
urls.append({'title' : title, 'url' : url})
def get_mainsnak(claims, propertyName):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return None
propValue = propValue[0].get('mainsnak', None)
return propValue
def get_string(claims, propertyName, defaultValue=None):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return defaultValue
result = []
for e in propValue:
mainsnak = e.get('mainsnak', {})
datatype = mainsnak.get('datatype', '')
datavalue = mainsnak.get('datavalue', {})
if datavalue != None:
result.append(datavalue.get('value', ''))
if len(result) == 0:
return defaultValue
else:
return ', '.join(result)
def get_time(claims, propertyName, defaultValue=None):
propValue = claims.get(propertyName, {})
if len(propValue) == 0:
return defaultValue
result = []
for e in propValue:
mainsnak = e.get('mainsnak', {})
datatype = mainsnak.get('datatype', '')
datavalue = mainsnak.get('datavalue', {})
if datavalue != None:
value = datavalue.get('value', '')
result.append(value.get('time', ''))
if len(result) == 0:
return defaultValue
else:
return ', '.join(result)
def get_geolink(claims, propertyName, defaultValue=''):
mainsnak = get_mainsnak(claims, propertyName)
if mainsnak == None:
return defaultValue
datatype = mainsnak.get('datatype', '')
datavalue = mainsnak.get('datavalue', {})
if datatype != 'globe-coordinate':
return defaultValue
value = datavalue.get('value', {})
precision = value.get('precision', 0.0002)
# there is no zoom information, deduce from precision (error prone)
# samples :
# 13 --> 5
# 1 --> 6
# 0.016666666666667 --> 9
# 0.00027777777777778 --> 19
# wolframalpha : quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}}
# 14.1186-8.8322 x+0.625447 x^2
if precision < 0.0003:
zoom = 19
else:
zoom = int(15 - precision*8.8322 + precision*precision*0.625447)
url = urlMap.replace('{latitude}', str(value.get('latitude',0))).replace('{longitude}', str(value.get('longitude',0))).replace('{zoom}', str(zoom))
return url
def get_wikilink(result, wikiid):
url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None)
if url == None:
return url
elif url.startswith('http://'):
url = url.replace('http://', 'https://')
elif url.startswith('//'):
url = 'https:' + url
return url

View File

@ -38,17 +38,14 @@ def default_request_params():
# create a callback wrapper for the search engine results # create a callback wrapper for the search engine results
def make_callback(engine_name, results, suggestions, callback, params): def make_callback(engine_name, results, suggestions, answers, infoboxes, callback, params):
# creating a callback wrapper for the search engine results # creating a callback wrapper for the search engine results
def process_callback(response, **kwargs): def process_callback(response, **kwargs):
cb_res = [] cb_res = []
response.search_params = params response.search_params = params
# update stats with current page-load-time # callback
engines[engine_name].stats['page_load_time'] += \
(datetime.now() - params['started']).total_seconds()
try: try:
search_results = callback(response) search_results = callback(response)
except Exception, e: except Exception, e:
@ -61,6 +58,7 @@ def make_callback(engine_name, results, suggestions, callback, params):
engine_name, str(e)) engine_name, str(e))
return return
# add results
for result in search_results: for result in search_results:
result['engine'] = engine_name result['engine'] = engine_name
@ -70,21 +68,38 @@ def make_callback(engine_name, results, suggestions, callback, params):
suggestions.add(result['suggestion']) suggestions.add(result['suggestion'])
continue continue
# if it is an answer, add it to list of answers
if 'answer' in result:
answers.add(result['answer'])
continue
# if it is an infobox, add it to list of infoboxes
if 'infobox' in result:
infoboxes.append(result)
print result
continue
# append result # append result
cb_res.append(result) cb_res.append(result)
results[engine_name] = cb_res results[engine_name] = cb_res
# update stats with current page-load-time
engines[engine_name].stats['page_load_time'] += \
(datetime.now() - params['started']).total_seconds()
return process_callback return process_callback
# return the meaningful length of the content for a result # return the meaningful length of the content for a result
def content_result_len(result): def content_result_len(content):
if isinstance(result.get('content'), basestring): if isinstance(content, basestring):
content = re.sub('[,;:!?\./\\\\ ()-_]', '', result.get('content')) content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
return len(content) return len(content)
else: else:
return 0 return 0
# score results and remove duplications # score results and remove duplications
def score_results(results): def score_results(results):
# calculate scoring parameters # calculate scoring parameters
@ -138,7 +153,7 @@ def score_results(results):
# merge duplicates together # merge duplicates together
if duplicated: if duplicated:
# using content with more text # using content with more text
if content_result_len(res) > content_result_len(duplicated): if content_result_len(res.get('content', '')) > content_result_len(duplicated.get('content', '')):
duplicated['content'] = res['content'] duplicated['content'] = res['content']
# increase result-score # increase result-score
@ -197,6 +212,64 @@ def score_results(results):
return gresults return gresults
def merge_two_infoboxes(infobox1, infobox2):
if 'urls' in infobox2:
urls1 = infobox1.get('urls', None)
if urls1 == None:
urls1 = []
infobox1.set('urls', urls1)
urlSet = set()
for url in infobox1.get('urls', []):
urlSet.add(url.get('url', None))
for url in infobox2.get('urls', []):
if url.get('url', None) not in urlSet:
urls1.append(url)
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None)
if attributes1 == None:
attributes1 = []
infobox1.set('attributes', attributes1)
attributeSet = set()
for attribute in infobox1.get('attributes', []):
if attribute.get('label', None) not in attributeSet:
attributeSet.add(attribute.get('label', None))
for attribute in infobox2.get('attributes', []):
attributes1.append(attribute)
if 'content' in infobox2:
content1 = infobox1.get('content', None)
content2 = infobox2.get('content', '')
if content1 != None:
if content_result_len(content2) > content_result_len(content1):
infobox1['content'] = content2
else:
infobox1.set('content', content2)
def merge_infoboxes(infoboxes):
results = []
infoboxes_id = {}
for infobox in infoboxes:
add_infobox = True
infobox_id = infobox.get('id', None)
if infobox_id != None:
existingIndex = infoboxes_id.get(infobox_id, None)
if existingIndex != None:
merge_two_infoboxes(results[existingIndex], infobox)
add_infobox=False
if add_infobox:
results.append(infobox)
infoboxes_id[infobox_id] = len(results)-1
return results
class Search(object): class Search(object):
"""Search information container""" """Search information container"""
@ -219,6 +292,8 @@ class Search(object):
self.results = [] self.results = []
self.suggestions = [] self.suggestions = []
self.answers = []
self.infoboxes = []
self.request_data = {} self.request_data = {}
# set specific language if set # set specific language if set
@ -350,6 +425,8 @@ class Search(object):
requests = [] requests = []
results = {} results = {}
suggestions = set() suggestions = set()
answers = set()
infoboxes = []
# increase number of searches # increase number of searches
number_of_searches += 1 number_of_searches += 1
@ -394,6 +471,8 @@ class Search(object):
selected_engine['name'], selected_engine['name'],
results, results,
suggestions, suggestions,
answers,
infoboxes,
engine.response, engine.response,
request_params request_params
) )
@ -431,11 +510,14 @@ class Search(object):
# score results and remove duplications # score results and remove duplications
results = score_results(results) results = score_results(results)
# merge infoboxes according to their ids
infoboxes = merge_infoboxes(infoboxes)
# update engine stats, using calculated score # update engine stats, using calculated score
for result in results: for result in results:
for res_engine in result['engines']: for res_engine in result['engines']:
engines[result['engine']]\ engines[result['engine']]\
.stats['score_count'] += result['score'] .stats['score_count'] += result['score']
# return results and suggestions # return results, suggestions, answers and infoboxes
return results, suggestions return results, suggestions, answers, infoboxes

View File

@ -1,7 +1,7 @@
server: server:
port : 8888 port : 8888
secret_key : "ultrasecretkey" # change this! secret_key : "ultrasecretkey" # change this!
debug : False # Debug mode, only for development debug : True # Debug mode, only for development
request_timeout : 2.0 # seconds request_timeout : 2.0 # seconds
base_url : False # Set custom base_url. Possible values: False or "https://your.custom.host/location/" base_url : False # Set custom base_url. Possible values: False or "https://your.custom.host/location/"
themes_path : "" # Custom ui themes path themes_path : "" # Custom ui themes path

File diff suppressed because one or more lines are too long

View File

@ -235,6 +235,17 @@ a {
max-width: 54em; max-width: 54em;
word-wrap:break-word; word-wrap:break-word;
line-height: 1.24; line-height: 1.24;
img {
float: left;
margin-right: 5px;
max-width: 200px;
max-height: 100px;
}
br.last {
clear: both;
}
} }
.url { .url {
@ -384,15 +395,14 @@ tr {
} }
} }
#suggestions { #suggestions, #answers {
margin-top: 20px; margin-top: 20px;
span { }
display: inline;
margin: 0 2px 2px 2px; #suggestions, #answers, #infoboxes {
padding: 0;
}
input { input {
padding: 0; padding: 0;
margin: 3px; margin: 3px;
@ -402,6 +412,7 @@ tr {
color: @color-result-search-url-font; color: @color-result-search-url-font;
cursor: pointer; cursor: pointer;
} }
input[type="submit"] { input[type="submit"] {
text-decoration: underline; text-decoration: underline;
} }
@ -411,6 +422,53 @@ tr {
} }
} }
#infoboxes {
position: absolute;
top: 220px;
right: 20px;
margin: 0px 2px 5px 5px;
padding: 0px 2px 2px;
max-width: 21em;
.infobox {
margin: 10px 0 10px;
border: 1px solid #ddd;
padding: 5px;
font-size: 0.8em;
img {
max-width: 20em;
max-heigt: 12em;
display: block;
margin: 5px;
padding: 5px;
}
h2 {
margin: 0;
}
table {
width: auto;
td {
vertical-align: top;
}
}
input {
font-size: 1em;
}
br {
clear: both;
}
}
}
#search_url { #search_url {
margin-top: 8px; margin-top: 8px;
@ -453,16 +511,6 @@ tr {
@media screen and (max-width: @results-width) { @media screen and (max-width: @results-width) {
#categories {
font-size: 90%;
clear: both;
.checkbox_container {
margin-top: 2px;
margin: auto;
}
}
#results { #results {
margin: auto; margin: auto;
padding: 0; padding: 0;
@ -483,7 +531,33 @@ tr {
} }
} }
@media screen and (max-width: 70em) { @media screen and (max-width: 75em) {
#infoboxes {
position: inherit;
max-width: inherit;
.infobox {
clear:both;
img {
float: left;
max-width: 10em;
}
}
}
#categories {
font-size: 90%;
clear: both;
.checkbox_container {
margin-top: 2px;
margin: auto;
}
}
.right { .right {
display: none; display: none;
postion: fixed !important; postion: fixed !important;
@ -515,12 +589,6 @@ tr {
.result { .result {
border-top: 1px solid @color-result-top-border; border-top: 1px solid @color-result-top-border;
margin: 7px 0 6px 0; margin: 7px 0 6px 0;
img {
max-width: 90%;
width: auto;
height: auto
}
} }
} }

View File

@ -8,6 +8,8 @@
<h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3> <h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3>
<p class="url">{{ result.pretty_url }} <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}">cached</a></p> <p class="url">{{ result.pretty_url }} <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}">cached</a></p>
{% if result.publishedDate %}<p class="published_date">{{ result.publishedDate }}</p>{% endif %} {% if result.publishedDate %}<p class="published_date">{{ result.publishedDate }}</p>{% endif %}
<p class="content">{% if result.content %}{{ result.content|safe }}<br />{% endif %}</p> <p class="content">
{% if result.img_src %}<img src="{{ result.img_src|safe }}" class="image" />{% endif %}
{% if result.content %}{{ result.content|safe }}<br class="last"/>{% endif %}</p>
</div> </div>
</div> </div>

View File

@ -30,6 +30,14 @@
</div> </div>
</div> </div>
{% if answers %}
<div id="answers"><span>{{ _('Answers') }}</span>
{% for answer in answers %}
<span>{{ answer }}</span>
{% endfor %}
</div>
{% endif %}
{% if suggestions %} {% if suggestions %}
<div id="suggestions"><span>{{ _('Suggestions') }}</span> <div id="suggestions"><span>{{ _('Suggestions') }}</span>
{% for suggestion in suggestions %} {% for suggestion in suggestions %}
@ -41,6 +49,14 @@
</div> </div>
{% endif %} {% endif %}
{% if infoboxes %}
<div id="infoboxes">
{% for infobox in infoboxes %}
{% include 'default/infobox.html' %}
{% endfor %}
</div>
{% endif %}
{% for result in results %} {% for result in results %}
{% if result['template'] %} {% if result['template'] %}
{% include 'default/result_templates/'+result['template'] %} {% include 'default/result_templates/'+result['template'] %}

View File

@ -198,7 +198,7 @@ def index():
'index.html', 'index.html',
) )
search.results, search.suggestions = search.search(request) search.results, search.suggestions, search.answers, search.infoboxes = search.search(request)
for result in search.results: for result in search.results:
@ -291,6 +291,8 @@ def index():
pageno=search.pageno, pageno=search.pageno,
base_url=get_base_url(), base_url=get_base_url(),
suggestions=search.suggestions, suggestions=search.suggestions,
answers=search.answers,
infoboxes=search.infoboxes,
theme=get_current_theme_name() theme=get_current_theme_name()
) )