mirror of
https://github.com/searxng/searxng
synced 2024-01-01 18:24:07 +00:00
Merge pull request #540 from a01200356/wikipedia_infobox
[enh] wikipedia infobox
This commit is contained in:
commit
f46057feb2
@ -43,3 +43,4 @@ generally made searx better:
|
|||||||
- Kang-min Liu
|
- Kang-min Liu
|
||||||
- Kirill Isakov
|
- Kirill Isakov
|
||||||
- Guilhem Bonnefille
|
- Guilhem Bonnefille
|
||||||
|
- Marc Abonce Seguin
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
from re import compile, sub
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from searx.utils import html_to_text
|
from searx.utils import html_to_text
|
||||||
from searx.engines.xpath import extract_text
|
from searx.engines.xpath import extract_text
|
||||||
@ -7,6 +8,8 @@ from searx.engines.xpath import extract_text
|
|||||||
url = 'https://api.duckduckgo.com/'\
|
url = 'https://api.duckduckgo.com/'\
|
||||||
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
||||||
|
|
||||||
|
http_regex = compile(r'^http:')
|
||||||
|
|
||||||
|
|
||||||
def result_to_text(url, text, htmlResult):
|
def result_to_text(url, text, htmlResult):
|
||||||
# TODO : remove result ending with "Meaning" or "Category"
|
# TODO : remove result ending with "Meaning" or "Category"
|
||||||
@ -19,8 +22,8 @@ def result_to_text(url, text, htmlResult):
|
|||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
# TODO add kl={locale}
|
|
||||||
params['url'] = url.format(query=urlencode({'q': query}))
|
params['url'] = url.format(query=urlencode({'q': query}))
|
||||||
|
params['headers']['Accept-Language'] = params['language']
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
@ -103,6 +106,10 @@ def response(resp):
|
|||||||
urls.append({'title': search_res.get('DefinitionSource'),
|
urls.append({'title': search_res.get('DefinitionSource'),
|
||||||
'url': definitionURL})
|
'url': definitionURL})
|
||||||
|
|
||||||
|
# to merge with wikidata's infobox
|
||||||
|
if infobox_id:
|
||||||
|
infobox_id = http_regex.sub('https:', infobox_id)
|
||||||
|
|
||||||
# entity
|
# entity
|
||||||
entity = search_res.get('Entity', None)
|
entity = search_res.get('Entity', None)
|
||||||
# TODO continent / country / department / location / waterfall /
|
# TODO continent / country / department / location / waterfall /
|
||||||
|
@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
|
|||||||
results.append({'title': title, 'url': official_website})
|
results.append({'title': title, 'url': official_website})
|
||||||
|
|
||||||
wikipedia_link_count = 0
|
wikipedia_link_count = 0
|
||||||
if language != 'en':
|
wikipedia_link = get_wikilink(result, language + 'wiki')
|
||||||
wikipedia_link_count += add_url(urls,
|
|
||||||
'Wikipedia (' + language + ')',
|
|
||||||
get_wikilink(result, language +
|
|
||||||
'wiki'))
|
|
||||||
wikipedia_en_link = get_wikilink(result, 'enwiki')
|
|
||||||
wikipedia_link_count += add_url(urls,
|
wikipedia_link_count += add_url(urls,
|
||||||
'Wikipedia (en)',
|
'Wikipedia (' + language + ')',
|
||||||
wikipedia_en_link)
|
wikipedia_link)
|
||||||
|
if language != 'en':
|
||||||
|
wikipedia_en_link = get_wikilink(result, 'enwiki')
|
||||||
|
wikipedia_link_count += add_url(urls,
|
||||||
|
'Wikipedia (en)',
|
||||||
|
wikipedia_en_link)
|
||||||
if wikipedia_link_count == 0:
|
if wikipedia_link_count == 0:
|
||||||
misc_language = get_wiki_firstlanguage(result, 'wiki')
|
misc_language = get_wiki_firstlanguage(result, 'wiki')
|
||||||
if misc_language is not None:
|
if misc_language is not None:
|
||||||
@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
|
|||||||
else:
|
else:
|
||||||
results.append({
|
results.append({
|
||||||
'infobox': title,
|
'infobox': title,
|
||||||
'id': wikipedia_en_link,
|
'id': wikipedia_link,
|
||||||
'content': description,
|
'content': description,
|
||||||
'attributes': attributes,
|
'attributes': attributes,
|
||||||
'urls': urls
|
'urls': urls
|
||||||
|
114
searx/engines/wikipedia.py
Normal file
114
searx/engines/wikipedia.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
"""
|
||||||
|
Wikipedia (Web)
|
||||||
|
|
||||||
|
@website https://{language}.wikipedia.org
|
||||||
|
@provide-api yes
|
||||||
|
|
||||||
|
@using-api yes
|
||||||
|
@results JSON
|
||||||
|
@stable yes
|
||||||
|
@parse url, infobox
|
||||||
|
"""
|
||||||
|
|
||||||
|
from json import loads
|
||||||
|
from urllib import urlencode, quote
|
||||||
|
|
||||||
|
# search-url
|
||||||
|
base_url = 'https://{language}.wikipedia.org/'
|
||||||
|
search_postfix = 'w/api.php?'\
|
||||||
|
'action=query'\
|
||||||
|
'&format=json'\
|
||||||
|
'&{query}'\
|
||||||
|
'&prop=extracts|pageimages'\
|
||||||
|
'&exintro'\
|
||||||
|
'&explaintext'\
|
||||||
|
'&pithumbsize=300'\
|
||||||
|
'&redirects'
|
||||||
|
|
||||||
|
|
||||||
|
# set language in base_url
|
||||||
|
def url_lang(lang):
|
||||||
|
if lang == 'all':
|
||||||
|
language = 'en'
|
||||||
|
else:
|
||||||
|
language = lang.split('_')[0]
|
||||||
|
|
||||||
|
return base_url.format(language=language)
|
||||||
|
|
||||||
|
|
||||||
|
# do search-request
|
||||||
|
def request(query, params):
|
||||||
|
if query.islower():
|
||||||
|
query += '|' + query.title()
|
||||||
|
|
||||||
|
params['url'] = url_lang(params['language']) \
|
||||||
|
+ search_postfix.format(query=urlencode({'titles': query}))
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
# get first meaningful paragraph
|
||||||
|
# this should filter out disambiguation pages and notes above first paragraph
|
||||||
|
# "magic numbers" were obtained by fine tuning
|
||||||
|
def extract_first_paragraph(content, title, image):
|
||||||
|
first_paragraph = None
|
||||||
|
|
||||||
|
failed_attempts = 0
|
||||||
|
for paragraph in content.split('\n'):
|
||||||
|
|
||||||
|
starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
|
||||||
|
length = len(paragraph)
|
||||||
|
|
||||||
|
if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
|
||||||
|
first_paragraph = paragraph
|
||||||
|
break
|
||||||
|
|
||||||
|
failed_attempts += 1
|
||||||
|
if failed_attempts > 3:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return first_paragraph
|
||||||
|
|
||||||
|
|
||||||
|
# get response from search-request
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
|
||||||
|
search_result = loads(resp.content)
|
||||||
|
|
||||||
|
# wikipedia article's unique id
|
||||||
|
# first valid id is assumed to be the requested article
|
||||||
|
for article_id in search_result['query']['pages']:
|
||||||
|
page = search_result['query']['pages'][article_id]
|
||||||
|
if int(article_id) > 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if int(article_id) < 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
title = page.get('title')
|
||||||
|
|
||||||
|
image = page.get('thumbnail')
|
||||||
|
if image:
|
||||||
|
image = image.get('source')
|
||||||
|
|
||||||
|
extract = page.get('extract')
|
||||||
|
|
||||||
|
summary = extract_first_paragraph(extract, title, image)
|
||||||
|
if not summary:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# link to wikipedia article
|
||||||
|
# parenthesis are not quoted to make infobox mergeable with wikidata's
|
||||||
|
wikipedia_link = url_lang(resp.search_params['language']) \
|
||||||
|
+ 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
|
||||||
|
|
||||||
|
results.append({'url': wikipedia_link, 'title': title})
|
||||||
|
|
||||||
|
results.append({'infobox': title,
|
||||||
|
'id': wikipedia_link,
|
||||||
|
'content': summary,
|
||||||
|
'img_src': image,
|
||||||
|
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
|
||||||
|
|
||||||
|
return results
|
@ -37,7 +37,7 @@ def merge_two_infoboxes(infobox1, infobox2):
|
|||||||
urls1 = infobox1.get('urls', None)
|
urls1 = infobox1.get('urls', None)
|
||||||
if urls1 is None:
|
if urls1 is None:
|
||||||
urls1 = []
|
urls1 = []
|
||||||
infobox1.set('urls', urls1)
|
infobox1['urls'] = urls1
|
||||||
|
|
||||||
urlSet = set()
|
urlSet = set()
|
||||||
for url in infobox1.get('urls', []):
|
for url in infobox1.get('urls', []):
|
||||||
@ -47,11 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2):
|
|||||||
if url.get('url', None) not in urlSet:
|
if url.get('url', None) not in urlSet:
|
||||||
urls1.append(url)
|
urls1.append(url)
|
||||||
|
|
||||||
|
if 'img_src' in infobox2:
|
||||||
|
img1 = infobox1.get('img_src', None)
|
||||||
|
img2 = infobox2.get('img_src')
|
||||||
|
if img1 is None:
|
||||||
|
infobox1['img_src'] = img2
|
||||||
|
|
||||||
if 'attributes' in infobox2:
|
if 'attributes' in infobox2:
|
||||||
attributes1 = infobox1.get('attributes', None)
|
attributes1 = infobox1.get('attributes', None)
|
||||||
if attributes1 is None:
|
if attributes1 is None:
|
||||||
attributes1 = []
|
attributes1 = []
|
||||||
infobox1.set('attributes', attributes1)
|
infobox1['attributes'] = attributes1
|
||||||
|
|
||||||
attributeSet = set()
|
attributeSet = set()
|
||||||
for attribute in infobox1.get('attributes', []):
|
for attribute in infobox1.get('attributes', []):
|
||||||
@ -68,7 +74,7 @@ def merge_two_infoboxes(infobox1, infobox2):
|
|||||||
if result_content_len(content2) > result_content_len(content1):
|
if result_content_len(content2) > result_content_len(content1):
|
||||||
infobox1['content'] = content2
|
infobox1['content'] = content2
|
||||||
else:
|
else:
|
||||||
infobox1.set('content', content2)
|
infobox1['content'] = content2
|
||||||
|
|
||||||
|
|
||||||
def result_score(result):
|
def result_score(result):
|
||||||
|
@ -43,10 +43,9 @@ engines:
|
|||||||
shortcut : bs
|
shortcut : bs
|
||||||
|
|
||||||
- name : wikipedia
|
- name : wikipedia
|
||||||
engine : mediawiki
|
engine : wikipedia
|
||||||
shortcut : wp
|
shortcut : wp
|
||||||
base_url : 'https://{language}.wikipedia.org/'
|
base_url : 'https://{language}.wikipedia.org/'
|
||||||
number_of_results : 1
|
|
||||||
|
|
||||||
- name : bing
|
- name : bing
|
||||||
engine : bing
|
engine : bing
|
||||||
@ -93,6 +92,7 @@ engines:
|
|||||||
- name : ddg definitions
|
- name : ddg definitions
|
||||||
engine : duckduckgo_definitions
|
engine : duckduckgo_definitions
|
||||||
shortcut : ddd
|
shortcut : ddd
|
||||||
|
disabled : True
|
||||||
|
|
||||||
- name : digg
|
- name : digg
|
||||||
engine : digg
|
engine : digg
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
<div class="panel panel-default infobox">
|
<div class="panel panel-default infobox">
|
||||||
<div class="panel-heading">
|
<div class="panel-heading">
|
||||||
<h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4>
|
<bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi>
|
||||||
</div>
|
</div>
|
||||||
<div class="panel-body">
|
<div class="panel-body">
|
||||||
|
<bdi>
|
||||||
{% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %}
|
{% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %}
|
||||||
{% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %}
|
{% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %}
|
||||||
|
|
||||||
@ -28,5 +29,6 @@
|
|||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
</bdi>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -123,7 +123,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
|
|||||||
self.assertEqual(results[1]['url'], 'result first url')
|
self.assertEqual(results[1]['url'], 'result first url')
|
||||||
self.assertEqual(results[2]['suggestion'], 'text')
|
self.assertEqual(results[2]['suggestion'], 'text')
|
||||||
self.assertEqual(results[3]['infobox'], 'heading')
|
self.assertEqual(results[3]['infobox'], 'heading')
|
||||||
self.assertEqual(results[3]['id'], 'http://definition.url')
|
self.assertEqual(results[3]['id'], 'https://definition.url')
|
||||||
self.assertEqual(results[3]['entity'], 'Entity')
|
self.assertEqual(results[3]['entity'], 'Entity')
|
||||||
self.assertIn('abstract', results[3]['content'])
|
self.assertIn('abstract', results[3]['content'])
|
||||||
self.assertIn('this is the definition', results[3]['content'])
|
self.assertIn('this is the definition', results[3]['content'])
|
||||||
@ -240,7 +240,7 @@ class TestDDGDefinitionsEngine(SearxTestCase):
|
|||||||
self.assertEqual(type(results), list)
|
self.assertEqual(type(results), list)
|
||||||
self.assertEqual(len(results), 1)
|
self.assertEqual(len(results), 1)
|
||||||
self.assertEqual(results[0]['infobox'], 'heading')
|
self.assertEqual(results[0]['infobox'], 'heading')
|
||||||
self.assertEqual(results[0]['id'], 'http://definition.url')
|
self.assertEqual(results[0]['id'], 'https://definition.url')
|
||||||
self.assertEqual(results[0]['entity'], 'Entity')
|
self.assertEqual(results[0]['entity'], 'Entity')
|
||||||
self.assertIn('abstract', results[0]['content'])
|
self.assertIn('abstract', results[0]['content'])
|
||||||
self.assertIn('this is the definition', results[0]['content'])
|
self.assertIn('this is the definition', results[0]['content'])
|
||||||
|
160
tests/unit/engines/test_wikipedia.py
Normal file
160
tests/unit/engines/test_wikipedia.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from collections import defaultdict
|
||||||
|
import mock
|
||||||
|
from searx.engines import wikipedia
|
||||||
|
from searx.testing import SearxTestCase
|
||||||
|
|
||||||
|
|
||||||
|
class TestWikipediaEngine(SearxTestCase):
|
||||||
|
|
||||||
|
def test_request(self):
|
||||||
|
query = 'test_query'
|
||||||
|
dicto = defaultdict(dict)
|
||||||
|
dicto['language'] = 'fr_FR'
|
||||||
|
params = wikipedia.request(query, dicto)
|
||||||
|
self.assertIn('url', params)
|
||||||
|
self.assertIn(query, params['url'])
|
||||||
|
self.assertIn('test_query', params['url'])
|
||||||
|
self.assertIn('Test_Query', params['url'])
|
||||||
|
self.assertIn('fr.wikipedia.org', params['url'])
|
||||||
|
|
||||||
|
query = 'Test_Query'
|
||||||
|
params = wikipedia.request(query, dicto)
|
||||||
|
self.assertIn('Test_Query', params['url'])
|
||||||
|
self.assertNotIn('test_query', params['url'])
|
||||||
|
|
||||||
|
dicto['language'] = 'all'
|
||||||
|
params = wikipedia.request(query, dicto)
|
||||||
|
self.assertIn('en', params['url'])
|
||||||
|
|
||||||
|
def test_response(self):
|
||||||
|
dicto = defaultdict(dict)
|
||||||
|
dicto['language'] = 'fr'
|
||||||
|
|
||||||
|
self.assertRaises(AttributeError, wikipedia.response, None)
|
||||||
|
self.assertRaises(AttributeError, wikipedia.response, [])
|
||||||
|
self.assertRaises(AttributeError, wikipedia.response, '')
|
||||||
|
self.assertRaises(AttributeError, wikipedia.response, '[]')
|
||||||
|
|
||||||
|
# page not found
|
||||||
|
json = """
|
||||||
|
{
|
||||||
|
"batchcomplete": "",
|
||||||
|
"query": {
|
||||||
|
"normalized": [],
|
||||||
|
"pages": {
|
||||||
|
"-1": {
|
||||||
|
"ns": 0,
|
||||||
|
"title": "",
|
||||||
|
"missing": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
response = mock.Mock(content=json, search_params=dicto)
|
||||||
|
self.assertEqual(wikipedia.response(response), [])
|
||||||
|
|
||||||
|
# normal case
|
||||||
|
json = """
|
||||||
|
{
|
||||||
|
"batchcomplete": "",
|
||||||
|
"query": {
|
||||||
|
"normalized": [],
|
||||||
|
"pages": {
|
||||||
|
"12345": {
|
||||||
|
"pageid": 12345,
|
||||||
|
"ns": 0,
|
||||||
|
"title": "The Title",
|
||||||
|
"extract": "The Title is...",
|
||||||
|
"thumbnail": {
|
||||||
|
"source": "img_src.jpg"
|
||||||
|
},
|
||||||
|
"pageimage": "img_name.jpg"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
response = mock.Mock(content=json, search_params=dicto)
|
||||||
|
results = wikipedia.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[0]['title'], u'The Title')
|
||||||
|
self.assertIn('fr.wikipedia.org/wiki/The_Title', results[0]['url'])
|
||||||
|
self.assertEqual(results[1]['infobox'], u'The Title')
|
||||||
|
self.assertIn('fr.wikipedia.org/wiki/The_Title', results[1]['id'])
|
||||||
|
self.assertIn('The Title is...', results[1]['content'])
|
||||||
|
self.assertEqual(results[1]['img_src'], 'img_src.jpg')
|
||||||
|
|
||||||
|
# disambiguation page
|
||||||
|
json = """
|
||||||
|
{
|
||||||
|
"batchcomplete": "",
|
||||||
|
"query": {
|
||||||
|
"normalized": [],
|
||||||
|
"pages": {
|
||||||
|
"12345": {
|
||||||
|
"pageid": 12345,
|
||||||
|
"ns": 0,
|
||||||
|
"title": "The Title",
|
||||||
|
"extract": "The Title can be:\\nThe Title 1\\nThe Title 2\\nThe Title 3\\nThe Title 4......................................................................................................................................." """ # noqa
|
||||||
|
json += """
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
response = mock.Mock(content=json, search_params=dicto)
|
||||||
|
results = wikipedia.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 0)
|
||||||
|
|
||||||
|
# no image
|
||||||
|
json = """
|
||||||
|
{
|
||||||
|
"batchcomplete": "",
|
||||||
|
"query": {
|
||||||
|
"normalized": [],
|
||||||
|
"pages": {
|
||||||
|
"12345": {
|
||||||
|
"pageid": 12345,
|
||||||
|
"ns": 0,
|
||||||
|
"title": "The Title",
|
||||||
|
"extract": "The Title is......................................................................................................................................................................................." """ # noqa
|
||||||
|
json += """
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
response = mock.Mock(content=json, search_params=dicto)
|
||||||
|
results = wikipedia.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertIn('The Title is...', results[1]['content'])
|
||||||
|
self.assertEqual(results[1]['img_src'], None)
|
||||||
|
|
||||||
|
# title not in first paragraph
|
||||||
|
json = u"""
|
||||||
|
{
|
||||||
|
"batchcomplete": "",
|
||||||
|
"query": {
|
||||||
|
"normalized": [],
|
||||||
|
"pages": {
|
||||||
|
"12345": {
|
||||||
|
"pageid": 12345,
|
||||||
|
"ns": 0,
|
||||||
|
"title": "披頭四樂隊",
|
||||||
|
"extract": "披头士乐队....................................................................................................................................................................................................\\n披頭四樂隊...", """ # noqa
|
||||||
|
json += """
|
||||||
|
"thumbnail": {
|
||||||
|
"source": "img_src.jpg"
|
||||||
|
},
|
||||||
|
"pageimage": "img_name.jpg"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
response = mock.Mock(content=json, search_params=dicto)
|
||||||
|
results = wikipedia.response(response)
|
||||||
|
self.assertEqual(type(results), list)
|
||||||
|
self.assertEqual(len(results), 2)
|
||||||
|
self.assertEqual(results[1]['infobox'], u'披頭四樂隊')
|
||||||
|
self.assertIn(u'披头士乐队...', results[1]['content'])
|
Loading…
Reference in New Issue
Block a user