From ad58b14be7cc9a1e95858e150e9d8005734d9232 Mon Sep 17 00:00:00 2001 From: marc Date: Mon, 27 Jun 2016 23:35:43 -0500 Subject: [PATCH] [fix] merge infoboxes based on weight also minor changes in attributes and images from wikidata --- searx/engines/wikidata.py | 24 +++++++++++++++++++----- searx/results.py | 18 +++++++++++++++++- searx/settings.yml | 2 ++ tests/unit/engines/test_wikidata.py | 22 ++++++++++++---------- 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index f10fc13f4..91040e218 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -35,7 +35,7 @@ url_detail = wikidata_api\ url_map = 'https://www.openstreetmap.org/'\ + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' -url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500' +url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' # xpaths wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' @@ -162,6 +162,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): # INFOBOX ATTRIBUTES (ROWS) + # DATES # inception date add_attribute(attributes, result, 'P571', date=True) # dissolution date @@ -170,11 +171,14 @@ def getDetail(jsonresponse, wikidata_id, language, locale): add_attribute(attributes, result, 'P580', date=True) # end date add_attribute(attributes, result, 'P582', date=True) - # date of birth add_attribute(attributes, result, 'P569', date=True) # date of death add_attribute(attributes, result, 'P570', date=True) + # date of spacecraft launch + add_attribute(attributes, result, 'P619', date=True) + # date of spacecraft landing + add_attribute(attributes, result, 'P620', date=True) # nationality add_attribute(attributes, result, 'P27') @@ -201,7 +205,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): # area add_attribute(attributes, result, 'P2046') # currency - add_attribute(attributes, result, 'P38') + add_attribute(attributes, result, 'P38', trim=True) # heigth (building) add_attribute(attributes, result, 'P2048') @@ -230,6 +234,10 @@ def getDetail(jsonresponse, wikidata_id, language, locale): add_attribute(attributes, result, 'P264') # publisher add_attribute(attributes, result, 'P123') + # original network + add_attribute(attributes, result, 'P449') + # distributor + add_attribute(attributes, result, 'P750') # composer add_attribute(attributes, result, 'P86') # publication date @@ -266,6 +274,10 @@ def getDetail(jsonresponse, wikidata_id, language, locale): add_attribute(attributes, result, 'P112') # legal form (company/organization) add_attribute(attributes, result, 'P1454') + # operator + add_attribute(attributes, result, 'P137') + # crew members (tripulation) + add_attribute(attributes, result, 'P1029') # taxon add_attribute(attributes, result, 'P225') # chemical formula @@ -300,8 +312,8 @@ def getDetail(jsonresponse, wikidata_id, language, locale): # only returns first match def add_image(result): - # P18: image, P154: logo, P242: map, P41: flag, P2716: collage, P2910: icon - property_ids = ['P18', 'P154', 'P242', 'P41', 'P2716', 'P2910'] + # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon + property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] for property_id in property_ids: image = result.xpath(property_xpath.replace('{propertyid}', property_id)) @@ -320,6 +332,7 @@ def add_attribute(attributes, result, property_id, default_label=None, date=Fals label = default_label else: label = extract_text(attribute[0].xpath(label_xpath)) + label = label[0].upper() + label[1:] if date: trim = True @@ -369,6 +382,7 @@ def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, dom_element = dom_element[0] if not default_label: label = extract_text(dom_element.xpath(label_xpath)) + label = label[0].upper() + label[1:] if link_type == 'geo': links.append(get_geolink(dom_element)) diff --git a/searx/results.py b/searx/results.py index bf4067b41..9a4ec0b28 100644 --- a/searx/results.py +++ b/searx/results.py @@ -43,6 +43,19 @@ def compare_urls(url_a, url_b): def merge_two_infoboxes(infobox1, infobox2): + # get engines weights + if hasattr(engines[infobox1['engine']], 'weight'): + weight1 = engines[infobox1['engine']].weight + else: + weight1 = 1 + if hasattr(engines[infobox2['engine']], 'weight'): + weight2 = engines[infobox2['engine']].weight + else: + weight2 = 1 + + if weight2 > weight1: + infobox1['engine'] = infobox2['engine'] + if 'urls' in infobox2: urls1 = infobox1.get('urls', None) if urls1 is None: @@ -64,6 +77,8 @@ def merge_two_infoboxes(infobox1, infobox2): img2 = infobox2.get('img_src') if img1 is None: infobox1['img_src'] = img2 + elif weight2 > weight1: + infobox1['img_src'] = img2 if 'attributes' in infobox2: attributes1 = infobox1.get('attributes', None) @@ -77,7 +92,8 @@ def merge_two_infoboxes(infobox1, infobox2): attributeSet.add(attribute.get('label', None)) for attribute in infobox2.get('attributes', []): - attributes1.append(attribute) + if attribute.get('label', None) not in attributeSet: + attributes1.append(attribute) if 'content' in infobox2: content1 = infobox1.get('content', None) diff --git a/searx/settings.yml b/searx/settings.yml index 34393e7c1..38e9f4752 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -105,6 +105,7 @@ engines: - name : ddg definitions engine : duckduckgo_definitions shortcut : ddd + weight : 2 disabled : True - name : digg @@ -127,6 +128,7 @@ engines: - name : wikidata engine : wikidata shortcut : wd + weight : 2 - name : duckduckgo engine : duckduckgo diff --git a/tests/unit/engines/test_wikidata.py b/tests/unit/engines/test_wikidata.py index 99d8540cf..ec5f52ef9 100644 --- a/tests/unit/engines/test_wikidata.py +++ b/tests/unit/engines/test_wikidata.py @@ -95,14 +95,14 @@ class TestWikidataEngine(SearxTestCase): results = wikidata.getDetail(response, "Q123", "yua", "yua_MX") self.assertEqual(len(results), 2) - self.assertEqual(results[0]['title'], 'official website') + self.assertEqual(results[0]['title'], 'Official website') self.assertEqual(results[0]['url'], 'https://officialsite.com') self.assertEqual(results[1]['infobox'], 'Test') self.assertEqual(results[1]['id'], None) self.assertEqual(results[1]['content'], 'Description') self.assertEqual(results[1]['attributes'], []) - self.assertEqual(results[1]['urls'][0]['title'], 'official website') + self.assertEqual(results[1]['urls'][0]['title'], 'Official website') self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com') self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)') self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test') @@ -141,7 +141,8 @@ class TestWikidataEngine(SearxTestCase): html_etree = fromstring(html) image_src = wikidata.add_image(html_etree) - self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500") + self.assertEqual(image_src, + "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400") html = u"""
@@ -196,7 +197,8 @@ class TestWikidataEngine(SearxTestCase): html_etree = fromstring(html) image_src = wikidata.add_image(html_etree) - self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500") + self.assertEqual(image_src, + "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400") def test_add_attribute(self): html = u""" @@ -234,7 +236,7 @@ class TestWikidataEngine(SearxTestCase): wikidata.add_attribute(attributes, html_etree, "P27") self.assertEqual(len(attributes), 1) - self.assertEqual(attributes[0]["label"], "country of citizenship") + self.assertEqual(attributes[0]["label"], "Country of citizenship") self.assertEqual(attributes[0]["value"], "United Kingdom") html = u""" @@ -269,7 +271,7 @@ class TestWikidataEngine(SearxTestCase): html_etree = fromstring(html) wikidata.add_attribute(attributes, html_etree, "P569", date=True) self.assertEqual(len(attributes), 1) - self.assertEqual(attributes[0]["label"], "date of birth") + self.assertEqual(attributes[0]["label"], "Date of birth") self.assertEqual(attributes[0]["value"], "27 January 1832") html = u""" @@ -317,7 +319,7 @@ class TestWikidataEngine(SearxTestCase): html_etree = fromstring(html) wikidata.add_attribute(attributes, html_etree, "P6") self.assertEqual(len(attributes), 1) - self.assertEqual(attributes[0]["label"], "head of government") + self.assertEqual(attributes[0]["label"], "Head of government") self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister") attributes = [] @@ -355,7 +357,7 @@ class TestWikidataEngine(SearxTestCase): html_etree = fromstring(html) wikidata.add_url(urls, html_etree, 'P856') self.assertEquals(len(urls), 1) - self.assertIn({'title': 'official website', 'url': 'https://searx.me/'}, urls) + self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls) urls = [] results = [] wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results) @@ -403,8 +405,8 @@ class TestWikidataEngine(SearxTestCase): html_etree = fromstring(html) wikidata.add_url(urls, html_etree, 'P856') self.assertEquals(len(urls), 2) - self.assertIn({'title': 'official website', 'url': 'http://www.worldofwarcraft.com'}, urls) - self.assertIn({'title': 'official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) + self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls) + self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) def test_get_imdblink(self): html = u"""