[mod] result.py: merge infobox URL and attributes when the same label or the same entity

entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
This commit is contained in:
Alexandre Flament 2020-10-26 19:22:19 +01:00
parent 23f4203dfb
commit 382fded665
1 changed files with 26 additions and 8 deletions

View File

@ -20,6 +20,18 @@ def result_content_len(content):
def compare_urls(url_a, url_b): def compare_urls(url_a, url_b):
"""Lazy compare between two URL.
"www.example.com" and "example.com" are equals.
"www.example.com/path/" and "www.example.com/path" are equals.
"https://www.example.com/" and "http://www.example.com/" are equals.
Args:
url_a (ParseResult): first URL
url_b (ParseResult): second URL
Returns:
bool: True if url_a and url_b are equals
"""
# ignore www. in comparison # ignore www. in comparison
if url_a.netloc.startswith('www.'): if url_a.netloc.startswith('www.'):
host_a = url_a.netloc.replace('www.', '', 1) host_a = url_a.netloc.replace('www.', '', 1)
@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
for url2 in infobox2.get('urls', []): for url2 in infobox2.get('urls', []):
unique_url = True unique_url = True
parsed_url2 = urlparse(url2.get('url', '')) parsed_url2 = urlparse(url2.get('url', ''))
entity_url2 = url2.get('entity')
for url1 in urls1: for url1 in urls1:
if compare_urls(urlparse(url1.get('url', '')), parsed_url2): if (entity_url2 is not None and url1.get('entity') == entity_url2)\
or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
unique_url = False unique_url = False
break break
if unique_url: if unique_url:
@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['img_src'] = img2 infobox1['img_src'] = img2
if 'attributes' in infobox2: if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None) attributes1 = infobox1.get('attributes')
if attributes1 is None: if attributes1 is None:
attributes1 = [] infobox1['attributes'] = attributes1 = []
infobox1['attributes'] = attributes1
attributeSet = set() attributeSet = set()
for attribute in infobox1.get('attributes', []): for attribute in attributes1:
if attribute.get('label', None) not in attributeSet: label = attribute.get('label')
attributeSet.add(attribute.get('label', None)) if label not in attributeSet:
attributeSet.add(label)
entity = attribute.get('entity')
if entity not in attributeSet:
attributeSet.add(entity)
for attribute in infobox2.get('attributes', []): for attribute in infobox2.get('attributes', []):
if attribute.get('label', None) not in attributeSet: if attribute.get('label') not in attributeSet\
and attribute.get('entity') not in attributeSet:
attributes1.append(attribute) attributes1.append(attribute)
if 'content' in infobox2: if 'content' in infobox2: