forked from zaclys/searxng
		
	[mod] result.py: merge infobox URL and attributes when the same label or the same entity
entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
This commit is contained in:
		
							parent
							
								
									23f4203dfb
								
							
						
					
					
						commit
						382fded665
					
				
					 1 changed files with 26 additions and 8 deletions
				
			
		| 
						 | 
					@ -20,6 +20,18 @@ def result_content_len(content):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def compare_urls(url_a, url_b):
 | 
					def compare_urls(url_a, url_b):
 | 
				
			||||||
 | 
					    """Lazy compare between two URL.
 | 
				
			||||||
 | 
					    "www.example.com" and "example.com" are equals.
 | 
				
			||||||
 | 
					    "www.example.com/path/" and "www.example.com/path" are equals.
 | 
				
			||||||
 | 
					    "https://www.example.com/" and "http://www.example.com/" are equals.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        url_a (ParseResult): first URL
 | 
				
			||||||
 | 
					        url_b (ParseResult): second URL
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        bool: True if url_a and url_b are equals
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    # ignore www. in comparison
 | 
					    # ignore www. in comparison
 | 
				
			||||||
    if url_a.netloc.startswith('www.'):
 | 
					    if url_a.netloc.startswith('www.'):
 | 
				
			||||||
        host_a = url_a.netloc.replace('www.', '', 1)
 | 
					        host_a = url_a.netloc.replace('www.', '', 1)
 | 
				
			||||||
| 
						 | 
					@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
 | 
				
			||||||
        for url2 in infobox2.get('urls', []):
 | 
					        for url2 in infobox2.get('urls', []):
 | 
				
			||||||
            unique_url = True
 | 
					            unique_url = True
 | 
				
			||||||
            parsed_url2 = urlparse(url2.get('url', ''))
 | 
					            parsed_url2 = urlparse(url2.get('url', ''))
 | 
				
			||||||
 | 
					            entity_url2 = url2.get('entity')
 | 
				
			||||||
            for url1 in urls1:
 | 
					            for url1 in urls1:
 | 
				
			||||||
                if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
 | 
					                if (entity_url2 is not None and url1.get('entity') == entity_url2)\
 | 
				
			||||||
 | 
					                   or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
 | 
				
			||||||
                    unique_url = False
 | 
					                    unique_url = False
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
            if unique_url:
 | 
					            if unique_url:
 | 
				
			||||||
| 
						 | 
					@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
 | 
				
			||||||
            infobox1['img_src'] = img2
 | 
					            infobox1['img_src'] = img2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if 'attributes' in infobox2:
 | 
					    if 'attributes' in infobox2:
 | 
				
			||||||
        attributes1 = infobox1.get('attributes', None)
 | 
					        attributes1 = infobox1.get('attributes')
 | 
				
			||||||
        if attributes1 is None:
 | 
					        if attributes1 is None:
 | 
				
			||||||
            attributes1 = []
 | 
					            infobox1['attributes'] = attributes1 = []
 | 
				
			||||||
            infobox1['attributes'] = attributes1
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        attributeSet = set()
 | 
					        attributeSet = set()
 | 
				
			||||||
        for attribute in infobox1.get('attributes', []):
 | 
					        for attribute in attributes1:
 | 
				
			||||||
            if attribute.get('label', None) not in attributeSet:
 | 
					            label = attribute.get('label')
 | 
				
			||||||
                attributeSet.add(attribute.get('label', None))
 | 
					            if label not in attributeSet:
 | 
				
			||||||
 | 
					                attributeSet.add(label)
 | 
				
			||||||
 | 
					            entity = attribute.get('entity')
 | 
				
			||||||
 | 
					            if entity not in attributeSet:
 | 
				
			||||||
 | 
					                attributeSet.add(entity)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for attribute in infobox2.get('attributes', []):
 | 
					        for attribute in infobox2.get('attributes', []):
 | 
				
			||||||
            if attribute.get('label', None) not in attributeSet:
 | 
					            if attribute.get('label') not in attributeSet\
 | 
				
			||||||
 | 
					               and attribute.get('entity') not in attributeSet:
 | 
				
			||||||
                attributes1.append(attribute)
 | 
					                attributes1.append(attribute)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if 'content' in infobox2:
 | 
					    if 'content' in infobox2:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue