forked from zaclys/searxng
		
	[mod] result.py: merge infobox URL and attributes when the same label or the same entity
entity are wikidata entity (like "Q42" for "Douglas Adams", see https://www.wikidata.org/wiki/Q42 )
This commit is contained in:
		
							parent
							
								
									23f4203dfb
								
							
						
					
					
						commit
						382fded665
					
				
					 1 changed files with 26 additions and 8 deletions
				
			
		|  | @ -20,6 +20,18 @@ def result_content_len(content): | |||
| 
 | ||||
| 
 | ||||
| def compare_urls(url_a, url_b): | ||||
|     """Lazy compare between two URL. | ||||
|     "www.example.com" and "example.com" are equals. | ||||
|     "www.example.com/path/" and "www.example.com/path" are equals. | ||||
|     "https://www.example.com/" and "http://www.example.com/" are equals. | ||||
| 
 | ||||
|     Args: | ||||
|         url_a (ParseResult): first URL | ||||
|         url_b (ParseResult): second URL | ||||
| 
 | ||||
|     Returns: | ||||
|         bool: True if url_a and url_b are equals | ||||
|     """ | ||||
|     # ignore www. in comparison | ||||
|     if url_a.netloc.startswith('www.'): | ||||
|         host_a = url_a.netloc.replace('www.', '', 1) | ||||
|  | @ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2): | |||
|         for url2 in infobox2.get('urls', []): | ||||
|             unique_url = True | ||||
|             parsed_url2 = urlparse(url2.get('url', '')) | ||||
|             entity_url2 = url2.get('entity') | ||||
|             for url1 in urls1: | ||||
|                 if compare_urls(urlparse(url1.get('url', '')), parsed_url2): | ||||
|                 if (entity_url2 is not None and url1.get('entity') == entity_url2)\ | ||||
|                    or compare_urls(urlparse(url1.get('url', '')), parsed_url2): | ||||
|                     unique_url = False | ||||
|                     break | ||||
|             if unique_url: | ||||
|  | @ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2): | |||
|             infobox1['img_src'] = img2 | ||||
| 
 | ||||
|     if 'attributes' in infobox2: | ||||
|         attributes1 = infobox1.get('attributes', None) | ||||
|         attributes1 = infobox1.get('attributes') | ||||
|         if attributes1 is None: | ||||
|             attributes1 = [] | ||||
|             infobox1['attributes'] = attributes1 | ||||
|             infobox1['attributes'] = attributes1 = [] | ||||
| 
 | ||||
|         attributeSet = set() | ||||
|         for attribute in infobox1.get('attributes', []): | ||||
|             if attribute.get('label', None) not in attributeSet: | ||||
|                 attributeSet.add(attribute.get('label', None)) | ||||
|         for attribute in attributes1: | ||||
|             label = attribute.get('label') | ||||
|             if label not in attributeSet: | ||||
|                 attributeSet.add(label) | ||||
|             entity = attribute.get('entity') | ||||
|             if entity not in attributeSet: | ||||
|                 attributeSet.add(entity) | ||||
| 
 | ||||
|         for attribute in infobox2.get('attributes', []): | ||||
|             if attribute.get('label', None) not in attributeSet: | ||||
|             if attribute.get('label') not in attributeSet\ | ||||
|                and attribute.get('entity') not in attributeSet: | ||||
|                 attributes1.append(attribute) | ||||
| 
 | ||||
|     if 'content' in infobox2: | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Alexandre Flament
						Alexandre Flament