mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	Perso branch - added cjk check and enclosement
This commit is contained in:
		
							parent
							
								
									afd71a6c0f
								
							
						
					
					
						commit
						6c421110b5
					
				
					 1 changed files with 50 additions and 16 deletions
				
			
		|  | @ -113,31 +113,65 @@ def prettify_url(url, max_length=74): | |||
|         return url | ||||
| 
 | ||||
| 
 | ||||
| def contains_cjko(s: str) -> bool: | ||||
|     """This function check whether or not a string contains Chinese, Japanese, | ||||
|     or Korean characters. It employs regex and uses the u escape sequence to | ||||
|     match any character in a set of Unicode ranges. | ||||
| 
 | ||||
|     Args: | ||||
|         s (str): string to be checked. | ||||
| 
 | ||||
|     Returns: | ||||
|         bool: True if the input s contains the characters and False otherwise. | ||||
|     """ | ||||
|     unicode_ranges = ('\u4e00-\u9fff' # Chinese characters | ||||
|                       '\u3040-\u309f' # Japanese hiragana | ||||
|                       '\u30a0-\u30ff' # Japanese katakana | ||||
|                       '\u4e00-\u9faf' # Japanese kanji | ||||
|                       '\uac00-\ud7af' # Korean hangul syllables | ||||
|                       '\u1100-\u11ff' # Korean hangul jamo | ||||
|                       ) | ||||
|     return bool(re.search(fr'[{unicode_ranges}]', s)) | ||||
| 
 | ||||
| 
 | ||||
| def regex_highlight_cjk(word: str) -> str: | ||||
|     """Generate the regex pattern to match for a given word according | ||||
|     to whether or not the word contains CJK characters or not. | ||||
|     If the word is and/or contains CJK character, the regex pattern | ||||
|     will match standalone word by taking into account the presence | ||||
|     of whitespace before and after it; if not, it will match any presence | ||||
|     of the word throughout the text, ignoring the whitespace. | ||||
| 
 | ||||
|     Args: | ||||
|         word (str): the word to be matched with regex pattern. | ||||
| 
 | ||||
|     Returns: | ||||
|         str: the regex pattern for the word. | ||||
|     """ | ||||
|     rword = re.escape(word) | ||||
|     if contains_cjko(rword): | ||||
|         return fr'({rword})' | ||||
|     else: | ||||
|         return fr'\b({rword})(?!\w)' | ||||
| 
 | ||||
| 
 | ||||
| def highlight_content(content, query): | ||||
| 
 | ||||
|     if not content: | ||||
|         return None | ||||
| 
 | ||||
|     # ignoring html contents | ||||
|     # TODO better html content detection | ||||
|     if content.find('<') != -1: | ||||
|         return content | ||||
| 
 | ||||
|     if content.lower().find(query.lower()) > -1: | ||||
|         query_regex = '({0})'.format(re.escape(query)) | ||||
|         content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) | ||||
|     else: | ||||
|         regex_parts = [] | ||||
|         for chunk in query.split(): | ||||
|             chunk = chunk.replace('"', '') | ||||
|             if len(chunk) == 0: | ||||
|                 continue | ||||
|             elif len(chunk) == 1: | ||||
|                 regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk))) | ||||
|             else: | ||||
|                 regex_parts.append('{0}'.format(re.escape(chunk))) | ||||
|         query_regex = '({0})'.format('|'.join(regex_parts)) | ||||
|         content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) | ||||
| 
 | ||||
|     querysplit = query.split() | ||||
|     queries = [] | ||||
|     for qs in querysplit: | ||||
|         queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U)) | ||||
|     if len(queries) > 0: | ||||
|         for q in set(queries): | ||||
|             content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content) | ||||
|     return content | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 ahmad-alkadri
						ahmad-alkadri