mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	Perso branch - added cjk check and enclosement
This commit is contained in:
		
							parent
							
								
									afd71a6c0f
								
							
						
					
					
						commit
						6c421110b5
					
				
					 1 changed files with 50 additions and 16 deletions
				
			
		|  | @ -113,31 +113,65 @@ def prettify_url(url, max_length=74): | ||||||
|         return url |         return url | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def contains_cjko(s: str) -> bool: | ||||||
|  |     """This function check whether or not a string contains Chinese, Japanese, | ||||||
|  |     or Korean characters. It employs regex and uses the u escape sequence to | ||||||
|  |     match any character in a set of Unicode ranges. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         s (str): string to be checked. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         bool: True if the input s contains the characters and False otherwise. | ||||||
|  |     """ | ||||||
|  |     unicode_ranges = ('\u4e00-\u9fff' # Chinese characters | ||||||
|  |                       '\u3040-\u309f' # Japanese hiragana | ||||||
|  |                       '\u30a0-\u30ff' # Japanese katakana | ||||||
|  |                       '\u4e00-\u9faf' # Japanese kanji | ||||||
|  |                       '\uac00-\ud7af' # Korean hangul syllables | ||||||
|  |                       '\u1100-\u11ff' # Korean hangul jamo | ||||||
|  |                       ) | ||||||
|  |     return bool(re.search(fr'[{unicode_ranges}]', s)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def regex_highlight_cjk(word: str) -> str: | ||||||
|  |     """Generate the regex pattern to match for a given word according | ||||||
|  |     to whether or not the word contains CJK characters or not. | ||||||
|  |     If the word is and/or contains CJK character, the regex pattern | ||||||
|  |     will match standalone word by taking into account the presence | ||||||
|  |     of whitespace before and after it; if not, it will match any presence | ||||||
|  |     of the word throughout the text, ignoring the whitespace. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         word (str): the word to be matched with regex pattern. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         str: the regex pattern for the word. | ||||||
|  |     """ | ||||||
|  |     rword = re.escape(word) | ||||||
|  |     if contains_cjko(rword): | ||||||
|  |         return fr'({rword})' | ||||||
|  |     else: | ||||||
|  |         return fr'\b({rword})(?!\w)' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def highlight_content(content, query): | def highlight_content(content, query): | ||||||
| 
 | 
 | ||||||
|     if not content: |     if not content: | ||||||
|         return None |         return None | ||||||
|  | 
 | ||||||
|     # ignoring html contents |     # ignoring html contents | ||||||
|     # TODO better html content detection |     # TODO better html content detection | ||||||
|     if content.find('<') != -1: |     if content.find('<') != -1: | ||||||
|         return content |         return content | ||||||
| 
 | 
 | ||||||
|     if content.lower().find(query.lower()) > -1: |     querysplit = query.split() | ||||||
|         query_regex = '({0})'.format(re.escape(query)) |     queries = [] | ||||||
|         content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) |     for qs in querysplit: | ||||||
|     else: |         queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U)) | ||||||
|         regex_parts = [] |     if len(queries) > 0: | ||||||
|         for chunk in query.split(): |         for q in set(queries): | ||||||
|             chunk = chunk.replace('"', '') |             content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content) | ||||||
|             if len(chunk) == 0: |  | ||||||
|                 continue |  | ||||||
|             elif len(chunk) == 1: |  | ||||||
|                 regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk))) |  | ||||||
|             else: |  | ||||||
|                 regex_parts.append('{0}'.format(re.escape(chunk))) |  | ||||||
|         query_regex = '({0})'.format('|'.join(regex_parts)) |  | ||||||
|         content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) |  | ||||||
| 
 |  | ||||||
|     return content |     return content | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 ahmad-alkadri
						ahmad-alkadri