mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[enh] add re-usable func to filter text
This commit is contained in:
		
							parent
							
								
									0fb3f0e4ae
								
							
						
					
					
						commit
						0fa81fc782
					
				
					 6 changed files with 53 additions and 25 deletions
				
			
		|  | @ -132,6 +132,7 @@ from lxml import html | |||
| from searx import locales | ||||
| from searx.utils import ( | ||||
|     extract_text, | ||||
|     extr, | ||||
|     eval_xpath, | ||||
|     eval_xpath_list, | ||||
|     eval_xpath_getindex, | ||||
|  | @ -252,11 +253,7 @@ def response(resp): | |||
|     if brave_category in ('search', 'goggles'): | ||||
|         return _parse_search(resp) | ||||
| 
 | ||||
|     datastr = "" | ||||
|     for line in resp.text.split("\n"): | ||||
|         if "const data = " in line: | ||||
|             datastr = line.replace("const data = ", "").strip()[:-1] | ||||
|             break | ||||
|     datastr = extr(resp.text, "const data = ", ";\n").strip() | ||||
| 
 | ||||
|     json_data = js_variable_to_python(datastr) | ||||
|     json_resp = json_data[1]['data']['body']['response'] | ||||
|  |  | |||
|  | @ -392,7 +392,9 @@ def fetch_traits(engine_traits: EngineTraits): | |||
|     SearXNG's locale. | ||||
| 
 | ||||
|     """ | ||||
|     # pylint: disable=too-many-branches, too-many-statements | ||||
|     # pylint: disable=too-many-branches, too-many-statements, disable=import-outside-toplevel | ||||
|     from searx.utils import extr, js_variable_to_python | ||||
| 
 | ||||
|     # fetch regions | ||||
| 
 | ||||
|     engine_traits.all_locale = 'wt-wt' | ||||
|  | @ -403,11 +405,9 @@ def fetch_traits(engine_traits: EngineTraits): | |||
|     if not resp.ok:  # type: ignore | ||||
|         print("ERROR: response from DuckDuckGo is not OK.") | ||||
| 
 | ||||
|     pos = resp.text.find('regions:{') + 8  # type: ignore | ||||
|     js_code = resp.text[pos:]  # type: ignore | ||||
|     pos = js_code.find('}') + 1 | ||||
|     regions = json.loads(js_code[:pos]) | ||||
|     js_code = extr(resp.text, 'regions:', ',snippetLengths') | ||||
| 
 | ||||
|     regions = json.loads(js_code) | ||||
|     for eng_tag, name in regions.items(): | ||||
| 
 | ||||
|         if eng_tag == 'wt-wt': | ||||
|  | @ -439,12 +439,9 @@ def fetch_traits(engine_traits: EngineTraits): | |||
| 
 | ||||
|     engine_traits.custom['lang_region'] = {} | ||||
| 
 | ||||
|     pos = resp.text.find('languages:{') + 10  # type: ignore | ||||
|     js_code = resp.text[pos:]  # type: ignore | ||||
|     pos = js_code.find('}') + 1 | ||||
|     js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') | ||||
|     languages = json.loads(js_code) | ||||
|     js_code = extr(resp.text, 'languages:', ',regions') | ||||
| 
 | ||||
|     languages = js_variable_to_python(js_code) | ||||
|     for eng_lang, name in languages.items(): | ||||
| 
 | ||||
|         if eng_lang == 'wt_WT': | ||||
|  |  | |||
|  | @ -312,13 +312,12 @@ def fetch_traits(engine_traits: EngineTraits): | |||
|     # pylint: disable=import-outside-toplevel | ||||
|     from searx import network | ||||
|     from searx.locales import region_tag | ||||
|     from searx.utils import extr | ||||
| 
 | ||||
|     resp = network.get(about['website']) | ||||
|     text = resp.text | ||||
|     text = text[text.find('INITIAL_PROPS') :] | ||||
|     text = text[text.find('{') : text.find('</script>')] | ||||
|     json_string = extr(resp.text, 'INITIAL_PROPS = ', '</script>') | ||||
| 
 | ||||
|     q_initial_props = loads(text) | ||||
|     q_initial_props = loads(json_string) | ||||
|     q_locales = q_initial_props.get('locales') | ||||
|     eng_tag_list = set() | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,6 +7,8 @@ from urllib.parse import urlencode | |||
| from json import loads | ||||
| from dateutil import parser | ||||
| 
 | ||||
| from searx.utils import extr | ||||
| 
 | ||||
| # about | ||||
| about = { | ||||
|     "website": 'https://vimeo.com/', | ||||
|  | @ -23,7 +25,7 @@ paging = True | |||
| 
 | ||||
| # search-url | ||||
| base_url = 'https://vimeo.com/' | ||||
| search_url = base_url + '/search/page:{pageno}?{query}' | ||||
| search_url = base_url + 'search/page:{pageno}?{query}' | ||||
| 
 | ||||
| 
 | ||||
| # do search-request | ||||
|  | @ -36,9 +38,8 @@ def request(query, params): | |||
| # get response from search-request | ||||
| def response(resp): | ||||
|     results = [] | ||||
|     data_start_pos = resp.text.find('{"filtered"') | ||||
|     data_end_pos = resp.text.find(';\n', data_start_pos + 1) | ||||
|     data = loads(resp.text[data_start_pos:data_end_pos]) | ||||
| 
 | ||||
|     data = loads(extr(resp.text, 'var data = ', ';\n')) | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in data['filtered']['data']: | ||||
|  |  | |||
|  | @ -7,6 +7,8 @@ from functools import reduce | |||
| from json import loads, dumps | ||||
| from urllib.parse import quote_plus | ||||
| 
 | ||||
| from searx.utils import extr | ||||
| 
 | ||||
| # about | ||||
| about = { | ||||
|     "website": 'https://www.youtube.com/', | ||||
|  | @ -109,8 +111,8 @@ def parse_next_page_response(response_text): | |||
| 
 | ||||
| def parse_first_page_response(response_text): | ||||
|     results = [] | ||||
|     results_data = response_text[response_text.find('ytInitialData') :] | ||||
|     results_data = results_data[results_data.find('{') : results_data.find(';</script>')] | ||||
|     results_data = extr(response_text, 'ytInitialData = ', ';</script>') | ||||
| 
 | ||||
|     results_json = loads(results_data) if results_data else {} | ||||
|     sections = ( | ||||
|         results_json.get('contents', {}) | ||||
|  |  | |||
|  | @ -2,6 +2,9 @@ | |||
| """Utility functions for the engines | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| from __future__ import annotations | ||||
| 
 | ||||
| import re | ||||
| import importlib | ||||
| import importlib.util | ||||
|  | @ -371,6 +374,35 @@ def convert_str_to_int(number_str: str) -> int: | |||
|     return 0 | ||||
| 
 | ||||
| 
 | ||||
| def extr(txt: str, begin: str, end: str, default: str = ""): | ||||
|     """Extract the string between ``begin`` and ``end`` from ``txt`` | ||||
| 
 | ||||
|     :param txt:     String to search in | ||||
|     :param begin:   First string to be searched for | ||||
|     :param end:     Second string to be searched for after ``begin`` | ||||
|     :param default: Default value if one of ``begin`` or ``end`` is not | ||||
|                     found.  Defaults to an empty string. | ||||
|     :return: The string between the two search-strings ``begin`` and ``end``. | ||||
|              If at least one of ``begin`` or ``end`` is not found, the value of | ||||
|              ``default`` is returned. | ||||
| 
 | ||||
|     Examples: | ||||
|       >>> extr("abcde", "a", "e") | ||||
|       "bcd" | ||||
|       >>> extr("abcde", "a", "z", deafult="nothing") | ||||
|       "nothing" | ||||
| 
 | ||||
|     """ | ||||
| 
 | ||||
|     # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129 | ||||
| 
 | ||||
|     try: | ||||
|         first = txt.index(begin) + len(begin) | ||||
|         return txt[first : txt.index(end, first)] | ||||
|     except ValueError: | ||||
|         return default | ||||
| 
 | ||||
| 
 | ||||
| def int_or_zero(num: Union[List[str], str]) -> int: | ||||
|     """Convert num to int or 0. num can be either a str or a list. | ||||
|     If num is a list, the first element is converted to int (or return 0 if the list is empty). | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Allen
						Allen