mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	Merge pull request #547 from return42/fix-442
[fix] google & google video engines
This commit is contained in:
		
						commit
						328473befd
					
				
					 2 changed files with 32 additions and 36 deletions
				
			
		|  | @ -138,12 +138,7 @@ content_xpath = './/div[@class="IsZvec"]' | |||
| 
 | ||||
| # Suggestions are links placed in a *card-section*, we extract only the text | ||||
| # from the links not the links itself. | ||||
| suggestion_xpath = '//div[contains(@class, "card-section")]//a' | ||||
| 
 | ||||
| # Since google does *auto-correction* on the first query these are not really | ||||
| # *spelling suggestions*, we use them anyway. | ||||
| spelling_suggestion_xpath = '//div[@class="med"]/p/a' | ||||
| 
 | ||||
| suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' | ||||
| 
 | ||||
| def get_lang_info(params, lang_list, custom_aliases, supported_any_language): | ||||
|     """Composing various language properties for the google engines. | ||||
|  | @ -322,7 +317,6 @@ def response(resp): | |||
| 
 | ||||
|     # convert the text to dom | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     # results --> answer | ||||
|     answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') | ||||
|     if answer_list: | ||||
|  | @ -379,9 +373,6 @@ def response(resp): | |||
|         # append suggestion | ||||
|         results.append({'suggestion': extract_text(suggestion)}) | ||||
| 
 | ||||
|     for correction in eval_xpath_list(dom, spelling_suggestion_xpath): | ||||
|         results.append({'correction': extract_text(correction)}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
| 
 | ||||
|  |  | |||
|  | @ -31,13 +31,9 @@ from searx.engines.google import ( | |||
|     get_lang_info, | ||||
|     time_range_dict, | ||||
|     filter_mapping, | ||||
|     results_xpath, | ||||
|     g_section_with_header, | ||||
|     title_xpath, | ||||
|     href_xpath, | ||||
|     content_xpath, | ||||
|     suggestion_xpath, | ||||
|     spelling_suggestion_xpath, | ||||
|     detect_google_sorry, | ||||
| ) | ||||
| 
 | ||||
|  | @ -74,11 +70,27 @@ def _re(regexpr): | |||
|     RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) | ||||
|     return RE_CACHE[regexpr] | ||||
| 
 | ||||
| 
 | ||||
| def scrap_out_thumbs_src(dom): | ||||
|     ret_val = {} | ||||
|     thumb_name = 'dimg_' | ||||
|     for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): | ||||
|         _script = script.text | ||||
|         # "dimg_35":"https://i.ytimg.c....", | ||||
|         _dimurl = _re("s='([^']*)").findall( _script) | ||||
|         for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script): | ||||
|             v = v.replace(r'\u003d','=') | ||||
|             v = v.replace(r'\u0026','&') | ||||
|             ret_val[k] = v | ||||
|     logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) | ||||
|     return ret_val | ||||
| 
 | ||||
| 
 | ||||
| def scrap_out_thumbs(dom): | ||||
|     """Scrap out thumbnail data from <script> tags. | ||||
|     """ | ||||
|     ret_val = {} | ||||
|     thumb_name = 'vidthumb' | ||||
|     thumb_name = 'dimg_' | ||||
| 
 | ||||
|     for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): | ||||
|         _script = script.text | ||||
|  | @ -88,20 +100,11 @@ def scrap_out_thumbs(dom): | |||
|         if not _imgdata: | ||||
|             continue | ||||
| 
 | ||||
|         # var ii=['vidthumb4','vidthumb7'] | ||||
|         # var ii=['dimg_17'] | ||||
|         for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): | ||||
|             # At least the equal sign in the URL needs to be decoded | ||||
|             ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") | ||||
| 
 | ||||
|     # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... | ||||
|     for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): | ||||
|         _script = script.text | ||||
|         for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : | ||||
|             match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) | ||||
|             if match: | ||||
|                 # At least the equal sign in the URL needs to be decoded | ||||
|                 ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=") | ||||
| 
 | ||||
|     logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) | ||||
|     return ret_val | ||||
| 
 | ||||
|  | @ -145,9 +148,11 @@ def response(resp): | |||
|     # convert the text to dom | ||||
|     dom = html.fromstring(resp.text) | ||||
|     vidthumb_imgdata = scrap_out_thumbs(dom) | ||||
|     thumbs_src = scrap_out_thumbs_src(dom) | ||||
|     logger.debug(str(thumbs_src)) | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in eval_xpath_list(dom, results_xpath): | ||||
|     for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): | ||||
| 
 | ||||
|         # google *sections* | ||||
|         if extract_text(eval_xpath(result, g_section_with_header)): | ||||
|  | @ -155,21 +160,24 @@ def response(resp): | |||
|             continue | ||||
| 
 | ||||
|         title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) | ||||
|         url = eval_xpath_getindex(result, href_xpath, 0) | ||||
|         c_node = eval_xpath_getindex(result, content_xpath, 0) | ||||
|         url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) | ||||
| 
 | ||||
|         # <img id="vidthumb1" ...> | ||||
|         img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) | ||||
|         img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) | ||||
|         if img_id is None: | ||||
|             logger.error("no img_id for: %s" % result) | ||||
|             continue | ||||
| 
 | ||||
|         img_src = vidthumb_imgdata.get(img_id, None) | ||||
|         if not img_src: | ||||
|             logger.error("no vidthumb imgdata for: %s" % img_id) | ||||
|             img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) | ||||
|             img_src = thumbs_src.get(img_id, "") | ||||
| 
 | ||||
|         length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) | ||||
|         content = extract_text(eval_xpath(c_node, './/div[2]/span')) | ||||
|         pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) | ||||
|         length = extract_text(eval_xpath( | ||||
|             result, './/div[contains(@class, "P7xzyf")]/span/span')) | ||||
|         c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) | ||||
|         content = extract_text(c_node) | ||||
|         pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) | ||||
| 
 | ||||
|         results.append({ | ||||
|             'url':         url, | ||||
|  | @ -186,7 +194,4 @@ def response(resp): | |||
|         # append suggestion | ||||
|         results.append({'suggestion': extract_text(suggestion)}) | ||||
| 
 | ||||
|     for correction in eval_xpath_list(dom, spelling_suggestion_xpath): | ||||
|         results.append({'correction': extract_text(correction)}) | ||||
| 
 | ||||
|     return results | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Alexandre Flament
						Alexandre Flament