forked from zaclys/searxng
		
	[fix] google video engine - rework of the HTML parser
The google video response has been changed slightly, a rework of the parser was needed. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									488ace1da9
								
							
						
					
					
						commit
						1ce09df9aa
					
				
					 1 changed files with 31 additions and 22 deletions
				
			
		| 
						 | 
					@ -31,11 +31,8 @@ from searx.engines.google import (
 | 
				
			||||||
    get_lang_info,
 | 
					    get_lang_info,
 | 
				
			||||||
    time_range_dict,
 | 
					    time_range_dict,
 | 
				
			||||||
    filter_mapping,
 | 
					    filter_mapping,
 | 
				
			||||||
    results_xpath,
 | 
					 | 
				
			||||||
    g_section_with_header,
 | 
					    g_section_with_header,
 | 
				
			||||||
    title_xpath,
 | 
					    title_xpath,
 | 
				
			||||||
    href_xpath,
 | 
					 | 
				
			||||||
    content_xpath,
 | 
					 | 
				
			||||||
    suggestion_xpath,
 | 
					    suggestion_xpath,
 | 
				
			||||||
    detect_google_sorry,
 | 
					    detect_google_sorry,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					@ -73,11 +70,27 @@ def _re(regexpr):
 | 
				
			||||||
    RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
 | 
					    RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
 | 
				
			||||||
    return RE_CACHE[regexpr]
 | 
					    return RE_CACHE[regexpr]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def scrap_out_thumbs_src(dom):
 | 
				
			||||||
 | 
					    ret_val = {}
 | 
				
			||||||
 | 
					    thumb_name = 'dimg_'
 | 
				
			||||||
 | 
					    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
 | 
				
			||||||
 | 
					        _script = script.text
 | 
				
			||||||
 | 
					        # "dimg_35":"https://i.ytimg.c....",
 | 
				
			||||||
 | 
					        _dimurl = _re("s='([^']*)").findall( _script)
 | 
				
			||||||
 | 
					        for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script):
 | 
				
			||||||
 | 
					            v = v.replace(r'\u003d','=')
 | 
				
			||||||
 | 
					            v = v.replace(r'\u0026','&')
 | 
				
			||||||
 | 
					            ret_val[k] = v
 | 
				
			||||||
 | 
					    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
 | 
				
			||||||
 | 
					    return ret_val
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def scrap_out_thumbs(dom):
 | 
					def scrap_out_thumbs(dom):
 | 
				
			||||||
    """Scrap out thumbnail data from <script> tags.
 | 
					    """Scrap out thumbnail data from <script> tags.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    ret_val = {}
 | 
					    ret_val = {}
 | 
				
			||||||
    thumb_name = 'vidthumb'
 | 
					    thumb_name = 'dimg_'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
 | 
					    for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
 | 
				
			||||||
        _script = script.text
 | 
					        _script = script.text
 | 
				
			||||||
| 
						 | 
					@ -87,20 +100,11 @@ def scrap_out_thumbs(dom):
 | 
				
			||||||
        if not _imgdata:
 | 
					        if not _imgdata:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # var ii=['vidthumb4','vidthumb7']
 | 
					        # var ii=['dimg_17']
 | 
				
			||||||
        for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
 | 
					        for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
 | 
				
			||||||
            # At least the equal sign in the URL needs to be decoded
 | 
					            # At least the equal sign in the URL needs to be decoded
 | 
				
			||||||
            ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
 | 
					            ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
 | 
					 | 
				
			||||||
    for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
 | 
					 | 
				
			||||||
        _script = script.text
 | 
					 | 
				
			||||||
        for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
 | 
					 | 
				
			||||||
            match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
 | 
					 | 
				
			||||||
            if match:
 | 
					 | 
				
			||||||
                # At least the equal sign in the URL needs to be decoded
 | 
					 | 
				
			||||||
                ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
 | 
					    logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
 | 
				
			||||||
    return ret_val
 | 
					    return ret_val
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -144,9 +148,11 @@ def response(resp):
 | 
				
			||||||
    # convert the text to dom
 | 
					    # convert the text to dom
 | 
				
			||||||
    dom = html.fromstring(resp.text)
 | 
					    dom = html.fromstring(resp.text)
 | 
				
			||||||
    vidthumb_imgdata = scrap_out_thumbs(dom)
 | 
					    vidthumb_imgdata = scrap_out_thumbs(dom)
 | 
				
			||||||
 | 
					    thumbs_src = scrap_out_thumbs_src(dom)
 | 
				
			||||||
 | 
					    logger.debug(str(thumbs_src))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # parse results
 | 
					    # parse results
 | 
				
			||||||
    for result in eval_xpath_list(dom, results_xpath):
 | 
					    for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # google *sections*
 | 
					        # google *sections*
 | 
				
			||||||
        if extract_text(eval_xpath(result, g_section_with_header)):
 | 
					        if extract_text(eval_xpath(result, g_section_with_header)):
 | 
				
			||||||
| 
						 | 
					@ -154,21 +160,24 @@ def response(resp):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
 | 
					        title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
 | 
				
			||||||
        url = eval_xpath_getindex(result, href_xpath, 0)
 | 
					        url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
 | 
				
			||||||
        c_node = eval_xpath_getindex(result, content_xpath, 0)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # <img id="vidthumb1" ...>
 | 
					        # <img id="vidthumb1" ...>
 | 
				
			||||||
        img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
 | 
					        img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
 | 
				
			||||||
        if img_id is None:
 | 
					        if img_id is None:
 | 
				
			||||||
 | 
					            logger.error("no img_id for: %s" % result)
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        img_src = vidthumb_imgdata.get(img_id, None)
 | 
					        img_src = vidthumb_imgdata.get(img_id, None)
 | 
				
			||||||
        if not img_src:
 | 
					        if not img_src:
 | 
				
			||||||
            logger.error("no vidthumb imgdata for: %s" % img_id)
 | 
					            logger.error("no vidthumb imgdata for: %s" % img_id)
 | 
				
			||||||
            img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
 | 
					            img_src = thumbs_src.get(img_id, "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
 | 
					        length = extract_text(eval_xpath(
 | 
				
			||||||
        content = extract_text(eval_xpath(c_node, './/div[2]/span'))
 | 
					            result, './/div[contains(@class, "P7xzyf")]/span/span'))
 | 
				
			||||||
        pub_info = extract_text(eval_xpath(c_node, './/div[2]/div'))
 | 
					        c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
 | 
				
			||||||
 | 
					        content = extract_text(c_node)
 | 
				
			||||||
 | 
					        pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        results.append({
 | 
					        results.append({
 | 
				
			||||||
            'url':         url,
 | 
					            'url':         url,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue