mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[fix] engine: google has changed the layout of its response
Since 28. March google has changed its response, this patch fixes the google engine to scrap out the results & images from the new designed response. closes: https://github.com/searxng/searxng/issues/2287 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									1498202b0b
								
							
						
					
					
						commit
						6f9e678346
					
				
					 1 changed files with 26 additions and 45 deletions
				
			
		| 
						 | 
					@ -59,13 +59,10 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
 | 
				
			||||||
# specific xpath variables
 | 
					# specific xpath variables
 | 
				
			||||||
# ------------------------
 | 
					# ------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
results_xpath = './/div[@data-sokoban-container]'
 | 
					results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
 | 
				
			||||||
title_xpath = './/a/h3[1]'
 | 
					title_xpath = './/a/h3[1]'
 | 
				
			||||||
href_xpath = './/a[h3]/@href'
 | 
					href_xpath = './/a[h3]/@href'
 | 
				
			||||||
content_xpath = './/div[@data-content-feature]'
 | 
					content_xpath = './/div[@data-sncf]'
 | 
				
			||||||
 | 
					 | 
				
			||||||
# google *sections* are no usual *results*, we ignore them
 | 
					 | 
				
			||||||
g_section_with_header = './g-section-with-header'
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Suggestions are links placed in a *card-section*, we extract only the text
 | 
					# Suggestions are links placed in a *card-section*, we extract only the text
 | 
				
			||||||
# from the links not the links itself.
 | 
					# from the links not the links itself.
 | 
				
			||||||
| 
						 | 
					@ -303,21 +300,18 @@ def request(query, params):
 | 
				
			||||||
    return params
 | 
					    return params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# (function(){var s='data:image/jpeg;base64,/9j/4AAQ ...
 | 
					# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
 | 
				
			||||||
# ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})();
 | 
					# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
 | 
				
			||||||
RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc")
 | 
					RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _parse_data_images(dom):
 | 
					def _parse_data_images(dom):
 | 
				
			||||||
    data_image_map = {}
 | 
					    data_image_map = {}
 | 
				
			||||||
    for _script in eval_xpath_list(dom, "//script[@nonce]"):
 | 
					    for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
 | 
				
			||||||
        script = _script.text
 | 
					        end_pos = data_image.rfind('=')
 | 
				
			||||||
        if not script:
 | 
					        if end_pos > 0:
 | 
				
			||||||
            continue
 | 
					            data_image = data_image[: end_pos + 1]
 | 
				
			||||||
        script = RE_DATA_IMAGE.search(script)
 | 
					        data_image_map[img_id] = data_image
 | 
				
			||||||
        if not script:
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=')
 | 
					 | 
				
			||||||
    logger.debug('data:image objects --> %s', list(data_image_map.keys()))
 | 
					    logger.debug('data:image objects --> %s', list(data_image_map.keys()))
 | 
				
			||||||
    return data_image_map
 | 
					    return data_image_map
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -331,11 +325,7 @@ def response(resp):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # convert the text to dom
 | 
					    # convert the text to dom
 | 
				
			||||||
    dom = html.fromstring(resp.text)
 | 
					    dom = html.fromstring(resp.text)
 | 
				
			||||||
 | 
					    data_image_map = _parse_data_images(dom)
 | 
				
			||||||
    data_image_map = {}
 | 
					 | 
				
			||||||
    if '_fmt:html' in UI_ASYNC:
 | 
					 | 
				
			||||||
        # in this format images are embedded by a bse64 encoded 'data:image'
 | 
					 | 
				
			||||||
        data_image_map = _parse_data_images(dom)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # results --> answer
 | 
					    # results --> answer
 | 
				
			||||||
    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
 | 
					    answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
 | 
				
			||||||
| 
						 | 
					@ -349,11 +339,6 @@ def response(resp):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for result in eval_xpath_list(dom, results_xpath):  # pylint: disable=too-many-nested-blocks
 | 
					    for result in eval_xpath_list(dom, results_xpath):  # pylint: disable=too-many-nested-blocks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # google *sections*
 | 
					 | 
				
			||||||
        if extract_text(eval_xpath(result, g_section_with_header)):
 | 
					 | 
				
			||||||
            logger.debug("ignoring <g-section-with-header>")
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
 | 
					            title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
 | 
				
			||||||
            if title_tag is None:
 | 
					            if title_tag is None:
 | 
				
			||||||
| 
						 | 
					@ -361,33 +346,29 @@ def response(resp):
 | 
				
			||||||
                logger.debug('ignoring item from the result_xpath list: missing title')
 | 
					                logger.debug('ignoring item from the result_xpath list: missing title')
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            title = extract_text(title_tag)
 | 
					            title = extract_text(title_tag)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            url = eval_xpath_getindex(result, href_xpath, 0, None)
 | 
					            url = eval_xpath_getindex(result, href_xpath, 0, None)
 | 
				
			||||||
            if url is None:
 | 
					            if url is None:
 | 
				
			||||||
 | 
					                logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            content = []
 | 
					            content_nodes = eval_xpath(result, content_xpath)
 | 
				
			||||||
            img_list = []
 | 
					            content = extract_text(content_nodes)
 | 
				
			||||||
            for content_feature in eval_xpath(result, content_xpath):
 | 
					 | 
				
			||||||
                val = content_feature.attrib['data-content-feature']
 | 
					 | 
				
			||||||
                if val in ['1', '2']:
 | 
					 | 
				
			||||||
                    txt = extract_text(content_feature, allow_none=True)
 | 
					 | 
				
			||||||
                    if txt:
 | 
					 | 
				
			||||||
                        content.append(txt)
 | 
					 | 
				
			||||||
                elif '0' in val:
 | 
					 | 
				
			||||||
                    img = content_feature.xpath('.//img/@src')
 | 
					 | 
				
			||||||
                    if img:
 | 
					 | 
				
			||||||
                        img = img[0]
 | 
					 | 
				
			||||||
                        if img.startswith('data:image'):
 | 
					 | 
				
			||||||
                            img_id = content_feature.xpath('.//img/@id')
 | 
					 | 
				
			||||||
                            if img_id:
 | 
					 | 
				
			||||||
                                img = data_image_map.get(img_id[0])
 | 
					 | 
				
			||||||
                        img_list.append(img)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if not content:
 | 
					            if not content:
 | 
				
			||||||
                logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
 | 
					                logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            content = ' / '.join(content)
 | 
					
 | 
				
			||||||
            img_src = img_list[0] if img_list else None
 | 
					            img_src = content_nodes[0].xpath('.//img/@src')
 | 
				
			||||||
 | 
					            if img_src:
 | 
				
			||||||
 | 
					                img_src = img_src[0]
 | 
				
			||||||
 | 
					                if img_src.startswith('data:image'):
 | 
				
			||||||
 | 
					                    img_id = content_nodes[0].xpath('.//img/@id')
 | 
				
			||||||
 | 
					                    if img_id:
 | 
				
			||||||
 | 
					                        img_src = data_image_map.get(img_id[0])
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                img_src = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
 | 
					            results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        except Exception as e:  # pylint: disable=broad-except
 | 
					        except Exception as e:  # pylint: disable=broad-except
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue