[fix] google video engine - rework of the HTML parser

The google video response has been changed slightly, a rework of the parser was
needed.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2021-11-26 01:14:17 +01:00
parent 488ace1da9
commit 1ce09df9aa
1 changed files with 31 additions and 22 deletions

View File

@ -31,11 +31,8 @@ from searx.engines.google import (
get_lang_info, get_lang_info,
time_range_dict, time_range_dict,
filter_mapping, filter_mapping,
results_xpath,
g_section_with_header, g_section_with_header,
title_xpath, title_xpath,
href_xpath,
content_xpath,
suggestion_xpath, suggestion_xpath,
detect_google_sorry, detect_google_sorry,
) )
@ -73,11 +70,27 @@ def _re(regexpr):
RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
return RE_CACHE[regexpr] return RE_CACHE[regexpr]
def scrap_out_thumbs_src(dom):
ret_val = {}
thumb_name = 'dimg_'
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
_script = script.text
# "dimg_35":"https://i.ytimg.c....",
_dimurl = _re("s='([^']*)").findall( _script)
for k,v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)' ).findall(_script):
v = v.replace(r'\u003d','=')
v = v.replace(r'\u0026','&')
ret_val[k] = v
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val
def scrap_out_thumbs(dom): def scrap_out_thumbs(dom):
"""Scrap out thumbnail data from <script> tags. """Scrap out thumbnail data from <script> tags.
""" """
ret_val = {} ret_val = {}
thumb_name = 'vidthumb' thumb_name = 'dimg_'
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
_script = script.text _script = script.text
@ -87,20 +100,11 @@ def scrap_out_thumbs(dom):
if not _imgdata: if not _imgdata:
continue continue
# var ii=['vidthumb4','vidthumb7'] # var ii=['dimg_17']
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
# At least the equal sign in the URL needs to be decoded # At least the equal sign in the URL needs to be decoded
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
# {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
_script = script.text
for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
if match:
# At least the equal sign in the URL needs to be decoded
ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=")
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val return ret_val
@ -144,9 +148,11 @@ def response(resp):
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
vidthumb_imgdata = scrap_out_thumbs(dom) vidthumb_imgdata = scrap_out_thumbs(dom)
thumbs_src = scrap_out_thumbs_src(dom)
logger.debug(str(thumbs_src))
# parse results # parse results
for result in eval_xpath_list(dom, results_xpath): for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
# google *sections* # google *sections*
if extract_text(eval_xpath(result, g_section_with_header)): if extract_text(eval_xpath(result, g_section_with_header)):
@ -154,21 +160,24 @@ def response(resp):
continue continue
title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
url = eval_xpath_getindex(result, href_xpath, 0) url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
c_node = eval_xpath_getindex(result, content_xpath, 0)
# <img id="vidthumb1" ...> # <img id="vidthumb1" ...>
img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
if img_id is None: if img_id is None:
logger.error("no img_id for: %s" % result)
continue continue
img_src = vidthumb_imgdata.get(img_id, None) img_src = vidthumb_imgdata.get(img_id, None)
if not img_src: if not img_src:
logger.error("no vidthumb imgdata for: %s" % img_id) logger.error("no vidthumb imgdata for: %s" % img_id)
img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) img_src = thumbs_src.get(img_id, "")
length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) length = extract_text(eval_xpath(
content = extract_text(eval_xpath(c_node, './/div[2]/span')) result, './/div[contains(@class, "P7xzyf")]/span/span'))
pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
content = extract_text(c_node)
pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
results.append({ results.append({
'url': url, 'url': url,