[fix] google-videos engine: ignore news articles

In the video search, google also sometimes includes news.  E.g. in the DE
language when you search for `!gov paris`, google adds an article from a german
newspaper (FAZ), I assume these are sponsored link (not tagged advertisement?)

Those links do not have an image / this patch ignores *video links* wqithout an
image ID.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2021-11-26 17:11:20 +01:00
parent cfb5eda4d1
commit 6e06618e0c
1 changed files with 5 additions and 7 deletions

View File

@ -154,25 +154,23 @@ def response(resp):
# parse results # parse results
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
# google *sections* # ignore google *sections*
if extract_text(eval_xpath(result, g_section_with_header)): if extract_text(eval_xpath(result, g_section_with_header)):
logger.debug("ingoring <g-section-with-header>") logger.debug("ingoring <g-section-with-header>")
continue continue
title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) # ingnore articles without an image id / e.g. news articles
url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
# <img id="vidthumb1" ...>
img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
if img_id is None: if img_id is None:
logger.error("no img_id for: %s" % result) logger.error("no img_id found in item %s (news article?)", len(results) + 1)
continue continue
img_src = vidthumb_imgdata.get(img_id, None) img_src = vidthumb_imgdata.get(img_id, None)
if not img_src: if not img_src:
logger.error("no vidthumb imgdata for: %s" % img_id)
img_src = thumbs_src.get(img_id, "") img_src = thumbs_src.get(img_id, "")
title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
length = extract_text(eval_xpath( length = extract_text(eval_xpath(
result, './/div[contains(@class, "P7xzyf")]/span/span')) result, './/div[contains(@class, "P7xzyf")]/span/span'))
c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)