[fix] google engine: remove <script> tags from result items

In some results, Google returns a <script> tag that must be removed before
extracting the content.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2024-11-27 13:35:21 +01:00 committed by Markus Heiser
parent 55481a6377
commit 342d321196

View file

@ -59,11 +59,6 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables # specific xpath variables
# ------------------------ # ------------------------
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[contains(@data-sncf, "1")]'
# Suggestions are links placed in a *card-section*, we extract only the text # Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself. # from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a' suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
@ -345,22 +340,27 @@ def response(resp):
# parse results # parse results
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
# pylint: disable=too-many-nested-blocks
try: try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
if title_tag is None: if title_tag is None:
# this not one of the common google results *section* # this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title') logger.debug('ignoring item from the result_xpath list: missing title')
continue continue
title = extract_text(title_tag) title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0, None) url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
if url is None: if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title) logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue continue
content_nodes = eval_xpath(result, content_xpath) content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
for item in content_nodes:
for script in item.xpath(".//script"):
script.getparent().remove(script)
content = extract_text(content_nodes) content = extract_text(content_nodes)
if not content: if not content: