mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
[fix] google engine: remove <script> tags from result items
In some results, Google returns a <script> tag that must be removed before extracting the content. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
55481a6377
commit
342d321196
1 changed files with 9 additions and 9 deletions
|
@ -59,11 +59,6 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
|
||||||
# specific xpath variables
|
# specific xpath variables
|
||||||
# ------------------------
|
# ------------------------
|
||||||
|
|
||||||
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
|
|
||||||
title_xpath = './/a/h3[1]'
|
|
||||||
href_xpath = './/a[h3]/@href'
|
|
||||||
content_xpath = './/div[contains(@data-sncf, "1")]'
|
|
||||||
|
|
||||||
# Suggestions are links placed in a *card-section*, we extract only the text
|
# Suggestions are links placed in a *card-section*, we extract only the text
|
||||||
# from the links not the links itself.
|
# from the links not the links itself.
|
||||||
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
|
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
|
||||||
|
@ -345,22 +340,27 @@ def response(resp):
|
||||||
|
|
||||||
# parse results
|
# parse results
|
||||||
|
|
||||||
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
|
for result in eval_xpath_list(dom, './/div[contains(@jscontroller, "SC7lYd")]'):
|
||||||
|
# pylint: disable=too-many-nested-blocks
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
|
title_tag = eval_xpath_getindex(result, './/a/h3[1]', 0, default=None)
|
||||||
if title_tag is None:
|
if title_tag is None:
|
||||||
# this not one of the common google results *section*
|
# this not one of the common google results *section*
|
||||||
logger.debug('ignoring item from the result_xpath list: missing title')
|
logger.debug('ignoring item from the result_xpath list: missing title')
|
||||||
continue
|
continue
|
||||||
title = extract_text(title_tag)
|
title = extract_text(title_tag)
|
||||||
|
|
||||||
url = eval_xpath_getindex(result, href_xpath, 0, None)
|
url = eval_xpath_getindex(result, './/a[h3]/@href', 0, None)
|
||||||
if url is None:
|
if url is None:
|
||||||
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
|
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
content_nodes = eval_xpath(result, content_xpath)
|
content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
|
||||||
|
for item in content_nodes:
|
||||||
|
for script in item.xpath(".//script"):
|
||||||
|
script.getparent().remove(script)
|
||||||
|
|
||||||
content = extract_text(content_nodes)
|
content = extract_text(content_nodes)
|
||||||
|
|
||||||
if not content:
|
if not content:
|
||||||
|
|
Loading…
Add table
Reference in a new issue