Merge pull request #2306 from return42/fix-1959

[fix] engine google-News: fix decoding of URLs
This commit is contained in:
Markus Heiser 2023-04-02 08:02:37 +02:00 committed by GitHub
commit a5155a32c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 11 additions and 30 deletions

View File

@ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`:
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import binascii
import re
from urllib.parse import urlencode from urllib.parse import urlencode
from base64 import b64decode import base64
from lxml import html from lxml import html
import babel import babel
@ -144,34 +142,17 @@ def response(resp):
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
# The first <a> tag in the <article> contains the link to the # The first <a> tag in the <article> contains the link to the article
# article The href attribute of the <a> is a google internal link, # The href attribute of the <a> tag is a google internal link, we have
# we can't use. The real link is hidden in the jslog attribute: # to decode
#
# <a ...
# jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
# href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
# ... />
jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) href = eval_xpath_getindex(result, './article/a/@href', 0)
url = re.findall('http[^;]*', jslog) href = href.split('?')[0]
if url: href = href.split('/')[-1]
url = url[0] href = base64.urlsafe_b64decode(href + '====')
else: href = href[4:].split(b'\xd2')[0]
# The real URL is base64 encoded in the json attribute: href = href.decode()
# jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
jslog = jslog.split(";")[1].split(':')[1].strip()
try:
padding = (4 - (len(jslog) % 4)) * "="
jslog = b64decode(jslog + padding)
except binascii.Error:
# URL can't be read, skip this result
continue
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
url = re.findall('http[^;"]*', str(jslog))[0]
# the first <h3> tag in the <article> contains the title of the link
title = extract_text(eval_xpath(result, './article/h3[1]')) title = extract_text(eval_xpath(result, './article/h3[1]'))
# The pub_date is mostly a string like 'yesertday', not a real # The pub_date is mostly a string like 'yesertday', not a real
@ -189,7 +170,7 @@ def response(resp):
results.append( results.append(
{ {
'url': url, 'url': href,
'title': title, 'title': title,
'content': content, 'content': content,
'img_src': img_src, 'img_src': img_src,