searxngRebrandZaclys/searx/engines/google_news.py

194 lines
6.2 KiB
Python
Raw Normal View History

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google news engine. The google news API
ignores some parameters from the common :ref:`google API`:
- num_ : the number of search results is ignored
- save_ : is ignored / Google-News results are always *SafeSearch*
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
"""
2014-03-04 12:11:04 +00:00
# pylint: disable=invalid-name, missing-function-docstring
import binascii
[fix] google news - send CONSENT Cookie to not be redirected In the EU there exists a "General Data Protection Regulation" [1] aka GDPR (BTW: very user friendly!) which requires consent to tracking. To get the consent from the user, google-news requests are redirected to confirm and get a CONSENT Cookie from https://consent.google.de/s?continue=... This patch adds a CONSENT Cookie to the google-news request to avoid redirection. The behavior of the CONTENTS cookies over all google engines seems similar but the pattern is not yet fully clear to me, here are some random samples from my analysis .. Using common google search from different domains:: google.com: CONSENT=YES+cb.{{date}}-14-p0.de+FX+816 google.de: CONSENT=YES+cb.{{date}}-14-p0.de+FX+333 google.fr: CONSENT=YES+srp.gws-{{date}}-0-RC2.fr+FX+826 When searching about videos (google-videos):: google.es: CONSENT=YES+srp.gws-{{date}}-0-RC2.es+FX+076 google.de: CONSENT=YES+srp.gws-{{date}}-0-RC2.de+FX+171 Google news has only one domain for all languages:: news.google.com: CONSENT=YES+cb.{{date}}-14-p0.de+FX+816 Using google-scholar search from different domains:: scholar.google.de: CONSENT=YES+cb.{{date}}-14-p0.de+FX+333 scholar.google.fr: does not use such a cookie / did not ask the user scholar.google.es: does not use such a cookie / did not ask the user Interim summary: Pattern is unclear and I won't apply the CONSENT cookie to all google engines. More experience is need before we generalize the CONSENT cookies over all google engines. Related: - e9a6ab401 [fix] youtube - send CONSENT Cookie to not be redirected - https://github.com/benbusby/whoogle-search/issues/311 - https://github.com/benbusby/whoogle-search/issues/243 [1] https://en.wikipedia.org/wiki/General_Data_Protection_Regulation Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2021-06-18 09:49:20 +00:00
from datetime import datetime
import re
from urllib.parse import urlencode
from base64 import b64decode
from lxml import html
from searx import logger
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
# pylint: disable=unused-import
from searx.engines.google import (
supported_languages_url,
_fetch_supported_languages,
)
# pylint: enable=unused-import
from searx.engines.google import (
get_lang_info,
detect_google_sorry,
)
2014-03-04 12:11:04 +00:00
# about
about = {
"website": 'https://news.google.com',
"wikidata_id": 'Q12020',
"official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
logger = logger.getChild('google news')
# compared to other google engines google-news has a different time range
# support. The time range is included in the search term.
time_range_dict = {
'day': 'when:1d',
'week': 'when:7d',
'month': 'when:1m',
'year': 'when:1y',
}
# engine dependent config
2014-03-04 12:11:04 +00:00
categories = ['news']
paging = False
use_locale_domain = True
time_range_support = True
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False
2014-03-04 12:11:04 +00:00
def request(query, params):
"""Google-News search request"""
lang_info = get_lang_info(
# pylint: disable=undefined-variable
params, supported_languages, language_aliases, False
)
logger.debug(
"HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# google news has only one domain
lang_info['subdomain'] = 'news.google.com'
ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
# google news redirects en to en-US
if lang_info['params']['hl'] == 'en':
lang_info['params']['hl'] = 'en-US'
# Very special to google-news compared to other google engines, the time
# range is included in the search term.
if params['time_range']:
query += ' ' + time_range_dict[params['time_range']]
query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query,
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
'gl': lang_info['country'],
}) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
params['url'] = query_url
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
)
[fix] google news - send CONSENT Cookie to not be redirected In the EU there exists a "General Data Protection Regulation" [1] aka GDPR (BTW: very user friendly!) which requires consent to tracking. To get the consent from the user, google-news requests are redirected to confirm and get a CONSENT Cookie from https://consent.google.de/s?continue=... This patch adds a CONSENT Cookie to the google-news request to avoid redirection. The behavior of the CONTENTS cookies over all google engines seems similar but the pattern is not yet fully clear to me, here are some random samples from my analysis .. Using common google search from different domains:: google.com: CONSENT=YES+cb.{{date}}-14-p0.de+FX+816 google.de: CONSENT=YES+cb.{{date}}-14-p0.de+FX+333 google.fr: CONSENT=YES+srp.gws-{{date}}-0-RC2.fr+FX+826 When searching about videos (google-videos):: google.es: CONSENT=YES+srp.gws-{{date}}-0-RC2.es+FX+076 google.de: CONSENT=YES+srp.gws-{{date}}-0-RC2.de+FX+171 Google news has only one domain for all languages:: news.google.com: CONSENT=YES+cb.{{date}}-14-p0.de+FX+816 Using google-scholar search from different domains:: scholar.google.de: CONSENT=YES+cb.{{date}}-14-p0.de+FX+333 scholar.google.fr: does not use such a cookie / did not ask the user scholar.google.es: does not use such a cookie / did not ask the user Interim summary: Pattern is unclear and I won't apply the CONSENT cookie to all google engines. More experience is need before we generalize the CONSENT cookies over all google engines. Related: - e9a6ab401 [fix] youtube - send CONSENT Cookie to not be redirected - https://github.com/benbusby/whoogle-search/issues/311 - https://github.com/benbusby/whoogle-search/issues/243 [1] https://en.wikipedia.org/wiki/General_Data_Protection_Regulation Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2021-06-18 09:49:20 +00:00
params['headers']['Cookie'] = "CONSENT=YES+cb.%s-14-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
2014-03-04 12:11:04 +00:00
return params
def response(resp):
"""Get response from google's search request"""
2014-03-04 12:11:04 +00:00
results = []
2014-09-01 13:10:05 +00:00
detect_google_sorry(resp)
# convert the text to dom
dom = html.fromstring(resp.text)
2014-03-04 12:11:04 +00:00
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
# The first <a> tag in the <article> contains the link to the
# article The href attribute of the <a> is a google internal link,
# we can't use. The real link is hidden in the jslog attribute:
#
# <a ...
# jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
# href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
# ... />
jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
url = re.findall('http[^;]*', jslog)
if url:
url = url[0]
else:
# The real URL is base64 encoded in the json attribute:
# jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
jslog = jslog.split(";")[1].split(':')[1].strip()
try:
padding = (4 -(len(jslog) % 4)) * "="
jslog = b64decode(jslog + padding)
except binascii.Error:
# URL cant be read, skip this result
continue
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
url = re.findall('http[^;"]*', str(jslog))[0]
# the first <h3> tag in the <article> contains the title of the link
title = extract_text(eval_xpath(result, './article/h3[1]'))
# the first <div> tag in the <article> contains the content of the link
content = extract_text(eval_xpath(result, './article/div[1]'))
# the second <div> tag contains origin publisher and the publishing date
pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))
pub_info = []
if pub_origin:
pub_info.append(pub_origin)
if pub_date:
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_info.append(pub_date)
pub_info = ', '.join(pub_info)
if pub_info:
content = pub_info + ': ' + content
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
# These URL are long but not personalized (double checked via tor).
img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
results.append({
'url': url,
'title': title,
'content': content,
'img_src': img_src,
})
2014-09-01 13:10:05 +00:00
# return results
2014-03-04 12:11:04 +00:00
return results