Merge pull request #2483 from return42/fix-google-news

[fix] revise of the google-News engine
This commit is contained in:
Alexandre Flament 2021-01-23 20:21:09 +01:00 committed by GitHub
commit 7d24850d49
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 167 additions and 68 deletions

View File

@ -177,7 +177,9 @@ PYLINT_FILES=\
searx/testing.py \ searx/testing.py \
searx/engines/gigablast.py \ searx/engines/gigablast.py \
searx/engines/deviantart.py \ searx/engines/deviantart.py \
searx/engines/digg.py searx/engines/digg.py \
searx/engines/google.py \
searx/engines/google_news.py
test.pylint: pyenvinstall test.pylint: pyenvinstall
$(call cmd,pylint,$(PYLINT_FILES)) $(call cmd,pylint,$(PYLINT_FILES))

View File

@ -155,6 +155,11 @@ def get_lang_country(params, lang_list, custom_aliases):
return language, country, lang_country return language, country, lang_country
def detect_google_sorry(resp):
resp_url = urlparse(resp.url)
if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
def request(query, params): def request(query, params):
"""Google search request""" """Google search request"""
@ -200,16 +205,10 @@ def request(query, params):
def response(resp): def response(resp):
"""Get response from google's search request""" """Get response from google's search request"""
detect_google_sorry(resp)
results = [] results = []
# detect google sorry
resp_url = urlparse(resp.url)
if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
raise SearxEngineCaptchaException()
if resp_url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
# which subdomain ? # which subdomain ?
# subdomain = resp.search_params.get('google_subdomain') # subdomain = resp.search_params.get('google_subdomain')

View File

@ -12,10 +12,9 @@ Definitions`_.
Header set Content-Security-Policy "img-src 'self' data: ;" Header set Content-Security-Policy "img-src 'self' data: ;"
""" """
from urllib.parse import urlencode, urlparse, unquote from urllib.parse import urlencode, unquote
from lxml import html from lxml import html
from searx import logger from searx import logger
from searx.exceptions import SearxEngineCaptchaException
from searx.utils import extract_text, eval_xpath from searx.utils import extract_text, eval_xpath
from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
@ -23,6 +22,7 @@ from searx.engines.google import (
get_lang_country, get_lang_country,
google_domains, google_domains,
time_range_dict, time_range_dict,
detect_google_sorry,
) )
logger = logger.getChild('google images') logger = logger.getChild('google images')
@ -123,13 +123,7 @@ def response(resp):
"""Get response from google's search request""" """Get response from google's search request"""
results = [] results = []
# detect google sorry detect_google_sorry(resp)
resp_url = urlparse(resp.url)
if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
raise SearxEngineCaptchaException()
if resp_url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
# which subdomain ? # which subdomain ?
# subdomain = resp.search_params.get('google_subdomain') # subdomain = resp.search_params.get('google_subdomain')

View File

@ -1,12 +1,45 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Google (News)
Google (News)
For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Not all parameters can be appied, e.g. num_ (the number of
search results to return) is ignored.
.. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
""" """
# pylint: disable=invalid-name, missing-function-docstring
import binascii
import re
from urllib.parse import urlencode from urllib.parse import urlencode
from base64 import b64decode
from lxml import html from lxml import html
from searx.utils import match_language
from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import from searx import logger
from searx.utils import (
eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
# pylint: disable=unused-import
from searx.engines.google import (
supported_languages_url,
_fetch_supported_languages,
detect_google_sorry,
)
# pylint: enable=unused-import
from searx.engines.google import (
get_lang_country,
filter_mapping,
)
# about # about
about = { about = {
@ -18,72 +51,143 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
# search-url logger = logger.getChild('google news')
# compared to other google engines google-news has a different time range
# support. The time range is included in the search term.
time_range_dict = {
'day': 'when:1d',
'week': 'when:7d',
'month': 'when:1m',
'year': 'when:1y',
}
# engine dependent config
categories = ['news'] categories = ['news']
paging = True paging = False
language_support = True language_support = True
safesearch = True use_locale_domain = True
time_range_support = True time_range_support = True
number_of_results = 10 safesearch = True # not really, but it is not generated by google
search_url = 'https://www.google.com/search'\
'?{query}'\
'&tbm=nws'\
'&gws_rd=cr'\
'&{search_options}'
time_range_attr = "qdr:{range}"
time_range_dict = {'day': 'd',
'week': 'w',
'month': 'm',
'year': 'y'}
# do search-request
def request(query, params): def request(query, params):
"""Google-News search request"""
search_options = { language, country, lang_country = get_lang_country(
'start': (params['pageno'] - 1) * number_of_results # pylint: disable=undefined-variable
} params, supported_languages, language_aliases
)
subdomain = 'news.google.com'
if params['time_range'] in time_range_dict: if params['time_range']: # in time_range_dict:
search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) query += ' ' + time_range_dict[params['time_range']]
if safesearch and params['safesearch']: query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({
search_options['safe'] = 'on' 'q': query,
'hl': lang_country,
'lr': "lang_" + language,
'ie': "utf8",
'oe': "utf8",
'ceid' : "%s:%s" % (country, language),
'gl' : country,
})
params['url'] = search_url.format(query=urlencode({'q': query}), if params['safesearch']:
search_options=urlencode(search_options)) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
if params['language'] != 'all': params['url'] = query_url
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] logger.debug("query_url --> %s", query_url)
if language:
params['url'] += '&hl=' + language # en-US,en;q=0.8,en;q=0.5
params['headers']['Accept-Language'] = (
lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
)
logger.debug("HTTP header Accept-Language --> %s",
params['headers']['Accept-Language'])
params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
)
#params['google_subdomain'] = subdomain
return params return params
# get response from search-request
def response(resp): def response(resp):
"""Get response from google's search request"""
results = [] results = []
detect_google_sorry(resp)
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
try:
r = {
'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"),
'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')),
'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
}
except:
continue
imgs = result.xpath('.//img/@src') # The first <a> tag in the <article> contains the link to the
if len(imgs) and not imgs[0].startswith('data'): # article The href attribute of the <a> is a google internal link,
r['img_src'] = imgs[0] # we can't use. The real link is hidden in the jslog attribute:
#
# <a ...
# jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
# href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
# ... />
results.append(r) jslog = eval_xpath_getindex(result, './article/a/@jslog', 0)
url = re.findall('http[^;]*', jslog)
if url:
url = url[0]
else:
# The real URL is base64 encoded in the json attribute:
# jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
jslog = jslog.split(";")[1].split(':')[1].strip()
try:
padding = (4 -(len(jslog) % 4)) * "="
jslog = b64decode(jslog + padding)
except binascii.Error:
# URL cant be read, skip this result
continue
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
url = re.findall('http[^;"]*', str(jslog))[0]
# the first <h3> tag in the <article> contains the title of the link
title = extract_text(eval_xpath(result, './article/h3[1]'))
# the first <div> tag in the <article> contains the content of the link
content = extract_text(eval_xpath(result, './article/div[1]'))
# the second <div> tag contains origin publisher and the publishing date
pub_date = extract_text(eval_xpath(result, './article/div[2]//time'))
pub_origin = extract_text(eval_xpath(result, './article/div[2]//a'))
pub_info = []
if pub_origin:
pub_info.append(pub_origin)
if pub_date:
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_info.append(pub_date)
pub_info = ', '.join(pub_info)
if pub_info:
content = pub_info + ': ' + content
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
# These URL are long but not personalized (double checked via tor).
img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
results.append({
'url': url,
'title': title,
'content': content,
'img_src': img_src,
})
# return results # return results
return results return results