forked from zaclys/searxng
		
	Merge pull request #2483 from return42/fix-google-news
[fix] revise of the google-News engine
This commit is contained in:
		
						commit
						7d24850d49
					
				
					 4 changed files with 167 additions and 68 deletions
				
			
		
							
								
								
									
										4
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										4
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -177,7 +177,9 @@ PYLINT_FILES=\ | |||
| 	searx/testing.py \
 | ||||
| 	searx/engines/gigablast.py \
 | ||||
| 	searx/engines/deviantart.py \
 | ||||
| 	searx/engines/digg.py | ||||
| 	searx/engines/digg.py \
 | ||||
| 	searx/engines/google.py \
 | ||||
| 	searx/engines/google_news.py | ||||
| 
 | ||||
| test.pylint: pyenvinstall | ||||
| 	$(call cmd,pylint,$(PYLINT_FILES)) | ||||
|  |  | |||
|  | @ -155,6 +155,11 @@ def get_lang_country(params, lang_list, custom_aliases): | |||
| 
 | ||||
|     return language, country, lang_country | ||||
| 
 | ||||
| def detect_google_sorry(resp): | ||||
|     resp_url = urlparse(resp.url) | ||||
|     if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'): | ||||
|         raise SearxEngineCaptchaException() | ||||
| 
 | ||||
| 
 | ||||
| def request(query, params): | ||||
|     """Google search request""" | ||||
|  | @ -200,16 +205,10 @@ def request(query, params): | |||
| 
 | ||||
| def response(resp): | ||||
|     """Get response from google's search request""" | ||||
| 
 | ||||
|     detect_google_sorry(resp) | ||||
| 
 | ||||
|     results = [] | ||||
| 
 | ||||
|     # detect google sorry | ||||
|     resp_url = urlparse(resp.url) | ||||
|     if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': | ||||
|         raise SearxEngineCaptchaException() | ||||
| 
 | ||||
|     if resp_url.path.startswith('/sorry'): | ||||
|         raise SearxEngineCaptchaException() | ||||
| 
 | ||||
|     # which subdomain ? | ||||
|     # subdomain = resp.search_params.get('google_subdomain') | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,10 +12,9 @@ Definitions`_. | |||
|      Header set Content-Security-Policy "img-src 'self' data: ;" | ||||
| """ | ||||
| 
 | ||||
| from urllib.parse import urlencode, urlparse, unquote | ||||
| from urllib.parse import urlencode, unquote | ||||
| from lxml import html | ||||
| from searx import logger | ||||
| from searx.exceptions import SearxEngineCaptchaException | ||||
| from searx.utils import extract_text, eval_xpath | ||||
| from searx.engines.google import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import | ||||
| 
 | ||||
|  | @ -23,6 +22,7 @@ from searx.engines.google import ( | |||
|     get_lang_country, | ||||
|     google_domains, | ||||
|     time_range_dict, | ||||
|     detect_google_sorry, | ||||
| ) | ||||
| 
 | ||||
| logger = logger.getChild('google images') | ||||
|  | @ -123,13 +123,7 @@ def response(resp): | |||
|     """Get response from google's search request""" | ||||
|     results = [] | ||||
| 
 | ||||
|     # detect google sorry | ||||
|     resp_url = urlparse(resp.url) | ||||
|     if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': | ||||
|         raise SearxEngineCaptchaException() | ||||
| 
 | ||||
|     if resp_url.path.startswith('/sorry'): | ||||
|         raise SearxEngineCaptchaException() | ||||
|     detect_google_sorry(resp) | ||||
| 
 | ||||
|     # which subdomain ? | ||||
|     # subdomain = resp.search_params.get('google_subdomain') | ||||
|  |  | |||
|  | @ -1,12 +1,45 @@ | |||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| """ | ||||
|  Google (News) | ||||
| """Google (News) | ||||
| 
 | ||||
| For detailed description of the *REST-full* API see: `Query Parameter | ||||
| Definitions`_.  Not all parameters can be appied, e.g. num_ (the number of | ||||
| search results to return) is ignored. | ||||
| 
 | ||||
| .. _Query Parameter Definitions: | ||||
|    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||
| 
 | ||||
| .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| # pylint: disable=invalid-name, missing-function-docstring | ||||
| 
 | ||||
| import binascii | ||||
| import re | ||||
| from urllib.parse import urlencode | ||||
| from base64 import b64decode | ||||
| from lxml import html | ||||
| from searx.utils import match_language | ||||
| from searx.engines.google import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import | ||||
| 
 | ||||
| from searx import logger | ||||
| from searx.utils import ( | ||||
|     eval_xpath, | ||||
|     eval_xpath_list, | ||||
|     eval_xpath_getindex, | ||||
|     extract_text, | ||||
| ) | ||||
| 
 | ||||
| # pylint: disable=unused-import | ||||
| from searx.engines.google import ( | ||||
|     supported_languages_url, | ||||
|     _fetch_supported_languages, | ||||
|     detect_google_sorry, | ||||
| ) | ||||
| # pylint: enable=unused-import | ||||
| 
 | ||||
| from searx.engines.google import ( | ||||
|     get_lang_country, | ||||
|     filter_mapping, | ||||
| ) | ||||
| 
 | ||||
| # about | ||||
| about = { | ||||
|  | @ -18,72 +51,143 @@ about = { | |||
|     "results": 'HTML', | ||||
| } | ||||
| 
 | ||||
| # search-url | ||||
| logger = logger.getChild('google news') | ||||
| 
 | ||||
| # compared to other google engines google-news has a different time range | ||||
| # support.  The time range is included in the search term. | ||||
| time_range_dict = { | ||||
|     'day': 'when:1d', | ||||
|     'week': 'when:7d', | ||||
|     'month': 'when:1m', | ||||
|     'year': 'when:1y', | ||||
| } | ||||
| 
 | ||||
| # engine dependent config | ||||
| 
 | ||||
| categories = ['news'] | ||||
| paging = True | ||||
| paging = False | ||||
| language_support = True | ||||
| safesearch = True | ||||
| use_locale_domain = True | ||||
| time_range_support = True | ||||
| number_of_results = 10 | ||||
| safesearch = True # not really, but it is not generated by google | ||||
| 
 | ||||
| search_url = 'https://www.google.com/search'\ | ||||
|     '?{query}'\ | ||||
|     '&tbm=nws'\ | ||||
|     '&gws_rd=cr'\ | ||||
|     '&{search_options}' | ||||
| time_range_attr = "qdr:{range}" | ||||
| time_range_dict = {'day': 'd', | ||||
|                    'week': 'w', | ||||
|                    'month': 'm', | ||||
|                    'year': 'y'} | ||||
| 
 | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     """Google-News search request""" | ||||
| 
 | ||||
|     search_options = { | ||||
|         'start': (params['pageno'] - 1) * number_of_results | ||||
|     } | ||||
|     language, country, lang_country = get_lang_country( | ||||
|         # pylint: disable=undefined-variable | ||||
|         params, supported_languages, language_aliases | ||||
|     ) | ||||
|     subdomain = 'news.google.com' | ||||
| 
 | ||||
|     if params['time_range'] in time_range_dict: | ||||
|         search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) | ||||
|     if params['time_range']: # in time_range_dict: | ||||
|         query += ' ' + time_range_dict[params['time_range']] | ||||
| 
 | ||||
|     if safesearch and params['safesearch']: | ||||
|         search_options['safe'] = 'on' | ||||
|     query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ | ||||
|         'q': query, | ||||
|         'hl': lang_country, | ||||
|         'lr': "lang_" + language, | ||||
|         'ie': "utf8", | ||||
|         'oe': "utf8", | ||||
|         'ceid' : "%s:%s" % (country, language), | ||||
|         'gl' : country, | ||||
|     }) | ||||
| 
 | ||||
|     params['url'] = search_url.format(query=urlencode({'q': query}), | ||||
|                                       search_options=urlencode(search_options)) | ||||
|     if params['safesearch']: | ||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||
| 
 | ||||
|     if params['language'] != 'all': | ||||
|         language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] | ||||
|         if language: | ||||
|             params['url'] += '&hl=' + language | ||||
|     params['url'] = query_url | ||||
|     logger.debug("query_url --> %s", query_url) | ||||
| 
 | ||||
|     # en-US,en;q=0.8,en;q=0.5 | ||||
|     params['headers']['Accept-Language'] = ( | ||||
|         lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' | ||||
|         ) | ||||
|     logger.debug("HTTP header Accept-Language --> %s", | ||||
|                  params['headers']['Accept-Language']) | ||||
|     params['headers']['Accept'] = ( | ||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||
|         ) | ||||
|     #params['google_subdomain'] = subdomain | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
|     """Get response from google's search request""" | ||||
|     results = [] | ||||
| 
 | ||||
|     detect_google_sorry(resp) | ||||
| 
 | ||||
|     # which subdomain ? | ||||
|     # subdomain = resp.search_params.get('google_subdomain') | ||||
| 
 | ||||
|     # convert the text to dom | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'): | ||||
|         try: | ||||
|             r = { | ||||
|                 'url': result.xpath('.//a[@class="l lLrAF"]')[0].attrib.get("href"), | ||||
|                 'title': ''.join(result.xpath('.//a[@class="l lLrAF"]//text()')), | ||||
|                 'content': ''.join(result.xpath('.//div[@class="st"]//text()')), | ||||
|             } | ||||
|         except: | ||||
|             continue | ||||
|     for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): | ||||
| 
 | ||||
|         imgs = result.xpath('.//img/@src') | ||||
|         if len(imgs) and not imgs[0].startswith('data'): | ||||
|             r['img_src'] = imgs[0] | ||||
|         # The first <a> tag in the <article> contains the link to the | ||||
|         # article The href attribute of the <a> is a google internal link, | ||||
|         # we can't use.  The real link is hidden in the jslog attribute: | ||||
|         # | ||||
|         #   <a ... | ||||
|         #      jslog="95014; 4:https://www.cnn.com/.../index.html; track:click" | ||||
|         #      href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen" | ||||
|         #      ... /> | ||||
| 
 | ||||
|         results.append(r) | ||||
|         jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) | ||||
|         url = re.findall('http[^;]*', jslog) | ||||
|         if url: | ||||
|             url = url[0] | ||||
|         else: | ||||
|             # The real URL is base64 encoded in the json attribute: | ||||
|             # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click" | ||||
|             jslog = jslog.split(";")[1].split(':')[1].strip() | ||||
|             try: | ||||
|                 padding = (4 -(len(jslog) % 4)) * "=" | ||||
|                 jslog = b64decode(jslog + padding) | ||||
|             except binascii.Error: | ||||
|                 # URL cant be read, skip this result | ||||
|                 continue | ||||
| 
 | ||||
|             # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]' | ||||
|             url = re.findall('http[^;"]*', str(jslog))[0] | ||||
| 
 | ||||
|         # the first <h3> tag in the <article> contains the title of the link | ||||
|         title = extract_text(eval_xpath(result, './article/h3[1]')) | ||||
| 
 | ||||
|         # the first <div> tag in the <article> contains the content of the link | ||||
|         content = extract_text(eval_xpath(result, './article/div[1]')) | ||||
| 
 | ||||
|         # the second <div> tag contains origin publisher and the publishing date | ||||
| 
 | ||||
|         pub_date = extract_text(eval_xpath(result, './article/div[2]//time')) | ||||
|         pub_origin = extract_text(eval_xpath(result, './article/div[2]//a')) | ||||
| 
 | ||||
|         pub_info = [] | ||||
|         if pub_origin: | ||||
|             pub_info.append(pub_origin) | ||||
|         if pub_date: | ||||
|             # The pub_date is mostly a string like 'yesertday', not a real | ||||
|             # timezone date or time.  Therefore we can't use publishedDate. | ||||
|             pub_info.append(pub_date) | ||||
|         pub_info = ', '.join(pub_info) | ||||
|         if pub_info: | ||||
|             content = pub_info + ': ' + content | ||||
| 
 | ||||
|         # The image URL is located in a preceding sibling <img> tag, e.g.: | ||||
|         # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" | ||||
|         # These URL are long but not personalized (double checked via tor). | ||||
| 
 | ||||
|         img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src')) | ||||
| 
 | ||||
|         results.append({ | ||||
|             'url':      url, | ||||
|             'title':    title, | ||||
|             'content':  content, | ||||
|             'img_src':  img_src, | ||||
|         }) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Alexandre Flament
						Alexandre Flament