mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[fix] normalize the language & region aspects of all google engines
BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									923b490022
								
							
						
					
					
						commit
						b1fefec40d
					
				
					 4 changed files with 187 additions and 179 deletions
				
			
		|  | @ -1,11 +1,11 @@ | ||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """Google (Web) | """Google (Web) | ||||||
| 
 | 
 | ||||||
|  For detailed description of the *REST-full* API see: `Query Parameter | For detailed description of the *REST-full* API see: `Query Parameter | ||||||
|  Definitions`_. | Definitions`_. | ||||||
| 
 | 
 | ||||||
|  .. _Query Parameter Definitions: | .. _Query Parameter Definitions: | ||||||
|  https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions |    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| # pylint: disable=invalid-name, missing-function-docstring | # pylint: disable=invalid-name, missing-function-docstring | ||||||
|  | @ -16,7 +16,6 @@ from searx import logger | ||||||
| from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex | from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex | ||||||
| from searx.exceptions import SearxEngineCaptchaException | from searx.exceptions import SearxEngineCaptchaException | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| logger = logger.getChild('google engine') | logger = logger.getChild('google engine') | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
|  | @ -56,7 +55,7 @@ google_domains = { | ||||||
|     'NZ': 'google.co.nz',   # New Zealand |     'NZ': 'google.co.nz',   # New Zealand | ||||||
|     'PH': 'google.com.ph',  # Philippines |     'PH': 'google.com.ph',  # Philippines | ||||||
|     'SG': 'google.com.sg',  # Singapore |     'SG': 'google.com.sg',  # Singapore | ||||||
|     # 'US': 'google.us',    # United States, redirect to .com |     'US': 'google.com',     # United States (google.us) redirects to .com | ||||||
|     'ZA': 'google.co.za',   # South Africa |     'ZA': 'google.co.za',   # South Africa | ||||||
|     'AR': 'google.com.ar',  # Argentina |     'AR': 'google.com.ar',  # Argentina | ||||||
|     'CL': 'google.cl',      # Chile |     'CL': 'google.cl',      # Chile | ||||||
|  | @ -87,7 +86,7 @@ google_domains = { | ||||||
|     'TH': 'google.co.th',   # Thailand |     'TH': 'google.co.th',   # Thailand | ||||||
|     'TR': 'google.com.tr',  # Turkey |     'TR': 'google.com.tr',  # Turkey | ||||||
|     'UA': 'google.com.ua',  # Ukraine |     'UA': 'google.com.ua',  # Ukraine | ||||||
|     # 'CN': 'google.cn',    # China, only from China ? |     'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN | ||||||
|     'HK': 'google.com.hk',  # Hong Kong |     'HK': 'google.com.hk',  # Hong Kong | ||||||
|     'TW': 'google.com.tw'   # Taiwan |     'TW': 'google.com.tw'   # Taiwan | ||||||
| } | } | ||||||
|  | @ -134,26 +133,58 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a' | ||||||
| spelling_suggestion_xpath = '//div[@class="med"]/p/a' | spelling_suggestion_xpath = '//div[@class="med"]/p/a' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_lang_country(params, lang_list, custom_aliases): | def get_lang_info(params, lang_list, custom_aliases): | ||||||
|     """Returns a tuple with *langauage* on its first and *country* on its second |     ret_val = {} | ||||||
|     position.""" |  | ||||||
|     language = params['language'] |  | ||||||
|     if language == 'all': |  | ||||||
|         language = 'en-US' |  | ||||||
| 
 | 
 | ||||||
|     language_array = language.split('-') |     _lang = params['language'] | ||||||
|  |     if _lang.lower() == 'all': | ||||||
|  |         _lang = 'en-US' | ||||||
| 
 | 
 | ||||||
|     if len(language_array) == 2: |     language = match_language(_lang, lang_list, custom_aliases) | ||||||
|         country = language_array[1] |     ret_val['language'] = language | ||||||
|  | 
 | ||||||
|  |     # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...) | ||||||
|  |     _l = _lang.split('-') | ||||||
|  | 
 | ||||||
|  |     # the country code (US, AT, CA) | ||||||
|  |     if len(_l) == 2: | ||||||
|  |         country = _l[1] | ||||||
|     else: |     else: | ||||||
|         country = language_array[0].upper() |         country = _l[0].upper() | ||||||
|  |         if country == 'EN': | ||||||
|  |             country = 'US' | ||||||
| 
 | 
 | ||||||
|     language = match_language(language, lang_list, custom_aliases) |     ret_val['country'] = country | ||||||
|  | 
 | ||||||
|  |     # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR) | ||||||
|     lang_country = '%s-%s' % (language, country) |     lang_country = '%s-%s' % (language, country) | ||||||
|     if lang_country == 'en-EN': |  | ||||||
|         lang_country = 'en' |  | ||||||
| 
 | 
 | ||||||
|     return language, country, lang_country |     # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 | ||||||
|  |     ret_val['Accept-Language'] = ','.join([ | ||||||
|  |         lang_country, | ||||||
|  |         language + ';q=0.8,', | ||||||
|  |         'en;q=0.6', | ||||||
|  |         '*;q=0.5', | ||||||
|  |     ]) | ||||||
|  | 
 | ||||||
|  |     # subdomain | ||||||
|  |     ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com') | ||||||
|  | 
 | ||||||
|  |     # hl parameter: | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The | ||||||
|  |     # Interface Language: | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages | ||||||
|  | 
 | ||||||
|  |     ret_val['hl'] = lang_list.get(lang_country, language) | ||||||
|  | 
 | ||||||
|  |     # lr parameter: | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results#lrsp | ||||||
|  |     # Language Collection Values: | ||||||
|  |     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections | ||||||
|  | 
 | ||||||
|  |     ret_val['lr'] = "lang_" + lang_list.get(lang_country, language) | ||||||
|  | 
 | ||||||
|  |     return ret_val | ||||||
| 
 | 
 | ||||||
| def detect_google_sorry(resp): | def detect_google_sorry(resp): | ||||||
|     resp_url = urlparse(resp.url) |     resp_url = urlparse(resp.url) | ||||||
|  | @ -165,17 +196,17 @@ def request(query, params): | ||||||
|     """Google search request""" |     """Google search request""" | ||||||
| 
 | 
 | ||||||
|     offset = (params['pageno'] - 1) * 10 |     offset = (params['pageno'] - 1) * 10 | ||||||
|     language, country, lang_country = get_lang_country( | 
 | ||||||
|  |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases | ||||||
|     ) |     ) | ||||||
|     subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') |  | ||||||
| 
 | 
 | ||||||
|     # https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium |     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium | ||||||
|     query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q': query, |         'q': query, | ||||||
|         'hl': lang_country, |         'hl': lang_info['hl'], | ||||||
|         'lr': "lang_" + language, |         'lr': lang_info['lr'], | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|         'start': offset, |         'start': offset, | ||||||
|  | @ -186,19 +217,14 @@ def request(query, params): | ||||||
|     if params['safesearch']: |     if params['safesearch']: | ||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
| 
 | 
 | ||||||
|     params['url'] = query_url |  | ||||||
|     logger.debug("query_url --> %s", query_url) |     logger.debug("query_url --> %s", query_url) | ||||||
|  |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     # en-US,en;q=0.8,en;q=0.5 |     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||||
|     params['headers']['Accept-Language'] = ( |     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||||
|         lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' |  | ||||||
|     ) |  | ||||||
|     logger.debug("HTTP header Accept-Language --> %s", |  | ||||||
|                  params['headers']['Accept-Language']) |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|     ) |     ) | ||||||
|     # params['google_subdomain'] = subdomain |  | ||||||
| 
 | 
 | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
|  | @ -209,8 +235,6 @@ def response(resp): | ||||||
|     detect_google_sorry(resp) |     detect_google_sorry(resp) | ||||||
| 
 | 
 | ||||||
|     results = [] |     results = [] | ||||||
|     # which subdomain ? |  | ||||||
|     # subdomain = resp.search_params.get('google_subdomain') |  | ||||||
| 
 | 
 | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|  |  | ||||||
|  | @ -10,35 +10,50 @@ Definitions`_. | ||||||
|    ``data:` scheme).:: |    ``data:` scheme).:: | ||||||
| 
 | 
 | ||||||
|      Header set Content-Security-Policy "img-src 'self' data: ;" |      Header set Content-Security-Policy "img-src 'self' data: ;" | ||||||
|  | 
 | ||||||
|  | .. _Query Parameter Definitions: | ||||||
|  |    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||||
|  | .. _data URLs: | ||||||
|  |    https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from urllib.parse import urlencode, unquote | from urllib.parse import urlencode, unquote | ||||||
| from lxml import html | from lxml import html | ||||||
|  | 
 | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.utils import extract_text, eval_xpath | from searx.utils import ( | ||||||
| from searx.engines.google import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import |     eval_xpath, | ||||||
|  |     eval_xpath_list, | ||||||
|  |     eval_xpath_getindex, | ||||||
|  |     extract_text, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     get_lang_country, |     get_lang_info, | ||||||
|     google_domains, |  | ||||||
|     time_range_dict, |     time_range_dict, | ||||||
|     detect_google_sorry, |     detect_google_sorry, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  | # pylint: disable=unused-import | ||||||
|  | from searx.engines.google import ( | ||||||
|  |     supported_languages_url | ||||||
|  |     ,  _fetch_supported_languages | ||||||
|  | ) | ||||||
|  | # pylint: enable=unused-import | ||||||
|  | 
 | ||||||
| logger = logger.getChild('google images') | logger = logger.getChild('google images') | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
|     "website": 'https://images.google.com/', |     "website": 'https://images.google.com', | ||||||
|     "wikidata_id": 'Q521550', |     "wikidata_id": 'Q521550', | ||||||
|     "official_api_documentation": 'https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions',  # NOQA |     "official_api_documentation": 'https://developers.google.com/custom-search', | ||||||
|     "use_official_api": False, |     "use_official_api": False, | ||||||
|     "require_api_key": False, |     "require_api_key": False, | ||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| 
 |  | ||||||
| categories = ['images'] | categories = ['images'] | ||||||
| paging = False | paging = False | ||||||
| language_support = True | language_support = True | ||||||
|  | @ -84,17 +99,16 @@ def scrap_img_by_id(script, data_id): | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google-Video search request""" |     """Google-Video search request""" | ||||||
| 
 | 
 | ||||||
|     language, country, lang_country = get_lang_country( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases | ||||||
|     ) |     ) | ||||||
|     subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') |  | ||||||
| 
 | 
 | ||||||
|     query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q': query, |         'q': query, | ||||||
|         'tbm': "isch", |         'tbm': "isch", | ||||||
|         'hl': lang_country, |         'hl': lang_info['hl'], | ||||||
|         'lr': "lang_" + language, |         'lr': lang_info['lr'], | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|         'num': 30, |         'num': 30, | ||||||
|  | @ -105,17 +119,14 @@ def request(query, params): | ||||||
|     if params['safesearch']: |     if params['safesearch']: | ||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
| 
 | 
 | ||||||
|     params['url'] = query_url |  | ||||||
|     logger.debug("query_url --> %s", query_url) |     logger.debug("query_url --> %s", query_url) | ||||||
|  |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     params['headers']['Accept-Language'] = ( |     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||||
|         "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) |     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||||
|     logger.debug( |  | ||||||
|         "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|     ) |     ) | ||||||
|     # params['google_subdomain'] = subdomain |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -125,13 +136,11 @@ def response(resp): | ||||||
| 
 | 
 | ||||||
|     detect_google_sorry(resp) |     detect_google_sorry(resp) | ||||||
| 
 | 
 | ||||||
|     # which subdomain ? |  | ||||||
|     # subdomain = resp.search_params.get('google_subdomain') |  | ||||||
| 
 |  | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|     img_bas64_map = scrap_out_thumbs(dom) |     img_bas64_map = scrap_out_thumbs(dom) | ||||||
|     img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text |     img_src_script = eval_xpath_getindex( | ||||||
|  |         dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text | ||||||
| 
 | 
 | ||||||
|     # parse results |     # parse results | ||||||
|     # |     # | ||||||
|  | @ -156,55 +165,47 @@ def response(resp): | ||||||
|         return results |         return results | ||||||
| 
 | 
 | ||||||
|     root = root[0] |     root = root[0] | ||||||
|     for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): |     for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): | ||||||
| 
 | 
 | ||||||
|         try: |         img_alt = eval_xpath_getindex(img_node, '@alt', 0) | ||||||
|             img_alt = eval_xpath(img_node, '@alt')[0] |  | ||||||
| 
 | 
 | ||||||
|             img_base64_id = eval_xpath(img_node, '@data-iid') |         img_base64_id = eval_xpath(img_node, '@data-iid') | ||||||
|             if img_base64_id: |         if img_base64_id: | ||||||
|                 img_base64_id = img_base64_id[0] |             img_base64_id = img_base64_id[0] | ||||||
|                 thumbnail_src = img_bas64_map[img_base64_id] |             thumbnail_src = img_bas64_map[img_base64_id] | ||||||
|  |         else: | ||||||
|  |             thumbnail_src = eval_xpath(img_node, '@src') | ||||||
|  |             if not thumbnail_src: | ||||||
|  |                 thumbnail_src = eval_xpath(img_node, '@data-src') | ||||||
|  |             if thumbnail_src: | ||||||
|  |                 thumbnail_src = thumbnail_src[0] | ||||||
|             else: |             else: | ||||||
|                 thumbnail_src = eval_xpath(img_node, '@src') |                 thumbnail_src = '' | ||||||
|                 if not thumbnail_src: |  | ||||||
|                     thumbnail_src = eval_xpath(img_node, '@data-src') |  | ||||||
|                 if thumbnail_src: |  | ||||||
|                     thumbnail_src = thumbnail_src[0] |  | ||||||
|                 else: |  | ||||||
|                     thumbnail_src = '' |  | ||||||
| 
 | 
 | ||||||
|             link_node = eval_xpath(img_node, '../../../a[2]')[0] |         link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) | ||||||
|             url = eval_xpath(link_node, '@href')[0] |         url = eval_xpath_getindex(link_node, '@href', 0) | ||||||
| 
 | 
 | ||||||
|             pub_nodes = eval_xpath(link_node, './div/div') |         pub_nodes = eval_xpath(link_node, './div/div') | ||||||
|             pub_descr = img_alt |         pub_descr = img_alt | ||||||
|             pub_source = '' |         pub_source = '' | ||||||
|             if pub_nodes: |         if pub_nodes: | ||||||
|                 pub_descr = extract_text(pub_nodes[0]) |             pub_descr = extract_text(pub_nodes[0]) | ||||||
|                 pub_source = extract_text(pub_nodes[1]) |             pub_source = extract_text(pub_nodes[1]) | ||||||
| 
 | 
 | ||||||
|             img_src_id = eval_xpath(img_node, '../../../@data-id')[0] |         img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) | ||||||
|             src_url = scrap_img_by_id(img_src_script, img_src_id) |         src_url = scrap_img_by_id(img_src_script, img_src_id) | ||||||
|             if not src_url: |         if not src_url: | ||||||
|                 src_url = thumbnail_src |             src_url = thumbnail_src | ||||||
| 
 | 
 | ||||||
|             results.append({ |         results.append({ | ||||||
|                 'url': url, |             'url': url, | ||||||
|                 'title': img_alt, |             'title': img_alt, | ||||||
|                 'content': pub_descr, |             'content': pub_descr, | ||||||
|                 'source': pub_source, |             'source': pub_source, | ||||||
|                 'img_src': src_url, |             'img_src': src_url, | ||||||
|                 # 'img_format': img_format, |             # 'img_format': img_format, | ||||||
|                 'thumbnail_src': thumbnail_src, |             'thumbnail_src': thumbnail_src, | ||||||
|                 'template': 'images.html' |             'template': 'images.html' | ||||||
|             }) |         }) | ||||||
|         except Exception as e:  # pylint: disable=broad-except |  | ||||||
|             logger.error(e, exc_info=True) |  | ||||||
|             # from lxml import etree |  | ||||||
|             # logger.debug(etree.tostring(img_node, pretty_print=True)) |  | ||||||
|             # import pdb |  | ||||||
|             # pdb.set_trace() |  | ||||||
|             continue |  | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  |  | ||||||
|  | @ -2,13 +2,16 @@ | ||||||
| """Google (News) | """Google (News) | ||||||
| 
 | 
 | ||||||
| For detailed description of the *REST-full* API see: `Query Parameter | For detailed description of the *REST-full* API see: `Query Parameter | ||||||
| Definitions`_.  Not all parameters can be appied, e.g. num_ (the number of | Definitions`_.  Not all parameters can be appied: | ||||||
| search results to return) is ignored. | 
 | ||||||
|  | - num_ : the number of search results is ignored | ||||||
|  | - save_ : is ignored / Google-News results are always *SafeSearch* | ||||||
| 
 | 
 | ||||||
| .. _Query Parameter Definitions: | .. _Query Parameter Definitions: | ||||||
|    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions |    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||||
| 
 | 
 | ||||||
| .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp | .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp | ||||||
|  | .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
|  | @ -32,20 +35,19 @@ from searx.utils import ( | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     supported_languages_url, |     supported_languages_url, | ||||||
|     _fetch_supported_languages, |     _fetch_supported_languages, | ||||||
|     detect_google_sorry, |  | ||||||
| ) | ) | ||||||
| # pylint: enable=unused-import | # pylint: enable=unused-import | ||||||
| 
 | 
 | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     get_lang_country, |     get_lang_info, | ||||||
|     filter_mapping, |     detect_google_sorry, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| # about | # about | ||||||
| about = { | about = { | ||||||
|     "website": 'https://news.google.com', |     "website": 'https://news.google.com', | ||||||
|     "wikidata_id": 'Q12020', |     "wikidata_id": 'Q12020', | ||||||
|     "official_api_documentation": None, |     "official_api_documentation": 'https://developers.google.com/custom-search', | ||||||
|     "use_official_api": False, |     "use_official_api": False, | ||||||
|     "require_api_key": False, |     "require_api_key": False, | ||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
|  | @ -69,51 +71,53 @@ paging = False | ||||||
| language_support = True | language_support = True | ||||||
| use_locale_domain = True | use_locale_domain = True | ||||||
| time_range_support = True | time_range_support = True | ||||||
| safesearch = True # not really, but it is not generated by google | 
 | ||||||
|  | # Google-News results are always *SafeSearch*. Option 'safesearch' is set to | ||||||
|  | # False here, otherwise checker will report safesearch-errors:: | ||||||
|  | # | ||||||
|  | #  safesearch : results are identitical for safesearch=0 and safesearch=2 | ||||||
|  | safesearch = False | ||||||
| 
 | 
 | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google-News search request""" |     """Google-News search request""" | ||||||
| 
 | 
 | ||||||
|     language, country, lang_country = get_lang_country( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases | ||||||
|     ) |     ) | ||||||
|     subdomain = 'news.google.com' |  | ||||||
| 
 | 
 | ||||||
|     if params['time_range']: # in time_range_dict: |     # google news has only one domain | ||||||
|  |     lang_info['subdomain'] = 'news.google.com' | ||||||
|  | 
 | ||||||
|  |     ceid = "%s:%s" % (lang_info['country'], lang_info['language']) | ||||||
|  | 
 | ||||||
|  |     # google news redirects en to en-US | ||||||
|  |     if lang_info['hl'] == 'en': | ||||||
|  |         lang_info['hl'] = 'en-US' | ||||||
|  | 
 | ||||||
|  |     # Very special to google-news compared to other google engines, the time | ||||||
|  |     # range is included in the search term. | ||||||
|  |     if params['time_range']: | ||||||
|         query += ' ' + time_range_dict[params['time_range']] |         query += ' ' + time_range_dict[params['time_range']] | ||||||
| 
 | 
 | ||||||
|     query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q': query, |         'q': query, | ||||||
|         'hl': language, |         'hl': lang_info['hl'], | ||||||
|         'lr': "lang_" + language, |         'lr': lang_info['lr'], | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|         'ceid' : "%s:%s" % (country, language), |         'gl': lang_info['country'], | ||||||
|         'gl' : country, |     }) + ('&ceid=%s' % ceid)  # ceid includes a ':' character which must not be urlencoded | ||||||
|     }) |  | ||||||
| 
 | 
 | ||||||
|     if params['safesearch']: |  | ||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |  | ||||||
| 
 |  | ||||||
|     params['url'] = query_url |  | ||||||
|     logger.debug("query_url --> %s", query_url) |     logger.debug("query_url --> %s", query_url) | ||||||
|  |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     # en-US,en;q=0.8,en;q=0.5 |     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||||
|     params['headers']['Accept-Language'] = ( |     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||||
|         lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' |  | ||||||
|         ) |  | ||||||
|     logger.debug("HTTP header Accept-Language --> %s", |  | ||||||
|                  params['headers']['Accept-Language']) |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     # hl=en redirect to hl=en-US / en-CA ... |  | ||||||
|     params['soft_max_redirects'] = 1 |  | ||||||
| 
 |  | ||||||
|     #params['google_subdomain'] = subdomain |  | ||||||
| 
 |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -123,9 +127,6 @@ def response(resp): | ||||||
| 
 | 
 | ||||||
|     detect_google_sorry(resp) |     detect_google_sorry(resp) | ||||||
| 
 | 
 | ||||||
|     # which subdomain ? |  | ||||||
|     # subdomain = resp.search_params.get('google_subdomain') |  | ||||||
| 
 |  | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """ | """Google (Video) | ||||||
| Google (Viedo) |  | ||||||
| 
 | 
 | ||||||
| For detailed description of the *REST-full* API see: `Query Parameter | For detailed description of the *REST-full* API see: `Query Parameter | ||||||
| Definitions`_.  Not all parameters can be appied. | Definitions`_.  Not all parameters can be appied. | ||||||
|  | @ -22,20 +21,19 @@ Definitions`_.  Not all parameters can be appied. | ||||||
| # pylint: disable=invalid-name, missing-function-docstring | # pylint: disable=invalid-name, missing-function-docstring | ||||||
| 
 | 
 | ||||||
| import re | import re | ||||||
| from urllib.parse import urlencode, urlparse | from urllib.parse import urlencode | ||||||
| from lxml import html | from lxml import html | ||||||
| 
 | 
 | ||||||
| from searx import logger | from searx import logger | ||||||
| from searx.exceptions import SearxEngineCaptchaException |  | ||||||
| from searx.utils import ( | from searx.utils import ( | ||||||
|     eval_xpath, |     eval_xpath, | ||||||
|     eval_xpath_list, |     eval_xpath_list, | ||||||
|  |     eval_xpath_getindex, | ||||||
|     extract_text, |     extract_text, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| from searx.engines.google import ( | from searx.engines.google import ( | ||||||
|     get_lang_country, |     get_lang_info, | ||||||
|     google_domains, |  | ||||||
|     time_range_dict, |     time_range_dict, | ||||||
|     filter_mapping, |     filter_mapping, | ||||||
|     results_xpath, |     results_xpath, | ||||||
|  | @ -44,7 +42,8 @@ from searx.engines.google import ( | ||||||
|     href_xpath, |     href_xpath, | ||||||
|     content_xpath, |     content_xpath, | ||||||
|     suggestion_xpath, |     suggestion_xpath, | ||||||
|     spelling_suggestion_xpath |     spelling_suggestion_xpath, | ||||||
|  |     detect_google_sorry, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| # pylint: disable=unused-import | # pylint: disable=unused-import | ||||||
|  | @ -58,12 +57,10 @@ from searx.engines.google import ( | ||||||
| about = { | about = { | ||||||
|     "website": 'https://www.google.com', |     "website": 'https://www.google.com', | ||||||
|     "wikidata_id": 'Q219885', |     "wikidata_id": 'Q219885', | ||||||
|     "official_api_documentation": 'https://developers.google.com/custom-search/', |     "official_api_documentation": 'https://developers.google.com/custom-search', | ||||||
|     "use_official_api": False, |     "use_official_api": False, | ||||||
|     "require_api_key": False, |     "require_api_key": False, | ||||||
|     "results": 'HTML', |     "results": 'HTML', | ||||||
|     "template": 'video.html', |  | ||||||
|     "parse": ('url', 'title', 'content', 'thumbnail') |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| logger = logger.getChild('google video') | logger = logger.getChild('google video') | ||||||
|  | @ -90,7 +87,7 @@ def scrap_out_thumbs(dom): | ||||||
|     ret_val = dict() |     ret_val = dict() | ||||||
|     thumb_name = 'vidthumb' |     thumb_name = 'vidthumb' | ||||||
| 
 | 
 | ||||||
|     for script in eval_xpath(dom, '//script[contains(., "_setImagesSrc")]'): |     for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): | ||||||
|         _script = script.text |         _script = script.text | ||||||
| 
 | 
 | ||||||
|         # var s='data:image/jpeg;base64, ...' |         # var s='data:image/jpeg;base64, ...' | ||||||
|  | @ -104,7 +101,7 @@ def scrap_out_thumbs(dom): | ||||||
|             ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") |             ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") | ||||||
| 
 | 
 | ||||||
|     # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... |     # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... | ||||||
|     for script in eval_xpath(dom, '//script[contains(., "google.ldi={")]'): |     for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): | ||||||
|         _script = script.text |         _script = script.text | ||||||
|         for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : |         for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : | ||||||
|             match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) |             match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) | ||||||
|  | @ -119,17 +116,16 @@ def scrap_out_thumbs(dom): | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     """Google-Video search request""" |     """Google-Video search request""" | ||||||
| 
 | 
 | ||||||
|     language, country, lang_country = get_lang_country( |     lang_info = get_lang_info( | ||||||
|         # pylint: disable=undefined-variable |         # pylint: disable=undefined-variable | ||||||
|         params, supported_languages, language_aliases |         params, supported_languages, language_aliases | ||||||
|     ) |     ) | ||||||
|     subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') |  | ||||||
| 
 | 
 | ||||||
|     query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ |     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||||
|         'q':   query, |         'q':   query, | ||||||
|         'tbm': "vid", |         'tbm': "vid", | ||||||
|         'hl':  lang_country, |         'hl': lang_info['hl'], | ||||||
|         'lr': "lang_" + language, |         'lr': lang_info['lr'], | ||||||
|         'ie': "utf8", |         'ie': "utf8", | ||||||
|         'oe': "utf8", |         'oe': "utf8", | ||||||
|     }) |     }) | ||||||
|  | @ -139,18 +135,14 @@ def request(query, params): | ||||||
|     if params['safesearch']: |     if params['safesearch']: | ||||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) |         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||||
| 
 | 
 | ||||||
|     params['url'] = query_url |  | ||||||
|     logger.debug("query_url --> %s", query_url) |     logger.debug("query_url --> %s", query_url) | ||||||
|  |     params['url'] = query_url | ||||||
| 
 | 
 | ||||||
|     # en-US,en;q=0.8,en;q=0.5 |     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||||
|     params['headers']['Accept-Language'] = ( |     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||||
|         "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) |  | ||||||
|     logger.debug( |  | ||||||
|         "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) |  | ||||||
|     params['headers']['Accept'] = ( |     params['headers']['Accept'] = ( | ||||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' |         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||||
|         ) |         ) | ||||||
|     #params['google_subdomain'] = subdomain |  | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -158,16 +150,7 @@ def response(resp): | ||||||
|     """Get response from google's search request""" |     """Get response from google's search request""" | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     # detect google sorry |     detect_google_sorry(resp) | ||||||
|     resp_url = urlparse(resp.url) |  | ||||||
|     if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': |  | ||||||
|         raise SearxEngineCaptchaException() |  | ||||||
| 
 |  | ||||||
|     if resp_url.path.startswith('/sorry'): |  | ||||||
|         raise SearxEngineCaptchaException() |  | ||||||
| 
 |  | ||||||
|     # which subdomain ? |  | ||||||
|     # subdomain = resp.search_params.get('google_subdomain') |  | ||||||
| 
 | 
 | ||||||
|     # convert the text to dom |     # convert the text to dom | ||||||
|     dom = html.fromstring(resp.text) |     dom = html.fromstring(resp.text) | ||||||
|  | @ -181,19 +164,18 @@ def response(resp): | ||||||
|             logger.debug("ingoring <g-section-with-header>") |             logger.debug("ingoring <g-section-with-header>") | ||||||
|             continue |             continue | ||||||
| 
 | 
 | ||||||
|         title = extract_text(eval_xpath(result, title_xpath)[0]) |         title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) | ||||||
|         url = eval_xpath(result, href_xpath)[0] |         url = eval_xpath_getindex(result, href_xpath, 0) | ||||||
|         c_node = eval_xpath(result, content_xpath)[0] |         c_node = eval_xpath_getindex(result, content_xpath, 0) | ||||||
| 
 | 
 | ||||||
|         # <img id="vidthumb1" ...> |         # <img id="vidthumb1" ...> | ||||||
|         img_id = eval_xpath(c_node, './div[1]//a/g-img/img/@id') |         img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) | ||||||
|         if not img_id: |         if img_id is None: | ||||||
|             continue |             continue | ||||||
|         img_id = img_id[0] |  | ||||||
|         img_src = vidthumb_imgdata.get(img_id, None) |         img_src = vidthumb_imgdata.get(img_id, None) | ||||||
|         if not img_src: |         if not img_src: | ||||||
|             logger.error("no vidthumb imgdata for: %s" % img_id) |             logger.error("no vidthumb imgdata for: %s" % img_id) | ||||||
|             img_src = eval_xpath(c_node, './div[1]//a/g-img/img/@src')[0] |             img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) | ||||||
| 
 | 
 | ||||||
|         length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) |         length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) | ||||||
|         content = extract_text(eval_xpath(c_node, './/div[2]/span')) |         content = extract_text(eval_xpath(c_node, './/div[2]/span')) | ||||||
|  | @ -210,11 +192,11 @@ def response(resp): | ||||||
|             }) |             }) | ||||||
| 
 | 
 | ||||||
|     # parse suggestion |     # parse suggestion | ||||||
|     for suggestion in eval_xpath(dom, suggestion_xpath): |     for suggestion in eval_xpath_list(dom, suggestion_xpath): | ||||||
|         # append suggestion |         # append suggestion | ||||||
|         results.append({'suggestion': extract_text(suggestion)}) |         results.append({'suggestion': extract_text(suggestion)}) | ||||||
| 
 | 
 | ||||||
|     for correction in eval_xpath(dom, spelling_suggestion_xpath): |     for correction in eval_xpath_list(dom, spelling_suggestion_xpath): | ||||||
|         results.append({'correction': extract_text(correction)}) |         results.append({'correction': extract_text(correction)}) | ||||||
| 
 | 
 | ||||||
|     return results |     return results | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Markus Heiser
						Markus Heiser