mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[fix] normalize the language & region aspects of all google engines
BTW: make the engines ready for search.checker: - replace eval_xpath by eval_xpath_getindex and eval_xpath_list - google_images: remove outer try/except block Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									923b490022
								
							
						
					
					
						commit
						b1fefec40d
					
				
					 4 changed files with 187 additions and 179 deletions
				
			
		|  | @ -1,10 +1,10 @@ | |||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| """Google (Web) | ||||
| 
 | ||||
|  For detailed description of the *REST-full* API see: `Query Parameter | ||||
|  Definitions`_. | ||||
| For detailed description of the *REST-full* API see: `Query Parameter | ||||
| Definitions`_. | ||||
| 
 | ||||
|  .. _Query Parameter Definitions: | ||||
| .. _Query Parameter Definitions: | ||||
|    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||
| """ | ||||
| 
 | ||||
|  | @ -16,7 +16,6 @@ from searx import logger | |||
| from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex | ||||
| from searx.exceptions import SearxEngineCaptchaException | ||||
| 
 | ||||
| 
 | ||||
| logger = logger.getChild('google engine') | ||||
| 
 | ||||
| # about | ||||
|  | @ -56,7 +55,7 @@ google_domains = { | |||
|     'NZ': 'google.co.nz',   # New Zealand | ||||
|     'PH': 'google.com.ph',  # Philippines | ||||
|     'SG': 'google.com.sg',  # Singapore | ||||
|     # 'US': 'google.us',    # United States, redirect to .com | ||||
|     'US': 'google.com',     # United States (google.us) redirects to .com | ||||
|     'ZA': 'google.co.za',   # South Africa | ||||
|     'AR': 'google.com.ar',  # Argentina | ||||
|     'CL': 'google.cl',      # Chile | ||||
|  | @ -87,7 +86,7 @@ google_domains = { | |||
|     'TH': 'google.co.th',   # Thailand | ||||
|     'TR': 'google.com.tr',  # Turkey | ||||
|     'UA': 'google.com.ua',  # Ukraine | ||||
|     # 'CN': 'google.cn',    # China, only from China ? | ||||
|     'CN': 'google.com.hk',  # There is no google.cn, we use .com.hk for zh-CN | ||||
|     'HK': 'google.com.hk',  # Hong Kong | ||||
|     'TW': 'google.com.tw'   # Taiwan | ||||
| } | ||||
|  | @ -134,26 +133,58 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a' | |||
| spelling_suggestion_xpath = '//div[@class="med"]/p/a' | ||||
| 
 | ||||
| 
 | ||||
| def get_lang_country(params, lang_list, custom_aliases): | ||||
|     """Returns a tuple with *langauage* on its first and *country* on its second | ||||
|     position.""" | ||||
|     language = params['language'] | ||||
|     if language == 'all': | ||||
|         language = 'en-US' | ||||
| def get_lang_info(params, lang_list, custom_aliases): | ||||
|     ret_val = {} | ||||
| 
 | ||||
|     language_array = language.split('-') | ||||
|     _lang = params['language'] | ||||
|     if _lang.lower() == 'all': | ||||
|         _lang = 'en-US' | ||||
| 
 | ||||
|     if len(language_array) == 2: | ||||
|         country = language_array[1] | ||||
|     language = match_language(_lang, lang_list, custom_aliases) | ||||
|     ret_val['language'] = language | ||||
| 
 | ||||
|     # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...) | ||||
|     _l = _lang.split('-') | ||||
| 
 | ||||
|     # the country code (US, AT, CA) | ||||
|     if len(_l) == 2: | ||||
|         country = _l[1] | ||||
|     else: | ||||
|         country = language_array[0].upper() | ||||
|         country = _l[0].upper() | ||||
|         if country == 'EN': | ||||
|             country = 'US' | ||||
| 
 | ||||
|     language = match_language(language, lang_list, custom_aliases) | ||||
|     ret_val['country'] = country | ||||
| 
 | ||||
|     # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR) | ||||
|     lang_country = '%s-%s' % (language, country) | ||||
|     if lang_country == 'en-EN': | ||||
|         lang_country = 'en' | ||||
| 
 | ||||
|     return language, country, lang_country | ||||
|     # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 | ||||
|     ret_val['Accept-Language'] = ','.join([ | ||||
|         lang_country, | ||||
|         language + ';q=0.8,', | ||||
|         'en;q=0.6', | ||||
|         '*;q=0.5', | ||||
|     ]) | ||||
| 
 | ||||
|     # subdomain | ||||
|     ret_val['subdomain']  = 'www.' + google_domains.get(country.upper(), 'google.com') | ||||
| 
 | ||||
|     # hl parameter: | ||||
|     #   https://developers.google.com/custom-search/docs/xml_results#hlsp The | ||||
|     # Interface Language: | ||||
|     #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages | ||||
| 
 | ||||
|     ret_val['hl'] = lang_list.get(lang_country, language) | ||||
| 
 | ||||
|     # lr parameter: | ||||
|     #   https://developers.google.com/custom-search/docs/xml_results#lrsp | ||||
|     # Language Collection Values: | ||||
|     #   https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections | ||||
| 
 | ||||
|     ret_val['lr'] = "lang_" + lang_list.get(lang_country, language) | ||||
| 
 | ||||
|     return ret_val | ||||
| 
 | ||||
| def detect_google_sorry(resp): | ||||
|     resp_url = urlparse(resp.url) | ||||
|  | @ -165,17 +196,17 @@ def request(query, params): | |||
|     """Google search request""" | ||||
| 
 | ||||
|     offset = (params['pageno'] - 1) * 10 | ||||
|     language, country, lang_country = get_lang_country( | ||||
| 
 | ||||
|     lang_info = get_lang_info( | ||||
|         # pylint: disable=undefined-variable | ||||
|         params, supported_languages, language_aliases | ||||
|     ) | ||||
|     subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') | ||||
| 
 | ||||
|     # https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium | ||||
|     query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ | ||||
|     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium | ||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||
|         'q': query, | ||||
|         'hl': lang_country, | ||||
|         'lr': "lang_" + language, | ||||
|         'hl': lang_info['hl'], | ||||
|         'lr': lang_info['lr'], | ||||
|         'ie': "utf8", | ||||
|         'oe': "utf8", | ||||
|         'start': offset, | ||||
|  | @ -186,19 +217,14 @@ def request(query, params): | |||
|     if params['safesearch']: | ||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||
| 
 | ||||
|     params['url'] = query_url | ||||
|     logger.debug("query_url --> %s", query_url) | ||||
|     params['url'] = query_url | ||||
| 
 | ||||
|     # en-US,en;q=0.8,en;q=0.5 | ||||
|     params['headers']['Accept-Language'] = ( | ||||
|         lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' | ||||
|     ) | ||||
|     logger.debug("HTTP header Accept-Language --> %s", | ||||
|                  params['headers']['Accept-Language']) | ||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||
|     params['headers']['Accept'] = ( | ||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||
|     ) | ||||
|     # params['google_subdomain'] = subdomain | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
|  | @ -209,8 +235,6 @@ def response(resp): | |||
|     detect_google_sorry(resp) | ||||
| 
 | ||||
|     results = [] | ||||
|     # which subdomain ? | ||||
|     # subdomain = resp.search_params.get('google_subdomain') | ||||
| 
 | ||||
|     # convert the text to dom | ||||
|     dom = html.fromstring(resp.text) | ||||
|  |  | |||
|  | @ -10,35 +10,50 @@ Definitions`_. | |||
|    ``data:` scheme).:: | ||||
| 
 | ||||
|      Header set Content-Security-Policy "img-src 'self' data: ;" | ||||
| 
 | ||||
| .. _Query Parameter Definitions: | ||||
|    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||
| .. _data URLs: | ||||
|    https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs | ||||
| """ | ||||
| 
 | ||||
| from urllib.parse import urlencode, unquote | ||||
| from lxml import html | ||||
| 
 | ||||
| from searx import logger | ||||
| from searx.utils import extract_text, eval_xpath | ||||
| from searx.engines.google import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import | ||||
| from searx.utils import ( | ||||
|     eval_xpath, | ||||
|     eval_xpath_list, | ||||
|     eval_xpath_getindex, | ||||
|     extract_text, | ||||
| ) | ||||
| 
 | ||||
| from searx.engines.google import ( | ||||
|     get_lang_country, | ||||
|     google_domains, | ||||
|     get_lang_info, | ||||
|     time_range_dict, | ||||
|     detect_google_sorry, | ||||
| ) | ||||
| 
 | ||||
| # pylint: disable=unused-import | ||||
| from searx.engines.google import ( | ||||
|     supported_languages_url | ||||
|     ,  _fetch_supported_languages | ||||
| ) | ||||
| # pylint: enable=unused-import | ||||
| 
 | ||||
| logger = logger.getChild('google images') | ||||
| 
 | ||||
| # about | ||||
| about = { | ||||
|     "website": 'https://images.google.com/', | ||||
|     "website": 'https://images.google.com', | ||||
|     "wikidata_id": 'Q521550', | ||||
|     "official_api_documentation": 'https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions',  # NOQA | ||||
|     "official_api_documentation": 'https://developers.google.com/custom-search', | ||||
|     "use_official_api": False, | ||||
|     "require_api_key": False, | ||||
|     "results": 'HTML', | ||||
| } | ||||
| 
 | ||||
| # engine dependent config | ||||
| 
 | ||||
| categories = ['images'] | ||||
| paging = False | ||||
| language_support = True | ||||
|  | @ -84,17 +99,16 @@ def scrap_img_by_id(script, data_id): | |||
| def request(query, params): | ||||
|     """Google-Video search request""" | ||||
| 
 | ||||
|     language, country, lang_country = get_lang_country( | ||||
|     lang_info = get_lang_info( | ||||
|         # pylint: disable=undefined-variable | ||||
|         params, supported_languages, language_aliases | ||||
|     ) | ||||
|     subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') | ||||
| 
 | ||||
|     query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ | ||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||
|         'q': query, | ||||
|         'tbm': "isch", | ||||
|         'hl': lang_country, | ||||
|         'lr': "lang_" + language, | ||||
|         'hl': lang_info['hl'], | ||||
|         'lr': lang_info['lr'], | ||||
|         'ie': "utf8", | ||||
|         'oe': "utf8", | ||||
|         'num': 30, | ||||
|  | @ -105,17 +119,14 @@ def request(query, params): | |||
|     if params['safesearch']: | ||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||
| 
 | ||||
|     params['url'] = query_url | ||||
|     logger.debug("query_url --> %s", query_url) | ||||
|     params['url'] = query_url | ||||
| 
 | ||||
|     params['headers']['Accept-Language'] = ( | ||||
|         "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) | ||||
|     logger.debug( | ||||
|         "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) | ||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||
|     params['headers']['Accept'] = ( | ||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||
|     ) | ||||
|     # params['google_subdomain'] = subdomain | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
|  | @ -125,13 +136,11 @@ def response(resp): | |||
| 
 | ||||
|     detect_google_sorry(resp) | ||||
| 
 | ||||
|     # which subdomain ? | ||||
|     # subdomain = resp.search_params.get('google_subdomain') | ||||
| 
 | ||||
|     # convert the text to dom | ||||
|     dom = html.fromstring(resp.text) | ||||
|     img_bas64_map = scrap_out_thumbs(dom) | ||||
|     img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text | ||||
|     img_src_script = eval_xpath_getindex( | ||||
|         dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text | ||||
| 
 | ||||
|     # parse results | ||||
|     # | ||||
|  | @ -156,10 +165,9 @@ def response(resp): | |||
|         return results | ||||
| 
 | ||||
|     root = root[0] | ||||
|     for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): | ||||
|     for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): | ||||
| 
 | ||||
|         try: | ||||
|             img_alt = eval_xpath(img_node, '@alt')[0] | ||||
|         img_alt = eval_xpath_getindex(img_node, '@alt', 0) | ||||
| 
 | ||||
|         img_base64_id = eval_xpath(img_node, '@data-iid') | ||||
|         if img_base64_id: | ||||
|  | @ -174,8 +182,8 @@ def response(resp): | |||
|             else: | ||||
|                 thumbnail_src = '' | ||||
| 
 | ||||
|             link_node = eval_xpath(img_node, '../../../a[2]')[0] | ||||
|             url = eval_xpath(link_node, '@href')[0] | ||||
|         link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) | ||||
|         url = eval_xpath_getindex(link_node, '@href', 0) | ||||
| 
 | ||||
|         pub_nodes = eval_xpath(link_node, './div/div') | ||||
|         pub_descr = img_alt | ||||
|  | @ -184,7 +192,7 @@ def response(resp): | |||
|             pub_descr = extract_text(pub_nodes[0]) | ||||
|             pub_source = extract_text(pub_nodes[1]) | ||||
| 
 | ||||
|             img_src_id = eval_xpath(img_node, '../../../@data-id')[0] | ||||
|         img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) | ||||
|         src_url = scrap_img_by_id(img_src_script, img_src_id) | ||||
|         if not src_url: | ||||
|             src_url = thumbnail_src | ||||
|  | @ -199,12 +207,5 @@ def response(resp): | |||
|             'thumbnail_src': thumbnail_src, | ||||
|             'template': 'images.html' | ||||
|         }) | ||||
|         except Exception as e:  # pylint: disable=broad-except | ||||
|             logger.error(e, exc_info=True) | ||||
|             # from lxml import etree | ||||
|             # logger.debug(etree.tostring(img_node, pretty_print=True)) | ||||
|             # import pdb | ||||
|             # pdb.set_trace() | ||||
|             continue | ||||
| 
 | ||||
|     return results | ||||
|  |  | |||
|  | @ -2,13 +2,16 @@ | |||
| """Google (News) | ||||
| 
 | ||||
| For detailed description of the *REST-full* API see: `Query Parameter | ||||
| Definitions`_.  Not all parameters can be appied, e.g. num_ (the number of | ||||
| search results to return) is ignored. | ||||
| Definitions`_.  Not all parameters can be appied: | ||||
| 
 | ||||
| - num_ : the number of search results is ignored | ||||
| - save_ : is ignored / Google-News results are always *SafeSearch* | ||||
| 
 | ||||
| .. _Query Parameter Definitions: | ||||
|    https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions | ||||
| 
 | ||||
| .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp | ||||
| .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
|  | @ -32,20 +35,19 @@ from searx.utils import ( | |||
| from searx.engines.google import ( | ||||
|     supported_languages_url, | ||||
|     _fetch_supported_languages, | ||||
|     detect_google_sorry, | ||||
| ) | ||||
| # pylint: enable=unused-import | ||||
| 
 | ||||
| from searx.engines.google import ( | ||||
|     get_lang_country, | ||||
|     filter_mapping, | ||||
|     get_lang_info, | ||||
|     detect_google_sorry, | ||||
| ) | ||||
| 
 | ||||
| # about | ||||
| about = { | ||||
|     "website": 'https://news.google.com', | ||||
|     "wikidata_id": 'Q12020', | ||||
|     "official_api_documentation": None, | ||||
|     "official_api_documentation": 'https://developers.google.com/custom-search', | ||||
|     "use_official_api": False, | ||||
|     "require_api_key": False, | ||||
|     "results": 'HTML', | ||||
|  | @ -69,51 +71,53 @@ paging = False | |||
| language_support = True | ||||
| use_locale_domain = True | ||||
| time_range_support = True | ||||
| safesearch = True # not really, but it is not generated by google | ||||
| 
 | ||||
| # Google-News results are always *SafeSearch*. Option 'safesearch' is set to | ||||
| # False here, otherwise checker will report safesearch-errors:: | ||||
| # | ||||
| #  safesearch : results are identitical for safesearch=0 and safesearch=2 | ||||
| safesearch = False | ||||
| 
 | ||||
| def request(query, params): | ||||
|     """Google-News search request""" | ||||
| 
 | ||||
|     language, country, lang_country = get_lang_country( | ||||
|     lang_info = get_lang_info( | ||||
|         # pylint: disable=undefined-variable | ||||
|         params, supported_languages, language_aliases | ||||
|     ) | ||||
|     subdomain = 'news.google.com' | ||||
| 
 | ||||
|     if params['time_range']: # in time_range_dict: | ||||
|     # google news has only one domain | ||||
|     lang_info['subdomain'] = 'news.google.com' | ||||
| 
 | ||||
|     ceid = "%s:%s" % (lang_info['country'], lang_info['language']) | ||||
| 
 | ||||
|     # google news redirects en to en-US | ||||
|     if lang_info['hl'] == 'en': | ||||
|         lang_info['hl'] = 'en-US' | ||||
| 
 | ||||
|     # Very special to google-news compared to other google engines, the time | ||||
|     # range is included in the search term. | ||||
|     if params['time_range']: | ||||
|         query += ' ' + time_range_dict[params['time_range']] | ||||
| 
 | ||||
|     query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ | ||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||
|         'q': query, | ||||
|         'hl': language, | ||||
|         'lr': "lang_" + language, | ||||
|         'hl': lang_info['hl'], | ||||
|         'lr': lang_info['lr'], | ||||
|         'ie': "utf8", | ||||
|         'oe': "utf8", | ||||
|         'ceid' : "%s:%s" % (country, language), | ||||
|         'gl' : country, | ||||
|     }) | ||||
|         'gl': lang_info['country'], | ||||
|     }) + ('&ceid=%s' % ceid)  # ceid includes a ':' character which must not be urlencoded | ||||
| 
 | ||||
|     if params['safesearch']: | ||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||
| 
 | ||||
|     params['url'] = query_url | ||||
|     logger.debug("query_url --> %s", query_url) | ||||
|     params['url'] = query_url | ||||
| 
 | ||||
|     # en-US,en;q=0.8,en;q=0.5 | ||||
|     params['headers']['Accept-Language'] = ( | ||||
|         lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' | ||||
|         ) | ||||
|     logger.debug("HTTP header Accept-Language --> %s", | ||||
|                  params['headers']['Accept-Language']) | ||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||
|     params['headers']['Accept'] = ( | ||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||
|         ) | ||||
| 
 | ||||
|     # hl=en redirect to hl=en-US / en-CA ... | ||||
|     params['soft_max_redirects'] = 1 | ||||
| 
 | ||||
|     #params['google_subdomain'] = subdomain | ||||
| 
 | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
|  | @ -123,9 +127,6 @@ def response(resp): | |||
| 
 | ||||
|     detect_google_sorry(resp) | ||||
| 
 | ||||
|     # which subdomain ? | ||||
|     # subdomain = resp.search_params.get('google_subdomain') | ||||
| 
 | ||||
|     # convert the text to dom | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| """ | ||||
| Google (Viedo) | ||||
| """Google (Video) | ||||
| 
 | ||||
| For detailed description of the *REST-full* API see: `Query Parameter | ||||
| Definitions`_.  Not all parameters can be appied. | ||||
|  | @ -22,20 +21,19 @@ Definitions`_.  Not all parameters can be appied. | |||
| # pylint: disable=invalid-name, missing-function-docstring | ||||
| 
 | ||||
| import re | ||||
| from urllib.parse import urlencode, urlparse | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| 
 | ||||
| from searx import logger | ||||
| from searx.exceptions import SearxEngineCaptchaException | ||||
| from searx.utils import ( | ||||
|     eval_xpath, | ||||
|     eval_xpath_list, | ||||
|     eval_xpath_getindex, | ||||
|     extract_text, | ||||
| ) | ||||
| 
 | ||||
| from searx.engines.google import ( | ||||
|     get_lang_country, | ||||
|     google_domains, | ||||
|     get_lang_info, | ||||
|     time_range_dict, | ||||
|     filter_mapping, | ||||
|     results_xpath, | ||||
|  | @ -44,7 +42,8 @@ from searx.engines.google import ( | |||
|     href_xpath, | ||||
|     content_xpath, | ||||
|     suggestion_xpath, | ||||
|     spelling_suggestion_xpath | ||||
|     spelling_suggestion_xpath, | ||||
|     detect_google_sorry, | ||||
| ) | ||||
| 
 | ||||
| # pylint: disable=unused-import | ||||
|  | @ -58,12 +57,10 @@ from searx.engines.google import ( | |||
| about = { | ||||
|     "website": 'https://www.google.com', | ||||
|     "wikidata_id": 'Q219885', | ||||
|     "official_api_documentation": 'https://developers.google.com/custom-search/', | ||||
|     "official_api_documentation": 'https://developers.google.com/custom-search', | ||||
|     "use_official_api": False, | ||||
|     "require_api_key": False, | ||||
|     "results": 'HTML', | ||||
|     "template": 'video.html', | ||||
|     "parse": ('url', 'title', 'content', 'thumbnail') | ||||
| } | ||||
| 
 | ||||
| logger = logger.getChild('google video') | ||||
|  | @ -90,7 +87,7 @@ def scrap_out_thumbs(dom): | |||
|     ret_val = dict() | ||||
|     thumb_name = 'vidthumb' | ||||
| 
 | ||||
|     for script in eval_xpath(dom, '//script[contains(., "_setImagesSrc")]'): | ||||
|     for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): | ||||
|         _script = script.text | ||||
| 
 | ||||
|         # var s='data:image/jpeg;base64, ...' | ||||
|  | @ -104,7 +101,7 @@ def scrap_out_thumbs(dom): | |||
|             ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") | ||||
| 
 | ||||
|     # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... | ||||
|     for script in eval_xpath(dom, '//script[contains(., "google.ldi={")]'): | ||||
|     for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): | ||||
|         _script = script.text | ||||
|         for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : | ||||
|             match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) | ||||
|  | @ -119,17 +116,16 @@ def scrap_out_thumbs(dom): | |||
| def request(query, params): | ||||
|     """Google-Video search request""" | ||||
| 
 | ||||
|     language, country, lang_country = get_lang_country( | ||||
|     lang_info = get_lang_info( | ||||
|         # pylint: disable=undefined-variable | ||||
|         params, supported_languages, language_aliases | ||||
|     ) | ||||
|     subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') | ||||
| 
 | ||||
|     query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ | ||||
|     query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ | ||||
|         'q':   query, | ||||
|         'tbm': "vid", | ||||
|         'hl':  lang_country, | ||||
|         'lr': "lang_" + language, | ||||
|         'hl': lang_info['hl'], | ||||
|         'lr': lang_info['lr'], | ||||
|         'ie': "utf8", | ||||
|         'oe': "utf8", | ||||
|     }) | ||||
|  | @ -139,18 +135,14 @@ def request(query, params): | |||
|     if params['safesearch']: | ||||
|         query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) | ||||
| 
 | ||||
|     params['url'] = query_url | ||||
|     logger.debug("query_url --> %s", query_url) | ||||
|     params['url'] = query_url | ||||
| 
 | ||||
|     # en-US,en;q=0.8,en;q=0.5 | ||||
|     params['headers']['Accept-Language'] = ( | ||||
|         "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) | ||||
|     logger.debug( | ||||
|         "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) | ||||
|     logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language']) | ||||
|     params['headers']['Accept-Language'] = lang_info['Accept-Language'] | ||||
|     params['headers']['Accept'] = ( | ||||
|         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | ||||
|         ) | ||||
|     #params['google_subdomain'] = subdomain | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
|  | @ -158,16 +150,7 @@ def response(resp): | |||
|     """Get response from google's search request""" | ||||
|     results = [] | ||||
| 
 | ||||
|     # detect google sorry | ||||
|     resp_url = urlparse(resp.url) | ||||
|     if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': | ||||
|         raise SearxEngineCaptchaException() | ||||
| 
 | ||||
|     if resp_url.path.startswith('/sorry'): | ||||
|         raise SearxEngineCaptchaException() | ||||
| 
 | ||||
|     # which subdomain ? | ||||
|     # subdomain = resp.search_params.get('google_subdomain') | ||||
|     detect_google_sorry(resp) | ||||
| 
 | ||||
|     # convert the text to dom | ||||
|     dom = html.fromstring(resp.text) | ||||
|  | @ -181,19 +164,18 @@ def response(resp): | |||
|             logger.debug("ingoring <g-section-with-header>") | ||||
|             continue | ||||
| 
 | ||||
|         title = extract_text(eval_xpath(result, title_xpath)[0]) | ||||
|         url = eval_xpath(result, href_xpath)[0] | ||||
|         c_node = eval_xpath(result, content_xpath)[0] | ||||
|         title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) | ||||
|         url = eval_xpath_getindex(result, href_xpath, 0) | ||||
|         c_node = eval_xpath_getindex(result, content_xpath, 0) | ||||
| 
 | ||||
|         # <img id="vidthumb1" ...> | ||||
|         img_id = eval_xpath(c_node, './div[1]//a/g-img/img/@id') | ||||
|         if not img_id: | ||||
|         img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) | ||||
|         if img_id is None: | ||||
|             continue | ||||
|         img_id = img_id[0] | ||||
|         img_src = vidthumb_imgdata.get(img_id, None) | ||||
|         if not img_src: | ||||
|             logger.error("no vidthumb imgdata for: %s" % img_id) | ||||
|             img_src = eval_xpath(c_node, './div[1]//a/g-img/img/@src')[0] | ||||
|             img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) | ||||
| 
 | ||||
|         length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) | ||||
|         content = extract_text(eval_xpath(c_node, './/div[2]/span')) | ||||
|  | @ -210,11 +192,11 @@ def response(resp): | |||
|             }) | ||||
| 
 | ||||
|     # parse suggestion | ||||
|     for suggestion in eval_xpath(dom, suggestion_xpath): | ||||
|     for suggestion in eval_xpath_list(dom, suggestion_xpath): | ||||
|         # append suggestion | ||||
|         results.append({'suggestion': extract_text(suggestion)}) | ||||
| 
 | ||||
|     for correction in eval_xpath(dom, spelling_suggestion_xpath): | ||||
|     for correction in eval_xpath_list(dom, spelling_suggestion_xpath): | ||||
|         results.append({'correction': extract_text(correction)}) | ||||
| 
 | ||||
|     return results | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Markus Heiser
						Markus Heiser