[fix] normalize the language & region aspects of all google engines

BTW: make the engines ready for search.checker:

- replace eval_xpath by eval_xpath_getindex and eval_xpath_list
- google_images: remove outer try/except block

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2021-01-26 11:49:27 +01:00
parent 923b490022
commit b1fefec40d
4 changed files with 187 additions and 179 deletions

View File

@ -1,11 +1,11 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Google (Web) """Google (Web)
For detailed description of the *REST-full* API see: `Query Parameter For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Definitions`_.
.. _Query Parameter Definitions: .. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
""" """
# pylint: disable=invalid-name, missing-function-docstring # pylint: disable=invalid-name, missing-function-docstring
@ -16,7 +16,6 @@ from searx import logger
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import SearxEngineCaptchaException
logger = logger.getChild('google engine') logger = logger.getChild('google engine')
# about # about
@ -56,7 +55,7 @@ google_domains = {
'NZ': 'google.co.nz', # New Zealand 'NZ': 'google.co.nz', # New Zealand
'PH': 'google.com.ph', # Philippines 'PH': 'google.com.ph', # Philippines
'SG': 'google.com.sg', # Singapore 'SG': 'google.com.sg', # Singapore
# 'US': 'google.us', # United States, redirect to .com 'US': 'google.com', # United States (google.us) redirects to .com
'ZA': 'google.co.za', # South Africa 'ZA': 'google.co.za', # South Africa
'AR': 'google.com.ar', # Argentina 'AR': 'google.com.ar', # Argentina
'CL': 'google.cl', # Chile 'CL': 'google.cl', # Chile
@ -87,7 +86,7 @@ google_domains = {
'TH': 'google.co.th', # Thailand 'TH': 'google.co.th', # Thailand
'TR': 'google.com.tr', # Turkey 'TR': 'google.com.tr', # Turkey
'UA': 'google.com.ua', # Ukraine 'UA': 'google.com.ua', # Ukraine
# 'CN': 'google.cn', # China, only from China ? 'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN
'HK': 'google.com.hk', # Hong Kong 'HK': 'google.com.hk', # Hong Kong
'TW': 'google.com.tw' # Taiwan 'TW': 'google.com.tw' # Taiwan
} }
@ -134,26 +133,58 @@ suggestion_xpath = '//div[contains(@class, "card-section")]//a'
spelling_suggestion_xpath = '//div[@class="med"]/p/a' spelling_suggestion_xpath = '//div[@class="med"]/p/a'
def get_lang_country(params, lang_list, custom_aliases): def get_lang_info(params, lang_list, custom_aliases):
"""Returns a tuple with *langauage* on its first and *country* on its second ret_val = {}
position."""
language = params['language']
if language == 'all':
language = 'en-US'
language_array = language.split('-') _lang = params['language']
if _lang.lower() == 'all':
_lang = 'en-US'
if len(language_array) == 2: language = match_language(_lang, lang_list, custom_aliases)
country = language_array[1] ret_val['language'] = language
# the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...)
_l = _lang.split('-')
# the country code (US, AT, CA)
if len(_l) == 2:
country = _l[1]
else: else:
country = language_array[0].upper() country = _l[0].upper()
if country == 'EN':
country = 'US'
language = match_language(language, lang_list, custom_aliases) ret_val['country'] = country
# the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR)
lang_country = '%s-%s' % (language, country) lang_country = '%s-%s' % (language, country)
if lang_country == 'en-EN':
lang_country = 'en'
return language, country, lang_country # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
ret_val['Accept-Language'] = ','.join([
lang_country,
language + ';q=0.8,',
'en;q=0.6',
'*;q=0.5',
])
# subdomain
ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
# hl parameter:
# https://developers.google.com/custom-search/docs/xml_results#hlsp The
# Interface Language:
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
ret_val['hl'] = lang_list.get(lang_country, language)
# lr parameter:
# https://developers.google.com/custom-search/docs/xml_results#lrsp
# Language Collection Values:
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
ret_val['lr'] = "lang_" + lang_list.get(lang_country, language)
return ret_val
def detect_google_sorry(resp): def detect_google_sorry(resp):
resp_url = urlparse(resp.url) resp_url = urlparse(resp.url)
@ -165,17 +196,17 @@ def request(query, params):
"""Google search request""" """Google search request"""
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 10
language, country, lang_country = get_lang_country(
lang_info = get_lang_info(
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
params, supported_languages, language_aliases params, supported_languages, language_aliases
) )
subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
# https://www.google.de/search?q=corona&hl=de-DE&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query, 'q': query,
'hl': lang_country, 'hl': lang_info['hl'],
'lr': "lang_" + language, 'lr': lang_info['lr'],
'ie': "utf8", 'ie': "utf8",
'oe': "utf8", 'oe': "utf8",
'start': offset, 'start': offset,
@ -186,19 +217,14 @@ def request(query, params):
if params['safesearch']: if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
logger.debug("query_url --> %s", query_url) logger.debug("query_url --> %s", query_url)
params['url'] = query_url
# en-US,en;q=0.8,en;q=0.5 logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
params['headers']['Accept-Language'] = ( params['headers']['Accept-Language'] = lang_info['Accept-Language']
lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
)
logger.debug("HTTP header Accept-Language --> %s",
params['headers']['Accept-Language'])
params['headers']['Accept'] = ( params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
) )
# params['google_subdomain'] = subdomain
return params return params
@ -209,8 +235,6 @@ def response(resp):
detect_google_sorry(resp) detect_google_sorry(resp)
results = [] results = []
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)

View File

@ -10,35 +10,50 @@ Definitions`_.
``data:` scheme).:: ``data:` scheme).::
Header set Content-Security-Policy "img-src 'self' data: ;" Header set Content-Security-Policy "img-src 'self' data: ;"
.. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
.. _data URLs:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
""" """
from urllib.parse import urlencode, unquote from urllib.parse import urlencode, unquote
from lxml import html from lxml import html
from searx import logger from searx import logger
from searx.utils import extract_text, eval_xpath from searx.utils import (
from searx.engines.google import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import eval_xpath,
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.google import ( from searx.engines.google import (
get_lang_country, get_lang_info,
google_domains,
time_range_dict, time_range_dict,
detect_google_sorry, detect_google_sorry,
) )
# pylint: disable=unused-import
from searx.engines.google import (
supported_languages_url
, _fetch_supported_languages
)
# pylint: enable=unused-import
logger = logger.getChild('google images') logger = logger.getChild('google images')
# about # about
about = { about = {
"website": 'https://images.google.com/', "website": 'https://images.google.com',
"wikidata_id": 'Q521550', "wikidata_id": 'Q521550',
"official_api_documentation": 'https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions', # NOQA "official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'HTML',
} }
# engine dependent config # engine dependent config
categories = ['images'] categories = ['images']
paging = False paging = False
language_support = True language_support = True
@ -84,17 +99,16 @@ def scrap_img_by_id(script, data_id):
def request(query, params): def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
language, country, lang_country = get_lang_country( lang_info = get_lang_info(
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
params, supported_languages, language_aliases params, supported_languages, language_aliases
) )
subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
query_url = 'https://' + subdomain + '/search' + "?" + urlencode({ query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query, 'q': query,
'tbm': "isch", 'tbm': "isch",
'hl': lang_country, 'hl': lang_info['hl'],
'lr': "lang_" + language, 'lr': lang_info['lr'],
'ie': "utf8", 'ie': "utf8",
'oe': "utf8", 'oe': "utf8",
'num': 30, 'num': 30,
@ -105,17 +119,14 @@ def request(query, params):
if params['safesearch']: if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
logger.debug("query_url --> %s", query_url) logger.debug("query_url --> %s", query_url)
params['url'] = query_url
params['headers']['Accept-Language'] = ( logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
"%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) params['headers']['Accept-Language'] = lang_info['Accept-Language']
logger.debug(
"HTTP Accept-Language --> %s", params['headers']['Accept-Language'])
params['headers']['Accept'] = ( params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
) )
# params['google_subdomain'] = subdomain
return params return params
@ -125,13 +136,11 @@ def response(resp):
detect_google_sorry(resp) detect_google_sorry(resp)
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
img_bas64_map = scrap_out_thumbs(dom) img_bas64_map = scrap_out_thumbs(dom)
img_src_script = eval_xpath(dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text img_src_script = eval_xpath_getindex(
dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text
# parse results # parse results
# #
@ -156,55 +165,47 @@ def response(resp):
return results return results
root = root[0] root = root[0]
for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):
try: img_alt = eval_xpath_getindex(img_node, '@alt', 0)
img_alt = eval_xpath(img_node, '@alt')[0]
img_base64_id = eval_xpath(img_node, '@data-iid') img_base64_id = eval_xpath(img_node, '@data-iid')
if img_base64_id: if img_base64_id:
img_base64_id = img_base64_id[0] img_base64_id = img_base64_id[0]
thumbnail_src = img_bas64_map[img_base64_id] thumbnail_src = img_bas64_map[img_base64_id]
else:
thumbnail_src = eval_xpath(img_node, '@src')
if not thumbnail_src:
thumbnail_src = eval_xpath(img_node, '@data-src')
if thumbnail_src:
thumbnail_src = thumbnail_src[0]
else: else:
thumbnail_src = eval_xpath(img_node, '@src') thumbnail_src = ''
if not thumbnail_src:
thumbnail_src = eval_xpath(img_node, '@data-src')
if thumbnail_src:
thumbnail_src = thumbnail_src[0]
else:
thumbnail_src = ''
link_node = eval_xpath(img_node, '../../../a[2]')[0] link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
url = eval_xpath(link_node, '@href')[0] url = eval_xpath_getindex(link_node, '@href', 0)
pub_nodes = eval_xpath(link_node, './div/div') pub_nodes = eval_xpath(link_node, './div/div')
pub_descr = img_alt pub_descr = img_alt
pub_source = '' pub_source = ''
if pub_nodes: if pub_nodes:
pub_descr = extract_text(pub_nodes[0]) pub_descr = extract_text(pub_nodes[0])
pub_source = extract_text(pub_nodes[1]) pub_source = extract_text(pub_nodes[1])
img_src_id = eval_xpath(img_node, '../../../@data-id')[0] img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
src_url = scrap_img_by_id(img_src_script, img_src_id) src_url = scrap_img_by_id(img_src_script, img_src_id)
if not src_url: if not src_url:
src_url = thumbnail_src src_url = thumbnail_src
results.append({ results.append({
'url': url, 'url': url,
'title': img_alt, 'title': img_alt,
'content': pub_descr, 'content': pub_descr,
'source': pub_source, 'source': pub_source,
'img_src': src_url, 'img_src': src_url,
# 'img_format': img_format, # 'img_format': img_format,
'thumbnail_src': thumbnail_src, 'thumbnail_src': thumbnail_src,
'template': 'images.html' 'template': 'images.html'
}) })
except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True)
# from lxml import etree
# logger.debug(etree.tostring(img_node, pretty_print=True))
# import pdb
# pdb.set_trace()
continue
return results return results

View File

@ -2,13 +2,16 @@
"""Google (News) """Google (News)
For detailed description of the *REST-full* API see: `Query Parameter For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Not all parameters can be appied, e.g. num_ (the number of Definitions`_. Not all parameters can be appied:
search results to return) is ignored.
- num_ : the number of search results is ignored
- save_ : is ignored / Google-News results are always *SafeSearch*
.. _Query Parameter Definitions: .. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
""" """
@ -32,20 +35,19 @@ from searx.utils import (
from searx.engines.google import ( from searx.engines.google import (
supported_languages_url, supported_languages_url,
_fetch_supported_languages, _fetch_supported_languages,
detect_google_sorry,
) )
# pylint: enable=unused-import # pylint: enable=unused-import
from searx.engines.google import ( from searx.engines.google import (
get_lang_country, get_lang_info,
filter_mapping, detect_google_sorry,
) )
# about # about
about = { about = {
"website": 'https://news.google.com', "website": 'https://news.google.com',
"wikidata_id": 'Q12020', "wikidata_id": 'Q12020',
"official_api_documentation": None, "official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'HTML',
@ -69,51 +71,53 @@ paging = False
language_support = True language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True # not really, but it is not generated by google
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False
def request(query, params): def request(query, params):
"""Google-News search request""" """Google-News search request"""
language, country, lang_country = get_lang_country( lang_info = get_lang_info(
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
params, supported_languages, language_aliases params, supported_languages, language_aliases
) )
subdomain = 'news.google.com'
if params['time_range']: # in time_range_dict: # google news has only one domain
lang_info['subdomain'] = 'news.google.com'
ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
# google news redirects en to en-US
if lang_info['hl'] == 'en':
lang_info['hl'] = 'en-US'
# Very special to google-news compared to other google engines, the time
# range is included in the search term.
if params['time_range']:
query += ' ' + time_range_dict[params['time_range']] query += ' ' + time_range_dict[params['time_range']]
query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query, 'q': query,
'hl': language, 'hl': lang_info['hl'],
'lr': "lang_" + language, 'lr': lang_info['lr'],
'ie': "utf8", 'ie': "utf8",
'oe': "utf8", 'oe': "utf8",
'ceid' : "%s:%s" % (country, language), 'gl': lang_info['country'],
'gl' : country, }) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
logger.debug("query_url --> %s", query_url) logger.debug("query_url --> %s", query_url)
params['url'] = query_url
# en-US,en;q=0.8,en;q=0.5 logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
params['headers']['Accept-Language'] = ( params['headers']['Accept-Language'] = lang_info['Accept-Language']
lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5'
)
logger.debug("HTTP header Accept-Language --> %s",
params['headers']['Accept-Language'])
params['headers']['Accept'] = ( params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
) )
# hl=en redirect to hl=en-US / en-CA ...
params['soft_max_redirects'] = 1
#params['google_subdomain'] = subdomain
return params return params
@ -123,9 +127,6 @@ def response(resp):
detect_google_sorry(resp) detect_google_sorry(resp)
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Google (Video)
Google (Viedo)
For detailed description of the *REST-full* API see: `Query Parameter For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Not all parameters can be appied. Definitions`_. Not all parameters can be appied.
@ -22,20 +21,19 @@ Definitions`_. Not all parameters can be appied.
# pylint: disable=invalid-name, missing-function-docstring # pylint: disable=invalid-name, missing-function-docstring
import re import re
from urllib.parse import urlencode, urlparse from urllib.parse import urlencode
from lxml import html from lxml import html
from searx import logger from searx import logger
from searx.exceptions import SearxEngineCaptchaException
from searx.utils import ( from searx.utils import (
eval_xpath, eval_xpath,
eval_xpath_list, eval_xpath_list,
eval_xpath_getindex,
extract_text, extract_text,
) )
from searx.engines.google import ( from searx.engines.google import (
get_lang_country, get_lang_info,
google_domains,
time_range_dict, time_range_dict,
filter_mapping, filter_mapping,
results_xpath, results_xpath,
@ -44,7 +42,8 @@ from searx.engines.google import (
href_xpath, href_xpath,
content_xpath, content_xpath,
suggestion_xpath, suggestion_xpath,
spelling_suggestion_xpath spelling_suggestion_xpath,
detect_google_sorry,
) )
# pylint: disable=unused-import # pylint: disable=unused-import
@ -58,12 +57,10 @@ from searx.engines.google import (
about = { about = {
"website": 'https://www.google.com', "website": 'https://www.google.com',
"wikidata_id": 'Q219885', "wikidata_id": 'Q219885',
"official_api_documentation": 'https://developers.google.com/custom-search/', "official_api_documentation": 'https://developers.google.com/custom-search',
"use_official_api": False, "use_official_api": False,
"require_api_key": False, "require_api_key": False,
"results": 'HTML', "results": 'HTML',
"template": 'video.html',
"parse": ('url', 'title', 'content', 'thumbnail')
} }
logger = logger.getChild('google video') logger = logger.getChild('google video')
@ -90,7 +87,7 @@ def scrap_out_thumbs(dom):
ret_val = dict() ret_val = dict()
thumb_name = 'vidthumb' thumb_name = 'vidthumb'
for script in eval_xpath(dom, '//script[contains(., "_setImagesSrc")]'): for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
_script = script.text _script = script.text
# var s='data:image/jpeg;base64, ...' # var s='data:image/jpeg;base64, ...'
@ -104,7 +101,7 @@ def scrap_out_thumbs(dom):
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
# {google.ldidly=-1;google.ldi={"vidthumb8":"https://... # {google.ldidly=-1;google.ldi={"vidthumb8":"https://...
for script in eval_xpath(dom, '//script[contains(., "google.ldi={")]'): for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
_script = script.text _script = script.text
for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) :
match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val)
@ -119,17 +116,16 @@ def scrap_out_thumbs(dom):
def request(query, params): def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
language, country, lang_country = get_lang_country( lang_info = get_lang_info(
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
params, supported_languages, language_aliases params, supported_languages, language_aliases
) )
subdomain = 'www.' + google_domains.get(country.upper(), 'google.com')
query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
'q': query, 'q': query,
'tbm': "vid", 'tbm': "vid",
'hl': lang_country, 'hl': lang_info['hl'],
'lr': "lang_" + language, 'lr': lang_info['lr'],
'ie': "utf8", 'ie': "utf8",
'oe': "utf8", 'oe': "utf8",
}) })
@ -139,18 +135,14 @@ def request(query, params):
if params['safesearch']: if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
logger.debug("query_url --> %s", query_url) logger.debug("query_url --> %s", query_url)
params['url'] = query_url
# en-US,en;q=0.8,en;q=0.5 logger.debug("HTTP header Accept-Language --> %s", lang_info['Accept-Language'])
params['headers']['Accept-Language'] = ( params['headers']['Accept-Language'] = lang_info['Accept-Language']
"%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language))
logger.debug(
"HTTP Accept-Language --> %s", params['headers']['Accept-Language'])
params['headers']['Accept'] = ( params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
) )
#params['google_subdomain'] = subdomain
return params return params
@ -158,16 +150,7 @@ def response(resp):
"""Get response from google's search request""" """Get response from google's search request"""
results = [] results = []
# detect google sorry detect_google_sorry(resp)
resp_url = urlparse(resp.url)
if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
raise SearxEngineCaptchaException()
if resp_url.path.startswith('/sorry'):
raise SearxEngineCaptchaException()
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom # convert the text to dom
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
@ -181,19 +164,18 @@ def response(resp):
logger.debug("ingoring <g-section-with-header>") logger.debug("ingoring <g-section-with-header>")
continue continue
title = extract_text(eval_xpath(result, title_xpath)[0]) title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
url = eval_xpath(result, href_xpath)[0] url = eval_xpath_getindex(result, href_xpath, 0)
c_node = eval_xpath(result, content_xpath)[0] c_node = eval_xpath_getindex(result, content_xpath, 0)
# <img id="vidthumb1" ...> # <img id="vidthumb1" ...>
img_id = eval_xpath(c_node, './div[1]//a/g-img/img/@id') img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None)
if not img_id: if img_id is None:
continue continue
img_id = img_id[0]
img_src = vidthumb_imgdata.get(img_id, None) img_src = vidthumb_imgdata.get(img_id, None)
if not img_src: if not img_src:
logger.error("no vidthumb imgdata for: %s" % img_id) logger.error("no vidthumb imgdata for: %s" % img_id)
img_src = eval_xpath(c_node, './div[1]//a/g-img/img/@src')[0] img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0)
length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]'))
content = extract_text(eval_xpath(c_node, './/div[2]/span')) content = extract_text(eval_xpath(c_node, './/div[2]/span'))
@ -210,11 +192,11 @@ def response(resp):
}) })
# parse suggestion # parse suggestion
for suggestion in eval_xpath(dom, suggestion_xpath): for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion # append suggestion
results.append({'suggestion': extract_text(suggestion)}) results.append({'suggestion': extract_text(suggestion)})
for correction in eval_xpath(dom, spelling_suggestion_xpath): for correction in eval_xpath_list(dom, spelling_suggestion_xpath):
results.append({'correction': extract_text(correction)}) results.append({'correction': extract_text(correction)})
return results return results