From 89b3050b5c406f795dd25d24f182cf173ad42774 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 22 Jan 2021 17:16:46 +0100 Subject: [PATCH 1/7] [fix] revise of the google-Video engine This revise is based on the methods developed in the revise of the google engine (see commit 410c2f9). Signed-off-by: Markus Heiser --- Makefile | 3 +- searx/engines/google_videos.py | 240 +++++++++++++++++++++++++-------- 2 files changed, 184 insertions(+), 59 deletions(-) diff --git a/Makefile b/Makefile index 350b2321b..55b744786 100644 --- a/Makefile +++ b/Makefile @@ -179,7 +179,8 @@ PYLINT_FILES=\ searx/engines/deviantart.py \ searx/engines/digg.py \ searx/engines/google.py \ - searx/engines/google_news.py + searx/engines/google_news.py \ + searx/engines/google_videos.py test.pylint: pyenvinstall $(call cmd,pylint,$(PYLINT_FILES)) diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 61e01ca7b..486ba7ccd 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,13 +1,58 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """ - Google (Videos) +Google (Viedo) + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. Not all parameters can be appied. + +.. _admonition:: Content-Security-Policy (CSP) + + This engine needs to allow images from the `data URLs`_ (prefixed with the + ``data:` scheme).:: + + Header set Content-Security-Policy "img-src 'self' data: ;" + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions +.. _data URLs: + https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs + """ -from datetime import date, timedelta -from urllib.parse import urlencode -from lxml import html -from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex +# pylint: disable=invalid-name, missing-function-docstring + import re +from urllib.parse import urlencode, urlparse +from lxml import html + +from searx import logger +from searx.exceptions import SearxEngineCaptchaException +from searx.utils import ( + eval_xpath, + eval_xpath_list, + extract_text, +) + +from searx.engines.google import ( + get_lang_country, + google_domains, + time_range_dict, + filter_mapping, + results_xpath, + g_section_with_header, + title_xpath, + href_xpath, + content_xpath, + suggestion_xpath, + spelling_suggestion_xpath +) + +# pylint: disable=unused-import +from searx.engines.google import ( + supported_languages_url + , _fetch_supported_languages +) +# pylint: enable=unused-import # about about = { @@ -17,83 +62,162 @@ about = { "use_official_api": False, "require_api_key": False, "results": 'HTML', + "template": 'video.html', + "parse": ('url', 'title', 'content', 'thumbnail') } +logger = logger.getChild('google video') + # engine dependent config + categories = ['videos'] -paging = True -safesearch = True +paging = False +language_support = True +use_locale_domain = True time_range_support = True -number_of_results = 10 +safesearch = True -search_url = 'https://www.google.com/search'\ - '?q={query}'\ - '&tbm=vid'\ - '&{search_options}' -time_range_attr = "qdr:{range}" -time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" -time_range_dict = {'day': 'd', - 'week': 'w', - 'month': 'm'} +RE_CACHE = {} + +def _re(regexpr): + """returns compiled regular expression""" + RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) + return RE_CACHE[regexpr] + +def scrap_out_thumbs(dom): + """Scrap out thumbnail data from