From b184a77d087f3238a35733a11ccfc883651117b4 Mon Sep 17 00:00:00 2001
From: Alexandre FLAMENT <alexandre.flament@hesge.ch>
Date: Sat, 10 Sep 2022 10:04:17 +0000
Subject: [PATCH] google_internal_search: add images, parse result of query
 "time"

---
 searx/engines/google_internal_search.py | 232 ++++++++++++++++++++----
 searx/settings.yml                      |   6 +-
 2 files changed, 196 insertions(+), 42 deletions(-)

diff --git a/searx/engines/google_internal_search.py b/searx/engines/google_internal_search.py
index 7e51b7f38..ed5fde8ad 100644
--- a/searx/engines/google_internal_search.py
+++ b/searx/engines/google_internal_search.py
@@ -19,6 +19,10 @@ The implementation is shared by other engines:
 
 from urllib.parse import urlencode
 from json import loads, dumps
+from datetime import datetime
+from zoneinfo import ZoneInfo
+from babel.dates import format_datetime
+import babel
 from searx.utils import html_to_text
 
 # pylint: disable=unused-import
@@ -44,12 +48,54 @@ about = {
 }
 
 # engine dependent config
-categories = ['general', 'web']
+categories = None
 paging = True
 time_range_support = True
 safesearch = True
 send_accept_language_header = True
 
+# configuration
+include_image_results = True
+include_twitter_results = False
+
+
+def get_query_url_general(query, lang_info, query_params):
+    return (
+        'https://'
+        + lang_info['subdomain']
+        + '/search'
+        + "?"
+        + urlencode(
+            {
+                'q': query,
+                **query_params,
+            }
+        )
+    )
+
+
+def get_query_url_images(query, lang_info, query_params):
+    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
+    return (
+        'https://'
+        + lang_info['subdomain']
+        + '/search'
+        + "?"
+        + urlencode(
+            {
+                'q': query,
+                'tbm': "isch",
+                **query_params,
+            }
+        )
+    )
+
+
+CATEGORY_TO_GET_QUERY_URL = {
+    'general': get_query_url_general,
+    'images': get_query_url_images,
+}
+
 
 def request(query, params):
     """Google search request"""
@@ -58,25 +104,21 @@ def request(query, params):
 
     lang_info = get_lang_info(params, supported_languages, language_aliases, True)
 
+    query_params = {
+        **lang_info['params'],
+        'ie': "utf8",
+        'oe': "utf8",
+        'num': 30,
+        'start': offset,
+        'filter': '0',
+        'asearch': 'arc',
+        'async': 'use_ac:true,_fmt:json',
+    }
+
+    get_query_url = CATEGORY_TO_GET_QUERY_URL[categories[0]]  # pylint: disable=unsubscriptable-object
+
     # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
-    query_url = (
-        'https://'
-        + lang_info['subdomain']
-        + '/search'
-        + "?"
-        + urlencode(
-            {
-                'q': query,
-                **lang_info['params'],
-                'ie': "utf8",
-                'oe': "utf8",
-                'start': offset,
-                'filter': '0',
-                'asearch': 'arc',
-                'async': 'use_ac:true,_fmt:json',
-            }
-        )
-    )
+    query_url = get_query_url(query, lang_info, query_params)
 
     if params['time_range'] in time_range_dict:
         query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
@@ -90,10 +132,46 @@ def request(query, params):
     return params
 
 
-class ParseItem:
-    """Parse one tier 1 result"""
+def parse_search_feature_proto(search_feature_proto):
+    result_index = search_feature_proto["feature_metadata"]["logging_tree_ref_feature_metadata_extension"][
+        "result_index"
+    ]
+    image_result_data = search_feature_proto["payload"]["image_result_data"]
+    title = html_to_text(image_result_data["page_title"])
+    content = html_to_text(image_result_data.get("snippet", ""))
+    url = image_result_data["coupled_url"]
+    img_src = image_result_data["url"]
+    thumbnail_src = "https://encrypted-tbn0.gstatic.com/images?q=tbn:" + image_result_data["encrypted_docid"]
+    img_format = f'{image_result_data["full_image_size"]["width"]} * {image_result_data["full_image_size"]["height"]}'
 
-    def __init__(self):
+    iptc = image_result_data.get("iptc_info", {}).get("iptc", {})
+    copyright_notice = iptc.get("copyright_notice")
+    creator = iptc.get("creator")
+    if isinstance(creator, list):
+        creator = ", ".join(creator)
+    if creator and copyright_notice and creator != copyright_notice:
+        author = f'{creator} ; {copyright_notice}'
+    else:
+        author = creator
+    return {
+        "template": "images.html",
+        "title": title,
+        "content": content,
+        "url": url,
+        "img_src": img_src,
+        "thumbnail_src": thumbnail_src,
+        'img_format': img_format,
+        "author": author,
+        "result_index": result_index,
+    }
+
+
+class ParseResultGroupItem:
+    """Parse result_group_search_feature_proto.search_feature_proto"""
+
+    def __init__(self, locale):
+        """Parse one tier 1 result"""
+        self.locale = locale
         self.item_types = {
             "EXPLORE_UNIVERSAL_BLOCK": self.explore_universal_block,
             "HOST_CLUSTER": self.host_cluster,
@@ -102,10 +180,9 @@ class ParseItem:
             "VIDEO_UNIVERSAL_GROUP": self.video_universal_group,
             "WEB_RESULT": self.web_result,
             "WEB_ANSWERS_CARD_BLOCK": self.web_answers_card_block,
+            "IMAGE_RESULT_GROUP": self.image_result_group,
+            "TWITTER_RESULT_GROUP": self.twitter_result_group,
             # WHOLEPAGE_PAGE_GROUP - found for keyword what is t in English language
-            # IMAGE_RESULT_GROUP
-            # EXPLORE_UNIVERSAL_BLOCK
-            # TWITTER_RESULT_GROUP
             # EXPLORE_UNIVERSAL_BLOCK
             # TRAVEL_ANSWERS_RESULT
             # TOP_STORIES : news.html template
@@ -117,8 +194,10 @@ class ParseItem:
         for item in item_to_parse["explore_universal_unit_sfp_interface"]:
             explore_unit = item["explore_block_extension"]["payload"]["explore_unit"]
             if "lookup_key" in explore_unit:
-                results.append({'suggestion': html_to_text(explore_unit["lookup_key"]["aquarium_query"]), 'result_index': -1})
-            elif "label":
+                results.append(
+                    {'suggestion': html_to_text(explore_unit["lookup_key"]["aquarium_query"]), 'result_index': -1}
+                )
+            elif "label" in explore_unit:
                 results.append({'suggestion': html_to_text(explore_unit["label"]["text"]), 'result_index': -1})
         return results
 
@@ -222,26 +301,89 @@ class ParseItem:
                 content = html_to_text(payload["snippet_text"])
 
         return [{'url': url, 'title': title, 'content': content, 'result_index': result_index}]
-    
+
     def web_answers_card_block(self, item_to_parse):
         results = []
 
         for item in item_to_parse["web_answers_card_block_elements"]:
             answer = None
             url = None
-            title = None
             for item_webanswers in item["webanswers_container"]["webanswers_container_elements"]:
-                if "web_answers_result" in item_webanswers and "text" in item_webanswers["web_answers_result"]["payload"]:
+                if (
+                    "web_answers_result" in item_webanswers
+                    and "text" in item_webanswers["web_answers_result"]["payload"]
+                ):
                     answer = html_to_text(item_webanswers["web_answers_result"]["payload"]["text"])
                 if "web_answers_standard_result" in item_webanswers:
-                    primary_link = item_webanswers["web_answers_standard_result"]["payload"]["standard_result"]["primary_link"]
+                    primary_link = item_webanswers["web_answers_standard_result"]["payload"]["standard_result"][
+                        "primary_link"
+                    ]
                     url = primary_link["url"]
 
             results.append({'answer': answer, 'url': url, 'result_index': -1})
 
-        return(results)
+        return results
 
-def parse_web_results_list(json_data):
+    def twitter_result_group(self, item_to_parse):
+        results = []
+        if not include_twitter_results:
+            return results
+
+        result_index = item_to_parse["twitter_carousel_header"]["feature_metadata"][
+            "logging_tree_ref_feature_metadata_extension"
+        ]["result_index"]
+        for item in item_to_parse["twitter_cards"]:
+            profile_payload = item["profile_link"]["payload"]["author"]
+            results.append(
+                {
+                    "title": profile_payload["display_name"],
+                    "url": profile_payload["profile_page_url"],
+                    "result_index": result_index,
+                }
+            )
+
+        return results
+
+    def image_result_group(self, item_to_parse):
+        results = []
+        if not include_image_results:
+            return results
+
+        for item in item_to_parse["image_result_group_element"]:
+            print(item)
+            results.append(parse_search_feature_proto(item["image_result"]))
+        return results
+
+
+class ParseResultItem:  # pylint: disable=too-few-public-methods
+    """Parse result_search_feature_proto.search_feature_proto"""
+
+    def __init__(self, locale):
+        self.locale = locale
+        self.item_types = {
+            "LOCAL_TIME": self.local_time,
+            "IMAGE_RESULT": self.image_result,
+        }
+
+    def local_time(self, item_to_parse):
+        """Query like 'time in auckland' or 'time'
+        Note: localized_location reveal the location of the server
+        """
+        seconds_utc = item_to_parse["payload"]["current_time"]["seconds_utc"]
+        timezones_0 = item_to_parse["payload"]["target_location"]["timezones"][0]
+        iana_timezone = timezones_0["iana_timezone"]
+        localized_location = timezones_0["localized_location"]
+        result_tz = ZoneInfo(iana_timezone)
+        result_dt = datetime.fromtimestamp(seconds_utc, tz=result_tz)
+        result_dt_str = format_datetime(result_dt, 'long', tzinfo=result_tz, locale=self.locale)
+        answer = f"{result_dt_str} ( {localized_location} )"
+        return [{'answer': answer, 'result_index': -1}]
+
+    def image_result(self, item_to_parse):
+        return [parse_search_feature_proto(item_to_parse)]
+
+
+def parse_web_results_list(json_data, locale):
     results = []
 
     tier_1_search_results = json_data["arcResponse"]["search_results"]["tier_1_search_results"]
@@ -257,7 +399,8 @@ def parse_web_results_list(json_data):
         elif "full_page" in spell_suggestion:
             results.append({'correction': spell_suggestion["full_page"]["raw_query"], 'result_index': -1})
 
-    parse_item = ParseItem()
+    parseResultItem = ParseResultItem(locale)
+    parseResultGroupItem = ParseResultGroupItem(locale)
     for item in results_list:
         if "result_group" in item:
             result_item = item["result_group"]
@@ -266,11 +409,14 @@ def parse_web_results_list(json_data):
             result_item = item["result"]
             result_item_extension = result_item["result_extension"]
         one_namespace_type = result_item_extension["one_namespace_type"]
-        if one_namespace_type in parse_item.item_types and "result_group_search_feature_proto" in result_item:
+        if one_namespace_type in parseResultGroupItem.item_types and "result_group_search_feature_proto" in result_item:
             search_feature_proto = result_item["result_group_search_feature_proto"]["search_feature_proto"]
-            results = results + parse_item.item_types[one_namespace_type](search_feature_proto)
-        elif "result_group_search_feature_proto" not in result_item:
-            print(dumps(json_data["arcResponse"]))
+            results = results + parseResultGroupItem.item_types[one_namespace_type](search_feature_proto)
+        elif one_namespace_type in parseResultItem.item_types and "result_search_feature_proto" in result_item:
+            search_feature_proto = result_item["result_search_feature_proto"]["search_feature_proto"]
+            results = results + parseResultItem.item_types[one_namespace_type](search_feature_proto)
+        elif "result_group_search_feature_proto" in result_item:
+            print(dumps(one_namespace_type))
 
     return sorted(results, key=lambda d: d['result_index'])
 
@@ -280,7 +426,15 @@ def response(resp):
 
     detect_google_sorry(resp)
 
+    language = resp.search_params["language"]
+    locale = 'en'
+    try:
+        locale = babel.Locale.parse(language, sep='-')
+    except babel.core.UnknownLocaleError:
+        pass
+
     # only the 2nd line has the JSON content
     response_2nd_line = resp.text.split("\n", 1)[1]
     json_data = loads(response_2nd_line)
-    return parse_web_results_list(json_data)
+
+    return parse_web_results_list(json_data, locale)
diff --git a/searx/settings.yml b/searx/settings.yml
index 3f5058130..f04cd0915 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -692,8 +692,7 @@ engines:
   - name: google
     engine: google_internal_search
     shortcut: go
-    # see https://docs.searxng.org/src/searx.engines.google.html#module-searx.engines.google
-    use_mobile_ui: false
+    categories: [general, web]
     # additional_tests:
     #   android: *test_android
 
@@ -709,8 +708,9 @@ engines:
   #   use_mobile_ui: true
 
   - name: google images
-    engine: google_images
+    engine: google_internal_search
     shortcut: goi
+    categories: [images, web]
     # additional_tests:
     #   android: *test_android
     #   dali: