From a174dcf83a61aca1dd1c6e96e86e4356e1e13333 Mon Sep 17 00:00:00 2001 From: Joseph Cheung Date: Fri, 24 Feb 2023 08:25:15 +0800 Subject: [PATCH] Update google_internal_search.py --- searx/engines/google_internal_search.py | 107 ++---------------------- 1 file changed, 9 insertions(+), 98 deletions(-) diff --git a/searx/engines/google_internal_search.py b/searx/engines/google_internal_search.py index 2fa07e1ab..ed5fde8ad 100644 --- a/searx/engines/google_internal_search.py +++ b/searx/engines/google_internal_search.py @@ -19,8 +19,8 @@ The implementation is shared by other engines: from urllib.parse import urlencode from json import loads, dumps -from datetime import datetime, timedelta -from dateutil.tz import tzoffset +from datetime import datetime +from zoneinfo import ZoneInfo from babel.dates import format_datetime import babel from searx.utils import html_to_text @@ -91,50 +91,25 @@ def get_query_url_images(query, lang_info, query_params): ) -def get_query_url_news(query, lang_info, query_params): - return ( - 'https://' - + lang_info['subdomain'] - + '/search' - + "?" - + urlencode( - { - 'q': query, - 'tbm': "nws", - **query_params, - } - ) - ) - - CATEGORY_TO_GET_QUERY_URL = { 'general': get_query_url_general, 'images': get_query_url_images, - 'news': get_query_url_news, -} - -CATEGORY_RESULT_COUNT_PER_PAGE = { - 'general': 10, - 'images': 100, - 'news': 10, } def request(query, params): """Google search request""" - result_count_per_page = CATEGORY_RESULT_COUNT_PER_PAGE[categories[0]] # pylint: disable=unsubscriptable-object - - offset = (params['pageno'] - 1) * result_count_per_page + offset = (params['pageno'] - 1) * 10 lang_info = get_lang_info(params, supported_languages, language_aliases, True) query_params = { **lang_info['params'], - 'ie': 'utf8', - 'oe': 'utf8', + 'ie': "utf8", + 'oe': "utf8", + 'num': 30, 'start': offset, - 'num': result_count_per_page, 'filter': '0', 'asearch': 'arc', 'async': 'use_ac:true,_fmt:json', @@ -207,7 +182,6 @@ class ParseResultGroupItem: "WEB_ANSWERS_CARD_BLOCK": self.web_answers_card_block, "IMAGE_RESULT_GROUP": self.image_result_group, "TWITTER_RESULT_GROUP": self.twitter_result_group, - "NEWS_WHOLEPAGE": self.news_wholepage, # WHOLEPAGE_PAGE_GROUP - found for keyword what is t in English language # EXPLORE_UNIVERSAL_BLOCK # TRAVEL_ANSWERS_RESULT @@ -376,68 +350,10 @@ class ParseResultGroupItem: return results for item in item_to_parse["image_result_group_element"]: + print(item) results.append(parse_search_feature_proto(item["image_result"])) return results - def news_wholepage(self, item_to_parse): - """Parse a news search result""" - - def iter_snippets(): - """Iterate over all the results, yield result_index, snippet to deal with nested structured""" - result_index = 0 - for item in item_to_parse["element"]: - if "news_singleton_result_group" in item: - payload = item["news_singleton_result_group"]["result"]["payload"]["liquid_item_data"] - yield result_index, payload["article"]["stream_simplified_snippet"] - result_index += 1 - continue - - if "top_coverage" in item: - for element in item["top_coverage"]["element"]: - yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][ - "stream_simplified_snippet" - ] - result_index += 1 - continue - - if "news_sports_hub_result_group" in item: - for element in item["news_sports_hub_result_group"]["element"]: - yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][ - "stream_simplified_snippet" - ] - result_index += 1 - continue - - if "news_topic_hub_refinements_result_group" in item: - for ref_list in item["news_topic_hub_refinements_result_group"]["refinements"]["refinement_list"]: - for result in ref_list["results"]: - yield result_index, result["payload"]["liquid_item_data"]["article"][ - "stream_simplified_snippet" - ] - result_index += 1 - continue - - print("unknow news", item) - - results = [] - for result_index, snippet in iter_snippets(): - publishedDate = snippet["date"]["timestamp"] - url = snippet["url"]["result_url"] - title = html_to_text(snippet["title"]["text"]) - content = html_to_text(snippet["snippet"]["snippet"]) - img_src = snippet.get("thumbnail_info", {}).get("sffe_50k_thumbnail_url") - results.append( - { - 'url': url, - 'title': title, - 'content': content, - 'img_src': img_src, - 'publishedDate': datetime.fromtimestamp(publishedDate), - "result_index": result_index, - } - ) - return results - class ParseResultItem: # pylint: disable=too-few-public-methods """Parse result_search_feature_proto.search_feature_proto""" @@ -457,13 +373,7 @@ class ParseResultItem: # pylint: disable=too-few-public-methods timezones_0 = item_to_parse["payload"]["target_location"]["timezones"][0] iana_timezone = timezones_0["iana_timezone"] localized_location = timezones_0["localized_location"] - # parse timezone_abbrev_specific to create result_tz - # timezone_abbrev_specific for India is "UTC+5:30" and for New York is "UTC−4" - # the values for offsets are respectively ["5", "30", "0"] and ["-4": "0"] - timezone_abbrev_specific = timezones_0["timezone_abbrev_specific"] - offsets = timezone_abbrev_specific.replace("UTC", "").replace("GMT", "").replace("−", "-").split(":") - offsets.append("0") - result_tz = tzoffset(iana_timezone, timedelta(hours=int(offsets[0]), minutes=int(offsets[1]))) + result_tz = ZoneInfo(iana_timezone) result_dt = datetime.fromtimestamp(seconds_utc, tz=result_tz) result_dt_str = format_datetime(result_dt, 'long', tzinfo=result_tz, locale=self.locale) answer = f"{result_dt_str} ( {localized_location} )" @@ -480,6 +390,7 @@ def parse_web_results_list(json_data, locale): results_list = tier_1_search_results["result_list"]["item"] if "spell_suggestion" in tier_1_search_results: + print(tier_1_search_results["spell_suggestion"]) spell_suggestion = tier_1_search_results["spell_suggestion"] if "spell_column" in spell_suggestion: for spell_suggestion in tier_1_search_results["spell_suggestion"]["spell_column"]: