Update google_internal_search.py

This commit is contained in:
Joseph Cheung 2023-02-24 08:26:49 +08:00
parent a174dcf83a
commit 4e51be0fda

View file

@ -19,8 +19,8 @@ The implementation is shared by other engines:
from urllib.parse import urlencode from urllib.parse import urlencode
from json import loads, dumps from json import loads, dumps
from datetime import datetime from datetime import datetime, timedelta
from zoneinfo import ZoneInfo from dateutil.tz import tzoffset
from babel.dates import format_datetime from babel.dates import format_datetime
import babel import babel
from searx.utils import html_to_text from searx.utils import html_to_text
@ -91,25 +91,50 @@ def get_query_url_images(query, lang_info, query_params):
) )
def get_query_url_news(query, lang_info, query_params):
return (
'https://'
+ lang_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "nws",
**query_params,
}
)
)
CATEGORY_TO_GET_QUERY_URL = { CATEGORY_TO_GET_QUERY_URL = {
'general': get_query_url_general, 'general': get_query_url_general,
'images': get_query_url_images, 'images': get_query_url_images,
'news': get_query_url_news,
}
CATEGORY_RESULT_COUNT_PER_PAGE = {
'general': 10,
'images': 100,
'news': 10,
} }
def request(query, params): def request(query, params):
"""Google search request""" """Google search request"""
offset = (params['pageno'] - 1) * 10 result_count_per_page = CATEGORY_RESULT_COUNT_PER_PAGE[categories[0]] # pylint: disable=unsubscriptable-object
offset = (params['pageno'] - 1) * result_count_per_page
lang_info = get_lang_info(params, supported_languages, language_aliases, True) lang_info = get_lang_info(params, supported_languages, language_aliases, True)
query_params = { query_params = {
**lang_info['params'], **lang_info['params'],
'ie': "utf8", 'ie': 'utf8',
'oe': "utf8", 'oe': 'utf8',
'num': 30,
'start': offset, 'start': offset,
'num': result_count_per_page,
'filter': '0', 'filter': '0',
'asearch': 'arc', 'asearch': 'arc',
'async': 'use_ac:true,_fmt:json', 'async': 'use_ac:true,_fmt:json',
@ -182,6 +207,7 @@ class ParseResultGroupItem:
"WEB_ANSWERS_CARD_BLOCK": self.web_answers_card_block, "WEB_ANSWERS_CARD_BLOCK": self.web_answers_card_block,
"IMAGE_RESULT_GROUP": self.image_result_group, "IMAGE_RESULT_GROUP": self.image_result_group,
"TWITTER_RESULT_GROUP": self.twitter_result_group, "TWITTER_RESULT_GROUP": self.twitter_result_group,
"NEWS_WHOLEPAGE": self.news_wholepage,
# WHOLEPAGE_PAGE_GROUP - found for keyword what is t in English language # WHOLEPAGE_PAGE_GROUP - found for keyword what is t in English language
# EXPLORE_UNIVERSAL_BLOCK # EXPLORE_UNIVERSAL_BLOCK
# TRAVEL_ANSWERS_RESULT # TRAVEL_ANSWERS_RESULT
@ -350,10 +376,68 @@ class ParseResultGroupItem:
return results return results
for item in item_to_parse["image_result_group_element"]: for item in item_to_parse["image_result_group_element"]:
print(item)
results.append(parse_search_feature_proto(item["image_result"])) results.append(parse_search_feature_proto(item["image_result"]))
return results return results
def news_wholepage(self, item_to_parse):
"""Parse a news search result"""
def iter_snippets():
"""Iterate over all the results, yield result_index, snippet to deal with nested structured"""
result_index = 0
for item in item_to_parse["element"]:
if "news_singleton_result_group" in item:
payload = item["news_singleton_result_group"]["result"]["payload"]["liquid_item_data"]
yield result_index, payload["article"]["stream_simplified_snippet"]
result_index += 1
continue
if "top_coverage" in item:
for element in item["top_coverage"]["element"]:
yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][
"stream_simplified_snippet"
]
result_index += 1
continue
if "news_sports_hub_result_group" in item:
for element in item["news_sports_hub_result_group"]["element"]:
yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][
"stream_simplified_snippet"
]
result_index += 1
continue
if "news_topic_hub_refinements_result_group" in item:
for ref_list in item["news_topic_hub_refinements_result_group"]["refinements"]["refinement_list"]:
for result in ref_list["results"]:
yield result_index, result["payload"]["liquid_item_data"]["article"][
"stream_simplified_snippet"
]
result_index += 1
continue
print("unknow news", item)
results = []
for result_index, snippet in iter_snippets():
publishedDate = snippet["date"]["timestamp"]
url = snippet["url"]["result_url"]
title = html_to_text(snippet["title"]["text"])
content = html_to_text(snippet["snippet"]["snippet"])
img_src = snippet.get("thumbnail_info", {}).get("sffe_50k_thumbnail_url")
results.append(
{
'url': url,
'title': title,
'content': content,
'img_src': img_src,
'publishedDate': datetime.fromtimestamp(publishedDate),
"result_index": result_index,
}
)
return results
class ParseResultItem: # pylint: disable=too-few-public-methods class ParseResultItem: # pylint: disable=too-few-public-methods
"""Parse result_search_feature_proto.search_feature_proto""" """Parse result_search_feature_proto.search_feature_proto"""
@ -373,7 +457,13 @@ class ParseResultItem: # pylint: disable=too-few-public-methods
timezones_0 = item_to_parse["payload"]["target_location"]["timezones"][0] timezones_0 = item_to_parse["payload"]["target_location"]["timezones"][0]
iana_timezone = timezones_0["iana_timezone"] iana_timezone = timezones_0["iana_timezone"]
localized_location = timezones_0["localized_location"] localized_location = timezones_0["localized_location"]
result_tz = ZoneInfo(iana_timezone) # parse timezone_abbrev_specific to create result_tz
# timezone_abbrev_specific for India is "UTC+5:30" and for New York is "UTC4"
# the values for offsets are respectively ["5", "30", "0"] and ["-4": "0"]
timezone_abbrev_specific = timezones_0["timezone_abbrev_specific"]
offsets = timezone_abbrev_specific.replace("UTC", "").replace("GMT", "").replace("", "-").split(":")
offsets.append("0")
result_tz = tzoffset(iana_timezone, timedelta(hours=int(offsets[0]), minutes=int(offsets[1])))
result_dt = datetime.fromtimestamp(seconds_utc, tz=result_tz) result_dt = datetime.fromtimestamp(seconds_utc, tz=result_tz)
result_dt_str = format_datetime(result_dt, 'long', tzinfo=result_tz, locale=self.locale) result_dt_str = format_datetime(result_dt, 'long', tzinfo=result_tz, locale=self.locale)
answer = f"{result_dt_str} ( {localized_location} )" answer = f"{result_dt_str} ( {localized_location} )"
@ -390,7 +480,6 @@ def parse_web_results_list(json_data, locale):
results_list = tier_1_search_results["result_list"]["item"] results_list = tier_1_search_results["result_list"]["item"]
if "spell_suggestion" in tier_1_search_results: if "spell_suggestion" in tier_1_search_results:
print(tier_1_search_results["spell_suggestion"])
spell_suggestion = tier_1_search_results["spell_suggestion"] spell_suggestion = tier_1_search_results["spell_suggestion"]
if "spell_column" in spell_suggestion: if "spell_column" in spell_suggestion:
for spell_suggestion in tier_1_search_results["spell_suggestion"]["spell_column"]: for spell_suggestion in tier_1_search_results["spell_suggestion"]["spell_column"]: