mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
Update google_internal_search.py
This commit is contained in:
parent
9dd5eed1b8
commit
a174dcf83a
1 changed files with 9 additions and 98 deletions
|
@ -19,8 +19,8 @@ The implementation is shared by other engines:
|
||||||
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from json import loads, dumps
|
from json import loads, dumps
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime
|
||||||
from dateutil.tz import tzoffset
|
from zoneinfo import ZoneInfo
|
||||||
from babel.dates import format_datetime
|
from babel.dates import format_datetime
|
||||||
import babel
|
import babel
|
||||||
from searx.utils import html_to_text
|
from searx.utils import html_to_text
|
||||||
|
@ -91,50 +91,25 @@ def get_query_url_images(query, lang_info, query_params):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_query_url_news(query, lang_info, query_params):
|
|
||||||
return (
|
|
||||||
'https://'
|
|
||||||
+ lang_info['subdomain']
|
|
||||||
+ '/search'
|
|
||||||
+ "?"
|
|
||||||
+ urlencode(
|
|
||||||
{
|
|
||||||
'q': query,
|
|
||||||
'tbm': "nws",
|
|
||||||
**query_params,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
CATEGORY_TO_GET_QUERY_URL = {
|
CATEGORY_TO_GET_QUERY_URL = {
|
||||||
'general': get_query_url_general,
|
'general': get_query_url_general,
|
||||||
'images': get_query_url_images,
|
'images': get_query_url_images,
|
||||||
'news': get_query_url_news,
|
|
||||||
}
|
|
||||||
|
|
||||||
CATEGORY_RESULT_COUNT_PER_PAGE = {
|
|
||||||
'general': 10,
|
|
||||||
'images': 100,
|
|
||||||
'news': 10,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
"""Google search request"""
|
"""Google search request"""
|
||||||
|
|
||||||
result_count_per_page = CATEGORY_RESULT_COUNT_PER_PAGE[categories[0]] # pylint: disable=unsubscriptable-object
|
offset = (params['pageno'] - 1) * 10
|
||||||
|
|
||||||
offset = (params['pageno'] - 1) * result_count_per_page
|
|
||||||
|
|
||||||
lang_info = get_lang_info(params, supported_languages, language_aliases, True)
|
lang_info = get_lang_info(params, supported_languages, language_aliases, True)
|
||||||
|
|
||||||
query_params = {
|
query_params = {
|
||||||
**lang_info['params'],
|
**lang_info['params'],
|
||||||
'ie': 'utf8',
|
'ie': "utf8",
|
||||||
'oe': 'utf8',
|
'oe': "utf8",
|
||||||
|
'num': 30,
|
||||||
'start': offset,
|
'start': offset,
|
||||||
'num': result_count_per_page,
|
|
||||||
'filter': '0',
|
'filter': '0',
|
||||||
'asearch': 'arc',
|
'asearch': 'arc',
|
||||||
'async': 'use_ac:true,_fmt:json',
|
'async': 'use_ac:true,_fmt:json',
|
||||||
|
@ -207,7 +182,6 @@ class ParseResultGroupItem:
|
||||||
"WEB_ANSWERS_CARD_BLOCK": self.web_answers_card_block,
|
"WEB_ANSWERS_CARD_BLOCK": self.web_answers_card_block,
|
||||||
"IMAGE_RESULT_GROUP": self.image_result_group,
|
"IMAGE_RESULT_GROUP": self.image_result_group,
|
||||||
"TWITTER_RESULT_GROUP": self.twitter_result_group,
|
"TWITTER_RESULT_GROUP": self.twitter_result_group,
|
||||||
"NEWS_WHOLEPAGE": self.news_wholepage,
|
|
||||||
# WHOLEPAGE_PAGE_GROUP - found for keyword what is t in English language
|
# WHOLEPAGE_PAGE_GROUP - found for keyword what is t in English language
|
||||||
# EXPLORE_UNIVERSAL_BLOCK
|
# EXPLORE_UNIVERSAL_BLOCK
|
||||||
# TRAVEL_ANSWERS_RESULT
|
# TRAVEL_ANSWERS_RESULT
|
||||||
|
@ -376,68 +350,10 @@ class ParseResultGroupItem:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
for item in item_to_parse["image_result_group_element"]:
|
for item in item_to_parse["image_result_group_element"]:
|
||||||
|
print(item)
|
||||||
results.append(parse_search_feature_proto(item["image_result"]))
|
results.append(parse_search_feature_proto(item["image_result"]))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def news_wholepage(self, item_to_parse):
|
|
||||||
"""Parse a news search result"""
|
|
||||||
|
|
||||||
def iter_snippets():
|
|
||||||
"""Iterate over all the results, yield result_index, snippet to deal with nested structured"""
|
|
||||||
result_index = 0
|
|
||||||
for item in item_to_parse["element"]:
|
|
||||||
if "news_singleton_result_group" in item:
|
|
||||||
payload = item["news_singleton_result_group"]["result"]["payload"]["liquid_item_data"]
|
|
||||||
yield result_index, payload["article"]["stream_simplified_snippet"]
|
|
||||||
result_index += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if "top_coverage" in item:
|
|
||||||
for element in item["top_coverage"]["element"]:
|
|
||||||
yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][
|
|
||||||
"stream_simplified_snippet"
|
|
||||||
]
|
|
||||||
result_index += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if "news_sports_hub_result_group" in item:
|
|
||||||
for element in item["news_sports_hub_result_group"]["element"]:
|
|
||||||
yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][
|
|
||||||
"stream_simplified_snippet"
|
|
||||||
]
|
|
||||||
result_index += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if "news_topic_hub_refinements_result_group" in item:
|
|
||||||
for ref_list in item["news_topic_hub_refinements_result_group"]["refinements"]["refinement_list"]:
|
|
||||||
for result in ref_list["results"]:
|
|
||||||
yield result_index, result["payload"]["liquid_item_data"]["article"][
|
|
||||||
"stream_simplified_snippet"
|
|
||||||
]
|
|
||||||
result_index += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
print("unknow news", item)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for result_index, snippet in iter_snippets():
|
|
||||||
publishedDate = snippet["date"]["timestamp"]
|
|
||||||
url = snippet["url"]["result_url"]
|
|
||||||
title = html_to_text(snippet["title"]["text"])
|
|
||||||
content = html_to_text(snippet["snippet"]["snippet"])
|
|
||||||
img_src = snippet.get("thumbnail_info", {}).get("sffe_50k_thumbnail_url")
|
|
||||||
results.append(
|
|
||||||
{
|
|
||||||
'url': url,
|
|
||||||
'title': title,
|
|
||||||
'content': content,
|
|
||||||
'img_src': img_src,
|
|
||||||
'publishedDate': datetime.fromtimestamp(publishedDate),
|
|
||||||
"result_index": result_index,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
class ParseResultItem: # pylint: disable=too-few-public-methods
|
class ParseResultItem: # pylint: disable=too-few-public-methods
|
||||||
"""Parse result_search_feature_proto.search_feature_proto"""
|
"""Parse result_search_feature_proto.search_feature_proto"""
|
||||||
|
@ -457,13 +373,7 @@ class ParseResultItem: # pylint: disable=too-few-public-methods
|
||||||
timezones_0 = item_to_parse["payload"]["target_location"]["timezones"][0]
|
timezones_0 = item_to_parse["payload"]["target_location"]["timezones"][0]
|
||||||
iana_timezone = timezones_0["iana_timezone"]
|
iana_timezone = timezones_0["iana_timezone"]
|
||||||
localized_location = timezones_0["localized_location"]
|
localized_location = timezones_0["localized_location"]
|
||||||
# parse timezone_abbrev_specific to create result_tz
|
result_tz = ZoneInfo(iana_timezone)
|
||||||
# timezone_abbrev_specific for India is "UTC+5:30" and for New York is "UTC−4"
|
|
||||||
# the values for offsets are respectively ["5", "30", "0"] and ["-4": "0"]
|
|
||||||
timezone_abbrev_specific = timezones_0["timezone_abbrev_specific"]
|
|
||||||
offsets = timezone_abbrev_specific.replace("UTC", "").replace("GMT", "").replace("−", "-").split(":")
|
|
||||||
offsets.append("0")
|
|
||||||
result_tz = tzoffset(iana_timezone, timedelta(hours=int(offsets[0]), minutes=int(offsets[1])))
|
|
||||||
result_dt = datetime.fromtimestamp(seconds_utc, tz=result_tz)
|
result_dt = datetime.fromtimestamp(seconds_utc, tz=result_tz)
|
||||||
result_dt_str = format_datetime(result_dt, 'long', tzinfo=result_tz, locale=self.locale)
|
result_dt_str = format_datetime(result_dt, 'long', tzinfo=result_tz, locale=self.locale)
|
||||||
answer = f"{result_dt_str} ( {localized_location} )"
|
answer = f"{result_dt_str} ( {localized_location} )"
|
||||||
|
@ -480,6 +390,7 @@ def parse_web_results_list(json_data, locale):
|
||||||
results_list = tier_1_search_results["result_list"]["item"]
|
results_list = tier_1_search_results["result_list"]["item"]
|
||||||
|
|
||||||
if "spell_suggestion" in tier_1_search_results:
|
if "spell_suggestion" in tier_1_search_results:
|
||||||
|
print(tier_1_search_results["spell_suggestion"])
|
||||||
spell_suggestion = tier_1_search_results["spell_suggestion"]
|
spell_suggestion = tier_1_search_results["spell_suggestion"]
|
||||||
if "spell_column" in spell_suggestion:
|
if "spell_column" in spell_suggestion:
|
||||||
for spell_suggestion in tier_1_search_results["spell_suggestion"]["spell_column"]:
|
for spell_suggestion in tier_1_search_results["spell_suggestion"]["spell_column"]:
|
||||||
|
|
Loading…
Add table
Reference in a new issue