mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
google_internal_search: add news
This commit is contained in:
parent
e3ebc39e59
commit
14e5f8ff46
2 changed files with 91 additions and 5 deletions
|
@ -91,25 +91,50 @@ def get_query_url_images(query, lang_info, query_params):
|
|||
)
|
||||
|
||||
|
||||
def get_query_url_news(query, lang_info, query_params):
|
||||
return (
|
||||
'https://'
|
||||
+ lang_info['subdomain']
|
||||
+ '/search'
|
||||
+ "?"
|
||||
+ urlencode(
|
||||
{
|
||||
'q': query,
|
||||
'tbm': "nws",
|
||||
**query_params,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
CATEGORY_TO_GET_QUERY_URL = {
|
||||
'general': get_query_url_general,
|
||||
'images': get_query_url_images,
|
||||
'news': get_query_url_news,
|
||||
}
|
||||
|
||||
CATEGORY_RESULT_COUNT_PER_PAGE = {
|
||||
'general': 10,
|
||||
'images': 100,
|
||||
'news': 10,
|
||||
}
|
||||
|
||||
|
||||
def request(query, params):
|
||||
"""Google search request"""
|
||||
|
||||
offset = (params['pageno'] - 1) * 10
|
||||
result_count_per_page = CATEGORY_RESULT_COUNT_PER_PAGE[categories[0]] # pylint: disable=unsubscriptable-object
|
||||
|
||||
offset = (params['pageno'] - 1) * result_count_per_page
|
||||
|
||||
lang_info = get_lang_info(params, supported_languages, language_aliases, True)
|
||||
|
||||
query_params = {
|
||||
**lang_info['params'],
|
||||
'ie': "utf8",
|
||||
'oe': "utf8",
|
||||
'num': 30,
|
||||
'ie': 'utf8',
|
||||
'oe': 'utf8',
|
||||
'start': offset,
|
||||
'num': result_count_per_page,
|
||||
'filter': '0',
|
||||
'asearch': 'arc',
|
||||
'async': 'use_ac:true,_fmt:json',
|
||||
|
@ -182,6 +207,7 @@ class ParseResultGroupItem:
|
|||
"WEB_ANSWERS_CARD_BLOCK": self.web_answers_card_block,
|
||||
"IMAGE_RESULT_GROUP": self.image_result_group,
|
||||
"TWITTER_RESULT_GROUP": self.twitter_result_group,
|
||||
"NEWS_WHOLEPAGE": self.news_wholepage,
|
||||
# WHOLEPAGE_PAGE_GROUP - found for keyword what is t in English language
|
||||
# EXPLORE_UNIVERSAL_BLOCK
|
||||
# TRAVEL_ANSWERS_RESULT
|
||||
|
@ -353,6 +379,65 @@ class ParseResultGroupItem:
|
|||
results.append(parse_search_feature_proto(item["image_result"]))
|
||||
return results
|
||||
|
||||
def news_wholepage(self, item_to_parse):
|
||||
"""Parse a news search result"""
|
||||
|
||||
def iter_snippets():
|
||||
"""Iterate over all the results, yield result_index, snippet to deal with nested structured"""
|
||||
result_index = 0
|
||||
for item in item_to_parse["element"]:
|
||||
if "news_singleton_result_group" in item:
|
||||
payload = item["news_singleton_result_group"]["result"]["payload"]["liquid_item_data"]
|
||||
yield result_index, payload["article"]["stream_simplified_snippet"]
|
||||
result_index += 1
|
||||
continue
|
||||
|
||||
if "top_coverage" in item:
|
||||
for element in item["top_coverage"]["element"]:
|
||||
yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][
|
||||
"stream_simplified_snippet"
|
||||
]
|
||||
result_index += 1
|
||||
continue
|
||||
|
||||
if "news_sports_hub_result_group" in item:
|
||||
for element in item["news_sports_hub_result_group"]["element"]:
|
||||
yield result_index, element["result"]["payload"]["liquid_item_data"]["article"][
|
||||
"stream_simplified_snippet"
|
||||
]
|
||||
result_index += 1
|
||||
continue
|
||||
|
||||
if "news_topic_hub_refinements_result_group" in item:
|
||||
for ref_list in item["news_topic_hub_refinements_result_group"]["refinements"]["refinement_list"]:
|
||||
for result in ref_list["results"]:
|
||||
yield result_index, result["payload"]["liquid_item_data"]["article"][
|
||||
"stream_simplified_snippet"
|
||||
]
|
||||
result_index += 1
|
||||
continue
|
||||
|
||||
print("unknow news", item)
|
||||
|
||||
results = []
|
||||
for result_index, snippet in iter_snippets():
|
||||
publishedDate = snippet["date"]["timestamp"]
|
||||
url = snippet["url"]["result_url"]
|
||||
title = html_to_text(snippet["title"]["text"])
|
||||
content = html_to_text(snippet["snippet"]["snippet"])
|
||||
img_src = snippet.get("thumbnail_info", {}).get("sffe_50k_thumbnail_url")
|
||||
results.append(
|
||||
{
|
||||
'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'img_src': img_src,
|
||||
'publishedDate': datetime.fromtimestamp(publishedDate),
|
||||
"result_index": result_index,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
class ParseResultItem: # pylint: disable=too-few-public-methods
|
||||
"""Parse result_search_feature_proto.search_feature_proto"""
|
||||
|
|
|
@ -721,8 +721,9 @@ engines:
|
|||
# - ['one_title_contains', 'Salvador']
|
||||
|
||||
- name: google news
|
||||
engine: google_news
|
||||
engine: google_internal_search
|
||||
shortcut: gon
|
||||
categories: [news, web]
|
||||
# additional_tests:
|
||||
# android: *test_android
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue