[fix] rewrite Yahoo-News engine

Many things have been changed since last review of this engine.  This patch fix
xpath selectors, implements suggestion and is a complete review / rewrite of the
engine.

Signed-off-by: Markus Heiser <markus@darmarit.de>
This commit is contained in:
Markus Heiser 2021-03-08 09:41:32 +01:00
parent 0d8b369b5b
commit d2faea423a
2 changed files with 78 additions and 70 deletions

View File

@ -196,6 +196,7 @@ PYLINT_FILES=\
searx/engines/google_images.py \ searx/engines/google_images.py \
searx/engines/mediathekviewweb.py \ searx/engines/mediathekviewweb.py \
searx/engines/google_scholar.py \ searx/engines/google_scholar.py \
searx/engines/yahoo_news.py \
searx_extra/update/update_external_bangs.py searx_extra/update/update_external_bangs.py
test.pylint: pyenvinstall test.pylint: pyenvinstall

View File

@ -1,16 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Yahoo (News)
Yahoo (News)
Yahoo News is "English only" and do not offer localized nor language queries.
""" """
# pylint: disable=invalid-name, missing-function-docstring
import re import re
from datetime import datetime, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from datetime import datetime, timedelta
from searx.engines.yahoo import parse_url, language_aliases
from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
from dateutil import parser from dateutil import parser
from searx.utils import extract_text, extract_url, match_language from lxml import html
from searx import logger
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.yahoo import parse_url
# pylint: disable=unused-import
from searx.engines.yahoo import (
_fetch_supported_languages,
supported_languages_url,
)
# pylint: enable=unused-import
logger = logger.getChild('yahoo_news engine')
# about # about
about = { about = {
@ -22,90 +41,78 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
# engine dependent config language_support = False
categories = ['news'] time_range_support = False
safesearch = False
paging = True paging = True
categories = ['news']
# search-url # search-url
search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa search_url = (
'https://news.search.yahoo.com/search'
'?{query}&b={offset}'
)
# specific xpath variables AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' AGO_TIMEDELTA = {
url_xpath = './/h3/a/@href' 'minute': timedelta(minutes=1),
title_xpath = './/h3/a' 'hour': timedelta(hours=1),
content_xpath = './/div[@class="compText"]' 'day': timedelta(days=1),
publishedDate_xpath = './/span[contains(@class,"tri")]' 'week': timedelta(days=7),
suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' 'month': timedelta(days=30),
'year': timedelta(days=365),
}
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': params['url'] = search_url.format(
language = 'en' offset = offset,
else: query = urlencode({'p': query})
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] )
logger.debug("query_url --> %s", params['url'])
params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
.format(lang=language)
return params return params
def sanitize_url(url):
if ".yahoo.com/" in url:
return re.sub("\\;\\_ylt\\=.+$", "", url)
else:
return url
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
urls = result.xpath(url_xpath)
if len(urls) != 1: url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
if url is None:
continue continue
url = sanitize_url(parse_url(extract_url(urls, search_url))) url = parse_url(url)
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(result.xpath('.//h4/a'))
content = extract_text(result.xpath(content_xpath)[0]) content = extract_text(result.xpath('.//p'))
img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
# parse publishedDate item = {
publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) 'url': url,
'title': title,
'content': content,
'img_src' : img_src
}
# still useful ? pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
if re.match("^[0-9]+ minute(s|) ago$", publishedDate): ago = AGO_RE.search(pub_date)
publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) if ago:
elif re.match("^[0-9]+ days? ago$", publishedDate): number = int(ago.group(1))
publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) delta = AGO_TIMEDELTA[ago.group(2)]
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): pub_date = datetime.now() - delta * number
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
else: else:
try: try:
publishedDate = parser.parse(publishedDate) pub_date = parser.parse(pub_date)
except: except parser.ParserError:
publishedDate = datetime.now() pub_date = None
if publishedDate.year == 1900: if pub_date is not None:
publishedDate = publishedDate.replace(year=datetime.now().year) item['publishedDate'] = pub_date
results.append(item)
# append result for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
results.append({'url': url, results.append({'suggestion': extract_text(suggestion)})
'title': title,
'content': content,
'publishedDate': publishedDate})
# return results
return results return results