forked from zaclys/searxng
		
	Merge pull request #2640 from return42/fix-yahoo-news
[fix] rewrite Yahoo-News engine
This commit is contained in:
		
						commit
						ccf5ac9801
					
				
					 2 changed files with 78 additions and 70 deletions
				
			
		
							
								
								
									
										1
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -196,6 +196,7 @@ PYLINT_FILES=\ | |||
| 	searx/engines/google_images.py \
 | ||||
| 	searx/engines/mediathekviewweb.py \
 | ||||
| 	searx/engines/google_scholar.py \
 | ||||
| 	searx/engines/yahoo_news.py \
 | ||||
| 	searx_extra/update/update_external_bangs.py | ||||
| 
 | ||||
| test.pylint: pyenvinstall | ||||
|  |  | |||
|  | @ -1,16 +1,35 @@ | |||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| """ | ||||
|  Yahoo (News) | ||||
| """Yahoo (News) | ||||
| 
 | ||||
| Yahoo News is "English only" and do not offer localized nor language queries. | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| # pylint: disable=invalid-name, missing-function-docstring | ||||
| 
 | ||||
| import re | ||||
| from datetime import datetime, timedelta | ||||
| from urllib.parse import urlencode | ||||
| from lxml import html | ||||
| from searx.engines.yahoo import parse_url, language_aliases | ||||
| from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url  # NOQA # pylint: disable=unused-import | ||||
| from datetime import datetime, timedelta | ||||
| from dateutil import parser | ||||
| from searx.utils import extract_text, extract_url, match_language | ||||
| from lxml import html | ||||
| 
 | ||||
| from searx import logger | ||||
| from searx.utils import ( | ||||
|     eval_xpath_list, | ||||
|     eval_xpath_getindex, | ||||
|     extract_text, | ||||
| ) | ||||
| 
 | ||||
| from searx.engines.yahoo import parse_url | ||||
| 
 | ||||
| # pylint: disable=unused-import | ||||
| from searx.engines.yahoo import ( | ||||
|     _fetch_supported_languages, | ||||
|     supported_languages_url, | ||||
| ) | ||||
| # pylint: enable=unused-import | ||||
| 
 | ||||
| logger = logger.getChild('yahoo_news engine') | ||||
| 
 | ||||
| # about | ||||
| about = { | ||||
|  | @ -22,90 +41,78 @@ about = { | |||
|     "results": 'HTML', | ||||
| } | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['news'] | ||||
| language_support = False | ||||
| time_range_support = False | ||||
| safesearch = False | ||||
| paging = True | ||||
| categories = ['news'] | ||||
| 
 | ||||
| # search-url | ||||
| search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}'  # noqa | ||||
| search_url = ( | ||||
|     'https://news.search.yahoo.com/search' | ||||
|     '?{query}&b={offset}' | ||||
|     ) | ||||
| 
 | ||||
| # specific xpath variables | ||||
| results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' | ||||
| url_xpath = './/h3/a/@href' | ||||
| title_xpath = './/h3/a' | ||||
| content_xpath = './/div[@class="compText"]' | ||||
| publishedDate_xpath = './/span[contains(@class,"tri")]' | ||||
| suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' | ||||
| AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)') | ||||
| AGO_TIMEDELTA = { | ||||
|   'minute': timedelta(minutes=1), | ||||
|   'hour': timedelta(hours=1), | ||||
|   'day': timedelta(days=1), | ||||
|   'week': timedelta(days=7), | ||||
|   'month': timedelta(days=30), | ||||
|   'year': timedelta(days=365), | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     offset = (params['pageno'] - 1) * 10 + 1 | ||||
| 
 | ||||
|     if params['language'] == 'all': | ||||
|         language = 'en' | ||||
|     else: | ||||
|         language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] | ||||
| 
 | ||||
|     params['url'] = search_url.format(offset=offset, | ||||
|                                       query=urlencode({'p': query}), | ||||
|                                       lang=language) | ||||
| 
 | ||||
|     # TODO required? | ||||
|     params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ | ||||
|         .format(lang=language) | ||||
|     params['url'] = search_url.format( | ||||
|         offset = offset, | ||||
|         query = urlencode({'p': query}) | ||||
|     ) | ||||
|     logger.debug("query_url --> %s", params['url']) | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def sanitize_url(url): | ||||
|     if ".yahoo.com/" in url: | ||||
|         return re.sub("\\;\\_ylt\\=.+$", "", url) | ||||
|     else: | ||||
|         return url | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in dom.xpath(results_xpath): | ||||
|         urls = result.xpath(url_xpath) | ||||
|         if len(urls) != 1: | ||||
|     for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'): | ||||
| 
 | ||||
|         url = eval_xpath_getindex(result, './/h4/a/@href', 0, None) | ||||
|         if url is None: | ||||
|             continue | ||||
|         url = sanitize_url(parse_url(extract_url(urls, search_url))) | ||||
|         title = extract_text(result.xpath(title_xpath)[0]) | ||||
|         content = extract_text(result.xpath(content_xpath)[0]) | ||||
|         url = parse_url(url) | ||||
|         title = extract_text(result.xpath('.//h4/a')) | ||||
|         content = extract_text(result.xpath('.//p')) | ||||
|         img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None) | ||||
| 
 | ||||
|         # parse publishedDate | ||||
|         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) | ||||
|         item = { | ||||
|             'url': url, | ||||
|             'title': title, | ||||
|             'content': content, | ||||
|             'img_src' : img_src | ||||
|         } | ||||
| 
 | ||||
|         # still useful ? | ||||
|         if re.match("^[0-9]+ minute(s|) ago$", publishedDate): | ||||
|             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) | ||||
|         elif re.match("^[0-9]+ days? ago$", publishedDate): | ||||
|             publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) | ||||
|         elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): | ||||
|             timeNumbers = re.findall(r'\d+', publishedDate) | ||||
|             publishedDate = datetime.now()\ | ||||
|                 - timedelta(hours=int(timeNumbers[0]))\ | ||||
|                 - timedelta(minutes=int(timeNumbers[1])) | ||||
|         pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]')) | ||||
|         ago = AGO_RE.search(pub_date) | ||||
|         if ago: | ||||
|             number = int(ago.group(1)) | ||||
|             delta = AGO_TIMEDELTA[ago.group(2)] | ||||
|             pub_date = datetime.now() - delta * number | ||||
|         else: | ||||
|             try: | ||||
|                 publishedDate = parser.parse(publishedDate) | ||||
|             except: | ||||
|                 publishedDate = datetime.now() | ||||
|                 pub_date = parser.parse(pub_date) | ||||
|             except parser.ParserError: | ||||
|                 pub_date = None | ||||
| 
 | ||||
|         if publishedDate.year == 1900: | ||||
|             publishedDate = publishedDate.replace(year=datetime.now().year) | ||||
|         if pub_date is not None: | ||||
|             item['publishedDate'] = pub_date | ||||
|         results.append(item) | ||||
| 
 | ||||
|         # append result | ||||
|         results.append({'url': url, | ||||
|                         'title': title, | ||||
|                         'content': content, | ||||
|                         'publishedDate': publishedDate}) | ||||
|         for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'): | ||||
|             results.append({'suggestion': extract_text(suggestion)}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Alexandre Flament
						Alexandre Flament