forked from zaclys/searxng
		
	update bing engines and fix bing_news
This commit is contained in:
		
							parent
							
								
									55dfb305a0
								
							
						
					
					
						commit
						4b1e0423a0
					
				
					 2 changed files with 92 additions and 23 deletions
				
			
		|  | @ -1,48 +1,81 @@ | |||
| ## Bing (Web) | ||||
| #  | ||||
| # @website     https://www.bing.com | ||||
| # @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month | ||||
| #  | ||||
| # @using-api   no (because of query limit) | ||||
| # @results     HTML (using search portal) | ||||
| # @stable      no (HTML can change) | ||||
| # @parse       url, title, content | ||||
| # | ||||
| # @todo        publishedDate | ||||
| 
 | ||||
| from urllib import urlencode | ||||
| from cgi import escape | ||||
| from lxml import html | ||||
| 
 | ||||
| base_url = 'http://www.bing.com/' | ||||
| search_string = 'search?{query}&first={offset}' | ||||
| # engine dependent config | ||||
| categories = ['general'] | ||||
| paging = True | ||||
| language_support = True | ||||
| 
 | ||||
| # search-url | ||||
| base_url = 'https://www.bing.com/' | ||||
| search_string = 'search?{query}&first={offset}' | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     offset = (params['pageno'] - 1) * 10 + 1 | ||||
| 
 | ||||
|     if params['language'] == 'all': | ||||
|         language = 'en-US' | ||||
|     else: | ||||
|         language = params['language'].replace('_', '-') | ||||
| 
 | ||||
|     search_path = search_string.format( | ||||
|         query=urlencode({'q': query, 'setmkt': language}), | ||||
|         offset=offset) | ||||
| 
 | ||||
|     params['cookies']['SRCHHPGUSR'] = \ | ||||
|         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] | ||||
|     #if params['category'] == 'images': | ||||
|     #    params['url'] = base_url + 'images/' + search_path | ||||
| 
 | ||||
|     params['url'] = base_url + search_path | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     dom = html.fromstring(resp.content) | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in dom.xpath('//div[@class="sa_cc"]'): | ||||
|         link = result.xpath('.//h3/a')[0] | ||||
|         url = link.attrib.get('href') | ||||
|         title = ' '.join(link.xpath('.//text()')) | ||||
|         content = escape(' '.join(result.xpath('.//p//text()'))) | ||||
|         results.append({'url': url, 'title': title, 'content': content}) | ||||
| 
 | ||||
|         # append result | ||||
|         results.append({'url': url,  | ||||
|                         'title': title,  | ||||
|                         'content': content}) | ||||
| 
 | ||||
|     # return results if something is found | ||||
|     if results: | ||||
|         return results | ||||
| 
 | ||||
|     # parse results again if nothing is found yet | ||||
|     for result in dom.xpath('//li[@class="b_algo"]'): | ||||
|         link = result.xpath('.//h2/a')[0] | ||||
|         url = link.attrib.get('href') | ||||
|         title = ' '.join(link.xpath('.//text()')) | ||||
|         content = escape(' '.join(result.xpath('.//p//text()'))) | ||||
|         results.append({'url': url, 'title': title, 'content': content}) | ||||
| 
 | ||||
|         # append result | ||||
|         results.append({'url': url,  | ||||
|                         'title': title,  | ||||
|                         'content': content}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
|  |  | |||
|  | @ -1,50 +1,86 @@ | |||
| ## Bing (News) | ||||
| #  | ||||
| # @website     https://www.bing.com/news | ||||
| # @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month | ||||
| #  | ||||
| # @using-api   no (because of query limit) | ||||
| # @results     HTML (using search portal) | ||||
| # @stable      no (HTML can change) | ||||
| # @parse       url, title, content, publishedDate | ||||
| 
 | ||||
| from urllib import urlencode | ||||
| from cgi import escape | ||||
| from lxml import html | ||||
| from datetime import datetime, timedelta | ||||
| from dateutil import parser | ||||
| import re | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories = ['news'] | ||||
| 
 | ||||
| base_url = 'http://www.bing.com/' | ||||
| search_string = 'news/search?{query}&first={offset}' | ||||
| paging = True | ||||
| language_support = True | ||||
| 
 | ||||
| # search-url | ||||
| base_url = 'https://www.bing.com/' | ||||
| search_string = 'news/search?{query}&first={offset}' | ||||
| 
 | ||||
| # do search-request | ||||
| def request(query, params): | ||||
|     offset = (params['pageno'] - 1) * 10 + 1 | ||||
| 
 | ||||
|     if params['language'] == 'all': | ||||
|         language = 'en-US' | ||||
|     else: | ||||
|         language = params['language'].replace('_', '-') | ||||
| 
 | ||||
|     search_path = search_string.format( | ||||
|         query=urlencode({'q': query, 'setmkt': language}), | ||||
|         offset=offset) | ||||
| 
 | ||||
|     params['cookies']['SRCHHPGUSR'] = \ | ||||
|         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] | ||||
|     #if params['category'] == 'images': | ||||
|     # params['url'] = base_url + 'images/' + search_path | ||||
| 
 | ||||
|     params['url'] = base_url + search_path | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| # get response from search-request | ||||
| def response(resp): | ||||
|     results = [] | ||||
| 
 | ||||
|     dom = html.fromstring(resp.content) | ||||
|     for result in dom.xpath('//div[@class="sa_cc"]'): | ||||
|         link = result.xpath('.//h3/a')[0] | ||||
| 
 | ||||
|     # parse results | ||||
|     for result in dom.xpath('//div[@class="sn_r"]'): | ||||
|         link = result.xpath('.//div[@class="newstitle"]/a')[0] | ||||
|         url = link.attrib.get('href') | ||||
|         title = ' '.join(link.xpath('.//text()')) | ||||
|         content = escape(' '.join(result.xpath('.//p//text()'))) | ||||
|         results.append({'url': url, 'title': title, 'content': content}) | ||||
|         content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()'))) | ||||
|          | ||||
|         # parse publishedDate | ||||
|         publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()'))) | ||||
| 
 | ||||
|     if results: | ||||
|         return results | ||||
|         if re.match("^[0-9]+ minute(s|) ago$", publishedDate): | ||||
|             timeNumbers = re.findall(r'\d+', publishedDate) | ||||
|             publishedDate = datetime.now()\ | ||||
|                 - timedelta(minutes=int(timeNumbers[0])) | ||||
|         elif re.match("^[0-9]+ hour(s|) ago$", publishedDate): | ||||
|             timeNumbers = re.findall(r'\d+', publishedDate) | ||||
|             publishedDate = datetime.now()\ | ||||
|                 - timedelta(hours=int(timeNumbers[0])) | ||||
|         elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): | ||||
|             timeNumbers = re.findall(r'\d+', publishedDate) | ||||
|             publishedDate = datetime.now()\ | ||||
|                 - timedelta(hours=int(timeNumbers[0]))\ | ||||
|                 - timedelta(minutes=int(timeNumbers[1])) | ||||
|         else: | ||||
|             publishedDate = parser.parse(publishedDate)   | ||||
| 
 | ||||
|     for result in dom.xpath('//li[@class="b_algo"]'): | ||||
|         link = result.xpath('.//h2/a')[0] | ||||
|         url = link.attrib.get('href') | ||||
|         title = ' '.join(link.xpath('.//text()')) | ||||
|         content = escape(' '.join(result.xpath('.//p//text()'))) | ||||
|         results.append({'url': url, 'title': title, 'content': content}) | ||||
|         # append result | ||||
|         results.append({'url': url,  | ||||
|                         'title': title,  | ||||
|                         'publishedDate': publishedDate, | ||||
|                         'content': content}) | ||||
| 
 | ||||
|     # return results | ||||
|     return results | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Thomas Pointhuber
						Thomas Pointhuber