forked from zaclys/searxng
		
	Add INA search engine
This commit is contained in:
		
							parent
							
								
									fcc3264f80
								
							
						
					
					
						commit
						b538de568a
					
				
					 3 changed files with 153 additions and 0 deletions
				
			
		
							
								
								
									
										83
									
								
								searx/engines/ina.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										83
									
								
								searx/engines/ina.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,83 @@ | ||||||
|  | #  INA (Videos) | ||||||
|  | # | ||||||
|  | # @website     https://www.ina.fr/ | ||||||
|  | # @provide-api no | ||||||
|  | # | ||||||
|  | # @using-api   no | ||||||
|  | # @results     HTML (using search portal) | ||||||
|  | # @stable      no (HTML can change) | ||||||
|  | # @parse       url, title, content, publishedDate, thumbnail | ||||||
|  | # | ||||||
|  | # @todo        set content-parameter with correct data | ||||||
|  | # @todo        embedded (needs some md5 from video page) | ||||||
|  | 
 | ||||||
|  | from json import loads | ||||||
|  | from urllib import urlencode | ||||||
|  | from lxml import html | ||||||
|  | from HTMLParser import HTMLParser | ||||||
|  | from searx.engines.xpath import extract_text | ||||||
|  | from dateutil import parser | ||||||
|  | 
 | ||||||
|  | # engine dependent config | ||||||
|  | categories = ['videos'] | ||||||
|  | paging = True | ||||||
|  | page_size = 48 | ||||||
|  | 
 | ||||||
|  | # search-url | ||||||
|  | base_url = 'https://www.ina.fr' | ||||||
|  | search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' | ||||||
|  | 
 | ||||||
|  | # specific xpath variables | ||||||
|  | results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' | ||||||
|  | url_xpath = './/a/@href' | ||||||
|  | title_xpath = './/h3[@class="h3--title media-heading"]' | ||||||
|  | thumbnail_xpath = './/img/@src' | ||||||
|  | publishedDate_xpath = './/span[@class="broadcast"]' | ||||||
|  | content_xpath = './/p[@class="media-body__summary"]' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # do search-request | ||||||
|  | def request(query, params): | ||||||
|  |     params['url'] = search_url.format(ps=page_size, | ||||||
|  |                                       start=params['pageno'] * page_size, | ||||||
|  |                                       query=urlencode({'q': query})) | ||||||
|  | 
 | ||||||
|  |     return params | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # get response from search-request | ||||||
|  | def response(resp): | ||||||
|  |     results = [] | ||||||
|  | 
 | ||||||
|  |     # we get html in a JSON container... | ||||||
|  |     response = loads(resp.text) | ||||||
|  |     if "content" not in response: | ||||||
|  |         return [] | ||||||
|  |     dom = html.fromstring(response["content"]) | ||||||
|  |     p = HTMLParser() | ||||||
|  | 
 | ||||||
|  |     # parse results | ||||||
|  |     for result in dom.xpath(results_xpath): | ||||||
|  |         videoid = result.xpath(url_xpath)[0] | ||||||
|  |         url = base_url + videoid | ||||||
|  |         title = p.unescape(extract_text(result.xpath(title_xpath))) | ||||||
|  |         thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) | ||||||
|  |         if thumbnail[0] == '/': | ||||||
|  |             thumbnail = base_url + thumbnail | ||||||
|  |         d = extract_text(result.xpath(publishedDate_xpath)[0]) | ||||||
|  |         d = d.split('/') | ||||||
|  |         # force ISO date to avoid wrong parsing | ||||||
|  |         d = "%s-%s-%s" % (d[2], d[1], d[0]) | ||||||
|  |         publishedDate = parser.parse(d) | ||||||
|  |         content = extract_text(result.xpath(content_xpath)) | ||||||
|  | 
 | ||||||
|  |         # append result | ||||||
|  |         results.append({'url': url, | ||||||
|  |                         'title': title, | ||||||
|  |                         'content': content, | ||||||
|  |                         'template': 'videos.html', | ||||||
|  |                         'publishedDate': publishedDate, | ||||||
|  |                         'thumbnail': thumbnail}) | ||||||
|  | 
 | ||||||
|  |     # return results | ||||||
|  |     return results | ||||||
|  | @ -254,6 +254,12 @@ engines: | ||||||
|     disabled : True |     disabled : True | ||||||
|     shortcut : habr |     shortcut : habr | ||||||
| 
 | 
 | ||||||
|  |   - name : ina | ||||||
|  |     engine : ina | ||||||
|  |     shortcut : in | ||||||
|  |     timeout : 6.0 | ||||||
|  |     disabled : True | ||||||
|  | 
 | ||||||
|   - name : mixcloud |   - name : mixcloud | ||||||
|     engine : mixcloud |     engine : mixcloud | ||||||
|     shortcut : mc |     shortcut : mc | ||||||
|  |  | ||||||
							
								
								
									
										64
									
								
								tests/unit/engines/test_ina.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								tests/unit/engines/test_ina.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,64 @@ | ||||||
|  | from collections import defaultdict | ||||||
|  | import mock | ||||||
|  | from searx.engines import ina | ||||||
|  | from searx.testing import SearxTestCase | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestInaEngine(SearxTestCase): | ||||||
|  | 
 | ||||||
|  |     def test_request(self): | ||||||
|  |         query = 'test_query' | ||||||
|  |         dicto = defaultdict(dict) | ||||||
|  |         dicto['pageno'] = 0 | ||||||
|  |         params = ina.request(query, dicto) | ||||||
|  |         self.assertTrue('url' in params) | ||||||
|  |         self.assertTrue(query in params['url']) | ||||||
|  |         self.assertTrue('ina.fr' in params['url']) | ||||||
|  | 
 | ||||||
|  |     def test_response(self): | ||||||
|  |         self.assertRaises(AttributeError, ina.response, None) | ||||||
|  |         self.assertRaises(AttributeError, ina.response, []) | ||||||
|  |         self.assertRaises(AttributeError, ina.response, '') | ||||||
|  |         self.assertRaises(AttributeError, ina.response, '[]') | ||||||
|  | 
 | ||||||
|  |         response = mock.Mock(text='{}') | ||||||
|  |         self.assertEqual(ina.response(response), []) | ||||||
|  | 
 | ||||||
|  |         response = mock.Mock(text='{"data": []}') | ||||||
|  |         self.assertEqual(ina.response(response), []) | ||||||
|  | 
 | ||||||
|  |         json = """ | ||||||
|  |         {"content":"\\t<div class=\\"container\\">\\n\\t\\n\ | ||||||
|  |         <!-- DEBUT CONTENU PRINCIPAL -->\\n<div class=\\"row\\">\\n\ | ||||||
|  |         <div class=\\"search-results--list\\"><div class=\\"media\\">\\n\ | ||||||
|  |         \\t\\t\\t\\t<a class=\\"media-left  media-video  premium    xiti_click_action\\" \ | ||||||
|  |         data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ | ||||||
|  |         href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\\n\ | ||||||
|  |         <img src=\\"https:\/\/www.ina.fr\/images_v2\/140x105\/CAF89035682.jpeg\\" \ | ||||||
|  |         alt=\\"Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle \\">\\n\ | ||||||
|  |         \\t\\t\\t\\t\\t<\/a>\\n\ | ||||||
|  |         \\t\\t\\t\\t\\t<div class=\\"media-body\\">\\n\\t\\t\\t\\t\\t\\t<h3 class=\\"h3--title media-heading\\">\\n\ | ||||||
|  |         \\t\\t\\t\\t\\t\\t\\t<a class=\\"xiti_click_action\\" \ | ||||||
|  |         data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ | ||||||
|  |         href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\ | ||||||
|  |         Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle <\/a>\\n\ | ||||||
|  |         <\/h3>\\n\ | ||||||
|  |         <div class=\\"media-body__info\\">\\n<span class=\\"broadcast\\">27\/11\/1967<\/span>\\n\ | ||||||
|  |         <span class=\\"views\\">29321 vues<\/span>\\n\ | ||||||
|  |         <span class=\\"duration\\">01h 33m 07s<\/span>\\n\ | ||||||
|  |         <\/div>\\n\ | ||||||
|  |         <p class=\\"media-body__summary\\">VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE . \ | ||||||
|  |               - PA le Pr\\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...<\/p>\\n\ | ||||||
|  |         <\/div>\\n<\/div><!-- \/.media -->\\n" | ||||||
|  |         } | ||||||
|  |         """ | ||||||
|  |         response = mock.Mock(text=json) | ||||||
|  |         results = ina.response(response) | ||||||
|  |         self.assertEqual(type(results), list) | ||||||
|  |         self.assertEqual(len(results), 1) | ||||||
|  |         self.assertEqual(results[0]['title'], u'Conf\xe9rence de presse du G\xe9n\xe9ral de Gaulle') | ||||||
|  |         self.assertEqual(results[0]['url'], | ||||||
|  |                          'https://www.ina.fr/video/CAF89035682/conference-de-presse-du-general-de-gaulle-video.html') | ||||||
|  |         self.assertEqual(results[0]['content'], | ||||||
|  |                          u"VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE ." | ||||||
|  |                          u" - PA le Pr\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...") | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 François Revol
						François Revol