From 1dba6dcbac6891390653170f44cd7ba9de636cd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Revol?= Date: Sat, 9 Jul 2016 20:41:57 +0200 Subject: [PATCH] Add ScanR structures search engine In theory ScanR should also search for projects but the API is different, so we'd need another engine. --- searx/engines/scanr_structures.py | 78 +++++++++ searx/settings.yml | 5 + tests/unit/engines/test_scanr_structures.py | 175 ++++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 searx/engines/scanr_structures.py create mode 100644 tests/unit/engines/test_scanr_structures.py diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py new file mode 100644 index 000000000..ad78155ac --- /dev/null +++ b/searx/engines/scanr_structures.py @@ -0,0 +1,78 @@ +""" + ScanR Structures (Science) + + @website https://scanr.enseignementsup-recherche.gouv.fr + @provide-api yes (https://scanr.enseignementsup-recherche.gouv.fr/api/swagger-ui.html) + + @using-api yes + @results JSON + @stable yes + @parse url, title, content, img_src +""" + +from urllib import urlencode +from json import loads, dumps +from dateutil import parser +from searx.utils import html_to_text + +# engine dependent config +categories = ['science'] +paging = True +page_size = 20 + +# search-url +url = 'https://scanr.enseignementsup-recherche.gouv.fr/' +search_url = url + 'api/structures/search' + + +# do search-request +def request(query, params): + + params['url'] = search_url + params['method'] = 'POST' + params['headers']['Content-type'] = "application/json" + params['data'] = dumps({"query": query, + "searchField": "ALL", + "sortDirection": "ASC", + "sortOrder": "RELEVANCY", + "page": params['pageno'], + "pageSize": page_size}) + + return params + + +# get response from search-request +def response(resp): + results = [] + + search_res = loads(resp.text) + + # return empty array if there are no results + if search_res.get('total') < 1: + return [] + + # parse results + for result in search_res['results']: + if 'id' not in result: + continue + + # is it thumbnail or img_src?? + thumbnail = None + if 'logo' in result: + thumbnail = result['logo'] + if thumbnail[0] == '/': + thumbnail = url + thumbnail + + content = None + if 'highlights' in result: + content = result['highlights'][0]['value'] + + # append result + results.append({'url': url + 'structure/' + result['id'], + 'title': result['label'], + # 'thumbnail': thumbnail, + 'img_src': thumbnail, + 'content': html_to_text(content)}) + + # return results + return results diff --git a/searx/settings.yml b/searx/settings.yml index 558898886..d64b73a17 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -314,6 +314,11 @@ engines: engine : kickass shortcut : ka + - name : scanr_structures + shortcut: scs + engine : scanr_structures + disabled : True + - name : soundcloud engine : soundcloud shortcut : sc diff --git a/tests/unit/engines/test_scanr_structures.py b/tests/unit/engines/test_scanr_structures.py new file mode 100644 index 000000000..a7b9e9185 --- /dev/null +++ b/tests/unit/engines/test_scanr_structures.py @@ -0,0 +1,175 @@ +from collections import defaultdict +import mock +from searx.engines import scanr_structures +from searx.testing import SearxTestCase + + +class TestScanrStructuresEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = scanr_structures.request(query, dicto) + self.assertIn('url', params) + self.assertIn(query, params['data']) + self.assertIn('scanr.enseignementsup-recherche.gouv.fr', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, scanr_structures.response, None) + self.assertRaises(AttributeError, scanr_structures.response, []) + self.assertRaises(AttributeError, scanr_structures.response, '') + self.assertRaises(AttributeError, scanr_structures.response, '[]') + + response = mock.Mock(text='{}') + self.assertEqual(scanr_structures.response(response), []) + + response = mock.Mock(text='{"data": []}') + self.assertEqual(scanr_structures.response(response), []) + + json = u""" + { + "request": + { + "query":"test_query", + "page":1, + "pageSize":20, + "sortOrder":"RELEVANCY", + "sortDirection":"ASC", + "searchField":"ALL", + "from":0 + }, + "total":2471, + "results":[ + { + "id":"200711886U", + "label":"Laboratoire d'Informatique de Grenoble", + "kind":"RNSR", + "publicEntity":true, + "address":{"city":"Grenoble","departement":"38"}, + "logo":"/static/logos/200711886U.png", + "acronym":"LIG", + "type":{"code":"UR","label":"Unit\xe9 de recherche"}, + "level":2, + "institutions":[ + { + "id":"193819125", + "label":"Grenoble INP", + "acronym":"IPG", + "code":"UMR 5217" + }, + { + "id":"130021397", + "label":"Universit\xe9 de Grenoble Alpes", + "acronym":"UGA", + "code":"UMR 5217" + }, + { + "id":"180089013", + "label":"Centre national de la recherche scientifique", + "acronym":"CNRS", + "code":"UMR 5217" + }, + { + "id":"180089047", + "label":"Institut national de recherche en informatique et en automatique", + "acronym":"Inria", + "code":"UMR 5217" + } + ], + "highlights":[ + { + "type":"projects", + "value":"linguicielles d\xe9velopp\xe9s jusqu'ici par le GETALP\ + du LIG en tant que prototypes op\xe9rationnels.\ +\\r\\nDans le contexte" + }, + { + "type":"acronym", + "value":"LIG" + }, + { + "type":"websiteContents", + "value":"S\xe9lection\\nListe structures\\nD\xe9tail\\n\ + Accueil\\n200711886U : LIG\ + Laboratoire d'Informatique de Grenoble Unit\xe9 de recherche"}, + { + "type":"publications", + "value":"de noms. Nous avons d'abord d\xe9velopp\xe9 LOOV \ + (pour Lig Overlaid OCR in Vid\xe9o), \ + un outil d'extraction des" + } + ] + }, + { + "id":"199511665F", + "label":"Laboratoire Bordelais de Recherche en Informatique", + "kind":"RNSR", + "publicEntity":true, + "address":{"city":"Talence","departement":"33"}, + "logo":"/static/logos/199511665F.png", + "acronym":"LaBRI", + "type":{"code":"UR","label":"Unit\xe9 de recherche"}, + "level":2, + "institutions":[ + { + "id":"130006356", + "label":"Institut polytechnique de Bordeaux", + "acronym":"IPB", + "code":"UMR 5800" + }, + { + "id":"130018351", + "label":"Universit\xe9 de Bordeaux", + "acronym":null, + "code":"UMR 5800" + }, + { + "id":"180089013", + "label":"Centre national de la recherche scientifique", + "acronym":"CNRS", + "code":"UMR 5800" + }, + { + "id":"180089047", + "label":"Institut national de recherche en informatique et en automatique", + "acronym":"Inria", + "code":"UMR 5800" + } + ], + "highlights":[ + { + "type":"websiteContents", + "value":"Samia Kerdjoudj\\n2016-07-05\\nDouble-exponential\ + and triple-exponential bounds for\ + choosability problems parameterized" + }, + { + "type":"publications", + "value":"de cam\xe9ras install\xe9es dans les lieux publiques \ + a tripl\xe9 en 2009, passant de 20 000 \ + \xe0 60 000. Malgr\xe9 le" + } + ] + } + ] + } + """ + response = mock.Mock(text=json) + results = scanr_structures.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], u"Laboratoire d'Informatique de Grenoble") + self.assertEqual(results[0]['url'], 'https://scanr.enseignementsup-recherche.gouv.fr/structure/200711886U') + self.assertEqual(results[0]['content'], + u"linguicielles d\xe9velopp\xe9s jusqu'ici par le GETALP " + u"du LIG en tant que prototypes " + u"op\xe9rationnels. Dans le contexte") + self.assertEqual(results[1]['img_src'], + 'https://scanr.enseignementsup-recherche.gouv.fr//static/logos/199511665F.png') + self.assertEqual(results[1]['content'], + "Samia Kerdjoudj 2016-07-05 Double-exponential and" + " triple-exponential bounds for " + "choosability problems parameterized") + self.assertEqual(results[1]['url'], 'https://scanr.enseignementsup-recherche.gouv.fr/structure/199511665F') + self.assertEqual(results[1]['title'], u"Laboratoire Bordelais de Recherche en Informatique")