From df0d915806b6e4488099130cd1d7fb1775fe475c Mon Sep 17 00:00:00 2001 From: jibe-b Date: Fri, 22 Sep 2017 22:09:33 +0200 Subject: [PATCH] [add] pubmed engine --- searx/engines/pubmed.py | 101 +++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +++ searx/url_utils.py | 2 + tests/unit/engines/pubmed.py | 37 +++++++++++++ 4 files changed, 146 insertions(+) create mode 100644 searx/engines/pubmed.py create mode 100644 tests/unit/engines/pubmed.py diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py new file mode 100644 index 000000000..abb59d2ed --- /dev/null +++ b/searx/engines/pubmed.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +""" + PubMed (Scholar publications) + @website https://www.ncbi.nlm.nih.gov/pubmed/ + @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/) + @using-api yes + @results XML + @stable yes + @parse url, title, publishedDate, content + More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/ +""" + +from lxml import etree +from datetime import datetime +from searx.url_utils import urlencode, urlopen + + +categories = ['science'] + +base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\ + + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' + +# engine dependent config +number_of_results = 10 +pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/' + + +def request(query, params): + # basic search + offset = (params['pageno'] - 1) * number_of_results + + string_args = dict(query=urlencode({'term': query}), + offset=offset, + hits=number_of_results) + + params['url'] = base_url.format(**string_args) + + return params + + +def response(resp): + results = [] + + # First retrieve notice of each result + pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\ + + 'db=pubmed&retmode=xml&id={pmids_string}' + + # handle Python2 vs Python3 management of bytes and strings + try: + pmids_results = etree.XML(resp.text.encode('utf-8')) + except AttributeError: + pmids_results = etree.XML(resp.text) + + pmids = pmids_results.xpath('//eSearchResult/IdList/Id') + pmids_string = '' + + for item in pmids: + pmids_string += item.text + ',' + + retrieve_notice_args = dict(pmids_string=pmids_string) + + retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) + + search_results_xml = urlopen(retrieve_url_encoded).read() + search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') + + for entry in search_results: + title = entry.xpath('.//Article/ArticleTitle')[0].text + + pmid = entry.xpath('.//PMID')[0].text + url = pubmed_url + pmid + + try: + content = entry.xpath('.//Abstract/AbstractText')[0].text + except: + content = 'No abstract is available for this publication.' + + # If a doi is available, add it to the snipppet + try: + doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text + content = 'DOI: ' + doi + ' Abstract: ' + content + except: + pass + + if len(content) > 300: + content = content[0:300] + "..." + # TODO: center snippet on query term + + publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text + + '-' + entry.xpath('.//DateCreated/Month')[0].text + + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d') + + res_dict = {'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': content} + + results.append(res_dict) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index ec3e6b469..8ec5173f5 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -460,6 +460,12 @@ engines: url: https://pirateproxy.red/ timeout : 3.0 + - name : pubmed + engine : pubmed + shortcut : pub + categories: science + oa_first : false + - name : qwant engine : qwant shortcut : qw diff --git a/searx/url_utils.py b/searx/url_utils.py index dcafc3ba8..5e9e29190 100644 --- a/searx/url_utils.py +++ b/searx/url_utils.py @@ -3,6 +3,7 @@ from sys import version_info if version_info[0] == 2: from urllib import quote, quote_plus, unquote, urlencode from urlparse import parse_qs, parse_qsl, urljoin, urlparse, urlunparse, ParseResult + from urllib2 import urlopen else: from urllib.parse import ( parse_qs, @@ -16,6 +17,7 @@ else: urlunparse, ParseResult ) + from urllib.request import urlopen __export__ = (parse_qs, diff --git a/tests/unit/engines/pubmed.py b/tests/unit/engines/pubmed.py new file mode 100644 index 000000000..370efe067 --- /dev/null +++ b/tests/unit/engines/pubmed.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import pubmed +from searx.testing import SearxTestCase + + +class TestPubmedEngine(SearxTestCase): + + def test_request(self): + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = pubmed.request(query, dicto) + self.assertIn('url', params) + self.assertIn('eutils.ncbi.nlm.nih.gov/', params['url']) + self.assertIn('term', params['url']) + + def test_response(self): + self.assertRaises(AttributeError, pubmed.response, None) + self.assertRaises(AttributeError, pubmed.response, []) + self.assertRaises(AttributeError, pubmed.response, '') + self.assertRaises(AttributeError, pubmed.response, '[]') + + response = mock.Mock(text='') + self.assertEqual(pubmed.response(response), []) + + xml_mock = """110 +1 + +""" + + response = mock.Mock(text=xml_mock.encode('utf-8')) + results = pubmed.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['content'], 'No abstract is available for this publication.')