From e36f85b8365e5d6a9263dd78242a10a305a9000c Mon Sep 17 00:00:00 2001 From: Alexandre FLAMENT Date: Fri, 26 Aug 2022 16:10:12 +0000 Subject: [PATCH] Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py --- searx/engines/arxiv.py | 74 ++++++++++++++++++------ searx/engines/crossref.py | 59 +++++++++++++++++++ searx/engines/google_scholar.py | 85 +++++++++++++++++++++++---- searx/engines/pubmed.py | 95 ++++++++++++++++++------------- searx/engines/semantic_scholar.py | 57 +++++++++++++------ searx/engines/springer.py | 38 ++++++------- searx/searxng.msg | 1 + searx/settings.yml | 26 ++------- 8 files changed, 309 insertions(+), 126 deletions(-) create mode 100644 searx/engines/crossref.py diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index a1a58172d..a4811ebd5 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -3,9 +3,10 @@ ArXiV (Scientific preprints) """ -from lxml import html +from lxml import etree +from lxml.etree import XPath from datetime import datetime -from searx.utils import eval_xpath_list, eval_xpath_getindex +from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex # about about = { @@ -17,7 +18,7 @@ about = { "results": 'XML-RSS', } -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True base_url = ( @@ -27,6 +28,23 @@ base_url = ( # engine dependent config number_of_results = 10 +# xpaths +arxiv_namespaces = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", +} +xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces) +xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces) +xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces) +xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces) +xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces) +xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces) +xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces) +xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces) +xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces) +xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces) +xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces) + def request(query, params): # basic search @@ -41,30 +59,50 @@ def request(query, params): def response(resp): results = [] + dom = etree.fromstring(resp.content) + for entry in eval_xpath_list(dom, xpath_entry): + title = eval_xpath_getindex(entry, xpath_title, 0).text - dom = html.fromstring(resp.content) + url = eval_xpath_getindex(entry, xpath_id, 0).text + abstract = eval_xpath_getindex(entry, xpath_summary, 0).text - for entry in eval_xpath_list(dom, '//entry'): - title = eval_xpath_getindex(entry, './/title', 0).text + authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)] - url = eval_xpath_getindex(entry, './/id', 0).text + # doi + doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None) + doi = None if doi_element is None else doi_element.text - content_string = '{doi_content}{abstract_content}' + # pdf + pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None) + pdf_url = None if pdf_element is None else pdf_element.attrib.get('href') - abstract = eval_xpath_getindex(entry, './/summary', 0).text + # journal + journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None) + journal = None if journal_element is None else journal_element.text - # If a doi is available, add it to the snipppet - doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None) - doi_content = doi_element.text if doi_element is not None else '' - content = content_string.format(doi_content=doi_content, abstract_content=abstract) + # tags + tag_elements = eval_xpath(entry, xpath_category) + tags = [str(tag) for tag in tag_elements] - if len(content) > 300: - content = content[0:300] + "..." - # TODO: center snippet on query term + # comments + comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None) + comments = None if comments_elements is None else comments_elements.text - publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ') + publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ') - res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': abstract, + 'doi': doi, + 'authors': authors, + 'journal': journal, + 'tags': tags, + 'comments': comments, + 'pdf_url': pdf_url, + } results.append(res_dict) diff --git a/searx/engines/crossref.py b/searx/engines/crossref.py new file mode 100644 index 000000000..d61318146 --- /dev/null +++ b/searx/engines/crossref.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Semantic Scholar (Science) +""" + +from urllib.parse import urlencode +from searx.utils import html_to_text + +about = { + "website": 'https://www.crossref.org/', + "wikidata_id": 'Q5188229', + "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['science', 'scientific publications'] +paging = True +search_url = 'https://api.crossref.org/works' + + +def request(query, params): + params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1))) + return params + + +def response(resp): + res = resp.json() + results = [] + for record in res['message']['items']: + record_type = record['type'] + if record_type == 'book-chapter': + title = record['container-title'][0] + if record['title'][0].lower().strip() != title.lower().strip(): + title = title + ' (' + record['title'][0] + ')' + journal = None + else: + title = record['title'][0] + journal = record.get('container-title', [None])[0] + url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] + authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] + isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])] + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'journal': journal, + 'volume': record.get('volume'), + 'type': record['type'], + 'content': html_to_text(record.get('abstract', '')), + 'publisher': record.get('publisher'), + 'authors': authors, + 'doi': record['DOI'], + 'isbn': isbn, + } + ) + return results diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 41c62886b..c07cd4cea 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -13,10 +13,12 @@ Definitions`_. from urllib.parse import urlencode from datetime import datetime +from typing import Optional from lxml import html from searx.utils import ( eval_xpath, + eval_xpath_getindex, eval_xpath_list, extract_text, ) @@ -46,7 +48,7 @@ about = { } # engine dependent config -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True language_support = True use_locale_domain = True @@ -99,7 +101,43 @@ def request(query, params): return params -def response(resp): +def parse_gs_a(text: Optional[str]): + """Parse the text written in green. + + Possible formats: + * "{authors} - {journal}, {year} - {publisher}" + * "{authors} - {year} - {publisher}" + * "{authors} - {publisher}" + """ + if text is None or text == "": + return None, None, None, None + + s_text = text.split(' - ') + authors = s_text[0].split(', ') + publisher = s_text[-1] + if len(s_text) != 3: + return authors, None, publisher, None + + # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" + # get journal and year + journal_year = s_text[1].split(', ') + # journal is optional and may contains some coma + if len(journal_year) > 1: + journal = ', '.join(journal_year[0:-1]) + if journal == '…': + journal = None + else: + journal = None + # year + year = journal_year[-1] + try: + publishedDate = datetime.strptime(year.strip(), '%Y') + except ValueError: + publishedDate = None + return authors, journal, publisher, publishedDate + + +def response(resp): # pylint: disable=too-many-locals """Get response from google's search request""" results = [] @@ -112,30 +150,53 @@ def response(resp): dom = html.fromstring(resp.text) # parse results - for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): + for result in eval_xpath_list(dom, '//div[@data-cid]'): - title = extract_text(eval_xpath(result, './h3[1]//a')) + title = extract_text(eval_xpath(result, './/h3[1]//a')) if not title: # this is a [ZITATION] block continue - url = eval_xpath(result, './h3[1]//a/@href')[0] - content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or '' - - pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]')) - if pub_info: - content += "[%s]" % pub_info - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) if pub_type: - title = title + " " + pub_type + pub_type = pub_type[1:-1].lower() + + url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) + content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) + authors, journal, publisher, publishedDate = parse_gs_a( + extract_text(eval_xpath(result, './/div[@class="gs_a"]')) + ) + if publisher in url: + publisher = None + + # cited by + comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) + + # link to the html or pdf document + html_url = None + pdf_url = None + doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) + doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) + if doc_type == "[PDF]": + pdf_url = doc_url + else: + html_url = doc_url results.append( { + 'template': 'paper.html', + 'type': pub_type, 'url': url, 'title': title, + 'authors': authors, + 'publisher': publisher, + 'journal': journal, + 'publishedDate': publishedDate, 'content': content, + 'comments': comments, + 'html_url': html_url, + 'pdf_url': pdf_url, } ) diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index 27444ae24..02e282d5f 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -3,11 +3,15 @@ PubMed (Scholar publications) """ -from flask_babel import gettext from lxml import etree from datetime import datetime from urllib.parse import urlencode from searx.network import get +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, +) # about about = { @@ -22,7 +26,7 @@ about = { "results": 'XML', } -categories = ['science'] +categories = ['science', 'scientific publications'] base_url = ( 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' @@ -63,46 +67,61 @@ def response(resp): retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) - search_results_xml = get(retrieve_url_encoded).content - search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') + search_results_response = get(retrieve_url_encoded).content + search_results = etree.XML(search_results_response) + for entry in eval_xpath_list(search_results, '//PubmedArticle'): + medline = eval_xpath_getindex(entry, './MedlineCitation', 0) - for entry in search_results: - title = entry.xpath('.//Article/ArticleTitle')[0].text - - pmid = entry.xpath('.//PMID')[0].text + title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text + pmid = eval_xpath_getindex(medline, './/PMID', 0).text url = pubmed_url + pmid + content = extract_text( + eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True + ) + doi = extract_text( + eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True + ) + journal = extract_text( + eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True + ) + issn = extract_text( + eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True + ) + authors = [] + for author in eval_xpath_list(medline, './Article/AuthorList/Author'): + f = eval_xpath_getindex(author, './ForeName', 0, default=None) + l = eval_xpath_getindex(author, './LastName', 0, default=None) + f = '' if f is None else f.text + l = '' if l is None else l.text + authors.append((f + ' ' + l).strip()) - try: - content = entry.xpath('.//Abstract/AbstractText')[0].text - except: - content = gettext('No abstract is available for this publication.') + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'content': content, + 'journal': journal, + 'issn': [issn], + 'authors': authors, + 'doi': doi, + } - # If a doi is available, add it to the snipppet - try: - doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text - content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) - except: - pass - - if len(content) > 300: - content = content[0:300] + "..." - # TODO: center snippet on query term - - res_dict = {'url': url, 'title': title, 'content': content} - - try: - publishedDate = datetime.strptime( - entry.xpath('.//DateCreated/Year')[0].text - + '-' - + entry.xpath('.//DateCreated/Month')[0].text - + '-' - + entry.xpath('.//DateCreated/Day')[0].text, - '%Y-%m-%d', - ) - res_dict['publishedDate'] = publishedDate - except: - pass + accepted_date = eval_xpath_getindex( + entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None + ) + if accepted_date is not None: + year = eval_xpath_getindex(accepted_date, './Year', 0) + month = eval_xpath_getindex(accepted_date, './Month', 0) + day = eval_xpath_getindex(accepted_date, './Day', 0) + try: + publishedDate = datetime.strptime( + year.text + '-' + month.text + '-' + day.text, + '%Y-%m-%d', + ) + res_dict['publishedDate'] = publishedDate + except Exception as e: + print(e) results.append(res_dict) - return results + return results diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py index bda731047..b2701c333 100644 --- a/searx/engines/semantic_scholar.py +++ b/searx/engines/semantic_scholar.py @@ -6,6 +6,8 @@ from json import dumps, loads from datetime import datetime +from flask_babel import gettext + about = { "website": 'https://www.semanticscholar.org/', "wikidata_id": 'Q22908627', @@ -15,6 +17,7 @@ about = { "results": 'JSON', } +categories = ['science', 'scientific publications'] paging = True search_url = 'https://www.semanticscholar.org/api/1/search' paper_url = 'https://www.semanticscholar.org/paper' @@ -47,9 +50,6 @@ def response(resp): results = [] for result in res['results']: - item = {} - metadata = [] - url = result.get('primaryPaperLink', {}).get('url') if not url and result.get('links'): url = result.get('links')[0] @@ -60,22 +60,47 @@ def response(resp): if not url: url = paper_url + '/%s' % result['id'] - item['url'] = url + # publishedDate + if 'pubDate' in result: + publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d") + else: + publishedDate = None - item['title'] = result['title']['text'] - item['content'] = result['paperAbstract']['text'] + # authors + authors = [author[0]['name'] for author in result.get('authors', [])] - metadata = result.get('fieldsOfStudy') or [] - venue = result.get('venue', {}).get('text') - if venue: - metadata.append(venue) - if metadata: - item['metadata'] = ', '.join(metadata) + # pick for the first alternate link, but not from the crawler + pdf_url = None + for doc in result.get('alternatePaperLinks', []): + if doc['linkType'] != 'crawler': + pdf_url = doc['url'] + break - pubDate = result.get('pubDate') - if pubDate: - item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d") + # comments + comments = None + if 'citationStats' in result: + comments = gettext( + '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}' + ).format( + numCitations=result['citationStats']['numCitations'], + firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'], + lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'], + ) - results.append(item) + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': result['title']['text'], + 'content': result['paperAbstract']['text'], + 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), + 'doi': result.get('doiInfo', {}).get('doi'), + 'tags': result.get('fieldsOfStudy'), + 'authors': authors, + 'pdf_url': pdf_url, + 'publishedDate': publishedDate, + 'comments': comments, + } + ) return results diff --git a/searx/engines/springer.py b/searx/engines/springer.py index 512d71e5e..2711fa807 100644 --- a/searx/engines/springer.py +++ b/searx/engines/springer.py @@ -19,7 +19,7 @@ about = { "results": 'JSON', } -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True nb_per_page = 10 api_key = 'unset' @@ -41,32 +41,30 @@ def response(resp): json_data = loads(resp.text) for record in json_data['records']: - content = record['abstract'][0:500] - if len(record['abstract']) > len(content): - content += "..." + content = record['abstract'] published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') - - metadata = [ - record[x] - for x in [ - 'publicationName', - 'identifier', - 'contentType', - ] - if record.get(x) is not None - ] - - metadata = ' / '.join(metadata) - if record.get('startingPage') and record.get('endingPage') is not None: - metadata += " (%(startingPage)s-%(endingPage)s)" % record - + authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']] + tags = record.get('genre') + if isinstance(tags, str): + tags = [tags] results.append( { + 'template': 'paper.html', 'title': record['title'], 'url': record['url'][0]['value'].replace('http://', 'https://', 1), + 'type': record.get('contentType'), 'content': content, 'publishedDate': published, - 'metadata': metadata, + 'authors': authors, + 'doi': record.get('doi'), + 'journal': record.get('publicationName'), + 'start_page': record.get('start_page'), + 'end_page': record.get('end_page'), + 'tags': tags, + 'issn': [record.get('issn')], + 'isbn': [record.get('isbn')], + 'volume': record.get('volume') or None, + 'number': record.get('number') or None, } ) return results diff --git a/searx/searxng.msg b/searx/searxng.msg index 3b876f96d..c37240f83 100644 --- a/searx/searxng.msg +++ b/searx/searxng.msg @@ -43,6 +43,7 @@ CATEGORY_GROUPS = { 'REPOS': 'repos', 'SOFTWARE_WIKIS': 'software wikis', 'WEB': 'web', + 'SCIENTIFIC PUBLICATIONS': 'scientific publications', } STYLE_NAMES = { diff --git a/searx/settings.yml b/searx/settings.yml index 3f07bb2dd..ba38e694a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -319,7 +319,6 @@ engines: - name: arxiv engine: arxiv shortcut: arx - categories: science timeout: 4.0 # tmp suspended: dh key too small @@ -411,23 +410,9 @@ engines: # api_key: 'unset' - name: crossref - engine: json_engine - paging: true - search_url: https://search.crossref.org/dois?q={query}&page={pageno} - url_query: doi - title_query: title - title_html_to_text: true - content_query: fullCitation - content_html_to_text: true - categories: science + engine: crossref shortcut: cr - about: - website: https://www.crossref.org/ - wikidata_id: Q5188229 - official_api_documentation: https://github.com/CrossRef/rest-api-doc - use_official_api: false - require_api_key: false - results: JSON + timeout: 10 - name: yep engine: json_engine @@ -1068,7 +1053,7 @@ engines: title_query: metadata/oaf:entity/oaf:result/title/$ content_query: metadata/oaf:entity/oaf:result/description/$ content_html_to_text: true - categories: science + categories: "science" shortcut: oad timeout: 5.0 about: @@ -1198,7 +1183,6 @@ engines: - name: pubmed engine: pubmed shortcut: pub - categories: science timeout: 3.0 - name: pypi @@ -1346,7 +1330,6 @@ engines: engine: semantic_scholar disabled: true shortcut: se - categories: science # Spotify needs API credentials # - name: spotify @@ -1372,8 +1355,7 @@ engines: # # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" # api_key: 'unset' # shortcut: springer - # categories: science - # timeout: 6.0 + # timeout: 15.0 - name: startpage engine: startpage