Science category: update the engines

* use the paper.html template
* fetch more data from the engines
* add crossref.py
This commit is contained in:
Alexandre FLAMENT 2022-08-26 16:10:12 +00:00 committed by Alexandre Flament
parent 593026ad9c
commit e36f85b836
8 changed files with 309 additions and 126 deletions

View File

@ -3,9 +3,10 @@
ArXiV (Scientific preprints) ArXiV (Scientific preprints)
""" """
from lxml import html from lxml import etree
from lxml.etree import XPath
from datetime import datetime from datetime import datetime
from searx.utils import eval_xpath_list, eval_xpath_getindex from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about # about
about = { about = {
@ -17,7 +18,7 @@ about = {
"results": 'XML-RSS', "results": 'XML-RSS',
} }
categories = ['science'] categories = ['science', 'scientific publications']
paging = True paging = True
base_url = ( base_url = (
@ -27,6 +28,23 @@ base_url = (
# engine dependent config # engine dependent config
number_of_results = 10 number_of_results = 10
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
def request(query, params): def request(query, params):
# basic search # basic search
@ -41,30 +59,50 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
dom = etree.fromstring(resp.content)
for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
dom = html.fromstring(resp.content) url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
for entry in eval_xpath_list(dom, '//entry'): authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
title = eval_xpath_getindex(entry, './/title', 0).text
url = eval_xpath_getindex(entry, './/id', 0).text # doi
doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
content_string = '{doi_content}{abstract_content}' # pdf
pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
abstract = eval_xpath_getindex(entry, './/summary', 0).text # journal
journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
journal = None if journal_element is None else journal_element.text
# If a doi is available, add it to the snipppet # tags
doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None) tag_elements = eval_xpath(entry, xpath_category)
doi_content = doi_element.text if doi_element is not None else '' tags = [str(tag) for tag in tag_elements]
content = content_string.format(doi_content=doi_content, abstract_content=abstract)
if len(content) > 300: # comments
content = content[0:300] + "..." comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
# TODO: center snippet on query term comments = None if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ') publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
results.append(res_dict) results.append(res_dict)

59
searx/engines/crossref.py Normal file
View File

@ -0,0 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Semantic Scholar (Science)
"""
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": 'https://www.crossref.org/',
"wikidata_id": 'Q5188229',
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
search_url = 'https://api.crossref.org/works'
def request(query, params):
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
return params
def response(resp):
res = resp.json()
results = []
for record in res['message']['items']:
record_type = record['type']
if record_type == 'book-chapter':
title = record['container-title'][0]
if record['title'][0].lower().strip() != title.lower().strip():
title = title + ' (' + record['title'][0] + ')'
journal = None
else:
title = record['title'][0]
journal = record.get('container-title', [None])[0]
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
results.append(
{
'template': 'paper.html',
'url': url,
'title': title,
'journal': journal,
'volume': record.get('volume'),
'type': record['type'],
'content': html_to_text(record.get('abstract', '')),
'publisher': record.get('publisher'),
'authors': authors,
'doi': record['DOI'],
'isbn': isbn,
}
)
return results

View File

@ -13,10 +13,12 @@ Definitions`_.
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
from typing import Optional
from lxml import html from lxml import html
from searx.utils import ( from searx.utils import (
eval_xpath, eval_xpath,
eval_xpath_getindex,
eval_xpath_list, eval_xpath_list,
extract_text, extract_text,
) )
@ -46,7 +48,7 @@ about = {
} }
# engine dependent config # engine dependent config
categories = ['science'] categories = ['science', 'scientific publications']
paging = True paging = True
language_support = True language_support = True
use_locale_domain = True use_locale_domain = True
@ -99,7 +101,43 @@ def request(query, params):
return params return params
def response(resp): def parse_gs_a(text: Optional[str]):
"""Parse the text written in green.
Possible formats:
* "{authors} - {journal}, {year} - {publisher}"
* "{authors} - {year} - {publisher}"
* "{authors} - {publisher}"
"""
if text is None or text == "":
return None, None, None, None
s_text = text.split(' - ')
authors = s_text[0].split(', ')
publisher = s_text[-1]
if len(s_text) != 3:
return authors, None, publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(', ')
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal = ', '.join(journal_year[0:-1])
if journal == '':
journal = None
else:
journal = None
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), '%Y')
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate
def response(resp): # pylint: disable=too-many-locals
"""Get response from google's search request""" """Get response from google's search request"""
results = [] results = []
@ -112,30 +150,53 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): for result in eval_xpath_list(dom, '//div[@data-cid]'):
title = extract_text(eval_xpath(result, './h3[1]//a')) title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title: if not title:
# this is a [ZITATION] block # this is a [ZITATION] block
continue continue
url = eval_xpath(result, './h3[1]//a/@href')[0]
content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or ''
pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]'))
if pub_info:
content += "[%s]" % pub_info
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
if pub_type: if pub_type:
title = title + " " + pub_type pub_type = pub_type[1:-1].lower()
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
)
if publisher in url:
publisher = None
# cited by
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
# link to the html or pdf document
html_url = None
pdf_url = None
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
results.append( results.append(
{ {
'template': 'paper.html',
'type': pub_type,
'url': url, 'url': url,
'title': title, 'title': title,
'authors': authors,
'publisher': publisher,
'journal': journal,
'publishedDate': publishedDate,
'content': content, 'content': content,
'comments': comments,
'html_url': html_url,
'pdf_url': pdf_url,
} }
) )

View File

@ -3,11 +3,15 @@
PubMed (Scholar publications) PubMed (Scholar publications)
""" """
from flask_babel import gettext
from lxml import etree from lxml import etree
from datetime import datetime from datetime import datetime
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.network import get from searx.network import get
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about # about
about = { about = {
@ -22,7 +26,7 @@ about = {
"results": 'XML', "results": 'XML',
} }
categories = ['science'] categories = ['science', 'scientific publications']
base_url = ( base_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
@ -63,46 +67,61 @@ def response(resp):
retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
search_results_xml = get(retrieve_url_encoded).content search_results_response = get(retrieve_url_encoded).content
search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') search_results = etree.XML(search_results_response)
for entry in eval_xpath_list(search_results, '//PubmedArticle'):
medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
for entry in search_results: title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
title = entry.xpath('.//Article/ArticleTitle')[0].text pmid = eval_xpath_getindex(medline, './/PMID', 0).text
pmid = entry.xpath('.//PMID')[0].text
url = pubmed_url + pmid url = pubmed_url + pmid
content = extract_text(
eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
)
doi = extract_text(
eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
)
journal = extract_text(
eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
)
issn = extract_text(
eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
)
authors = []
for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
f = eval_xpath_getindex(author, './ForeName', 0, default=None)
l = eval_xpath_getindex(author, './LastName', 0, default=None)
f = '' if f is None else f.text
l = '' if l is None else l.text
authors.append((f + ' ' + l).strip())
try: res_dict = {
content = entry.xpath('.//Abstract/AbstractText')[0].text 'template': 'paper.html',
except: 'url': url,
content = gettext('No abstract is available for this publication.') 'title': title,
'content': content,
'journal': journal,
'issn': [issn],
'authors': authors,
'doi': doi,
}
# If a doi is available, add it to the snipppet accepted_date = eval_xpath_getindex(
try: entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text )
content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) if accepted_date is not None:
except: year = eval_xpath_getindex(accepted_date, './Year', 0)
pass month = eval_xpath_getindex(accepted_date, './Month', 0)
day = eval_xpath_getindex(accepted_date, './Day', 0)
if len(content) > 300: try:
content = content[0:300] + "..." publishedDate = datetime.strptime(
# TODO: center snippet on query term year.text + '-' + month.text + '-' + day.text,
'%Y-%m-%d',
res_dict = {'url': url, 'title': title, 'content': content} )
res_dict['publishedDate'] = publishedDate
try: except Exception as e:
publishedDate = datetime.strptime( print(e)
entry.xpath('.//DateCreated/Year')[0].text
+ '-'
+ entry.xpath('.//DateCreated/Month')[0].text
+ '-'
+ entry.xpath('.//DateCreated/Day')[0].text,
'%Y-%m-%d',
)
res_dict['publishedDate'] = publishedDate
except:
pass
results.append(res_dict) results.append(res_dict)
return results return results

View File

@ -6,6 +6,8 @@
from json import dumps, loads from json import dumps, loads
from datetime import datetime from datetime import datetime
from flask_babel import gettext
about = { about = {
"website": 'https://www.semanticscholar.org/', "website": 'https://www.semanticscholar.org/',
"wikidata_id": 'Q22908627', "wikidata_id": 'Q22908627',
@ -15,6 +17,7 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
categories = ['science', 'scientific publications']
paging = True paging = True
search_url = 'https://www.semanticscholar.org/api/1/search' search_url = 'https://www.semanticscholar.org/api/1/search'
paper_url = 'https://www.semanticscholar.org/paper' paper_url = 'https://www.semanticscholar.org/paper'
@ -47,9 +50,6 @@ def response(resp):
results = [] results = []
for result in res['results']: for result in res['results']:
item = {}
metadata = []
url = result.get('primaryPaperLink', {}).get('url') url = result.get('primaryPaperLink', {}).get('url')
if not url and result.get('links'): if not url and result.get('links'):
url = result.get('links')[0] url = result.get('links')[0]
@ -60,22 +60,47 @@ def response(resp):
if not url: if not url:
url = paper_url + '/%s' % result['id'] url = paper_url + '/%s' % result['id']
item['url'] = url # publishedDate
if 'pubDate' in result:
publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
else:
publishedDate = None
item['title'] = result['title']['text'] # authors
item['content'] = result['paperAbstract']['text'] authors = [author[0]['name'] for author in result.get('authors', [])]
metadata = result.get('fieldsOfStudy') or [] # pick for the first alternate link, but not from the crawler
venue = result.get('venue', {}).get('text') pdf_url = None
if venue: for doc in result.get('alternatePaperLinks', []):
metadata.append(venue) if doc['linkType'] != 'crawler':
if metadata: pdf_url = doc['url']
item['metadata'] = ', '.join(metadata) break
pubDate = result.get('pubDate') # comments
if pubDate: comments = None
item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d") if 'citationStats' in result:
comments = gettext(
'{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
).format(
numCitations=result['citationStats']['numCitations'],
firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
)
results.append(item) results.append(
{
'template': 'paper.html',
'url': url,
'title': result['title']['text'],
'content': result['paperAbstract']['text'],
'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
'doi': result.get('doiInfo', {}).get('doi'),
'tags': result.get('fieldsOfStudy'),
'authors': authors,
'pdf_url': pdf_url,
'publishedDate': publishedDate,
'comments': comments,
}
)
return results return results

View File

@ -19,7 +19,7 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
categories = ['science'] categories = ['science', 'scientific publications']
paging = True paging = True
nb_per_page = 10 nb_per_page = 10
api_key = 'unset' api_key = 'unset'
@ -41,32 +41,30 @@ def response(resp):
json_data = loads(resp.text) json_data = loads(resp.text)
for record in json_data['records']: for record in json_data['records']:
content = record['abstract'][0:500] content = record['abstract']
if len(record['abstract']) > len(content):
content += "..."
published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') published = datetime.strptime(record['publicationDate'], '%Y-%m-%d')
authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']]
metadata = [ tags = record.get('genre')
record[x] if isinstance(tags, str):
for x in [ tags = [tags]
'publicationName',
'identifier',
'contentType',
]
if record.get(x) is not None
]
metadata = ' / '.join(metadata)
if record.get('startingPage') and record.get('endingPage') is not None:
metadata += " (%(startingPage)s-%(endingPage)s)" % record
results.append( results.append(
{ {
'template': 'paper.html',
'title': record['title'], 'title': record['title'],
'url': record['url'][0]['value'].replace('http://', 'https://', 1), 'url': record['url'][0]['value'].replace('http://', 'https://', 1),
'type': record.get('contentType'),
'content': content, 'content': content,
'publishedDate': published, 'publishedDate': published,
'metadata': metadata, 'authors': authors,
'doi': record.get('doi'),
'journal': record.get('publicationName'),
'start_page': record.get('start_page'),
'end_page': record.get('end_page'),
'tags': tags,
'issn': [record.get('issn')],
'isbn': [record.get('isbn')],
'volume': record.get('volume') or None,
'number': record.get('number') or None,
} }
) )
return results return results

View File

@ -43,6 +43,7 @@ CATEGORY_GROUPS = {
'REPOS': 'repos', 'REPOS': 'repos',
'SOFTWARE_WIKIS': 'software wikis', 'SOFTWARE_WIKIS': 'software wikis',
'WEB': 'web', 'WEB': 'web',
'SCIENTIFIC PUBLICATIONS': 'scientific publications',
} }
STYLE_NAMES = { STYLE_NAMES = {

View File

@ -319,7 +319,6 @@ engines:
- name: arxiv - name: arxiv
engine: arxiv engine: arxiv
shortcut: arx shortcut: arx
categories: science
timeout: 4.0 timeout: 4.0
# tmp suspended: dh key too small # tmp suspended: dh key too small
@ -411,23 +410,9 @@ engines:
# api_key: 'unset' # api_key: 'unset'
- name: crossref - name: crossref
engine: json_engine engine: crossref
paging: true
search_url: https://search.crossref.org/dois?q={query}&page={pageno}
url_query: doi
title_query: title
title_html_to_text: true
content_query: fullCitation
content_html_to_text: true
categories: science
shortcut: cr shortcut: cr
about: timeout: 10
website: https://www.crossref.org/
wikidata_id: Q5188229
official_api_documentation: https://github.com/CrossRef/rest-api-doc
use_official_api: false
require_api_key: false
results: JSON
- name: yep - name: yep
engine: json_engine engine: json_engine
@ -1068,7 +1053,7 @@ engines:
title_query: metadata/oaf:entity/oaf:result/title/$ title_query: metadata/oaf:entity/oaf:result/title/$
content_query: metadata/oaf:entity/oaf:result/description/$ content_query: metadata/oaf:entity/oaf:result/description/$
content_html_to_text: true content_html_to_text: true
categories: science categories: "science"
shortcut: oad shortcut: oad
timeout: 5.0 timeout: 5.0
about: about:
@ -1198,7 +1183,6 @@ engines:
- name: pubmed - name: pubmed
engine: pubmed engine: pubmed
shortcut: pub shortcut: pub
categories: science
timeout: 3.0 timeout: 3.0
- name: pypi - name: pypi
@ -1346,7 +1330,6 @@ engines:
engine: semantic_scholar engine: semantic_scholar
disabled: true disabled: true
shortcut: se shortcut: se
categories: science
# Spotify needs API credentials # Spotify needs API credentials
# - name: spotify # - name: spotify
@ -1372,8 +1355,7 @@ engines:
# # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" # # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601"
# api_key: 'unset' # api_key: 'unset'
# shortcut: springer # shortcut: springer
# categories: science # timeout: 15.0
# timeout: 6.0
- name: startpage - name: startpage
engine: startpage engine: startpage