From 5c402923f8082272192d3e44f1214c9ffd7f0539 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Fri, 8 Sep 2023 18:11:06 +0200 Subject: [PATCH] [feat] engine: implementation of internet archive scholar --- searx/engines/internet_archive_scholar.py | 72 +++++++++++++++++++++++ searx/settings.yml | 5 ++ 2 files changed, 77 insertions(+) create mode 100644 searx/engines/internet_archive_scholar.py diff --git a/searx/engines/internet_archive_scholar.py b/searx/engines/internet_archive_scholar.py new file mode 100644 index 000000000..fdbc10026 --- /dev/null +++ b/searx/engines/internet_archive_scholar.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Internet Archive scholar(science) +""" + +from datetime import datetime +from urllib.parse import urlencode +from searx.utils import html_to_text + +about = { + "website": "https://scholar.archive.org/", + "wikidata_id": "Q115667709", + "official_api_documentation": "https://scholar.archive.org/api/redoc", + "use_official_api": True, + "require_api_key": False, + "results": "JSON", +} +categories = ['science', 'scientific publications'] +paging = True + +base_url = "https://scholar.archive.org" +results_per_page = 15 + + +def request(query, params): + args = { + "q": query, + "limit": results_per_page, + "offset": (params["pageno"] - 1) * results_per_page, + } + params["url"] = f"{base_url}/search?{urlencode(args)}" + params["headers"]["Accept"] = "application/json" + return params + + +def response(resp): + results = [] + + json = resp.json() + + for result in json["results"]: + publishedDate, content, doi = None, '', None + + if result['biblio'].get('release_date'): + publishedDate = datetime.strptime(result['biblio']['release_date'], "%Y-%m-%d") + + if len(result['abstracts']) > 0: + content = result['abstracts'][0].get('body') + elif len(result['_highlights']) > 0: + content = result['_highlights'][0] + + if len(result['releases']) > 0: + doi = result['releases'][0].get('doi') + + results.append( + { + 'template': 'paper.html', + 'url': result['fulltext']['access_url'], + 'title': result['biblio']['title'], + 'content': html_to_text(content), + 'publisher': result['biblio'].get('publisher'), + 'doi': doi, + 'journal': result['biblio'].get('container_name'), + 'authors': result['biblio'].get('contrib_names'), + 'tags': result['tags'], + 'publishedDate': publishedDate, + 'issns': result['biblio'].get('issns'), + 'pdf_url': result['fulltext'].get('access_url'), + } + ) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 77a6a72f9..bd2cc1bfb 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1348,6 +1348,11 @@ engines: api_site: 'askubuntu' categories: [it, q&a] + - name: internetarchivescholar + engine: internet_archive_scholar + shortcut: ias + timeout: 5.0 + - name: superuser engine: stackexchange shortcut: su