From 3ff2ad939daf5d60f2f8efd1219b33e2fe4572bc Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 24 Sep 2022 11:54:12 +0200 Subject: [PATCH 1/3] [fix] ERROR searx.engines.core.ac.uk: list index out of range Some result items from core.ac.uk do not have an URL:: Traceback (most recent call last): File "searx/search/processors/online.py", line 154, in search search_results = self._search_basic(query, params) File "searx/search/processors/online.py", line 142, in _search_basic return self.engine.response(response) File "SearXNG/searx/engines/core.py", line 73, in response 'url': source['urls'][0].replace('http://', 'https://', 1), Signed-off-by: Markus Heiser --- searx/engines/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/searx/engines/core.py b/searx/engines/core.py index 1fcb68f1f..96543308a 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -53,6 +53,9 @@ def response(resp): for result in json_data['data']: source = result['_source'] + if not source['urls']: + continue + time = source['publishedDate'] or source['depositedDate'] if time: date = datetime.fromtimestamp(time / 1000) From c76830d8a878a69924bfda54825c4bd09b6287db Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 24 Sep 2022 13:17:01 +0200 Subject: [PATCH 2/3] [mod] core.ac.uk: use paper.html template Signed-off-by: Markus Heiser --- searx/engines/core.py | 44 ++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/searx/engines/core.py b/searx/engines/core.py index 96543308a..c95fa1d28 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -4,7 +4,6 @@ """ -from json import loads from datetime import datetime from urllib.parse import urlencode @@ -48,36 +47,47 @@ def request(query, params): def response(resp): results = [] - json_data = loads(resp.text) + json_data = resp.json() for result in json_data['data']: - source = result['_source'] if not source['urls']: continue time = source['publishedDate'] or source['depositedDate'] if time: - date = datetime.fromtimestamp(time / 1000) - else: - date = None + publishedDate = datetime.fromtimestamp(time / 1000) - metadata = [] - if source['publisher'] and len(source['publisher']) > 3: - metadata.append(source['publisher']) - if source['topics']: - metadata.append(source['topics'][0]) - if source['doi']: - metadata.append(source['doi']) - metadata = ' / '.join(metadata) + journals = [] + if source['journals']: + for j in source['journals']: + journals.append(j['title']) + + publisher = source['publisher'] + if publisher: + publisher = source['publisher'].strip("'") results.append( { - 'url': source['urls'][0].replace('http://', 'https://', 1), + 'template': 'paper.html', 'title': source['title'], + 'url': source['urls'][0].replace('http://', 'https://', 1), 'content': source['description'], - 'publishedDate': date, - 'metadata': metadata, + # 'comments': '', + 'tags': source['topics'], + 'publishedDate': publishedDate, + 'type': (source['types'] or [None])[0], + 'authors': source['authors'], + 'editor': ', '.join(source['contributors'] or []), + 'publisher': publisher, + 'journal': ', '.join(journals), + # 'volume': '', + # 'pages' : '', + # 'number': '', + 'doi': source['doi'], + 'issn': source['issn'], + 'isbn': source.get('isbn'), # exists in the rawRecordXml + 'pdf_url': source.get('repositoryDocument', {}).get('pdfOrigin'), } ) From 16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 24 Sep 2022 14:26:07 +0200 Subject: [PATCH 3/3] [mod] core.ac.uk: try multiple ways to get url If the url is not found, using: * the DOI * the downloadUrl * the ARK id --- searx/engines/core.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/searx/engines/core.py b/searx/engines/core.py index c95fa1d28..a997343f2 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -41,7 +41,6 @@ def request(query, params): ) params['url'] = base_url + search_path - logger.debug("query_url --> %s", params['url']) return params @@ -51,17 +50,39 @@ def response(resp): for result in json_data['data']: source = result['_source'] - if not source['urls']: + url = None + if source.get('urls'): + url = source['urls'][0].replace('http://', 'https://', 1) + + if url is None and source.get('doi'): + # use the DOI reference + url = 'https://doi.org/' + source['doi'] + + if url is None and source.get('downloadUrl'): + # use the downloadUrl + url = source['downloadUrl'] + + if url is None and source.get('identifiers'): + # try to find an ark id, see + # https://www.wikidata.org/wiki/Property:P8091 + # and https://en.wikipedia.org/wiki/Archival_Resource_Key + arkids = [ + identifier[5:] # 5 is the length of "ark:/" + for identifier in source.get('identifiers') + if isinstance(identifier, str) and identifier.startswith('ark:/') + ] + if len(arkids) > 0: + url = 'https://n2t.net/' + arkids[0] + + if url is None: continue time = source['publishedDate'] or source['depositedDate'] if time: publishedDate = datetime.fromtimestamp(time / 1000) - journals = [] - if source['journals']: - for j in source['journals']: - journals.append(j['title']) + # sometimes the 'title' is None / filter None values + journals = [j['title'] for j in (source.get('journals') or []) if j['title']] publisher = source['publisher'] if publisher: @@ -71,8 +92,8 @@ def response(resp): { 'template': 'paper.html', 'title': source['title'], - 'url': source['urls'][0].replace('http://', 'https://', 1), - 'content': source['description'], + 'url': url, + 'content': source['description'] or '', # 'comments': '', 'tags': source['topics'], 'publishedDate': publishedDate,