Merge [feat] engine: implementation of Anna's Archive

Anna's Archive [1] is a free non-profit online shadow library metasearch engine
providing access to a variety of book resources (also via IPFS), created by a
team of anonymous archivists [2].

[1] https://annas-archive.org/
[2] https://annas-software.org/AnnaArchivist/annas-archive
This commit is contained in:
Markus Heiser 2023-06-29 13:56:19 +02:00 committed by GitHub
commit 749b04ac1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 346 additions and 1 deletions

View File

@ -0,0 +1,2 @@
.. automodule:: searx.engines.annas_archive
:members:

View File

@ -1,4 +1,134 @@
{
"annas archive": {
"all_locale": "",
"custom": {
"content": [
"",
"journal_article",
"book_any",
"book_fiction",
"book_unknown",
"book_nonfiction",
"book_comic",
"magazine",
"standards_document"
],
"ext": [
"",
"pdf",
"epub",
"cbr",
"fb2",
"mobi",
"cbz",
"djvu",
"azw3",
"fb2.zip",
"txt",
"rar",
"zip",
"doc",
"lit",
"rtf",
"htm",
"html",
"lrf",
"mht",
"docx"
],
"sort": [
"",
"newest",
"oldest",
"largest",
"smallest"
]
},
"data_type": "traits_v1",
"languages": {
"af": "af",
"ar": "ar",
"az": "az",
"be": "be",
"bg": "bg",
"bn": "bn",
"bo": "bo",
"bs": "bs",
"ca": "ca",
"cs": "cs",
"da": "da",
"de": "de",
"el": "el",
"en": "en",
"eo": "eo",
"es": "es",
"et": "et",
"eu": "eu",
"fa": "fa",
"fi": "fi",
"fil": "tl",
"fr": "fr",
"gl": "gl",
"gu": "gu",
"he": "he",
"hi": "hi",
"hr": "hr",
"hu": "hu",
"hy": "hy",
"id": "id",
"is": "is",
"it": "it",
"ja": "ja",
"ka": "ka",
"kk": "kk",
"kn": "kn",
"ko": "ko",
"ku": "ku",
"ky": "ky",
"lo": "lo",
"lt": "lt",
"lv": "lv",
"mk": "mk",
"ml": "ml",
"mn": "mn",
"mr": "mr",
"ms": "ms",
"my": "my",
"nb": "nb",
"ne": "ne",
"nl": "nl",
"no": "no",
"pa": "pa",
"pl": "pl",
"ps": "ps",
"pt": "pt",
"ro": "ro",
"ru": "ru",
"sa": "sa",
"sd": "sd",
"si": "si",
"sk": "sk",
"sl": "sl",
"so": "so",
"sq": "sq",
"sr": "sr",
"sv": "sv",
"sw": "sw",
"ta": "ta",
"te": "te",
"tg": "tg",
"tr": "tr",
"tt": "tt",
"ug": "ug",
"uk": "uk",
"ur": "ur",
"uz": "uz",
"vi": "vi",
"yi": "yi",
"zh": "zh"
},
"regions": {}
},
"arch linux wiki": {
"all_locale": null,
"custom": {
@ -4127,4 +4257,4 @@
},
"regions": {}
}
}
}

View File

@ -0,0 +1,200 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""".. _annas_archive engine:
==============
Anna's Archive
==============
.. _Anna's Archive: https://annas-archive.org/
.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive
`Anna's Archive`_ is a free non-profit online shadow library metasearch engine
providing access to a variety of book resources (also via IPFS), created by a
team of anonymous archivists (AnnaArchivist_).
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
Configuration
=============
The engine has the following additional settings:
- :py:obj:`aa_content`
- :py:obj:`aa_ext`
- :py:obj:`aa_sort`
With this options a SearXNG maintainer is able to configure **additional**
engines for specific searches in Anna's Archive. For example a engine to search
for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``.
.. code:: yaml
- name: annas articles
engine: annas_archive
shortcut: aaa
aa_content: 'journal_article'
aa_ext: 'pdf'
aa_sort: 'newest'
Implementations
===============
"""
from typing import List, Dict, Any, Optional
from urllib.parse import quote
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list
from searx.enginelib.traits import EngineTraits
from searx.data import ENGINE_TRAITS
# about
about: Dict[str, Any] = {
"website": "https://annas-archive.org/",
"wikidata_id": "Q115288326",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# engine dependent config
categories: List[str] = ["files"]
paging: bool = False
# search-url
base_url: str = "https://annas-archive.org"
aa_content: str = ""
"""Anan's search form field **Content** / possible values::
journal_article, book_any, book_fiction, book_unknown, book_nonfiction,
book_comic, magazine, standards_document
To not filter use an empty string (default).
"""
aa_sort: str = ''
"""Sort Anna's results, possible values::
newest, oldest, largest, smallest
To sort by *most relevant* use an empty string (default)."""
aa_ext: str = ''
"""Filter Anna's results by a file ending. Common filters for example are
``pdf`` and ``epub``.
.. note::
Anna's Archive is a beta release: Filter results by file extension does not
really work on Anna's Archive.
"""
def init(engine_settings=None): # pylint: disable=unused-argument
"""Check of engine's settings."""
traits = EngineTraits(**ENGINE_TRAITS['annas archive'])
if aa_content and aa_content not in traits.custom['content']:
raise ValueError(f'invalid setting content: {aa_content}')
if aa_sort and aa_sort not in traits.custom['sort']:
raise ValueError(f'invalid setting sort: {aa_sort}')
if aa_ext and aa_ext not in traits.custom['ext']:
raise ValueError(f'invalid setting ext: {aa_ext}')
def request(query, params: Dict[str, Any]) -> Dict[str, Any]:
q = quote(query)
lang = traits.get_language(params["language"], traits.all_locale) # type: ignore
params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}"
return params
def response(resp) -> List[Dict[str, Optional[str]]]:
results: List[Dict[str, Optional[str]]] = []
dom = html.fromstring(resp.text)
for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
results.append(_get_result(item))
# The rendering of the WEB page is very strange; except the first position
# all other positions of Anna's result page are enclosed in SGML comments.
# These comments are *uncommented* by some JS code, see query of class
# '.js-scroll-hidden' in Anna's HTML template:
# https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
item = html.fromstring(item.xpath('./comment()')[0].text)
results.append(_get_result(item))
return results
def _get_result(item):
return {
'template': 'paper.html',
'url': base_url + item.xpath('./@href')[0],
'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
'img_src': item.xpath('.//img/@src')[0],
}
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and other search arguments from Anna's search form."""
# pylint: disable=import-outside-toplevel
import babel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.locales import language_tag
engine_traits.all_locale = ''
engine_traits.custom['content'] = []
engine_traits.custom['ext'] = []
engine_traits.custom['sort'] = []
resp = get(base_url + '/search')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Anna's search page is not OK.")
dom = html.fromstring(resp.text) # type: ignore
# supported language codes
lang_map = {}
for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"):
eng_lang = x.get("value")
if eng_lang in ('', '_empty', 'nl-BE', 'und'):
continue
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
# silently ignore unknown languages
# print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = eng_lang
for x in eval_xpath_list(dom, "//form//select[@name='content']//option"):
engine_traits.custom['content'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"):
engine_traits.custom['ext'].append(x.get("value"))
for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
engine_traits.custom['sort'].append(x.get("value"))

View File

@ -297,6 +297,19 @@ engines:
shortcut: 9g
disabled: true
- name: annas archive
engine: annas_archive
disabled: true
shortcut: aa
# - name: annas articles
# engine: annas_archive
# shortcut: aaa
# # https://docs.searxng.org/src/searx.engines.annas_archive.html
# aa_content: 'journal_article' # book_any .. magazine, standards_document
# aa_ext: 'pdf' # pdf, epub, ..
# aa_sort: 'newest' # newest, oldest, largest, smallest
- name: apk mirror
engine: apkmirror
timeout: 4.0