mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	Merge [feat] engine: implementation of Anna's Archive
Anna's Archive [1] is a free non-profit online shadow library metasearch engine providing access to a variety of book resources (also via IPFS), created by a team of anonymous archivists [2]. [1] https://annas-archive.org/ [2] https://annas-software.org/AnnaArchivist/annas-archive
This commit is contained in:
		
						commit
						749b04ac1a
					
				
					 4 changed files with 346 additions and 1 deletions
				
			
		
							
								
								
									
										2
									
								
								docs/src/searx.engines.annas_archive.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								docs/src/searx.engines.annas_archive.rst
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,2 @@ | |||
| .. automodule:: searx.engines.annas_archive | ||||
|    :members: | ||||
|  | @ -1,4 +1,134 @@ | |||
| { | ||||
|   "annas archive": { | ||||
|     "all_locale": "", | ||||
|     "custom": { | ||||
|       "content": [ | ||||
|         "", | ||||
|         "journal_article", | ||||
|         "book_any", | ||||
|         "book_fiction", | ||||
|         "book_unknown", | ||||
|         "book_nonfiction", | ||||
|         "book_comic", | ||||
|         "magazine", | ||||
|         "standards_document" | ||||
|       ], | ||||
|       "ext": [ | ||||
|         "", | ||||
|         "pdf", | ||||
|         "epub", | ||||
|         "cbr", | ||||
|         "fb2", | ||||
|         "mobi", | ||||
|         "cbz", | ||||
|         "djvu", | ||||
|         "azw3", | ||||
|         "fb2.zip", | ||||
|         "txt", | ||||
|         "rar", | ||||
|         "zip", | ||||
|         "doc", | ||||
|         "lit", | ||||
|         "rtf", | ||||
|         "htm", | ||||
|         "html", | ||||
|         "lrf", | ||||
|         "mht", | ||||
|         "docx" | ||||
|       ], | ||||
|       "sort": [ | ||||
|         "", | ||||
|         "newest", | ||||
|         "oldest", | ||||
|         "largest", | ||||
|         "smallest" | ||||
|       ] | ||||
|     }, | ||||
|     "data_type": "traits_v1", | ||||
|     "languages": { | ||||
|       "af": "af", | ||||
|       "ar": "ar", | ||||
|       "az": "az", | ||||
|       "be": "be", | ||||
|       "bg": "bg", | ||||
|       "bn": "bn", | ||||
|       "bo": "bo", | ||||
|       "bs": "bs", | ||||
|       "ca": "ca", | ||||
|       "cs": "cs", | ||||
|       "da": "da", | ||||
|       "de": "de", | ||||
|       "el": "el", | ||||
|       "en": "en", | ||||
|       "eo": "eo", | ||||
|       "es": "es", | ||||
|       "et": "et", | ||||
|       "eu": "eu", | ||||
|       "fa": "fa", | ||||
|       "fi": "fi", | ||||
|       "fil": "tl", | ||||
|       "fr": "fr", | ||||
|       "gl": "gl", | ||||
|       "gu": "gu", | ||||
|       "he": "he", | ||||
|       "hi": "hi", | ||||
|       "hr": "hr", | ||||
|       "hu": "hu", | ||||
|       "hy": "hy", | ||||
|       "id": "id", | ||||
|       "is": "is", | ||||
|       "it": "it", | ||||
|       "ja": "ja", | ||||
|       "ka": "ka", | ||||
|       "kk": "kk", | ||||
|       "kn": "kn", | ||||
|       "ko": "ko", | ||||
|       "ku": "ku", | ||||
|       "ky": "ky", | ||||
|       "lo": "lo", | ||||
|       "lt": "lt", | ||||
|       "lv": "lv", | ||||
|       "mk": "mk", | ||||
|       "ml": "ml", | ||||
|       "mn": "mn", | ||||
|       "mr": "mr", | ||||
|       "ms": "ms", | ||||
|       "my": "my", | ||||
|       "nb": "nb", | ||||
|       "ne": "ne", | ||||
|       "nl": "nl", | ||||
|       "no": "no", | ||||
|       "pa": "pa", | ||||
|       "pl": "pl", | ||||
|       "ps": "ps", | ||||
|       "pt": "pt", | ||||
|       "ro": "ro", | ||||
|       "ru": "ru", | ||||
|       "sa": "sa", | ||||
|       "sd": "sd", | ||||
|       "si": "si", | ||||
|       "sk": "sk", | ||||
|       "sl": "sl", | ||||
|       "so": "so", | ||||
|       "sq": "sq", | ||||
|       "sr": "sr", | ||||
|       "sv": "sv", | ||||
|       "sw": "sw", | ||||
|       "ta": "ta", | ||||
|       "te": "te", | ||||
|       "tg": "tg", | ||||
|       "tr": "tr", | ||||
|       "tt": "tt", | ||||
|       "ug": "ug", | ||||
|       "uk": "uk", | ||||
|       "ur": "ur", | ||||
|       "uz": "uz", | ||||
|       "vi": "vi", | ||||
|       "yi": "yi", | ||||
|       "zh": "zh" | ||||
|     }, | ||||
|     "regions": {} | ||||
|   }, | ||||
|   "arch linux wiki": { | ||||
|     "all_locale": null, | ||||
|     "custom": { | ||||
|  | @ -4127,4 +4257,4 @@ | |||
|     }, | ||||
|     "regions": {} | ||||
|   } | ||||
| } | ||||
| } | ||||
|  |  | |||
							
								
								
									
										200
									
								
								searx/engines/annas_archive.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										200
									
								
								searx/engines/annas_archive.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,200 @@ | |||
| # SPDX-License-Identifier: AGPL-3.0-or-later | ||||
| # lint: pylint | ||||
| """.. _annas_archive engine: | ||||
| 
 | ||||
| ============== | ||||
| Anna's Archive | ||||
| ============== | ||||
| 
 | ||||
| .. _Anna's Archive: https://annas-archive.org/ | ||||
| .. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive | ||||
| 
 | ||||
| `Anna's Archive`_ is a free non-profit online shadow library metasearch engine | ||||
| providing access to a variety of book resources (also via IPFS), created by a | ||||
| team of anonymous archivists (AnnaArchivist_). | ||||
| 
 | ||||
| .. contents:: Contents | ||||
|    :depth: 2 | ||||
|    :local: | ||||
|    :backlinks: entry | ||||
| 
 | ||||
| 
 | ||||
| Configuration | ||||
| ============= | ||||
| 
 | ||||
| The engine has the following additional settings: | ||||
| 
 | ||||
| - :py:obj:`aa_content` | ||||
| - :py:obj:`aa_ext` | ||||
| - :py:obj:`aa_sort` | ||||
| 
 | ||||
| With this options a SearXNG maintainer is able to configure **additional** | ||||
| engines for specific searches in Anna's Archive.  For example a engine to search | ||||
| for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``. | ||||
| 
 | ||||
| .. code:: yaml | ||||
| 
 | ||||
|   - name: annas articles | ||||
|     engine: annas_archive | ||||
|     shortcut: aaa | ||||
|     aa_content: 'journal_article' | ||||
|     aa_ext: 'pdf' | ||||
|     aa_sort: 'newest' | ||||
| 
 | ||||
| 
 | ||||
| Implementations | ||||
| =============== | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| from typing import List, Dict, Any, Optional | ||||
| from urllib.parse import quote | ||||
| from lxml import html | ||||
| 
 | ||||
| from searx.utils import extract_text, eval_xpath, eval_xpath_list | ||||
| from searx.enginelib.traits import EngineTraits | ||||
| from searx.data import ENGINE_TRAITS | ||||
| 
 | ||||
| # about | ||||
| about: Dict[str, Any] = { | ||||
|     "website": "https://annas-archive.org/", | ||||
|     "wikidata_id": "Q115288326", | ||||
|     "official_api_documentation": None, | ||||
|     "use_official_api": False, | ||||
|     "require_api_key": False, | ||||
|     "results": "HTML", | ||||
| } | ||||
| 
 | ||||
| # engine dependent config | ||||
| categories: List[str] = ["files"] | ||||
| paging: bool = False | ||||
| 
 | ||||
| # search-url | ||||
| base_url: str = "https://annas-archive.org" | ||||
| aa_content: str = "" | ||||
| """Anan's search form field **Content** / possible values:: | ||||
| 
 | ||||
|     journal_article, book_any, book_fiction, book_unknown, book_nonfiction, | ||||
|     book_comic, magazine, standards_document | ||||
| 
 | ||||
| To not filter use an empty string (default). | ||||
| """ | ||||
| aa_sort: str = '' | ||||
| """Sort Anna's results, possible values:: | ||||
| 
 | ||||
|     newest, oldest, largest, smallest | ||||
| 
 | ||||
| To sort by *most relevant* use an empty string (default).""" | ||||
| 
 | ||||
| aa_ext: str = '' | ||||
| """Filter Anna's results by a file ending.  Common filters for example are | ||||
| ``pdf`` and ``epub``. | ||||
| 
 | ||||
| .. note:: | ||||
| 
 | ||||
|    Anna's Archive is a beta release: Filter results by file extension does not | ||||
|    really work on Anna's Archive. | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| def init(engine_settings=None):  # pylint: disable=unused-argument | ||||
|     """Check of engine's settings.""" | ||||
|     traits = EngineTraits(**ENGINE_TRAITS['annas archive']) | ||||
| 
 | ||||
|     if aa_content and aa_content not in traits.custom['content']: | ||||
|         raise ValueError(f'invalid setting content: {aa_content}') | ||||
| 
 | ||||
|     if aa_sort and aa_sort not in traits.custom['sort']: | ||||
|         raise ValueError(f'invalid setting sort: {aa_sort}') | ||||
| 
 | ||||
|     if aa_ext and aa_ext not in traits.custom['ext']: | ||||
|         raise ValueError(f'invalid setting ext: {aa_ext}') | ||||
| 
 | ||||
| 
 | ||||
| def request(query, params: Dict[str, Any]) -> Dict[str, Any]: | ||||
|     q = quote(query) | ||||
|     lang = traits.get_language(params["language"], traits.all_locale)  # type: ignore | ||||
|     params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}" | ||||
|     return params | ||||
| 
 | ||||
| 
 | ||||
| def response(resp) -> List[Dict[str, Optional[str]]]: | ||||
|     results: List[Dict[str, Optional[str]]] = [] | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'): | ||||
|         results.append(_get_result(item)) | ||||
| 
 | ||||
|     # The rendering of the WEB page is very strange; except the first position | ||||
|     # all other positions of Anna's result page are enclosed in SGML comments. | ||||
|     # These comments are *uncommented* by some JS code, see query of class | ||||
|     # '.js-scroll-hidden' in Anna's HTML template: | ||||
|     #   https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html | ||||
| 
 | ||||
|     for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'): | ||||
|         item = html.fromstring(item.xpath('./comment()')[0].text) | ||||
|         results.append(_get_result(item)) | ||||
| 
 | ||||
|     return results | ||||
| 
 | ||||
| 
 | ||||
| def _get_result(item): | ||||
|     return { | ||||
|         'template': 'paper.html', | ||||
|         'url': base_url + item.xpath('./@href')[0], | ||||
|         'title': extract_text(eval_xpath(item, './/h3/text()[1]')), | ||||
|         'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')), | ||||
|         'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))], | ||||
|         'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')), | ||||
|         'img_src': item.xpath('.//img/@src')[0], | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def fetch_traits(engine_traits: EngineTraits): | ||||
|     """Fetch languages and other search arguments from Anna's search form.""" | ||||
|     # pylint: disable=import-outside-toplevel | ||||
| 
 | ||||
|     import babel | ||||
|     from searx.network import get  # see https://github.com/searxng/searxng/issues/762 | ||||
|     from searx.locales import language_tag | ||||
| 
 | ||||
|     engine_traits.all_locale = '' | ||||
|     engine_traits.custom['content'] = [] | ||||
|     engine_traits.custom['ext'] = [] | ||||
|     engine_traits.custom['sort'] = [] | ||||
| 
 | ||||
|     resp = get(base_url + '/search') | ||||
|     if not resp.ok:  # type: ignore | ||||
|         raise RuntimeError("Response from Anna's search page is not OK.") | ||||
|     dom = html.fromstring(resp.text)  # type: ignore | ||||
| 
 | ||||
|     # supported language codes | ||||
| 
 | ||||
|     lang_map = {} | ||||
|     for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"): | ||||
|         eng_lang = x.get("value") | ||||
|         if eng_lang in ('', '_empty', 'nl-BE', 'und'): | ||||
|             continue | ||||
|         try: | ||||
|             locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') | ||||
|         except babel.UnknownLocaleError: | ||||
|             # silently ignore unknown languages | ||||
|             # print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) | ||||
|             continue | ||||
|         sxng_lang = language_tag(locale) | ||||
|         conflict = engine_traits.languages.get(sxng_lang) | ||||
|         if conflict: | ||||
|             if conflict != eng_lang: | ||||
|                 print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) | ||||
|             continue | ||||
|         engine_traits.languages[sxng_lang] = eng_lang | ||||
| 
 | ||||
|     for x in eval_xpath_list(dom, "//form//select[@name='content']//option"): | ||||
|         engine_traits.custom['content'].append(x.get("value")) | ||||
| 
 | ||||
|     for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"): | ||||
|         engine_traits.custom['ext'].append(x.get("value")) | ||||
| 
 | ||||
|     for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"): | ||||
|         engine_traits.custom['sort'].append(x.get("value")) | ||||
|  | @ -297,6 +297,19 @@ engines: | |||
|     shortcut: 9g | ||||
|     disabled: true | ||||
| 
 | ||||
|   - name: annas archive | ||||
|     engine: annas_archive | ||||
|     disabled: true | ||||
|     shortcut: aa | ||||
| 
 | ||||
|   # - name: annas articles | ||||
|   #   engine: annas_archive | ||||
|   #   shortcut: aaa | ||||
|   #   # https://docs.searxng.org/src/searx.engines.annas_archive.html | ||||
|   #   aa_content: 'journal_article' # book_any .. magazine, standards_document | ||||
|   #   aa_ext: 'pdf'  # pdf, epub, .. | ||||
|   #   aa_sort: 'newest'  # newest, oldest, largest, smallest | ||||
| 
 | ||||
|   - name: apk mirror | ||||
|     engine: apkmirror | ||||
|     timeout: 4.0 | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Markus Heiser
						Markus Heiser