[feat] engine: re-enables z-library (zlibrary-global.se)

- re-enables z-library as the new domain zlibrary-global.se is now available from the open web. The announcement of the domain: https://www.reddit.com/r/zlibrary/comments/13whe08/mod_note_zlibraryglobalse_domain_is_officially/ It is an official domain, it requires to log in to the "personal" subdomain only to download files, but the search works. - changes the result template of zlibrary to paper.html, filling the appropriate fields - implements language filtering for zlibrary - implement zlibrary custom filters (engine traits) - refactor and document the zlibrary engine
2024-01-01 19:24:07 +01:00 · 2023-06-25 18:32:15 +02:00 · 2023-06-25 18:32:15 +02:00 · cada89ee36
commit cada89ee36
parent cb92767f19
5 changed files with 808 additions and 78 deletions
--- a/docs/dev/engines/online/zlibrary.rst
+++ b/docs/dev/engines/online/zlibrary.rst
@ -0,0 +1,13 @@
 .. _zlibrary engine:
 =========
 Z-Library
 =========
 .. contents:: Contents
   :depth: 2
   :local:
   :backlinks: entry
 .. automodule:: searx.engines.zlibrary
  :members:
--- a/searx/data/engine_traits.json
+++ b/searx/data/engine_traits.json
@ -4256,5 +4256,602 @@
      "zh_Hant": "zh_cht"
    },
    "regions": {}
  },
  "z-library": {
    "all_locale": "",
    "custom": {
      "ext": [
        "",
        "TXT",
        "PDF",
        "FB2",
        "EPUB",
        "LIT",
        "MOBI",
        "RTF",
        "DJV",
        "DJVU",
        "AZW",
        "AZW3"
      ],
      "year_from": [
        "",
        "2023",
        "2022",
        "2021",
        "2020",
        "2019",
        "2018",
        "2017",
        "2016",
        "2015",
        "2014",
        "2013",
        "2012",
        "2011",
        "2010",
        "2009",
        "2008",
        "2007",
        "2006",
        "2005",
        "2004",
        "2003",
        "2002",
        "2001",
        "2000",
        "1999",
        "1998",
        "1997",
        "1996",
        "1995",
        "1994",
        "1993",
        "1992",
        "1991",
        "1990",
        "1989",
        "1988",
        "1987",
        "1986",
        "1985",
        "1984",
        "1983",
        "1982",
        "1981",
        "1980",
        "1979",
        "1978",
        "1977",
        "1976",
        "1975",
        "1974",
        "1973",
        "1972",
        "1971",
        "1970",
        "1969",
        "1968",
        "1967",
        "1966",
        "1965",
        "1964",
        "1963",
        "1962",
        "1961",
        "1960",
        "1959",
        "1958",
        "1957",
        "1956",
        "1955",
        "1954",
        "1953",
        "1952",
        "1951",
        "1950",
        "1949",
        "1948",
        "1947",
        "1946",
        "1945",
        "1944",
        "1943",
        "1942",
        "1941",
        "1940",
        "1939",
        "1938",
        "1937",
        "1936",
        "1935",
        "1934",
        "1933",
        "1932",
        "1931",
        "1930",
        "1929",
        "1928",
        "1927",
        "1926",
        "1925",
        "1924",
        "1923",
        "1922",
        "1921",
        "1920",
        "1919",
        "1918",
        "1917",
        "1916",
        "1915",
        "1914",
        "1913",
        "1912",
        "1911",
        "1910",
        "1909",
        "1908",
        "1907",
        "1906",
        "1905",
        "1904",
        "1903",
        "1902",
        "1901",
        "1900",
        "1899",
        "1898",
        "1897",
        "1896",
        "1895",
        "1894",
        "1893",
        "1892",
        "1891",
        "1890",
        "1889",
        "1888",
        "1887",
        "1886",
        "1885",
        "1884",
        "1883",
        "1882",
        "1881",
        "1880",
        "1879",
        "1878",
        "1877",
        "1876",
        "1875",
        "1874",
        "1873",
        "1872",
        "1871",
        "1870",
        "1869",
        "1868",
        "1867",
        "1866",
        "1865",
        "1864",
        "1863",
        "1862",
        "1861",
        "1860",
        "1859",
        "1858",
        "1857",
        "1856",
        "1855",
        "1854",
        "1853",
        "1852",
        "1851",
        "1850",
        "1849",
        "1848",
        "1847",
        "1846",
        "1845",
        "1844",
        "1843",
        "1842",
        "1841",
        "1840",
        "1839",
        "1838",
        "1837",
        "1836",
        "1835",
        "1834",
        "1833",
        "1832",
        "1831",
        "1830",
        "1829",
        "1828",
        "1827",
        "1826",
        "1825",
        "1824",
        "1823",
        "1822",
        "1821",
        "1820",
        "1819",
        "1818",
        "1817",
        "1816",
        "1815",
        "1814",
        "1813",
        "1812",
        "1811",
        "1810",
        "1809",
        "1808",
        "1807",
        "1806",
        "1805",
        "1804",
        "1803",
        "1802",
        "1801",
        "1800"
      ],
      "year_to": [
        "",
        "2023",
        "2022",
        "2021",
        "2020",
        "2019",
        "2018",
        "2017",
        "2016",
        "2015",
        "2014",
        "2013",
        "2012",
        "2011",
        "2010",
        "2009",
        "2008",
        "2007",
        "2006",
        "2005",
        "2004",
        "2003",
        "2002",
        "2001",
        "2000",
        "1999",
        "1998",
        "1997",
        "1996",
        "1995",
        "1994",
        "1993",
        "1992",
        "1991",
        "1990",
        "1989",
        "1988",
        "1987",
        "1986",
        "1985",
        "1984",
        "1983",
        "1982",
        "1981",
        "1980",
        "1979",
        "1978",
        "1977",
        "1976",
        "1975",
        "1974",
        "1973",
        "1972",
        "1971",
        "1970",
        "1969",
        "1968",
        "1967",
        "1966",
        "1965",
        "1964",
        "1963",
        "1962",
        "1961",
        "1960",
        "1959",
        "1958",
        "1957",
        "1956",
        "1955",
        "1954",
        "1953",
        "1952",
        "1951",
        "1950",
        "1949",
        "1948",
        "1947",
        "1946",
        "1945",
        "1944",
        "1943",
        "1942",
        "1941",
        "1940",
        "1939",
        "1938",
        "1937",
        "1936",
        "1935",
        "1934",
        "1933",
        "1932",
        "1931",
        "1930",
        "1929",
        "1928",
        "1927",
        "1926",
        "1925",
        "1924",
        "1923",
        "1922",
        "1921",
        "1920",
        "1919",
        "1918",
        "1917",
        "1916",
        "1915",
        "1914",
        "1913",
        "1912",
        "1911",
        "1910",
        "1909",
        "1908",
        "1907",
        "1906",
        "1905",
        "1904",
        "1903",
        "1902",
        "1901",
        "1900",
        "1899",
        "1898",
        "1897",
        "1896",
        "1895",
        "1894",
        "1893",
        "1892",
        "1891",
        "1890",
        "1889",
        "1888",
        "1887",
        "1886",
        "1885",
        "1884",
        "1883",
        "1882",
        "1881",
        "1880",
        "1879",
        "1878",
        "1877",
        "1876",
        "1875",
        "1874",
        "1873",
        "1872",
        "1871",
        "1870",
        "1869",
        "1868",
        "1867",
        "1866",
        "1865",
        "1864",
        "1863",
        "1862",
        "1861",
        "1860",
        "1859",
        "1858",
        "1857",
        "1856",
        "1855",
        "1854",
        "1853",
        "1852",
        "1851",
        "1850",
        "1849",
        "1848",
        "1847",
        "1846",
        "1845",
        "1844",
        "1843",
        "1842",
        "1841",
        "1840",
        "1839",
        "1838",
        "1837",
        "1836",
        "1835",
        "1834",
        "1833",
        "1832",
        "1831",
        "1830",
        "1829",
        "1828",
        "1827",
        "1826",
        "1825",
        "1824",
        "1823",
        "1822",
        "1821",
        "1820",
        "1819",
        "1818",
        "1817",
        "1816",
        "1815",
        "1814",
        "1813",
        "1812",
        "1811",
        "1810",
        "1809",
        "1808",
        "1807",
        "1806",
        "1805",
        "1804",
        "1803",
        "1802",
        "1801",
        "1800"
      ]
    },
    "data_type": "traits_v1",
    "languages": {
      "af": "afrikaans",
      "ak": "akan",
      "am": "amharic",
      "ar": "arabic",
      "as": "assamese",
      "az": "azerbaijani",
      "be": "belarusian",
      "bg": "bulgarian",
      "bm": "bambara",
      "bo": "tibetan",
      "br": "breton",
      "bs": "bosnian",
      "ca": "catalan",
      "ce": "chechen",
      "cs": "czech",
      "cv": "chuvash",
      "cy": "welsh",
      "da": "danish",
      "de": "german",
      "dz": "dzongkha",
      "ee": "ewe",
      "el": "greek",
      "en": "english",
      "eo": "esperanto",
      "es": "spanish",
      "et": "estonian",
      "eu": "basque",
      "fa": "persian",
      "fi": "finnish",
      "fo": "faroese",
      "fr": "french",
      "ga": "irish",
      "gl": "galician",
      "gu": "gujarati",
      "gv": "manx",
      "ha": "hausa",
      "he": "hebrew",
      "hi": "hindi",
      "hr": "croatian",
      "hu": "hungarian",
      "hy": "armenian",
      "ia": "interlingua",
      "id": "indonesian",
      "ig": "igbo",
      "is": "icelandic",
      "it": "italian",
      "ja": "japanese",
      "jv": "javanese",
      "ka": "georgian",
      "ki": "kikuyu",
      "kk": "kazakh",
      "kl": "kalaallisut",
      "kn": "kannada",
      "ko": "korean",
      "ks": "kashmiri",
      "ku": "kurdish",
      "kw": "cornish",
      "ky": "kyrgyz",
      "lb": "luxembourgish",
      "lg": "ganda",
      "ln": "lingala",
      "lo": "lao",
      "lt": "lithuanian",
      "lu": "luba-katanga",
      "lv": "latvian",
      "mg": "malagasy",
      "mk": "macedonian",
      "ml": "malayalam",
      "mn": "mongolian",
      "mr": "marathi",
      "mt": "maltese",
      "my": "burmese",
      "ne": "nepali",
      "nl": "dutch",
      "no": "norwegian",
      "oc": "occitan",
      "om": "oromo",
      "or": "odia",
      "pa": "punjabi",
      "pl": "polish",
      "ps": "pashto",
      "pt": "portuguese",
      "qu": "quechua",
      "rm": "romansh",
      "rn": "rundi",
      "ro": "romanian",
      "ru": "russian",
      "rw": "kinyarwanda",
      "sa": "sanskrit",
      "sc": "sardinian",
      "sd": "sindhi",
      "sg": "sango",
      "si": "sinhala",
      "sk": "slovak",
      "sl": "slovenian",
      "sn": "shona",
      "so": "somali",
      "sq": "albanian",
      "sr": "serbian",
      "su": "sundanese",
      "sv": "swedish",
      "sw": "swahili",
      "ta": "tamil",
      "te": "telugu",
      "tg": "tajik",
      "th": "thai",
      "ti": "tigrinya",
      "tk": "turkmen",
      "tr": "turkish",
      "tt": "tatar",
      "uk": "ukrainian",
      "ur": "urdu",
      "uz": "uzbek",
      "vi": "vietnamese",
      "wo": "wolof",
      "xh": "xhosa",
      "yi": "yiddish",
      "yo": "yoruba",
      "zh": "chinese",
      "zu": "zulu"
    },
    "regions": {}
  }
 }
--- a/searx/engines/zlibrary.py
+++ b/searx/engines/zlibrary.py
@ -1,94 +1,221 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # lint: pylint
-"""Z-Library
+"""`Z-Library`_ (abbreviated as z-lib, formerly BookFinder) is a shadow library
 project for file-sharing access to scholarly journal articles, academic texts
 and general-interest books.  It began as a mirror of Library Genesis, from which
 most of its books originate.
-Z-Library uses regional domains (see https://z-lib.org). Known ``base_url:``
+.. _Z-Library: https://zlibrary-global.se/
- base_url: https://b-ok.cc
+Configuration
- base_url: https://de1lib.org
+=============
- base_url: https://booksc.eu does not have cover preview
+
- base_url: https://booksc.org does not have cover preview
+The engine has the following additional settings:
 - :py:obj:`zlib_year_from`
 - :py:obj:`zlib_year_to`
 - :py:obj:`zlib_ext`
 With this options a SearXNG maintainer is able to configure **additional**
 engines for specific searches in Z-Library.  For example a engine to search
 only for EPUB from 2010 to 2020.
 .. code:: yaml
   - name: z-library 2010s epub
     engine: zlibrary
     shortcut: zlib2010s
     zlib_year_from: '2010'
     zlib_year_to: '2020'
     zlib_ext: 'EPUB'
 Implementations
 ===============
 """
-
+from __future__ import annotations
 from typing import TYPE_CHECKING
 from typing import List, Dict, Any, Optional
 from datetime import datetime
 from urllib.parse import quote
 from lxml import html
 from flask_babel import gettext
-from searx.utils import extract_text, eval_xpath
+from searx.utils import extract_text, eval_xpath, eval_xpath_list
-from searx.network import get as http_get
+from searx.enginelib.traits import EngineTraits
 from searx.data import ENGINE_TRAITS
 if TYPE_CHECKING:
    import httpx
    import logging
    logger: logging.Logger
 # about
-about = {
+about: Dict[str, Any] = {
-    "website": "https://z-lib.org",
+    "website": "https://zlibrary-global.se",
    "wikidata_id": "Q104863992",
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
-    "results": 'HTML',
+    "results": "HTML",
 }
-categories = ['files']
+categories: List[str] = ["files"]
-paging = True
+paging: bool = True
-base_url = ''
+base_url: str = "https://zlibrary-global.se"
 zlib_year_from: str = ""
 """Filter z-library's results by year from. E.g '2010'.
 """
 zlib_year_to: str = ""
 """Filter z-library's results by year to. E.g. '2010'.
 """
 zlib_ext: str = ""
 """Filter z-library's results by a file ending. Common filters for example are
 ``PDF`` and ``EPUB``.
 """
-def init(engine_settings=None):
+def init(engine_settings=None) -> None:  # pylint: disable=unused-argument
-    global base_url  # pylint: disable=global-statement
+    """Check of engine's settings."""
    traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"])
-    if "base_url" not in engine_settings:
+    if zlib_ext and zlib_ext not in traits.custom["ext"]:
-        resp = http_get('https://z-lib.org', timeout=5.0)
+        raise ValueError(f"invalid setting ext: {zlib_ext}")
-        if resp.ok:
+    if zlib_year_from and zlib_year_from not in traits.custom["year_from"]:
-            dom = html.fromstring(resp.text)
+        raise ValueError(f"invalid setting year_from: {zlib_year_from}")
-            base_url = extract_text(
+    if zlib_year_to and zlib_year_to not in traits.custom["year_to"]:
-                eval_xpath(dom, './/a[contains(@class, "domain-check-link") and @data-mode="books"]/@href')
+        raise ValueError(f"invalid setting year_to: {zlib_year_to}")
            )
    logger.debug("using base_url: %s" % base_url)
-def request(query, params):
+def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]:
-    search_url = base_url + '/s/{search_query}/?page={pageno}'
+    lang: str = traits.get_language(params["language"], traits.all_locale)  # type: ignore
-    params['url'] = search_url.format(search_query=quote(query), pageno=params['pageno'])
+    search_url: str = (
        base_url
        + "/s/{search_query}/?page={pageno}"
        + "&yearFrom={zlib_year_from}"
        + "&yearTo={zlib_year_to}"
        + "&languages[]={lang}"
        + "&extensions[]={zlib_ext}"
    )
    params["url"] = search_url.format(
        search_query=quote(query),
        pageno=params["pageno"],
        lang=lang,
        zlib_year_from=zlib_year_from,
        zlib_year_to=zlib_year_to,
        zlib_ext=zlib_ext,
    )
    return params
-def response(resp):
+def response(resp: httpx.Response) -> List[Dict[str, Any]]:
-    results = []
+    results: List[Dict[str, Any]] = []
    dom = html.fromstring(resp.text)
    for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'):
-        result = {}
+        results.append(_parse_result(item))
        result["url"] = base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0]
        result["title"] = extract_text(eval_xpath(item, './/*[@itemprop="name"]'))
        year = extract_text(
            eval_xpath(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]')
        )
        if year:
            year = '(%s) ' % year
        result[
            "content"
        ] = "{year}{authors}. {publisher}. Language: {language}. {file_type}. \
            Book rating: {book_rating}, book quality: {book_quality}".format(
            year=year,
            authors=extract_text(eval_xpath(item, './/div[@class="authors"]')),
            publisher=extract_text(eval_xpath(item, './/div[@title="Publisher"]')),
            file_type=extract_text(
                eval_xpath(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]')
            ),
            language=extract_text(
                eval_xpath(
                    item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]'
                )
            ),
            book_rating=extract_text(eval_xpath(item, './/span[contains(@class, "book-rating-interest-score")]')),
            book_quality=extract_text(eval_xpath(item, './/span[contains(@class, "book-rating-quality-score")]')),
        )
        result["img_src"] = extract_text(eval_xpath(item, './/img[contains(@class, "cover")]/@data-src'))
        results.append(result)
    return results
 def _text(item, selector: str) -> str | None:
    return extract_text(eval_xpath(item, selector))
 i18n_language = gettext("Language")
 i18n_book_rating = gettext("Book rating")
 i18n_file_quality = gettext("File quality")
 def _parse_result(item) -> Dict[str, Any]:
    author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]')
    result = {
        "template": "paper.html",
        "url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0],
        "title": _text(item, './/*[@itemprop="name"]'),
        "authors": [extract_text(author) for author in author_elements],
        "publisher": _text(item, './/a[@title="Publisher"]'),
        "type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'),
        "img_src": _text(item, './/img[contains(@class, "cover")]/@data-src'),
    }
    year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]')
    if year:
        result["publishedDate"] = datetime.strptime(year, '%Y')
    content = []
    language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]')
    if language:
        content.append(f"{i18n_language}: {language.capitalize()}")
    book_rating = _text(item, './/span[contains(@class, "book-rating-interest-score")]')
    if book_rating and float(book_rating):
        content.append(f"{i18n_book_rating}: {book_rating}")
    file_quality = _text(item, './/span[contains(@class, "book-rating-quality-score")]')
    if file_quality and float(file_quality):
        content.append(f"{i18n_file_quality}: {file_quality}")
    result["content"] = " | ".join(content)
    return result
 def fetch_traits(engine_traits: EngineTraits) -> None:
    """Fetch languages and other search arguments from zlibrary's search form."""
    # pylint: disable=import-outside-toplevel
    import babel
    from searx.network import get  # see https://github.com/searxng/searxng/issues/762
    from searx.locales import language_tag
    engine_traits.all_locale = ""
    engine_traits.custom["ext"] = []
    engine_traits.custom["year_from"] = []
    engine_traits.custom["year_to"] = []
    resp = get(base_url)
    if not resp.ok:  # type: ignore
        raise RuntimeError("Response from zlibrary's search page is not OK.")
    dom = html.fromstring(resp.text)  # type: ignore
    for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"):
        engine_traits.custom["year_from"].append(year.get("value"))
    for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"):
        engine_traits.custom["year_to"].append(year.get("value"))
    for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"):
        value: Optional[str] = ext.get("value")
        if value is None:
            value = ""
        engine_traits.custom["ext"].append(value)
    # Handle languages
    # Z-library uses English names for languages, so we need to map them to their respective locales
    language_name_locale_map: Dict[str, babel.Locale] = {}
    for locale in babel.core.localedata.locale_identifiers():  # type: ignore
        # Create a Locale object for the current locale
        loc = babel.Locale.parse(locale)
        language_name_locale_map[loc.english_name.lower()] = loc  # type: ignore
    for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"):
        eng_lang = x.get("value")
        if eng_lang is None:
            continue
        try:
            locale = language_name_locale_map[eng_lang.lower()]
        except KeyError:
            # silently ignore unknown languages
            # print("ERROR: %s is unknown by babel" % (eng_lang))
            continue
        sxng_lang = language_tag(locale)
        conflict = engine_traits.languages.get(sxng_lang)
        if conflict:
            if conflict != eng_lang:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
            continue
        engine_traits.languages[sxng_lang] = eng_lang
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -909,19 +909,11 @@ engines:
      require_api_key: false
      results: HTML
-  # Disabling zlibrary due to z-lib.org domain seizure
+  - name: z-library
-  # https://github.com/searxng/searxng/pull/1937
+    engine: zlibrary
-  #
+    shortcut: zlib
-  # - name: z-library
+    categories: files
-  #   engine: zlibrary
+    timeout: 7.0
  #   shortcut: zlib
  #   categories: files
  #   timeout: 3.0
  #   # choose base_url, otherwise engine will do it at initialization time
  #   # base_url: https://b-ok.cc
  #   # base_url: https://de1lib.org
  #   # base_url: https://booksc.eu   # does not have cover preview
  #   # base_url: https://booksc.org  # does not have cover preview
  - name: library of congress
    engine: loc
--- a/searx/sxng_locales.py
+++ b/searx/sxng_locales.py
@ -41,6 +41,7 @@ sxng_locales = (
    ('es-US', 'Español', 'Estados Unidos', 'Spanish', '\U0001f1fa\U0001f1f8'),
    ('et', 'Eesti', '', 'Estonian', '\U0001f310'),
    ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
    ('fa', 'فارسی', '', 'Persian', '\U0001f310'),
    ('fi', 'Suomi', '', 'Finnish', '\U0001f310'),
    ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
    ('fr', 'Français', '', 'French', '\U0001f310'),