From cada89ee3648de6ca5b458aeacafe6c10d5230a2 Mon Sep 17 00:00:00 2001 From: Paolo Basso <12545838+paolobasso99@users.noreply.github.com> Date: Sun, 25 Jun 2023 18:32:15 +0200 Subject: [PATCH] [feat] engine: re-enables z-library (zlibrary-global.se) - re-enables z-library as the new domain zlibrary-global.se is now available from the open web. The announcement of the domain: https://www.reddit.com/r/zlibrary/comments/13whe08/mod_note_zlibraryglobalse_domain_is_officially/ It is an official domain, it requires to log in to the "personal" subdomain only to download files, but the search works. - changes the result template of zlibrary to paper.html, filling the appropriate fields - implements language filtering for zlibrary - implement zlibrary custom filters (engine traits) - refactor and document the zlibrary engine --- docs/dev/engines/online/zlibrary.rst | 13 + searx/data/engine_traits.json | 599 ++++++++++++++++++++++++++- searx/engines/zlibrary.py | 255 +++++++++--- searx/settings.yml | 18 +- searx/sxng_locales.py | 1 + 5 files changed, 808 insertions(+), 78 deletions(-) create mode 100644 docs/dev/engines/online/zlibrary.rst diff --git a/docs/dev/engines/online/zlibrary.rst b/docs/dev/engines/online/zlibrary.rst new file mode 100644 index 000000000..fb197abff --- /dev/null +++ b/docs/dev/engines/online/zlibrary.rst @@ -0,0 +1,13 @@ +.. _zlibrary engine: + +========= +Z-Library +========= + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.engines.zlibrary + :members: diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json index 072c9a5c4..e6774d696 100644 --- a/searx/data/engine_traits.json +++ b/searx/data/engine_traits.json @@ -4256,5 +4256,602 @@ "zh_Hant": "zh_cht" }, "regions": {} + }, + "z-library": { + "all_locale": "", + "custom": { + "ext": [ + "", + "TXT", + "PDF", + "FB2", + "EPUB", + "LIT", + "MOBI", + "RTF", + "DJV", + "DJVU", + "AZW", + "AZW3" + ], + "year_from": [ + "", + "2023", + "2022", + "2021", + "2020", + "2019", + "2018", + "2017", + "2016", + "2015", + "2014", + "2013", + "2012", + "2011", + "2010", + "2009", + "2008", + "2007", + "2006", + "2005", + "2004", + "2003", + "2002", + "2001", + "2000", + "1999", + "1998", + "1997", + "1996", + "1995", + "1994", + "1993", + "1992", + "1991", + "1990", + "1989", + "1988", + "1987", + "1986", + "1985", + "1984", + "1983", + "1982", + "1981", + "1980", + "1979", + "1978", + "1977", + "1976", + "1975", + "1974", + "1973", + "1972", + "1971", + "1970", + "1969", + "1968", + "1967", + "1966", + "1965", + "1964", + "1963", + "1962", + "1961", + "1960", + "1959", + "1958", + "1957", + "1956", + "1955", + "1954", + "1953", + "1952", + "1951", + "1950", + "1949", + "1948", + "1947", + "1946", + "1945", + "1944", + "1943", + "1942", + "1941", + "1940", + "1939", + "1938", + "1937", + "1936", + "1935", + "1934", + "1933", + "1932", + "1931", + "1930", + "1929", + "1928", + "1927", + "1926", + "1925", + "1924", + "1923", + "1922", + "1921", + "1920", + "1919", + "1918", + "1917", + "1916", + "1915", + "1914", + "1913", + "1912", + "1911", + "1910", + "1909", + "1908", + "1907", + "1906", + "1905", + "1904", + "1903", + "1902", + "1901", + "1900", + "1899", + "1898", + "1897", + "1896", + "1895", + "1894", + "1893", + "1892", + "1891", + "1890", + "1889", + "1888", + "1887", + "1886", + "1885", + "1884", + "1883", + "1882", + "1881", + "1880", + "1879", + "1878", + "1877", + "1876", + "1875", + "1874", + "1873", + "1872", + "1871", + "1870", + "1869", + "1868", + "1867", + "1866", + "1865", + "1864", + "1863", + "1862", + "1861", + "1860", + "1859", + "1858", + "1857", + "1856", + "1855", + "1854", + "1853", + "1852", + "1851", + "1850", + "1849", + "1848", + "1847", + "1846", + "1845", + "1844", + "1843", + "1842", + "1841", + "1840", + "1839", + "1838", + "1837", + "1836", + "1835", + "1834", + "1833", + "1832", + "1831", + "1830", + "1829", + "1828", + "1827", + "1826", + "1825", + "1824", + "1823", + "1822", + "1821", + "1820", + "1819", + "1818", + "1817", + "1816", + "1815", + "1814", + "1813", + "1812", + "1811", + "1810", + "1809", + "1808", + "1807", + "1806", + "1805", + "1804", + "1803", + "1802", + "1801", + "1800" + ], + "year_to": [ + "", + "2023", + "2022", + "2021", + "2020", + "2019", + "2018", + "2017", + "2016", + "2015", + "2014", + "2013", + "2012", + "2011", + "2010", + "2009", + "2008", + "2007", + "2006", + "2005", + "2004", + "2003", + "2002", + "2001", + "2000", + "1999", + "1998", + "1997", + "1996", + "1995", + "1994", + "1993", + "1992", + "1991", + "1990", + "1989", + "1988", + "1987", + "1986", + "1985", + "1984", + "1983", + "1982", + "1981", + "1980", + "1979", + "1978", + "1977", + "1976", + "1975", + "1974", + "1973", + "1972", + "1971", + "1970", + "1969", + "1968", + "1967", + "1966", + "1965", + "1964", + "1963", + "1962", + "1961", + "1960", + "1959", + "1958", + "1957", + "1956", + "1955", + "1954", + "1953", + "1952", + "1951", + "1950", + "1949", + "1948", + "1947", + "1946", + "1945", + "1944", + "1943", + "1942", + "1941", + "1940", + "1939", + "1938", + "1937", + "1936", + "1935", + "1934", + "1933", + "1932", + "1931", + "1930", + "1929", + "1928", + "1927", + "1926", + "1925", + "1924", + "1923", + "1922", + "1921", + "1920", + "1919", + "1918", + "1917", + "1916", + "1915", + "1914", + "1913", + "1912", + "1911", + "1910", + "1909", + "1908", + "1907", + "1906", + "1905", + "1904", + "1903", + "1902", + "1901", + "1900", + "1899", + "1898", + "1897", + "1896", + "1895", + "1894", + "1893", + "1892", + "1891", + "1890", + "1889", + "1888", + "1887", + "1886", + "1885", + "1884", + "1883", + "1882", + "1881", + "1880", + "1879", + "1878", + "1877", + "1876", + "1875", + "1874", + "1873", + "1872", + "1871", + "1870", + "1869", + "1868", + "1867", + "1866", + "1865", + "1864", + "1863", + "1862", + "1861", + "1860", + "1859", + "1858", + "1857", + "1856", + "1855", + "1854", + "1853", + "1852", + "1851", + "1850", + "1849", + "1848", + "1847", + "1846", + "1845", + "1844", + "1843", + "1842", + "1841", + "1840", + "1839", + "1838", + "1837", + "1836", + "1835", + "1834", + "1833", + "1832", + "1831", + "1830", + "1829", + "1828", + "1827", + "1826", + "1825", + "1824", + "1823", + "1822", + "1821", + "1820", + "1819", + "1818", + "1817", + "1816", + "1815", + "1814", + "1813", + "1812", + "1811", + "1810", + "1809", + "1808", + "1807", + "1806", + "1805", + "1804", + "1803", + "1802", + "1801", + "1800" + ] + }, + "data_type": "traits_v1", + "languages": { + "af": "afrikaans", + "ak": "akan", + "am": "amharic", + "ar": "arabic", + "as": "assamese", + "az": "azerbaijani", + "be": "belarusian", + "bg": "bulgarian", + "bm": "bambara", + "bo": "tibetan", + "br": "breton", + "bs": "bosnian", + "ca": "catalan", + "ce": "chechen", + "cs": "czech", + "cv": "chuvash", + "cy": "welsh", + "da": "danish", + "de": "german", + "dz": "dzongkha", + "ee": "ewe", + "el": "greek", + "en": "english", + "eo": "esperanto", + "es": "spanish", + "et": "estonian", + "eu": "basque", + "fa": "persian", + "fi": "finnish", + "fo": "faroese", + "fr": "french", + "ga": "irish", + "gl": "galician", + "gu": "gujarati", + "gv": "manx", + "ha": "hausa", + "he": "hebrew", + "hi": "hindi", + "hr": "croatian", + "hu": "hungarian", + "hy": "armenian", + "ia": "interlingua", + "id": "indonesian", + "ig": "igbo", + "is": "icelandic", + "it": "italian", + "ja": "japanese", + "jv": "javanese", + "ka": "georgian", + "ki": "kikuyu", + "kk": "kazakh", + "kl": "kalaallisut", + "kn": "kannada", + "ko": "korean", + "ks": "kashmiri", + "ku": "kurdish", + "kw": "cornish", + "ky": "kyrgyz", + "lb": "luxembourgish", + "lg": "ganda", + "ln": "lingala", + "lo": "lao", + "lt": "lithuanian", + "lu": "luba-katanga", + "lv": "latvian", + "mg": "malagasy", + "mk": "macedonian", + "ml": "malayalam", + "mn": "mongolian", + "mr": "marathi", + "mt": "maltese", + "my": "burmese", + "ne": "nepali", + "nl": "dutch", + "no": "norwegian", + "oc": "occitan", + "om": "oromo", + "or": "odia", + "pa": "punjabi", + "pl": "polish", + "ps": "pashto", + "pt": "portuguese", + "qu": "quechua", + "rm": "romansh", + "rn": "rundi", + "ro": "romanian", + "ru": "russian", + "rw": "kinyarwanda", + "sa": "sanskrit", + "sc": "sardinian", + "sd": "sindhi", + "sg": "sango", + "si": "sinhala", + "sk": "slovak", + "sl": "slovenian", + "sn": "shona", + "so": "somali", + "sq": "albanian", + "sr": "serbian", + "su": "sundanese", + "sv": "swedish", + "sw": "swahili", + "ta": "tamil", + "te": "telugu", + "tg": "tajik", + "th": "thai", + "ti": "tigrinya", + "tk": "turkmen", + "tr": "turkish", + "tt": "tatar", + "uk": "ukrainian", + "ur": "urdu", + "uz": "uzbek", + "vi": "vietnamese", + "wo": "wolof", + "xh": "xhosa", + "yi": "yiddish", + "yo": "yoruba", + "zh": "chinese", + "zu": "zulu" + }, + "regions": {} } -} +} \ No newline at end of file diff --git a/searx/engines/zlibrary.py b/searx/engines/zlibrary.py index 7778f69b6..813d52f64 100644 --- a/searx/engines/zlibrary.py +++ b/searx/engines/zlibrary.py @@ -1,94 +1,221 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Z-Library +"""`Z-Library`_ (abbreviated as z-lib, formerly BookFinder) is a shadow library +project for file-sharing access to scholarly journal articles, academic texts +and general-interest books. It began as a mirror of Library Genesis, from which +most of its books originate. -Z-Library uses regional domains (see https://z-lib.org). Known ``base_url:`` +.. _Z-Library: https://zlibrary-global.se/ -- base_url: https://b-ok.cc -- base_url: https://de1lib.org -- base_url: https://booksc.eu does not have cover preview -- base_url: https://booksc.org does not have cover preview +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`zlib_year_from` +- :py:obj:`zlib_year_to` +- :py:obj:`zlib_ext` + +With this options a SearXNG maintainer is able to configure **additional** +engines for specific searches in Z-Library. For example a engine to search +only for EPUB from 2010 to 2020. + +.. code:: yaml + + - name: z-library 2010s epub + engine: zlibrary + shortcut: zlib2010s + zlib_year_from: '2010' + zlib_year_to: '2020' + zlib_ext: 'EPUB' + +Implementations +=============== """ - +from __future__ import annotations +from typing import TYPE_CHECKING +from typing import List, Dict, Any, Optional +from datetime import datetime from urllib.parse import quote from lxml import html +from flask_babel import gettext -from searx.utils import extract_text, eval_xpath -from searx.network import get as http_get +from searx.utils import extract_text, eval_xpath, eval_xpath_list +from searx.enginelib.traits import EngineTraits +from searx.data import ENGINE_TRAITS + +if TYPE_CHECKING: + import httpx + import logging + + logger: logging.Logger # about -about = { - "website": "https://z-lib.org", +about: Dict[str, Any] = { + "website": "https://zlibrary-global.se", "wikidata_id": "Q104863992", "official_api_documentation": None, "use_official_api": False, "require_api_key": False, - "results": 'HTML', + "results": "HTML", } -categories = ['files'] -paging = True -base_url = '' +categories: List[str] = ["files"] +paging: bool = True +base_url: str = "https://zlibrary-global.se" + +zlib_year_from: str = "" +"""Filter z-library's results by year from. E.g '2010'. +""" + +zlib_year_to: str = "" +"""Filter z-library's results by year to. E.g. '2010'. +""" + +zlib_ext: str = "" +"""Filter z-library's results by a file ending. Common filters for example are +``PDF`` and ``EPUB``. +""" -def init(engine_settings=None): - global base_url # pylint: disable=global-statement +def init(engine_settings=None) -> None: # pylint: disable=unused-argument + """Check of engine's settings.""" + traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"]) - if "base_url" not in engine_settings: - resp = http_get('https://z-lib.org', timeout=5.0) - if resp.ok: - dom = html.fromstring(resp.text) - base_url = extract_text( - eval_xpath(dom, './/a[contains(@class, "domain-check-link") and @data-mode="books"]/@href') - ) - logger.debug("using base_url: %s" % base_url) + if zlib_ext and zlib_ext not in traits.custom["ext"]: + raise ValueError(f"invalid setting ext: {zlib_ext}") + if zlib_year_from and zlib_year_from not in traits.custom["year_from"]: + raise ValueError(f"invalid setting year_from: {zlib_year_from}") + if zlib_year_to and zlib_year_to not in traits.custom["year_to"]: + raise ValueError(f"invalid setting year_to: {zlib_year_to}") -def request(query, params): - search_url = base_url + '/s/{search_query}/?page={pageno}' - params['url'] = search_url.format(search_query=quote(query), pageno=params['pageno']) +def request(query: str, params: Dict[str, Any]) -> Dict[str, Any]: + lang: str = traits.get_language(params["language"], traits.all_locale) # type: ignore + search_url: str = ( + base_url + + "/s/{search_query}/?page={pageno}" + + "&yearFrom={zlib_year_from}" + + "&yearTo={zlib_year_to}" + + "&languages[]={lang}" + + "&extensions[]={zlib_ext}" + ) + params["url"] = search_url.format( + search_query=quote(query), + pageno=params["pageno"], + lang=lang, + zlib_year_from=zlib_year_from, + zlib_year_to=zlib_year_to, + zlib_ext=zlib_ext, + ) return params -def response(resp): - results = [] +def response(resp: httpx.Response) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] dom = html.fromstring(resp.text) for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'): - result = {} - - result["url"] = base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0] - - result["title"] = extract_text(eval_xpath(item, './/*[@itemprop="name"]')) - - year = extract_text( - eval_xpath(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]') - ) - if year: - year = '(%s) ' % year - - result[ - "content" - ] = "{year}{authors}. {publisher}. Language: {language}. {file_type}. \ - Book rating: {book_rating}, book quality: {book_quality}".format( - year=year, - authors=extract_text(eval_xpath(item, './/div[@class="authors"]')), - publisher=extract_text(eval_xpath(item, './/div[@title="Publisher"]')), - file_type=extract_text( - eval_xpath(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]') - ), - language=extract_text( - eval_xpath( - item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]' - ) - ), - book_rating=extract_text(eval_xpath(item, './/span[contains(@class, "book-rating-interest-score")]')), - book_quality=extract_text(eval_xpath(item, './/span[contains(@class, "book-rating-quality-score")]')), - ) - - result["img_src"] = extract_text(eval_xpath(item, './/img[contains(@class, "cover")]/@data-src')) - - results.append(result) + results.append(_parse_result(item)) return results + + +def _text(item, selector: str) -> str | None: + return extract_text(eval_xpath(item, selector)) + + +i18n_language = gettext("Language") +i18n_book_rating = gettext("Book rating") +i18n_file_quality = gettext("File quality") + + +def _parse_result(item) -> Dict[str, Any]: + + author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]') + + result = { + "template": "paper.html", + "url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0], + "title": _text(item, './/*[@itemprop="name"]'), + "authors": [extract_text(author) for author in author_elements], + "publisher": _text(item, './/a[@title="Publisher"]'), + "type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'), + "img_src": _text(item, './/img[contains(@class, "cover")]/@data-src'), + } + + year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]') + if year: + result["publishedDate"] = datetime.strptime(year, '%Y') + + content = [] + language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]') + if language: + content.append(f"{i18n_language}: {language.capitalize()}") + book_rating = _text(item, './/span[contains(@class, "book-rating-interest-score")]') + if book_rating and float(book_rating): + content.append(f"{i18n_book_rating}: {book_rating}") + file_quality = _text(item, './/span[contains(@class, "book-rating-quality-score")]') + if file_quality and float(file_quality): + content.append(f"{i18n_file_quality}: {file_quality}") + result["content"] = " | ".join(content) + + return result + + +def fetch_traits(engine_traits: EngineTraits) -> None: + """Fetch languages and other search arguments from zlibrary's search form.""" + # pylint: disable=import-outside-toplevel + + import babel + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + from searx.locales import language_tag + + engine_traits.all_locale = "" + engine_traits.custom["ext"] = [] + engine_traits.custom["year_from"] = [] + engine_traits.custom["year_to"] = [] + + resp = get(base_url) + if not resp.ok: # type: ignore + raise RuntimeError("Response from zlibrary's search page is not OK.") + dom = html.fromstring(resp.text) # type: ignore + + for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"): + engine_traits.custom["year_from"].append(year.get("value")) + + for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"): + engine_traits.custom["year_to"].append(year.get("value")) + + for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"): + value: Optional[str] = ext.get("value") + if value is None: + value = "" + engine_traits.custom["ext"].append(value) + + # Handle languages + # Z-library uses English names for languages, so we need to map them to their respective locales + language_name_locale_map: Dict[str, babel.Locale] = {} + for locale in babel.core.localedata.locale_identifiers(): # type: ignore + # Create a Locale object for the current locale + loc = babel.Locale.parse(locale) + language_name_locale_map[loc.english_name.lower()] = loc # type: ignore + + for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"): + eng_lang = x.get("value") + if eng_lang is None: + continue + try: + locale = language_name_locale_map[eng_lang.lower()] + except KeyError: + # silently ignore unknown languages + # print("ERROR: %s is unknown by babel" % (eng_lang)) + continue + sxng_lang = language_tag(locale) + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = eng_lang diff --git a/searx/settings.yml b/searx/settings.yml index 10ef4369e..db1bac659 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -909,19 +909,11 @@ engines: require_api_key: false results: HTML - # Disabling zlibrary due to z-lib.org domain seizure - # https://github.com/searxng/searxng/pull/1937 - # - # - name: z-library - # engine: zlibrary - # shortcut: zlib - # categories: files - # timeout: 3.0 - # # choose base_url, otherwise engine will do it at initialization time - # # base_url: https://b-ok.cc - # # base_url: https://de1lib.org - # # base_url: https://booksc.eu # does not have cover preview - # # base_url: https://booksc.org # does not have cover preview + - name: z-library + engine: zlibrary + shortcut: zlib + categories: files + timeout: 7.0 - name: library of congress engine: loc diff --git a/searx/sxng_locales.py b/searx/sxng_locales.py index 2cbf2fcc2..399d029ee 100644 --- a/searx/sxng_locales.py +++ b/searx/sxng_locales.py @@ -41,6 +41,7 @@ sxng_locales = ( ('es-US', 'Español', 'Estados Unidos', 'Spanish', '\U0001f1fa\U0001f1f8'), ('et', 'Eesti', '', 'Estonian', '\U0001f310'), ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'), + ('fa', 'فارسی', '', 'Persian', '\U0001f310'), ('fi', 'Suomi', '', 'Finnish', '\U0001f310'), ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'), ('fr', 'Français', '', 'French', '\U0001f310'),