From e5637fe7b98d7fb06cbbe0e0f24deb12a33187ba Mon Sep 17 00:00:00 2001 From: Paolo Basso <12545838+paolobasso99@users.noreply.github.com> Date: Sun, 25 Jun 2023 17:12:17 +0200 Subject: [PATCH] [feat] engine: implementation of Anna's Archive Anna's Archive [1] is a free non-profit online shadow library metasearch engine providing access to a variety of book resources (also via IPFS), created by a team of anonymous archivists [2]. [1] https://annas-archive.org/ [2] https://annas-software.org/AnnaArchivist/annas-archive --- searx/engines/annas-archive.py | 63 ++++++++++++++++++++++++++++++++++ searx/settings.yml | 7 ++++ 2 files changed, 70 insertions(+) create mode 100644 searx/engines/annas-archive.py diff --git a/searx/engines/annas-archive.py b/searx/engines/annas-archive.py new file mode 100644 index 000000000..56d1ca77a --- /dev/null +++ b/searx/engines/annas-archive.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Anna's Archive + +""" +from typing import List, Dict, Any, Optional +from urllib.parse import quote +from lxml import html + +from searx.utils import extract_text, eval_xpath + +# about +about: Dict[str, Any] = { + "website": "https://annas-archive.org/", + "wikidata_id": "Q115288326", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# engine dependent config +categories: List[str] = ["files"] +paging: bool = False + +# search-url +base_url: str = "https://annas-archive.org" + +# xpath queries +xpath_results: str = '//main//a[starts-with(@href,"/md5")]' +xpath_url: str = ".//@href" +xpath_title: str = ".//h3/text()[1]" +xpath_authors: str = './/div[contains(@class, "italic")]' +xpath_publisher: str = './/div[contains(@class, "text-sm")]' +xpath_file_info: str = './/div[contains(@class, "text-xs")]' + + +def request(query, params: Dict[str, Any]) -> Dict[str, Any]: + search_url: str = base_url + "/search?q={search_query}" + params["url"] = search_url.format(search_query=quote(query)) + return params + + +def response(resp) -> List[Dict[str, Optional[str]]]: + results: List[Dict[str, Optional[str]]] = [] + dom = html.fromstring(resp.text) + + for item in dom.xpath(xpath_results): + result: Dict[str, Optional[str]] = {} + + result["url"] = base_url + item.xpath(xpath_url)[0] + + result["title"] = extract_text(eval_xpath(item, xpath_title)) + + result["content"] = "{publisher}. {authors}. {file_info}".format( + authors=extract_text(eval_xpath(item, xpath_authors)), + publisher=extract_text(eval_xpath(item, xpath_publisher)), + file_info=extract_text(eval_xpath(item, xpath_file_info)), + ) + + results.append(result) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index b6bb0a0e3..561ec41a9 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -297,6 +297,13 @@ engines: shortcut: 9g disabled: true + - name: anna's archive + engine: annas-archive + paging: False + categories: files + disabled: true + shortcut: aa + - name: apk mirror engine: apkmirror timeout: 4.0