searxng/searx/engines/btdigg.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""
 BTDigg (Videos, Music, Files)
"""

from lxml import html
from urllib.parse import quote, urljoin
from searx.utils import extract_text, get_torrent_size

# about
about = {
    "website": 'https://btdig.com',
    "wikidata_id": 'Q4836698',
    "official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'},
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

# engine dependent config
categories = ['files']
paging = True

# search-url
url = 'https://btdig.com'
search_url = url + '/search?q={search_term}&p={pageno}'


# do search-request
def request(query, params):
    params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1)

    return params


# get response from search-request
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    search_res = dom.xpath('//div[@class="one_result"]')

    # return empty array if nothing is found
    if not search_res:
        return []

    # parse results
    for result in search_res:
        link = result.xpath('.//div[@class="torrent_name"]//a')[0]
        href = urljoin(url, link.attrib.get('href'))
        title = extract_text(link)

        excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]
        content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)
        # it is better to emit <br/> instead of |, but html tags are verboten
        content = content.strip().replace('\n', ' | ')
        content = ' '.join(content.split())

        filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0]
        filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1]
        files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]

        # convert filesize to byte if possible
        filesize = get_torrent_size(filesize, filesize_multiplier)

        # convert files to int if possible
        try:
            files = int(files)
        except:
            files = None

        magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']

        # append result
        results.append(
            {
                'url': href,
                'title': title,
                'content': content,
                'filesize': filesize,
                'files': files,
                'magnetlink': magnetlink,
                'template': 'torrent.html',
            }
        )

    # return results sorted by seeder
    return results
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
update versions.cfg to use the current up-to-date packages 2015-05-02 15:45:17 +02:00			`"""`
			`BTDigg (Videos, Music, Files)`
			`"""`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
			`from lxml import html`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`from urllib.parse import quote, urljoin`
[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00			`from searx.utils import extract_text, get_torrent_size`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# about`
			`about = {`
			`"website": 'https://btdig.com',`
			`"wikidata_id": 'Q4836698',`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`"official_api_documentation": {'url': 'https://btdig.com/contacts', 'comment': 'on demand'},`
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`"use_official_api": False,`
			`"require_api_key": False,`
			`"results": 'HTML',`
			`}`

BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00			`# engine dependent config`
[mod] the bittorent search engines are available only in the files category related to #101 2021-05-29 16:14:19 +02:00			`categories = ['files']`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00			`paging = True`

			`# search-url`
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`url = 'https://btdig.com'`
[fix] btdigg 2015-01-25 10:21:44 +01:00			`search_url = url + '/search?q={search_term}&p={pageno}'`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00

			`# do search-request`
			`def request(query, params):`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`params['url'] = search_url.format(search_term=quote(query), pageno=params['pageno'] - 1)`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
			`return params`


			`# get response from search-request`
			`def response(resp):`
			`results = []`

[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`dom = html.fromstring(resp.text)`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`search_res = dom.xpath('//div[@class="one_result"]')`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
			`# return empty array if nothing is found`
			`if not search_res:`
			`return []`

			`# parse results`
			`for result in search_res:`
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`link = result.xpath('.//div[@class="torrent_name"]//a')[0]`
BTDigg's unit test 2015-01-30 19:52:44 +01:00			`href = urljoin(url, link.attrib.get('href'))`
[mod] do not escape html content in engines 2016-12-09 11:44:24 +01:00			`title = extract_text(link)`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`excerpt = result.xpath('.//div[@class="torrent_excerpt"]')[0]`
			`content = html.tostring(excerpt, encoding='unicode', method='text', with_tail=False)`
			`# it is better to emit <br/> instead of \|, but html tags are verboten`
			`content = content.strip().replace('\n', ' \| ')`
			`content = ' '.join(content.split())`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`filesize = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[0]`
			`filesize_multiplier = result.xpath('.//span[@class="torrent_size"]/text()')[0].split()[1]`
			`files = (result.xpath('.//span[@class="torrent_files"]/text()') or ['1'])[0]`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
			`# convert filesize to byte if possible`
add digbt engine Unfortunately, it is quite slow so it is disabled. Furthermore, the display of number of files is wrong on digbt.org, so it is not displayed on searx. 2016-08-13 14:55:47 +02:00			`filesize = get_torrent_size(filesize, filesize_multiplier)`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
			`# convert files to int if possible`
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`try:`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00			`files = int(files)`
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`except:`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00			`files = None`

[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`magnetlink = result.xpath('.//div[@class="torrent_magnet"]//a')[0].attrib['href']`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
			`# append result`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`results.append(`
			`{`
			`'url': href,`
			`'title': title,`
			`'content': content,`
			`'filesize': filesize,`
			`'files': files,`
			`'magnetlink': magnetlink,`
			`'template': 'torrent.html',`
			`}`
			`)`
BTDigg and Mixcloud engines 2015-01-21 18:02:29 +01:00
			`# return results sorted by seeder`
[mod] restore btdigg engine as btdig.com (#1515) 2019-07-25 08:40:48 +02:00			`return results`