searxng/searx/engines/xpath.py

from lxml import html
from urllib.parse import urlencode
from searx.utils import extract_text, extract_url, eval_xpath

search_url = None
url_xpath = None
content_xpath = None
title_xpath = None
thumbnail_xpath = False
categories = []
paging = False
suggestion_xpath = ''
results_xpath = ''
cached_xpath = ''
cached_url = ''

# parameters for engines with paging support
#
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1


def request(query, params):
    query = urlencode({'q': query})[2:]

    fp = {'query': query}
    if paging and search_url.find('{pageno}') >= 0:
        fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num

    params['url'] = search_url.format(**fp)
    params['query'] = query

    return params


def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False

    if results_xpath:
        for result in eval_xpath(dom, results_xpath):
            url = extract_url(eval_xpath(result, url_xpath), search_url)
            title = extract_text(eval_xpath(result, title_xpath))
            content = extract_text(eval_xpath(result, content_xpath))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url) for
                 x in dom.xpath(url_xpath)),
                map(extract_text, dom.xpath(title_xpath)),
                map(extract_text, dom.xpath(content_xpath)),
                map(extract_text, dom.xpath(cached_xpath))
            ):
                results.append({'url': url, 'title': title, 'content': content,
                                'cached_url': cached_url + cached, 'is_onion': is_onion})
        else:
            for url, title, content in zip(
                (extract_url(x, search_url) for
                 x in dom.xpath(url_xpath)),
                map(extract_text, dom.xpath(title_xpath)),
                map(extract_text, dom.xpath(content_xpath))
            ):
                results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})

    if not suggestion_xpath:
        return results
    for suggestion in eval_xpath(dom, suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})
    return results
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`from lxml import html`
[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00			`from urllib.parse import urlencode`
			`from searx.utils import extract_text, extract_url, eval_xpath`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`search_url = None`
			`url_xpath = None`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`content_xpath = None`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`title_xpath = None`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`thumbnail_xpath = False`
[mod] pylint: minor code change to allow pylint globally This commit is only a step, it doesn't fix all the issues reported by pylint 2020-11-03 11:35:53 +01:00			`categories = []`
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`paging = False`
[enh] suggestion support for xpath engine 2013-11-13 19:33:09 +01:00			`suggestion_xpath = ''`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`results_xpath = ''`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`cached_xpath = ''`
			`cached_url = ''`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00			`# parameters for engines with paging support`
			`#`
			`# number of results on each page`
			`# (only needed if the site requires not a page number, but an offset)`
			`page_size = 1`
			`# number of the first page (usually 0 or 1)`
			`first_page_num = 1`

[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`def request(query, params):`
			`query = urlencode({'q': query})[2:]`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
			`fp = {'query': query}`
			`if paging and search_url.find('{pageno}') >= 0:`
[fix] behaviour for page_size>1 and first_page_num>0 eg. pageno=1,21,41,... instead of 20,40,60,... 2016-08-14 13:46:54 +02:00			`fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
			`params['url'] = search_url.format(**fp)`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`params['query'] = query`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`return params`


			`def response(resp):`
			`results = []`
			`dom = html.fromstring(resp.text)`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`is_onion = True if 'onions' in categories else False`

[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`if results_xpath:`
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`for result in eval_xpath(dom, results_xpath):`
			`url = extract_url(eval_xpath(result, url_xpath), search_url)`
			`title = extract_text(eval_xpath(result, title_xpath))`
			`content = extract_text(eval_xpath(result, content_xpath))`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`tmp_result = {'url': url, 'title': title, 'content': content}`

			`# add thumbnail if available`
			`if thumbnail_xpath:`
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)`
[fix] fixes google play engines (#1651) update commit 87baa74a863ac74ae4c86bbfcb04148ba7f70696 2019-07-25 09:31:47 +02:00			`if len(thumbnail_xpath_result) > 0:`
			`tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`# add alternative cached url if available`
			`if cached_xpath:`
			`tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath))`

			`if is_onion:`
			`tmp_result['is_onion'] = True`

[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`results.append(tmp_result)`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`else:`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`if cached_xpath:`
			`for url, title, content, cached in zip(`
			`(extract_url(x, search_url) for`
			`x in dom.xpath(url_xpath)),`
			`map(extract_text, dom.xpath(title_xpath)),`
			`map(extract_text, dom.xpath(content_xpath)),`
			`map(extract_text, dom.xpath(cached_xpath))`
			`):`
			`results.append({'url': url, 'title': title, 'content': content,`
			`'cached_url': cached_url + cached, 'is_onion': is_onion})`
			`else:`
			`for url, title, content in zip(`
			`(extract_url(x, search_url) for`
			`x in dom.xpath(url_xpath)),`
			`map(extract_text, dom.xpath(title_xpath)),`
			`map(extract_text, dom.xpath(content_xpath))`
			`):`
			`results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00
[enh] suggestion support for xpath engine 2013-11-13 19:33:09 +01:00			`if not suggestion_xpath:`
			`return results`
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`for suggestion in eval_xpath(dom, suggestion_xpath):`
[mod][fix] xpath engine simplified, yahoo engine never returns truncated urls 2014-01-05 14:06:52 +01:00			`results.append({'suggestion': extract_text(suggestion)})`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`return results`