searxng/searx/engines/xpath.py

# SPDX-License-Identifier: AGPL-3.0-or-later

from lxml import html
from urllib.parse import urlencode
from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list

search_url = None
url_xpath = None
content_xpath = None
title_xpath = None
thumbnail_xpath = False
paging = False
suggestion_xpath = ''
results_xpath = ''
cached_xpath = ''
cached_url = ''

# parameters for engines with paging support
#
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1
# number of the first page (usually 0 or 1)
first_page_num = 1


def request(query, params):
    query = urlencode({'q': query})[2:]

    fp = {'query': query}
    if paging and search_url.find('{pageno}') >= 0:
        fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num

    params['url'] = search_url.format(**fp)
    params['query'] = query

    return params


def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False  # pylint: disable=undefined-variable

    if results_xpath:
        for result in eval_xpath_list(dom, results_xpath):
            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
            title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
            content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url\
                    + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url) for
                 x in eval_xpath_list(dom, url_xpath)),
                map(extract_text, eval_xpath_list(dom, title_xpath)),
                map(extract_text, eval_xpath_list(dom, content_xpath)),
                map(extract_text, eval_xpath_list(dom, cached_xpath))
            ):
                results.append({'url': url, 'title': title, 'content': content,
                                'cached_url': cached_url + cached, 'is_onion': is_onion})
        else:
            for url, title, content in zip(
                (extract_url(x, search_url) for
                 x in eval_xpath_list(dom, url_xpath)),
                map(extract_text, eval_xpath_list(dom, title_xpath)),
                map(extract_text, eval_xpath_list(dom, content_xpath))
            ):
                results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})

    if not suggestion_xpath:
        return results
    for suggestion in eval_xpath(dom, suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})
    return results
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`

[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`from lxml import html`
[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00			`from urllib.parse import urlencode`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`search_url = None`
			`url_xpath = None`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`content_xpath = None`
[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00			`title_xpath = None`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`thumbnail_xpath = False`
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`paging = False`
[enh] suggestion support for xpath engine 2013-11-13 19:33:09 +01:00			`suggestion_xpath = ''`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`results_xpath = ''`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`cached_xpath = ''`
			`cached_url = ''`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00			`# parameters for engines with paging support`
			`#`
			`# number of results on each page`
			`# (only needed if the site requires not a page number, but an offset)`
			`page_size = 1`
			`# number of the first page (usually 0 or 1)`
			`first_page_num = 1`

[fix] pep/flake8 compatibility 2014-01-20 02:31:20 +01:00
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`def request(query, params):`
			`query = urlencode({'q': query})[2:]`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
			`fp = {'query': query}`
			`if paging and search_url.find('{pageno}') >= 0:`
[fix] behaviour for page_size>1 and first_page_num>0 eg. pageno=1,21,41,... instead of 20,40,60,... 2016-08-14 13:46:54 +02:00			`fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
			`params['url'] = search_url.format(**fp)`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`params['query'] = query`
Add paging support to XPath & Erowid engines 2016-03-28 15:15:03 +02:00
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`return params`


			`def response(resp):`
			`results = []`
			`dom = html.fromstring(resp.text)`
[fix] xpath, mojeek: fix commit 58d72f26925d56e22330c54be03c3dcbee0c4135 before commit 58d72f2, category was not set in xpath.py, so searx/engines/__init__py was setting the category to ['general'] the commit 58d72f2 set the category to [] which is not replaced by searx/engines/__init__.py consequence: the mojeek engine is hidden in the preferences. this commit revert the xpath.py change. close #2368 2020-12-10 10:40:45 +01:00			`is_onion = True if 'onions' in categories else False # pylint: disable=undefined-variable`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`if results_xpath:`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`for result in eval_xpath_list(dom, results_xpath):`
			`url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)`
			`title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))`
			`content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`tmp_result = {'url': url, 'title': title, 'content': content}`

			`# add thumbnail if available`
			`if thumbnail_xpath:`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`thumbnail_xpath_result = eval_xpath_list(result, thumbnail_xpath)`
[fix] fixes google play engines (#1651) update commit 87baa74a863ac74ae4c86bbfcb04148ba7f70696 2019-07-25 09:31:47 +02:00			`if len(thumbnail_xpath_result) > 0:`
			`tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)`
[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`# add alternative cached url if available`
			`if cached_xpath:`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`tmp_result['cached_url'] = cached_url\`
			`+ extract_text(eval_xpath_list(result, cached_xpath, min_len=1))`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00
			`if is_onion:`
			`tmp_result['is_onion'] = True`

[fix] fixes google play engines and adds thumbnails to their results (#1612) fix google play apps, google play apps, google play music engines xpath engine: thumbnail_xpath can define an optional thumbnail 2019-07-25 07:46:41 +02:00			`results.append(tmp_result)`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00			`else:`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`if cached_xpath:`
			`for url, title, content, cached in zip(`
			`(extract_url(x, search_url) for`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`x in eval_xpath_list(dom, url_xpath)),`
			`map(extract_text, eval_xpath_list(dom, title_xpath)),`
			`map(extract_text, eval_xpath_list(dom, content_xpath)),`
			`map(extract_text, eval_xpath_list(dom, cached_xpath))`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`):`
			`results.append({'url': url, 'title': title, 'content': content,`
			`'cached_url': cached_url + cached, 'is_onion': is_onion})`
			`else:`
			`for url, title, content in zip(`
			`(extract_url(x, search_url) for`
[mod] xpath, 1337x, acgsou, apkmirror, archlinux, arxiv: use eval_xpath_* functions 2020-11-26 15:49:33 +01:00			`x in eval_xpath_list(dom, url_xpath)),`
			`map(extract_text, eval_xpath_list(dom, title_xpath)),`
			`map(extract_text, eval_xpath_list(dom, content_xpath))`
[enh] Add onions category with Ahmia, Not Evil and Torch Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time. 2016-05-19 07:38:43 +02:00			`):`
			`results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})`
[enh] xpath engine absolute xpath support 2013-10-26 13:45:43 +02:00
[enh] suggestion support for xpath engine 2013-11-13 19:33:09 +01:00			`if not suggestion_xpath:`
			`return results`
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`for suggestion in eval_xpath(dom, suggestion_xpath):`
[mod][fix] xpath engine simplified, yahoo engine never returns truncated urls 2014-01-05 14:06:52 +01:00			`results.append({'suggestion': extract_text(suggestion)})`
[enh] xpath engine added 2013-10-26 02:22:20 +02:00			`return results`