searxng/searx/engines/ask.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Ask.com"""

from urllib.parse import urlencode
import dateutil
from lxml import html
from searx import utils

# Metadata
about = {
    "website": "https://www.ask.com/",
    "wikidata_id": 'Q847564',
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": "HTML",
}

# Engine Configuration
categories = ['general']
paging = True
max_page = 5

# Base URL
base_url = "https://www.ask.com/web"


def request(query, params):

    query_params = {
        "q": query,
        "page": params["pageno"],
    }

    params["url"] = f"{base_url}?{urlencode(query_params)}"
    return params


def response(resp):

    start_tag = 'window.MESON.initialState = {'
    end_tag = '}};'

    dom = html.fromstring(resp.text)
    script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text

    pos = script.index(start_tag) + len(start_tag) - 1
    script = script[pos:]
    pos = script.index(end_tag) + len(end_tag) - 1
    script = script[:pos]

    json_resp = utils.js_variable_to_python(script)

    results = []

    for item in json_resp['search']['webResults']['results']:

        pubdate_original = item.get('pubdate_original')
        if pubdate_original:
            pubdate_original = dateutil.parser.parse(pubdate_original)
        metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)]

        results.append(
            {
                "url": item['url'].split('&ueid')[0],
                "title": item['title'],
                "content": item['abstract'],
                "publishedDate": pubdate_original,
                # "thumbnail": item.get('image_url') or None, # these are not thumbs / to large
                "metadata": ' | '.join(metadata),
            }
        )

    return results
[feat] engine: implementation of ask.com 2024-02-07 23:18:13 +01:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
			`"""Ask.com"""`

			`from urllib.parse import urlencode`
[mod] engine ask.com - parse JS result to JSON Parse the result list from ask.com given in the variable named window.MESON.initialState:: <script nonce=".."> window.MESON = window.MESON \|\| {}; window.MESON.initialState = {"siteConfig": ... ...}}; window.MESON.loadedLang = "en"; </script> The result list is in field:: json_resp['search']['webResults']['results'] Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-02-16 18:17:03 +01:00			`import dateutil`
[feat] engine: implementation of ask.com 2024-02-07 23:18:13 +01:00			`from lxml import html`
[mod] engine ask.com - parse JS result to JSON Parse the result list from ask.com given in the variable named window.MESON.initialState:: <script nonce=".."> window.MESON = window.MESON \|\| {}; window.MESON.initialState = {"siteConfig": ... ...}}; window.MESON.loadedLang = "en"; </script> The result list is in field:: json_resp['search']['webResults']['results'] Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-02-16 18:17:03 +01:00			`from searx import utils`
[feat] engine: implementation of ask.com 2024-02-07 23:18:13 +01:00
			`# Metadata`
			`about = {`
			`"website": "https://www.ask.com/",`
			`"wikidata_id": 'Q847564',`
			`"official_api_documentation": None,`
			`"use_official_api": False,`
			`"require_api_key": False,`
			`"results": "HTML",`
			`}`

			`# Engine Configuration`
			`categories = ['general']`
			`paging = True`
[mod] Ask engine: remove tracking paramaters and set max page to 5 2024-03-29 09:42:51 +01:00			`max_page = 5`
[feat] engine: implementation of ask.com 2024-02-07 23:18:13 +01:00
			`# Base URL`
			`base_url = "https://www.ask.com/web"`


			`def request(query, params):`

			`query_params = {`
			`"q": query,`
			`"page": params["pageno"],`
			`}`

			`params["url"] = f"{base_url}?{urlencode(query_params)}"`
			`return params`


			`def response(resp):`

[mod] engine ask.com - parse JS result to JSON Parse the result list from ask.com given in the variable named window.MESON.initialState:: <script nonce=".."> window.MESON = window.MESON \|\| {}; window.MESON.initialState = {"siteConfig": ... ...}}; window.MESON.loadedLang = "en"; </script> The result list is in field:: json_resp['search']['webResults']['results'] Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-02-16 18:17:03 +01:00			`start_tag = 'window.MESON.initialState = {'`
			`end_tag = '}};'`

			`dom = html.fromstring(resp.text)`
			`script = utils.eval_xpath_getindex(dom, '//script', 0, default=None).text`

			`pos = script.index(start_tag) + len(start_tag) - 1`
			`script = script[pos:]`
			`pos = script.index(end_tag) + len(end_tag) - 1`
			`script = script[:pos]`

			`json_resp = utils.js_variable_to_python(script)`

			`results = []`

			`for item in json_resp['search']['webResults']['results']:`

			`pubdate_original = item.get('pubdate_original')`
			`if pubdate_original:`
			`pubdate_original = dateutil.parser.parse(pubdate_original)`
			`metadata = [item.get(field) for field in ['category_l1', 'catsy'] if item.get(field)]`

			`results.append(`
			`{`
[mod] Ask engine: remove tracking paramaters and set max page to 5 2024-03-29 09:42:51 +01:00			`"url": item['url'].split('&ueid')[0],`
[mod] engine ask.com - parse JS result to JSON Parse the result list from ask.com given in the variable named window.MESON.initialState:: <script nonce=".."> window.MESON = window.MESON \|\| {}; window.MESON.initialState = {"siteConfig": ... ...}}; window.MESON.loadedLang = "en"; </script> The result list is in field:: json_resp['search']['webResults']['results'] Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-02-16 18:17:03 +01:00			`"title": item['title'],`
			`"content": item['abstract'],`
			`"publishedDate": pubdate_original,`
[mod] simple theme: drop img_src from default results The use of img_src AND thumbnail in the default results makes no sense (only a thumbnail is needed). In the current state this is rather confusing, because img_src is displayed like a thumbnail (small) and thumbnail is displayed like an image (large). Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-05-12 17:52:52 +02:00			`# "thumbnail": item.get('image_url') or None, # these are not thumbs / to large`
[mod] engine ask.com - parse JS result to JSON Parse the result list from ask.com given in the variable named window.MESON.initialState:: <script nonce=".."> window.MESON = window.MESON \|\| {}; window.MESON.initialState = {"siteConfig": ... ...}}; window.MESON.loadedLang = "en"; </script> The result list is in field:: json_resp['search']['webResults']['results'] Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-02-16 18:17:03 +01:00			`"metadata": ' \| '.join(metadata),`
			`}`
			`)`
[feat] engine: implementation of ask.com 2024-02-07 23:18:13 +01:00
			`return results`