searxng/searx/engines/startpage.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""
 Startpage (Web)
"""

from lxml import html
from dateutil import parser
from datetime import datetime, timedelta
import re
from unicodedata import normalize, combining
from babel import Locale
from babel.localedata import locale_identifiers
from searx.utils import extract_text, eval_xpath, match_language

# about
about = {
    "website": 'https://startpage.com',
    "wikidata_id": 'Q2333295',
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

# engine dependent config
categories = ['general', 'web']
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls

paging = True
supported_languages_url = 'https://www.startpage.com/do/settings'

# search-url
base_url = 'https://startpage.com/'
search_url = base_url + 'do/search'

# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
results_xpath = '//div[@class="w-gl__result__main"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]'
content_xpath = './/p[@class="w-gl__description"]'


# do search-request
def request(query, params):

    params['url'] = search_url
    params['method'] = 'POST'
    params['data'] = {
        'query': query,
        'page': params['pageno'],
        'cat': 'web',
        'cmd': 'process_search',
        'engine0': 'v1all',
    }

    # set language if specified
    if params['language'] != 'all':
        lang_code = match_language(params['language'], supported_languages, fallback=None)
        if lang_code:
            language_name = supported_languages[lang_code]['alias']
            params['data']['language'] = language_name
            params['data']['lui'] = language_name

    return params


# get response from search-request
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in eval_xpath(dom, results_xpath):
        links = eval_xpath(result, link_xpath)
        if not links:
            continue
        link = links[0]
        url = link.attrib.get('href')

        # block google-ad url's
        if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
            continue

        # block startpage search url's
        if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
            continue

        title = extract_text(link)

        if eval_xpath(result, content_xpath):
            content = extract_text(eval_xpath(result, content_xpath))
        else:
            content = ''

        published_date = None

        # check if search result starts with something like: "2 Sep 2014 ... "
        if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
            date_pos = content.find('...') + 4
            date_string = content[0 : date_pos - 5]
            # fix content string
            content = content[date_pos:]

            try:
                published_date = parser.parse(date_string, dayfirst=True)
            except ValueError:
                pass

        # check if search result starts with something like: "5 days ago ... "
        elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
            date_pos = content.find('...') + 4
            date_string = content[0 : date_pos - 5]

            # calculate datetime
            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))

            # fix content string
            content = content[date_pos:]

        if published_date:
            # append result
            results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
        else:
            # append result
            results.append({'url': url, 'title': title, 'content': content})

    # return results
    return results


# get supported languages from their site
def _fetch_supported_languages(resp):
    # startpage's language selector is a mess
    # each option has a displayed name and a value, either of which may represent the language name
    # in the native script, the language name in English, an English transliteration of the native name,
    # the English name of the writing script used by the language, or occasionally something else entirely.

    # this cases are so special they need to be hardcoded, a couple of them are mispellings
    language_names = {
        'english_uk': 'en-GB',
        'fantizhengwen': ['zh-TW', 'zh-HK'],
        'hangul': 'ko',
        'malayam': 'ml',
        'norsk': 'nb',
        'sinhalese': 'si',
        'sudanese': 'su',
    }

    # get the English name of every language known by babel
    language_names.update({name.lower(): lang_code for lang_code, name in Locale('en')._data['languages'].items()})

    # get the native name of every language known by babel
    for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
        native_name = Locale(lang_code).get_language_name().lower()
        # add native name exactly as it is
        language_names[native_name] = lang_code

        # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
        unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
        if len(unaccented_name) == len(unaccented_name.encode()):
            # add only if result is ascii (otherwise "normalization" didn't work)
            language_names[unaccented_name] = lang_code

    dom = html.fromstring(resp.text)
    sp_lang_names = []
    for option in dom.xpath('//form[@id="settings-form"]//select[@name="language"]/option'):
        sp_lang_names.append((option.get('value'), extract_text(option).lower()))

    supported_languages = {}
    for sp_option_value, sp_option_text in sp_lang_names:
        lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
        if isinstance(lang_code, str):
            supported_languages[lang_code] = {'alias': sp_option_value}
        elif isinstance(lang_code, list):
            for lc in lang_code:
                supported_languages[lc] = {'alias': sp_option_value}
        else:
            print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))

    return supported_languages
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
			`"""`
			`Startpage (Web)`
			`"""`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00
[enh] startpage engine added 2013-10-19 18:29:39 +02:00			`from lxml import html`
[enh] fix content fetching, parse published date from description 2015-10-24 16:15:30 +02:00			`from dateutil import parser`
			`from datetime import datetime, timedelta`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`import re`
fetch supported languages for startpage engine 2020-09-14 09:06:58 +02:00			`from unicodedata import normalize, combining`
			`from babel import Locale`
			`from babel.localedata import locale_identifiers`
[mod] move extract_text, extract_url to searx.utils 2020-10-02 18:13:56 +02:00			`from searx.utils import extract_text, eval_xpath, match_language`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# about`
			`about = {`
			`"website": 'https://startpage.com',`
			`"wikidata_id": 'Q2333295',`
			`"official_api_documentation": None,`
			`"use_official_api": False,`
			`"require_api_key": False,`
			`"results": 'HTML',`
			`}`

fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# engine dependent config`
[enh] add more categories 2021-12-22 16:58:52 +01:00			`categories = ['general', 'web']`
[fix] startpage engine compatibility 2014-11-17 10:19:23 +01:00			`# there is a mechanism to block "bot" search`
			`# (probably the parameter qid), require`
			`# storing of qid's between mulitble search-calls`

[fix] update startpage engine - closes #1601 2019-10-14 14:18:41 +02:00			`paging = True`
fetch supported languages for startpage engine 2020-09-14 09:06:58 +02:00			`supported_languages_url = 'https://www.startpage.com/do/settings'`
[enh] startpage engine added 2013-10-19 18:29:39 +02:00
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# search-url`
			`base_url = 'https://startpage.com/'`
			`search_url = base_url + 'do/search'`
[enh] startpage engine added 2013-10-19 18:29:39 +02:00
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# specific xpath variables`
			`# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]`
			`# not ads: div[@class="result"] are the direct childs of div[@id="results"]`
[Fix] Startpage 2020-12-13 15:43:50 +01:00			`results_xpath = '//div[@class="w-gl__result__main"]'`
Fix the StartPage result title is showing the url Fix the issue 2395 where StartPage result title is showing the url. https://github.com/searx/searx/issues/2395 2020-12-16 22:54:14 +01:00			`link_xpath = './/a[@class="w-gl__result-title result-link"]'`
[fix] update startpage engine - closes #1601 2019-10-14 14:18:41 +02:00			`content_xpath = './/p[@class="w-gl__description"]'`
[enh] startpage paging init 2014-01-30 02:10:32 +01:00
[fix] pep8 2014-01-24 09:35:27 +01:00
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# do search-request`
[enh] startpage engine added 2013-10-19 18:29:39 +02:00			`def request(query, params):`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00
[enh] startpage engine added 2013-10-19 18:29:39 +02:00			`params['url'] = search_url`
			`params['method'] = 'POST'`
[fix] update startpage engine - closes #1601 2019-10-14 14:18:41 +02:00			`params['data'] = {`
			`'query': query,`
			`'page': params['pageno'],`
			`'cat': 'web',`
			`'cmd': 'process_search',`
			`'engine0': 'v1all',`
			`}`

Revert "remove 'all' option from search languages" This reverts commit 4d1770398a6af8902e75c0bd885781584d39e796. 2019-01-06 15:27:46 +01:00			`# set language if specified`
			`if params['language'] != 'all':`
fetch supported languages for startpage engine 2020-09-14 09:06:58 +02:00			`lang_code = match_language(params['language'], supported_languages, fallback=None)`
			`if lang_code:`
			`language_name = supported_languages[lang_code]['alias']`
			`params['data']['language'] = language_name`
			`params['data']['lui'] = language_name`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00
[enh] startpage engine added 2013-10-19 18:29:39 +02:00			`return params`


fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# get response from search-request`
[enh] startpage engine added 2013-10-19 18:29:39 +02:00			`def response(resp):`
			`results = []`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00
[enh] py3 compatibility 2016-11-30 18:43:03 +01:00			`dom = html.fromstring(resp.text)`
[fix] startpage engine compatibility 2014-11-17 10:19:23 +01:00
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# parse results`
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`for result in eval_xpath(dom, results_xpath):`
			`links = eval_xpath(result, link_xpath)`
[fix] startpage engine compatibility 2014-11-17 10:19:23 +01:00			`if not links:`
			`continue`
			`link = links[0]`
[fix] urljoin removed 2013-10-24 23:43:39 +02:00			`url = link.attrib.get('href')`
[ehn] added ixquick engine, using startpage engine 2014-01-19 21:20:07 +01:00
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# block google-ad url's`
Fix anomalous backslash in string 2016-07-11 15:29:47 +02:00			`if re.match(r"^http(s\|)://(www\.)?google\.[a-z]+/aclk.*$", url):`
[fix] improve result handling of startpage engine 2015-08-24 11:28:55 +02:00			`continue`

			`# block startpage search url's`
Fix anomalous backslash in string 2016-07-11 15:29:47 +02:00			`if re.match(r"^http(s\|)://(www\.)?startpage\.com/do/search\?.*$", url):`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`continue`

[mod] do not escape html content in engines 2016-12-09 11:44:24 +01:00			`title = extract_text(link)`
Startpage's unit test 2015-02-06 17:31:10 +01:00
[mod] speed optimization compile XPath only once avoid redundant call to urlparse get_locale(webapp.py): avoid useless call to request.accept_languages.best_match 2019-11-15 09:31:37 +01:00			`if eval_xpath(result, content_xpath):`
			`content = extract_text(eval_xpath(result, content_xpath))`
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`else:`
			`content = ''`
[fix] pep8 2014-01-24 09:35:27 +01:00
[enh] fix content fetching, parse published date from description 2015-10-24 16:15:30 +02:00			`published_date = None`

			`# check if search result starts with something like: "2 Sep 2014 ... "`
Fix anomalous backslash in string 2016-07-11 15:29:47 +02:00			`if re.match(r"^([1-9]\|[1-2][0-9]\|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):`
[fix] pep8 compatibilty 2016-01-18 12:47:31 +01:00			`date_pos = content.find('...') + 4`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`date_string = content[0 : date_pos - 5]`
[enh] fix content fetching, parse published date from description 2015-10-24 16:15:30 +02:00			`# fix content string`
			`content = content[date_pos:]`

[Fix] Startpage ValueError on Spanish date format datetime.parser.parse() does not know the Spanish date format which leads to a ValueError. Fixes #1870 Traceback (most recent call last): File "/usr/local/searx/searx/search.py", line 160, in search_one_http_request_safe search_results = search_one_http_request(engine, query, request_params) File "/usr/local/searx/searx/search.py", line 97, in search_one_http_request return engine.response(response) File "/usr/local/searx/searx/engines/startpage.py", line 102, in response published_date = parser.parse(date_string, dayfirst=True) File "/usr/local/searx/searx-ve/lib/python3.6/site-packages/dateutil/parser/_parser.py", line 1358, in parse return DEFAULTPARSER.parse(timestr, **kwargs) File "/usr/local/searx/searx-ve/lib/python3.6/site-packages/dateutil/parser/_parser.py", line 649, in parse raise ValueError("Unknown string format:", timestr) ValueError: ('Unknown string format:', '24 Ene 2013') 2020-03-02 18:55:48 +01:00			`try:`
			`published_date = parser.parse(date_string, dayfirst=True)`
			`except ValueError:`
			`pass`

[enh] fix content fetching, parse published date from description 2015-10-24 16:15:30 +02:00			`# check if search result starts with something like: "5 days ago ... "`
Fix anomalous backslash in string 2016-07-11 15:29:47 +02:00			`elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):`
[fix] pep8 compatibilty 2016-01-18 12:47:31 +01:00			`date_pos = content.find('...') + 4`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`date_string = content[0 : date_pos - 5]`
[enh] fix content fetching, parse published date from description 2015-10-24 16:15:30 +02:00
			`# calculate datetime`
			`published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))`

			`# fix content string`
			`content = content[date_pos:]`

			`if published_date:`
			`# append result`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})`
[enh] fix content fetching, parse published date from description 2015-10-24 16:15:30 +02:00			`else:`
			`# append result`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`results.append({'url': url, 'title': title, 'content': content})`
[fix] pep8 2014-01-24 09:35:27 +01:00
fix startpage engine and add comments * add language support * remove not required code * improve google-ad detection (no false detection anymore, I hope) * other improvements 2014-09-02 19:57:01 +02:00			`# return results`
[enh] startpage engine added 2013-10-19 18:29:39 +02:00			`return results`
fetch supported languages for startpage engine 2020-09-14 09:06:58 +02:00

			`# get supported languages from their site`
			`def _fetch_supported_languages(resp):`
			`# startpage's language selector is a mess`
			`# each option has a displayed name and a value, either of which may represent the language name`
			`# in the native script, the language name in English, an English transliteration of the native name,`
			`# the English name of the writing script used by the language, or occasionally something else entirely.`

			`# this cases are so special they need to be hardcoded, a couple of them are mispellings`
			`language_names = {`
			`'english_uk': 'en-GB',`
			`'fantizhengwen': ['zh-TW', 'zh-HK'],`
			`'hangul': 'ko',`
			`'malayam': 'ml',`
			`'norsk': 'nb',`
			`'sinhalese': 'si',`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`'sudanese': 'su',`
fetch supported languages for startpage engine 2020-09-14 09:06:58 +02:00			`}`

			`# get the English name of every language known by babel`
			`language_names.update({name.lower(): lang_code for lang_code, name in Locale('en')._data['languages'].items()})`

			`# get the native name of every language known by babel`
			`for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):`
			`native_name = Locale(lang_code).get_language_name().lower()`
			`# add native name exactly as it is`
			`language_names[native_name] = lang_code`

			`# add "normalized" language name (i.e. français becomes francais and español becomes espanol)`
			`unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))`
			`if len(unaccented_name) == len(unaccented_name.encode()):`
			`# add only if result is ascii (otherwise "normalization" didn't work)`
			`language_names[unaccented_name] = lang_code`

			`dom = html.fromstring(resp.text)`
			`sp_lang_names = []`
			`for option in dom.xpath('//form[@id="settings-form"]//select[@name="language"]/option'):`
			`sp_lang_names.append((option.get('value'), extract_text(option).lower()))`

			`supported_languages = {}`
			`for sp_option_value, sp_option_text in sp_lang_names:`
			`lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)`
			`if isinstance(lang_code, str):`
			`supported_languages[lang_code] = {'alias': sp_option_value}`
			`elif isinstance(lang_code, list):`
			`for lc in lang_code:`
			`supported_languages[lc] = {'alias': sp_option_value}`
			`else:`
			`print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))`

			`return supported_languages`