searxng/searx/engines/brave.py

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
news, videos, images).  The support of :py:obj:`paging` and :py:obj:`time range
<time_range_support>` is limited (see remarks).

Configured ``brave`` engines:

.. code:: yaml

  - name: brave
    engine: brave
    ...
    brave_category: search
    time_range_support: true
    paging: true

  - name: brave.images
    engine: brave
    ...
    brave_category: images

  - name: brave.videos
    engine: brave
    ...
    brave_category: videos

  - name: brave.news
    engine: brave
    ...
    brave_category: news


.. _brave regions:

Brave regions
=============

Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
locales.  To get a mapping, all *officiat de-facto* languages of the Brave
region are mapped to regions in SearXNG (see :py:obj:`babel
<babel.languages.get_official_languages>`):

.. code:: python

    "regions": {
      ..
      "en-CA": "ca",
      "fr-CA": "ca",
      ..
     }


.. note::

   The language (aka region) support of Brave's index is limited to very basic
   languages.  The search results for languages like Chinese or Arabic are of
   low quality.


.. _brave languages:

Brave languages
===============

Brave's language support is limited to the UI (menus, area local notations,
etc).  Brave's index only seems to support a locale, but it does not seem to
support any languages in its index.  The choice of available languages is very
small (and its not clear to me where the difference in UI is when switching
from en-us to en-ca or en-gb).

In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
UI languages are stored in a custom field named ``ui_lang``:

.. code:: python

    "custom": {
      "ui_lang": {
        "ca": "ca",
        "de-DE": "de-de",
        "en-CA": "en-ca",
        "en-GB": "en-gb",
        "en-US": "en-us",
        "es": "es",
        "fr-CA": "fr-ca",
        "fr-FR": "fr-fr",
        "ja-JP": "ja-jp",
        "pt-BR": "pt-br",
        "sq-AL": "sq-al"
      }
    },

Implementations
===============

"""

from typing import TYPE_CHECKING

import re
from urllib.parse import (
    urlencode,
    urlparse,
    parse_qs,
)

from lxml import html

from searx import locales
from searx.utils import (
    extract_text,
    eval_xpath_list,
    eval_xpath_getindex,
    js_variable_to_python,
)
from searx.enginelib.traits import EngineTraits

if TYPE_CHECKING:
    import logging

    logger: logging.Logger

traits: EngineTraits

about = {
    "website": 'https://search.brave.com/',
    "wikidata_id": 'Q22906900',
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

base_url = "https://search.brave.com/"
categories = []
brave_category = 'search'
"""Brave supports common web-search, video search, image and video search.

- ``search``: Common WEB search
- ``videos``: search for videos
- ``images``: search for images
- ``news``: search for news
"""

brave_spellcheck = False
"""Brave supports some kind of spell checking.  When activated, Brave tries to
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``.  In
the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""

send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All)."""

safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'}  # cookie: safesearch=off

time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All)."""

time_range_map = {
    'day': 'pd',
    'week': 'pw',
    'month': 'pm',
    'year': 'py',
}


def request(query, params):

    # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
    params['headers']['Accept-Encoding'] = 'gzip, deflate'

    args = {
        'q': query,
    }
    if brave_spellcheck:
        args['spellcheck'] = '1'

    if brave_category == 'search':
        if params.get('pageno', 1) - 1:
            args['offset'] = params.get('pageno', 1) - 1
        if time_range_map.get(params['time_range']):
            args['tf'] = time_range_map.get(params['time_range'])

    params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"

    # set properties in the cookies

    params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
    # the useLocation is IP based, we use cookie 'country' for the region
    params['cookies']['useLocation'] = '0'
    params['cookies']['summarizer'] = '0'

    engine_region = traits.get_region(params['searxng_locale'], 'all')
    params['cookies']['country'] = engine_region.split('-')[-1].lower()  # type: ignore

    ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
    params['cookies']['ui_lang'] = ui_lang

    logger.debug("cookies %s", params['cookies'])


def response(resp):

    if brave_category == 'search':
        return _parse_search(resp)

    datastr = ""
    for line in resp.text.split("\n"):
        if "const data = " in line:
            datastr = line.replace("const data = ", "").strip()[:-1]
            break

    json_data = js_variable_to_python(datastr)
    json_resp = json_data[1]['data']['body']['response']

    if brave_category == 'news':
        return _parse_news(json_resp['news'])

    if brave_category == 'images':
        return _parse_images(json_resp)
    if brave_category == 'videos':
        return _parse_videos(json_resp)

    raise ValueError(f"Unsupported brave category: {brave_category}")


def _parse_search(resp):

    result_list = []
    dom = html.fromstring(resp.text)

    answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
    if answer_tag:
        url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
        result_list.append({'answer': extract_text(answer_tag), 'url': url})

    # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
    xpath_results = '//div[contains(@class, "snippet ")]'

    for result in eval_xpath_list(dom, xpath_results):

        url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
        title_tag = eval_xpath_getindex(result, './/div[contains(@class, "title")]', 0, default=None)
        if url is None or title_tag is None or not urlparse(url).netloc:  # partial url likely means it's an ad
            continue

        content_tag = eval_xpath_getindex(result, './/div[@class="snippet-description"]', 0, default='')
        img_src = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')

        item = {
            'url': url,
            'title': extract_text(title_tag),
            'content': extract_text(content_tag),
            'img_src': img_src,
        }

        video_tag = eval_xpath_getindex(
            result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
        )
        if video_tag is not None:

            # In my tests a video tag in the WEB search was most often not a
            # video, except the ones from youtube ..

            iframe_src = _get_iframe_src(url)
            if iframe_src:
                item['iframe_src'] = iframe_src
                item['template'] = 'videos.html'
                item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
            else:
                item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')

        result_list.append(item)

    return result_list


def _get_iframe_src(url):
    parsed_url = urlparse(url)
    if parsed_url.path == '/watch' and parsed_url.query:
        video_id = parse_qs(parsed_url.query).get('v', [])  # type: ignore
        if video_id:
            return 'https://www.youtube-nocookie.com/embed/' + video_id[0]  # type: ignore
    return None


def _parse_news(json_resp):
    result_list = []

    for result in json_resp["results"]:
        item = {
            'url': result['url'],
            'title': result['title'],
            'content': result['description'],
        }
        if result['thumbnail'] is not None:
            item['img_src'] = result['thumbnail']['src']
        result_list.append(item)

    return result_list


def _parse_images(json_resp):
    result_list = []

    for result in json_resp["results"]:
        item = {
            'url': result['url'],
            'title': result['title'],
            'content': result['description'],
            'template': 'images.html',
            'img_format': result['properties']['format'],
            'source': result['source'],
            'img_src': result['properties']['url'],
        }
        result_list.append(item)

    return result_list


def _parse_videos(json_resp):
    result_list = []

    for result in json_resp["results"]:

        url = result['url']
        item = {
            'url': url,
            'title': result['title'],
            'content': result['description'],
            'template': 'videos.html',
            'length': result['video']['duration'],
            'duration': result['video']['duration'],
        }

        if result['thumbnail'] is not None:
            item['thumbnail'] = result['thumbnail']['src']

        iframe_src = _get_iframe_src(url)
        if iframe_src:
            item['iframe_src'] = iframe_src

        result_list.append(item)

    return result_list


def fetch_traits(engine_traits: EngineTraits):
    """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
    regions>` from Brave."""

    # pylint: disable=import-outside-toplevel

    import babel.languages
    from searx.locales import region_tag, language_tag
    from searx.network import get  # see https://github.com/searxng/searxng/issues/762

    engine_traits.custom["ui_lang"] = {}

    headers = {
        'Accept-Encoding': 'gzip, deflate',
    }
    lang_map = {'no': 'nb'}  # norway

    # languages (UI)

    resp = get('https://search.brave.com/settings', headers=headers)

    if not resp.ok:  # type: ignore
        print("ERROR: response from Brave is not OK.")
    dom = html.fromstring(resp.text)  # type: ignore

    for option in dom.xpath('//div[@id="language-select"]//option'):

        ui_lang = option.get('value')
        try:
            if '-' in ui_lang:
                sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
            else:
                sxng_tag = language_tag(babel.Locale.parse(ui_lang))

        except babel.UnknownLocaleError:
            print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
            continue

        conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
        if conflict:
            if conflict != ui_lang:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
            continue
        engine_traits.custom["ui_lang"][sxng_tag] = ui_lang

    # search regions of brave

    engine_traits.all_locale = 'all'

    for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'):

        flag = country.xpath('./span[contains(@class, "flag")]')[0]
        # country_name = extract_text(flag.xpath('./following-sibling::*')[0])
        country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1)  # type: ignore

        # add official languages of the country ..
        for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
            lang_tag = lang_map.get(lang_tag, lang_tag)
            sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
            # print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag))

            conflict = engine_traits.regions.get(sxng_tag)
            if conflict:
                if conflict != country_tag:
                    print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
                    continue
            engine_traits.regions[sxng_tag] = country_tag
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`# lint: pylint`
			"""Brave supports the categories listed in :py:obj:`brave_category` (General,
			news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
			<time_range_support>` is limited (see remarks).

			Configured ``brave`` engines:

			`.. code:: yaml`

			`- name: brave`
			`engine: brave`
			`...`
			`brave_category: search`
			`time_range_support: true`
			`paging: true`

			`- name: brave.images`
			`engine: brave`
			`...`
			`brave_category: images`

			`- name: brave.videos`
			`engine: brave`
			`...`
			`brave_category: videos`

			`- name: brave.news`
			`engine: brave`
			`...`
			`brave_category: news`


[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`.. _brave regions:`

			`Brave regions`
			`=============`

			Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
[fix] spelling 2023-09-15 09:53:03 +02:00			`locales. To get a mapping, all officiat de-facto languages of the Brave`
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			region are mapped to regions in SearXNG (see :py:obj:`babel
			<babel.languages.get_official_languages>`):

			`.. code:: python`

			`"regions": {`
			`..`
			`"en-CA": "ca",`
			`"fr-CA": "ca",`
			`..`
			`}`


			`.. note::`

			`The language (aka region) support of Brave's index is limited to very basic`
			`languages. The search results for languages like Chinese or Arabic are of`
			`low quality.`


			`.. _brave languages:`

			`Brave languages`
			`===============`

[fix] spelling 2023-09-15 09:53:03 +02:00			`Brave's language support is limited to the UI (menus, area local notations,`
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`etc). Brave's index only seems to support a locale, but it does not seem to`
			`support any languages in its index. The choice of available languages is very`
[fix] spelling 2023-09-15 09:53:03 +02:00			`small (and its not clear to me where the difference in UI is when switching`
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`from en-us to en-ca or en-gb).`

			In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
			UI languages are stored in a custom field named ``ui_lang``:

			`.. code:: python`

			`"custom": {`
			`"ui_lang": {`
			`"ca": "ca",`
			`"de-DE": "de-de",`
			`"en-CA": "en-ca",`
			`"en-GB": "en-gb",`
			`"en-US": "en-us",`
			`"es": "es",`
			`"fr-CA": "fr-ca",`
			`"fr-FR": "fr-fr",`
			`"ja-JP": "ja-jp",`
			`"pt-BR": "pt-br",`
			`"sq-AL": "sq-al"`
			`}`
			`},`

[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`Implementations`
			`===============`

[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00			`"""`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`from typing import TYPE_CHECKING`

			`import re`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`from urllib.parse import (`
			`urlencode,`
			`urlparse,`
			`parse_qs,`
			`)`
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`from lxml import html`

[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`from searx import locales`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`from searx.utils import (`
			`extract_text,`
			`eval_xpath_list,`
			`eval_xpath_getindex,`
Replace chompjs with pure Python code The new implementation is good enough for the current usage (brave) 2023-09-09 12:18:39 +02:00			`js_variable_to_python,`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`)`
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`from searx.enginelib.traits import EngineTraits`

			`if TYPE_CHECKING:`
			`import logging`

			`logger: logging.Logger`

			`traits: EngineTraits`
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00
			`about = {`
			`"website": 'https://search.brave.com/',`
			`"wikidata_id": 'Q22906900',`
			`"official_api_documentation": None,`
			`"use_official_api": False,`
			`"require_api_key": False,`
			`"results": 'HTML',`
			`}`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00			`base_url = "https://search.brave.com/"`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`categories = []`
			`brave_category = 'search'`
			`"""Brave supports common web-search, video search, image and video search.`

			- ``search``: Common WEB search
			- ``videos``: search for videos
			- ``images``: search for images
			- ``news``: search for news
			`"""`

			`brave_spellcheck = False`
			`"""Brave supports some kind of spell checking. When activated, Brave tries to`
			fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
			`the UI of Brave the user gets warned about this, since we can not warn the user`
			`in SearXNG, the spellchecking is disabled by default.`
			`"""`

			`send_accept_language_header = True`
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00			`paging = False`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
			`category All)."""`

			`safesearch = True`
			`safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off`

			`time_range_support = False`
			"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
			`category All)."""`

			`time_range_map = {`
			`'day': 'pd',`
			`'week': 'pw',`
			`'month': 'pm',`
			`'year': 'py',`
			`}`
[feat] engine: brave - support for videos 2023-08-05 20:25:10 +02:00
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00
			`def request(query, params):`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
			`# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787`
			`params['headers']['Accept-Encoding'] = 'gzip, deflate'`

[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00			`args = {`
			`'q': query,`
			`}`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`if brave_spellcheck:`
			`args['spellcheck'] = '1'`

			`if brave_category == 'search':`
			`if params.get('pageno', 1) - 1:`
			`args['offset'] = params.get('pageno', 1) - 1`
			`if time_range_map.get(params['time_range']):`
			`args['tf'] = time_range_map.get(params['time_range'])`

			`params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"`

[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`# set properties in the cookies`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')`
			`# the useLocation is IP based, we use cookie 'country' for the region`
			`params['cookies']['useLocation'] = '0'`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`params['cookies']['summarizer'] = '0'`
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`engine_region = traits.get_region(params['searxng_locale'], 'all')`
			`params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')`
			`params['cookies']['ui_lang'] = ui_lang`

			`logger.debug("cookies %s", params['cookies'])`
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00
[feat] engine: brave - support for videos 2023-08-05 20:25:10 +02:00
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00			`def response(resp):`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
			`if brave_category == 'search':`
			`return _parse_search(resp)`
[feat] engine: brave - support for images 2023-08-05 19:46:04 +02:00
[feat] engine: brave - support for videos 2023-08-05 20:25:10 +02:00			`datastr = ""`
			`for line in resp.text.split("\n"):`
			`if "const data = " in line:`
			`datastr = line.replace("const data = ", "").strip()[:-1]`
			`break`

Replace chompjs with pure Python code The new implementation is good enough for the current usage (brave) 2023-09-09 12:18:39 +02:00			`json_data = js_variable_to_python(datastr)`
[feat] engine: brave - support for news 2023-08-05 20:35:04 +02:00			`json_resp = json_data[1]['data']['body']['response']`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
			`if brave_category == 'news':`
[fix] brave.news 2023-09-15 20:57:03 +02:00			`return _parse_news(json_resp['news'])`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
			`if brave_category == 'images':`
			`return _parse_images(json_resp)`
			`if brave_category == 'videos':`
			`return _parse_videos(json_resp)`

[mod] engine brave: raise error on unsupported category 2023-08-10 12:19:03 +02:00			`raise ValueError(f"Unsupported brave category: {brave_category}")`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00

			`def _parse_search(resp):`

			`result_list = []`
			`dom = html.fromstring(resp.text)`

			`answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)`
			`if answer_tag:`
[mod] brave: support for showing the answer source url 2023-08-08 18:18:28 +02:00			`url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)`
			`result_list.append({'answer': extract_text(answer_tag), 'url': url})`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
			`# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'`
[fix] engine - brave 2023-09-12 09:23:24 +02:00			`xpath_results = '//div[contains(@class, "snippet ")]'`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
			`for result in eval_xpath_list(dom, xpath_results):`

[fix] engine - brave 2023-09-12 09:23:24 +02:00			`url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)`
			`title_tag = eval_xpath_getindex(result, './/div[contains(@class, "title")]', 0, default=None)`
[fix] engine - brave don't show ads 2023-09-21 15:30:00 +02:00			`if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`continue`

[fix] engine - brave 2023-09-12 09:23:24 +02:00			`content_tag = eval_xpath_getindex(result, './/div[@class="snippet-description"]', 0, default='')`
			`img_src = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
			`item = {`
			`'url': url,`
			`'title': extract_text(title_tag),`
			`'content': extract_text(content_tag),`
			`'img_src': img_src,`
			`}`

			`video_tag = eval_xpath_getindex(`
			`result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None`
			`)`
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`if video_tag is not None:`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00
[fix] spelling 2023-09-15 09:53:03 +02:00			`# In my tests a video tag in the WEB search was most often not a`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`# video, except the ones from youtube ..`

			`iframe_src = _get_iframe_src(url)`
			`if iframe_src:`
			`item['iframe_src'] = iframe_src`
			`item['template'] = 'videos.html'`
			`item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')`
			`else:`
			`item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')`

			`result_list.append(item)`

			`return result_list`


			`def _get_iframe_src(url):`
			`parsed_url = urlparse(url)`
			`if parsed_url.path == '/watch' and parsed_url.query:`
			`video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore`
			`if video_id:`
			`return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore`
			`return None`


			`def _parse_news(json_resp):`
			`result_list = []`
[feat] engine: brave - support for videos 2023-08-05 20:25:10 +02:00
[feat] engine: brave - support for news 2023-08-05 20:35:04 +02:00			`for result in json_resp["results"]:`
[feat] engine: brave - support for videos 2023-08-05 20:25:10 +02:00			`item = {`
			`'url': result['url'],`
			`'title': result['title'],`
			`'content': result['description'],`
			`}`
[fix] brave.news 2023-09-15 20:57:03 +02:00			`if result['thumbnail'] is not None:`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`item['img_src'] = result['thumbnail']['src']`
			`result_list.append(item)`

			`return result_list`


			`def _parse_images(json_resp):`
			`result_list = []`

			`for result in json_resp["results"]:`
			`item = {`
			`'url': result['url'],`
			`'title': result['title'],`
			`'content': result['description'],`
			`'template': 'images.html',`
			`'img_format': result['properties']['format'],`
			`'source': result['source'],`
			`'img_src': result['properties']['url'],`
			`}`
			`result_list.append(item)`

			`return result_list`


			`def _parse_videos(json_resp):`
			`result_list = []`

			`for result in json_resp["results"]:`

			`url = result['url']`
			`item = {`
			`'url': url,`
			`'title': result['title'],`
			`'content': result['description'],`
			`'template': 'videos.html',`
			`'length': result['video']['duration'],`
			`'duration': result['video']['duration'],`
			`}`

[fix] brave.videos 2023-09-15 21:33:23 +02:00			`if result['thumbnail'] is not None:`
[feat] engine: brave - support for videos 2023-08-05 20:25:10 +02:00			`item['thumbnail'] = result['thumbnail']['src']`

[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`iframe_src = _get_iframe_src(url)`
			`if iframe_src:`
			`item['iframe_src'] = iframe_src`
[feat] engine: brave - support for news 2023-08-05 20:35:04 +02:00
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`result_list.append(item)`
[feat] engine: brave - support for videos 2023-08-05 20:25:10 +02:00
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 19:35:56 +02:00			`return result_list`
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00

			`def fetch_traits(engine_traits: EngineTraits):`
			"""Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
			regions>` from Brave."""

			`# pylint: disable=import-outside-toplevel`

			`import babel.languages`
			`from searx.locales import region_tag, language_tag`
			`from searx.network import get # see https://github.com/searxng/searxng/issues/762`

			`engine_traits.custom["ui_lang"] = {}`

			`headers = {`
			`'Accept-Encoding': 'gzip, deflate',`
			`}`
			`lang_map = {'no': 'nb'} # norway`

			`# languages (UI)`

			`resp = get('https://search.brave.com/settings', headers=headers)`

			`if not resp.ok: # type: ignore`
			`print("ERROR: response from Brave is not OK.")`
			`dom = html.fromstring(resp.text) # type: ignore`

			`for option in dom.xpath('//div[@id="language-select"]//option'):`

			`ui_lang = option.get('value')`
			`try:`
			`if '-' in ui_lang:`
			`sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))`
			`else:`
			`sxng_tag = language_tag(babel.Locale.parse(ui_lang))`

			`except babel.UnknownLocaleError:`
			`print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)`
			`continue`

			`conflict = engine_traits.custom["ui_lang"].get(sxng_tag)`
			`if conflict:`
			`if conflict != ui_lang:`
			`print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))`
			`continue`
			`engine_traits.custom["ui_lang"][sxng_tag] = ui_lang`

			`# search regions of brave`

			`engine_traits.all_locale = 'all'`

			`for country in dom.xpath('//div[@id="sidebar"]//ul/li/div[contains(@class, "country")]'):`

			`flag = country.xpath('./span[contains(@class, "flag")]')[0]`
			`# country_name = extract_text(flag.xpath('./following-sibling::*')[0])`
			`country_tag = re.search(r'flag-([^\s]*)\s', flag.xpath('./@class')[0]).group(1) # type: ignore`

[fix] spelling 2023-09-15 09:53:03 +02:00			`# add official languages of the country ..`
[mod] brave engines: add fetch_traits() / improve language support Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-08 11:20:10 +02:00			`for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):`
			`lang_tag = lang_map.get(lang_tag, lang_tag)`
			`sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))`
			`# print("%-20s: %s <-- %s" % (country_name, country_tag, sxng_tag))`

			`conflict = engine_traits.regions.get(sxng_tag)`
			`if conflict:`
			`if conflict != country_tag:`
			`print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))`
			`continue`
			`engine_traits.regions[sxng_tag] = country_tag`