searxng/tests/unit/test_utils.py

# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, invalid-name

import random
import string
import lxml.etree
from lxml import html
from parameterized.parameterized import parameterized

from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import utils
from tests import SearxTestCase


def random_string(length, choices=string.ascii_letters):
    return ''.join(random.choice(choices) for _ in range(length))


class TestUtils(SearxTestCase):  # pylint: disable=missing-class-docstring
    def test_gen_useragent(self):
        self.assertIsInstance(utils.gen_useragent(), str)
        self.assertIsNotNone(utils.gen_useragent())
        self.assertTrue(utils.gen_useragent().startswith('Mozilla'))

    def test_searx_useragent(self):
        self.assertIsInstance(utils.searx_useragent(), str)
        self.assertIsNotNone(utils.searx_useragent())
        self.assertTrue(utils.searx_useragent().startswith('searx'))

    def test_html_to_text(self):
        html_str = """
        <a href="/testlink" class="link_access_account">
            <style>
                .toto {
                    color: red;
                }
            </style>
            <span class="toto">
                <span>
                    <img src="test.jpg" />
                </span>
            </span>
            <span class="titi">
                            Test text
            </span>
            <script>value='dummy';</script>
        </a>
        """
        self.assertIsInstance(utils.html_to_text(html_str), str)
        self.assertIsNotNone(utils.html_to_text(html_str))
        self.assertEqual(utils.html_to_text(html_str), "Test text")
        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")

    def test_extract_text(self):
        html_str = """
        <a href="/testlink" class="link_access_account">
            <span class="toto">
                <span>
                    <img src="test.jpg" />
                </span>
            </span>
            <span class="titi">
                            Test text
            </span>
        </a>
        """
        dom = html.fromstring(html_str)
        self.assertEqual(utils.extract_text(dom), 'Test text')
        self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
        self.assertEqual(utils.extract_text(dom.xpath('//span/text()')), 'Test text')
        self.assertEqual(utils.extract_text(dom.xpath('count(//span)')), '3.0')
        self.assertEqual(utils.extract_text(dom.xpath('boolean(//span)')), 'True')
        self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
        self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')

    def test_extract_text_allow_none(self):
        self.assertEqual(utils.extract_text(None, allow_none=True), None)

    def test_extract_text_error_none(self):
        with self.assertRaises(ValueError):
            utils.extract_text(None)

    def test_extract_text_error_empty(self):
        with self.assertRaises(ValueError):
            utils.extract_text({})

    def test_extract_url(self):
        def f(html_str, search_url):
            return utils.extract_url(html.fromstring(html_str), search_url)

        self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
        self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
        self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
        self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
        self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
        with self.assertRaises(lxml.etree.ParserError):
            f('', 'https://example.com')
        with self.assertRaises(Exception):
            utils.extract_url([], 'https://example.com')

    def test_html_to_text_invalid(self):
        _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
        self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")

    def test_ecma_unscape(self):
        self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
        self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
        self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')


class TestHTMLTextExtractor(SearxTestCase):  # pylint: disable=missing-class-docstring
    def setUp(self):
        self.html_text_extractor = utils._HTMLTextExtractor()  # pylint: disable=protected-access

    def test__init__(self):
        self.assertEqual(self.html_text_extractor.result, [])

    @parameterized.expand(
        [
            ('xF', '\x0f'),
            ('XF', '\x0f'),
            ('97', 'a'),
        ]
    )
    def test_handle_charref(self, charref: str, expected: str):
        self.html_text_extractor.handle_charref(charref)
        self.assertIn(expected, self.html_text_extractor.result)

    def test_handle_entityref(self):
        entity = 'test'
        self.html_text_extractor.handle_entityref(entity)
        self.assertIn(entity, self.html_text_extractor.result)

    def test_invalid_html(self):
        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
        with self.assertRaises(utils._HTMLTextExtractorException):  # pylint: disable=protected-access
            self.html_text_extractor.feed(text)


class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring

    TEST_DOC = """<ul>
        <li>Text in <b>bold</b> and <i>italic</i> </li>
        <li>Another <b>text</b> <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs="></li>
        </ul>"""

    def test_get_xpath_cache(self):
        xp1 = utils.get_xpath('//a')
        xp2 = utils.get_xpath('//div')
        xp3 = utils.get_xpath('//a')

        self.assertEqual(id(xp1), id(xp3))
        self.assertNotEqual(id(xp1), id(xp2))

    def test_get_xpath_type(self):
        utils.get_xpath(lxml.etree.XPath('//a'))

        with self.assertRaises(TypeError):
            utils.get_xpath([])

    def test_get_xpath_invalid(self):
        invalid_xpath = '//a[0].text'
        with self.assertRaises(SearxXPathSyntaxException) as context:
            utils.get_xpath(invalid_xpath)

        self.assertEqual(context.exception.message, 'Invalid expression')
        self.assertEqual(context.exception.xpath_str, invalid_xpath)

    def test_eval_xpath_unregistered_function(self):
        doc = html.fromstring(TestXPathUtils.TEST_DOC)

        invalid_function_xpath = 'int(//a)'
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath(doc, invalid_function_xpath)

        self.assertEqual(context.exception.message, 'Unregistered function')
        self.assertEqual(context.exception.xpath_str, invalid_function_xpath)

    def test_eval_xpath(self):
        doc = html.fromstring(TestXPathUtils.TEST_DOC)

        self.assertEqual(utils.eval_xpath(doc, '//p'), [])
        self.assertEqual(utils.eval_xpath(doc, '//i/text()'), ['italic'])
        self.assertEqual(utils.eval_xpath(doc, 'count(//i)'), 1.0)

    def test_eval_xpath_list(self):
        doc = html.fromstring(TestXPathUtils.TEST_DOC)

        # check a not empty list
        self.assertEqual(utils.eval_xpath_list(doc, '//i/text()'), ['italic'])

        # check min_len parameter
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath_list(doc, '//p', min_len=1)
        self.assertEqual(context.exception.message, 'len(xpath_str) < 1')
        self.assertEqual(context.exception.xpath_str, '//p')

    def test_eval_xpath_getindex(self):
        doc = html.fromstring(TestXPathUtils.TEST_DOC)

        # check index 0
        self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 0), 'italic')

        # default is 'something'
        self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 1, default='something'), 'something')

        # default is None
        self.assertIsNone(utils.eval_xpath_getindex(doc, '//i/text()', 1, default=None))

        # index not found
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath_getindex(doc, '//i/text()', 1)
        self.assertEqual(context.exception.message, 'index 1 not found')

        # not a list
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath_getindex(doc, 'count(//i)', 1)
        self.assertEqual(context.exception.message, 'the result is not a list')

    def test_detect_language(self):
        # make sure new line are not an issue
        # fasttext.predict('') does not accept new line.
        l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
        self.assertEqual(l, 'en')

        l = utils.detect_language(
            'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす'
        )
        self.assertEqual(l, 'ja')

        l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
        self.assertEqual(l, 'tr')

        l = utils.detect_language('')
        self.assertIsNone(l)

        # mix languages --> None
        l = utils.detect_language('The いろはにほへと Pijamalı')
        self.assertIsNone(l)

        with self.assertRaises(ValueError):
            utils.detect_language(None)  # type: ignore
-												[mod] pylint all files with one profile / drop PYLINT_SEARXNG_DISABLE_OPTION

In the past, some files were tested with the standard profile, others with a
profile in which most of the messages were switched off ... some files were not
checked at all.

- ``PYLINT_SEARXNG_DISABLE_OPTION`` has been abolished
- the distinction ``# lint: pylint`` is no longer necessary
- the pylint tasks have been reduced from three to two

  1. ./searx/engines -> lint engines with additional builtins
  2. ./searx ./searxng_extra ./tests -> lint all other python files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-11 14:06:26 +01:00
+								# SPDX-License-Identifier: AGPL-3.0-or-later
 								# pylint: disable=missing-module-docstring, invalid-name
-												[refactor] unit tests (continued) - plugins

This commit includes some refactoring in unit tests.  As we test more plugins,
it seems unweildy to include every test class in the test_plugins.py file.  This
patch split apart all of the test plugins to their own respective files,
including the new test_plugin_calculator.py file.

											
										
										
											2024-10-05 16:10:21 +02:00
+								import random
 								import string
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
+								import lxml.etree
 								from lxml import html
-												[refactor] unit tests to utilize paramaterized and break down monolithic tests

- for tests which perform the same arrange/act/assert pattern but with different
  data, the data portion has been moved to the ``paramaterized.expand`` fields

- for monolithic tests which performed multiple arrange/act/asserts,
  they have been broken up into different unit tests.

- when possible, change generic assert statements to more concise
  asserts (i.e. ``assertIsNone``)

This work ultimately is focused on creating smaller and more concise tests.
While paramaterized may make adding new configurations for existing tests
easier, that is just a beneficial side effect.  The main benefit is that smaller
tests are easier to reason about, meaning they are easier to debug when they
start failing.  This improves the developer experience in debugging what went
wrong when refactoring the project.

Total number of tests went from 192 -> 259; or, broke apart larger tests into 69
more concise ones.

											
										
										
											2024-09-24 05:37:30 +02:00
+								from parameterized.parameterized import parameterized
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
+								from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
-												Cover searx.utils

											
										
										
											2014-07-12 15:46:55 +02:00
+								from searx import utils
-												[mod] move searx/testing.py to the tests directory

move robot tests to tests.robot
manage calls "python -m tests.robot"

											
										
										
											2021-09-02 16:01:34 +02:00
+								from tests import SearxTestCase
-												Cover searx.utils

											
										
										
											2014-07-12 15:46:55 +02:00
-												[refactor] unit tests (continued) - plugins

This commit includes some refactoring in unit tests.  As we test more plugins,
it seems unweildy to include every test class in the test_plugins.py file.  This
patch split apart all of the test plugins to their own respective files,
including the new test_plugin_calculator.py file.

											
										
										
											2024-10-05 16:10:21 +02:00
+								def random_string(length, choices=string.ascii_letters):
 								    return ''.join(random.choice(choices) for _ in range(length))
-												[mod] pylint all files with one profile / drop PYLINT_SEARXNG_DISABLE_OPTION

In the past, some files were tested with the standard profile, others with a
profile in which most of the messages were switched off ... some files were not
checked at all.

- ``PYLINT_SEARXNG_DISABLE_OPTION`` has been abolished
- the distinction ``# lint: pylint`` is no longer necessary
- the pylint tasks have been reduced from three to two

  1. ./searx/engines -> lint engines with additional builtins
  2. ./searx ./searxng_extra ./tests -> lint all other python files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-11 14:06:26 +01:00
+								class TestUtils(SearxTestCase):  # pylint: disable=missing-class-docstring
-												Cover searx.utils

											
										
										
											2014-07-12 15:46:55 +02:00
+								    def test_gen_useragent(self):
 								        self.assertIsInstance(utils.gen_useragent(), str)
 								        self.assertIsNotNone(utils.gen_useragent())
 								        self.assertTrue(utils.gen_useragent().startswith('Mozilla'))
-												A bit of utils unit tests

											
										
										
											2015-01-27 20:03:33 +01:00
+								    def test_searx_useragent(self):
 								        self.assertIsInstance(utils.searx_useragent(), str)
 								        self.assertIsNotNone(utils.searx_useragent())
 								        self.assertTrue(utils.searx_useragent().startswith('searx'))
 								    def test_html_to_text(self):
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
+								        html_str = """
 								        <a href="/testlink" class="link_access_account">
 								            <style>
 								                .toto {
 								                    color: red;
 								                }
 								            </style>
 								            <span class="toto">
 								                <span>
 								                    <img src="test.jpg" />
 								                </span>
 								            </span>
 								            <span class="titi">
 								                            Test text
 								            </span>
 								            <script>value='dummy';</script>
 								        </a>
 								        """
 								        self.assertIsInstance(utils.html_to_text(html_str), str)
 								        self.assertIsNotNone(utils.html_to_text(html_str))
 								        self.assertEqual(utils.html_to_text(html_str), "Test text")
-												[fix] HTMLParser: undocumented not implemented method

In python versions <py3.10 there is an issue with an undocumented method
HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed
in Python 3.5.

To be compatible to higher versions (>=py3.10) an error method is implemented
which throws an AssertionError exception like the higher Python versions do [3].

[1] https://github.com/python/cpython/issues/76025
[2] https://bugs.python.org/issue31844
[3] https://github.com/python/cpython/pull/8562

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2023-10-18 14:34:18 +02:00
+								        self.assertEqual(utils.html_to_text(r"regexp: (?<![a-zA-Z]"), "regexp: (?<![a-zA-Z]")
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
 								    def test_extract_text(self):
 								        html_str = """
-												A bit of utils unit tests

											
										
										
											2015-01-27 20:03:33 +01:00
+								        <a href="/testlink" class="link_access_account">
 								            <span class="toto">
 								                <span>
 								                    <img src="test.jpg" />
 								                </span>
 								            </span>
 								            <span class="titi">
 								                            Test text
 								            </span>
 								        </a>
 								        """
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
+								        dom = html.fromstring(html_str)
 								        self.assertEqual(utils.extract_text(dom), 'Test text')
 								        self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
+								        self.assertEqual(utils.extract_text(dom.xpath('//span/text()')), 'Test text')
 								        self.assertEqual(utils.extract_text(dom.xpath('count(//span)')), '3.0')
 								        self.assertEqual(utils.extract_text(dom.xpath('boolean(//span)')), 'True')
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
+								        self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
 								        self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
-												[refactor] unit tests to utilize paramaterized and break down monolithic tests

- for tests which perform the same arrange/act/assert pattern but with different
  data, the data portion has been moved to the ``paramaterized.expand`` fields

- for monolithic tests which performed multiple arrange/act/asserts,
  they have been broken up into different unit tests.

- when possible, change generic assert statements to more concise
  asserts (i.e. ``assertIsNone``)

This work ultimately is focused on creating smaller and more concise tests.
While paramaterized may make adding new configurations for existing tests
easier, that is just a beneficial side effect.  The main benefit is that smaller
tests are easier to reason about, meaning they are easier to debug when they
start failing.  This improves the developer experience in debugging what went
wrong when refactoring the project.

Total number of tests went from 192 -> 259; or, broke apart larger tests into 69
more concise ones.

											
										
										
											2024-09-24 05:37:30 +02:00
 								    def test_extract_text_allow_none(self):
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
+								        self.assertEqual(utils.extract_text(None, allow_none=True), None)
-												[refactor] unit tests to utilize paramaterized and break down monolithic tests

- for tests which perform the same arrange/act/assert pattern but with different
  data, the data portion has been moved to the ``paramaterized.expand`` fields

- for monolithic tests which performed multiple arrange/act/asserts,
  they have been broken up into different unit tests.

- when possible, change generic assert statements to more concise
  asserts (i.e. ``assertIsNone``)

This work ultimately is focused on creating smaller and more concise tests.
While paramaterized may make adding new configurations for existing tests
easier, that is just a beneficial side effect.  The main benefit is that smaller
tests are easier to reason about, meaning they are easier to debug when they
start failing.  This improves the developer experience in debugging what went
wrong when refactoring the project.

Total number of tests went from 192 -> 259; or, broke apart larger tests into 69
more concise ones.

											
										
										
											2024-09-24 05:37:30 +02:00
 								    def test_extract_text_error_none(self):
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
+								        with self.assertRaises(ValueError):
 								            utils.extract_text(None)
-												[refactor] unit tests to utilize paramaterized and break down monolithic tests

- for tests which perform the same arrange/act/assert pattern but with different
  data, the data portion has been moved to the ``paramaterized.expand`` fields

- for monolithic tests which performed multiple arrange/act/asserts,
  they have been broken up into different unit tests.

- when possible, change generic assert statements to more concise
  asserts (i.e. ``assertIsNone``)

This work ultimately is focused on creating smaller and more concise tests.
While paramaterized may make adding new configurations for existing tests
easier, that is just a beneficial side effect.  The main benefit is that smaller
tests are easier to reason about, meaning they are easier to debug when they
start failing.  This improves the developer experience in debugging what went
wrong when refactoring the project.

Total number of tests went from 192 -> 259; or, broke apart larger tests into 69
more concise ones.

											
										
										
											2024-09-24 05:37:30 +02:00
 								    def test_extract_text_error_empty(self):
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
+								        with self.assertRaises(ValueError):
 								            utils.extract_text({})
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
 								    def test_extract_url(self):
 								        def f(html_str, search_url):
 								            return utils.extract_url(html.fromstring(html_str), search_url)
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
-												[mod] move extract_text, extract_url to searx.utils

											
										
										
											2020-10-02 18:13:56 +02:00
+								        self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
 								        self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
 								        self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
 								        self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
 								        self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
 								        with self.assertRaises(lxml.etree.ParserError):
 								            f('', 'https://example.com')
 								        with self.assertRaises(Exception):
 								            utils.extract_url([], 'https://example.com')
-												A bit of utils unit tests

											
										
										
											2015-01-27 20:03:33 +01:00
-												[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception

Close #2188

											
										
										
											2020-09-11 10:23:56 +02:00
+								    def test_html_to_text_invalid(self):
-												[mod] pylint all files with one profile / drop PYLINT_SEARXNG_DISABLE_OPTION

In the past, some files were tested with the standard profile, others with a
profile in which most of the messages were switched off ... some files were not
checked at all.

- ``PYLINT_SEARXNG_DISABLE_OPTION`` has been abolished
- the distinction ``# lint: pylint`` is no longer necessary
- the pylint tasks have been reduced from three to two

  1. ./searx/engines -> lint engines with additional builtins
  2. ./searx ./searxng_extra ./tests -> lint all other python files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-11 14:06:26 +01:00
+								        _html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
 								        self.assertEqual(utils.html_to_text(_html), "Lorem ipsum")
-												[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception

Close #2188

											
										
										
											2020-09-11 10:23:56 +02:00
-												[fix] fix flickr_noapi decoding (#1655)

Characters that were not ASCII were incorrectly decoded.
Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function).

											
										
										
											2019-08-02 13:37:13 +02:00
+								    def test_ecma_unscape(self):
 								        self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 09:26:22 +01:00
+								        self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')
 								        self.assertEqual(utils.ecma_unescape('text using %u: %u5409, %u4E16%u754c'), 'text using %u: 吉, 世界')
-												[fix] fix flickr_noapi decoding (#1655)

Characters that were not ASCII were incorrectly decoded.
Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function).

											
										
										
											2019-08-02 13:37:13 +02:00
-												Cover searx.utils

											
										
										
											2014-07-12 15:46:55 +02:00
-												[mod] pylint all files with one profile / drop PYLINT_SEARXNG_DISABLE_OPTION

In the past, some files were tested with the standard profile, others with a
profile in which most of the messages were switched off ... some files were not
checked at all.

- ``PYLINT_SEARXNG_DISABLE_OPTION`` has been abolished
- the distinction ``# lint: pylint`` is no longer necessary
- the pylint tasks have been reduced from three to two

  1. ./searx/engines -> lint engines with additional builtins
  2. ./searx ./searxng_extra ./tests -> lint all other python files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-11 14:06:26 +01:00
+								class TestHTMLTextExtractor(SearxTestCase):  # pylint: disable=missing-class-docstring
-												Cover searx.utils

											
										
										
											2014-07-12 15:46:55 +02:00
+								    def setUp(self):
-												[mod] pylint all files with one profile / drop PYLINT_SEARXNG_DISABLE_OPTION

In the past, some files were tested with the standard profile, others with a
profile in which most of the messages were switched off ... some files were not
checked at all.

- ``PYLINT_SEARXNG_DISABLE_OPTION`` has been abolished
- the distinction ``# lint: pylint`` is no longer necessary
- the pylint tasks have been reduced from three to two

  1. ./searx/engines -> lint engines with additional builtins
  2. ./searx ./searxng_extra ./tests -> lint all other python files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-11 14:06:26 +01:00
+								        self.html_text_extractor = utils._HTMLTextExtractor()  # pylint: disable=protected-access
-												Cover searx.utils

											
										
										
											2014-07-12 15:46:55 +02:00
 								    def test__init__(self):
 								        self.assertEqual(self.html_text_extractor.result, [])
-												[refactor] unit tests to utilize paramaterized and break down monolithic tests

- for tests which perform the same arrange/act/assert pattern but with different
  data, the data portion has been moved to the ``paramaterized.expand`` fields

- for monolithic tests which performed multiple arrange/act/asserts,
  they have been broken up into different unit tests.

- when possible, change generic assert statements to more concise
  asserts (i.e. ``assertIsNone``)

This work ultimately is focused on creating smaller and more concise tests.
While paramaterized may make adding new configurations for existing tests
easier, that is just a beneficial side effect.  The main benefit is that smaller
tests are easier to reason about, meaning they are easier to debug when they
start failing.  This improves the developer experience in debugging what went
wrong when refactoring the project.

Total number of tests went from 192 -> 259; or, broke apart larger tests into 69
more concise ones.

											
										
										
											2024-09-24 05:37:30 +02:00
+								    @parameterized.expand(
 								        [
 								            ('xF', '\x0f'),
 								            ('XF', '\x0f'),
 								            ('97', 'a'),
 								        ]
 								    )
 								    def test_handle_charref(self, charref: str, expected: str):
 								        self.html_text_extractor.handle_charref(charref)
 								        self.assertIn(expected, self.html_text_extractor.result)
-												Cover searx.utils

											
										
										
											2014-07-12 15:46:55 +02:00
 								    def test_handle_entityref(self):
 								        entity = 'test'
 								        self.html_text_extractor.handle_entityref(entity)
 								        self.assertIn(entity, self.html_text_extractor.result)
-												[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception

Close #2188

											
										
										
											2020-09-11 10:23:56 +02:00
+								    def test_invalid_html(self):
 								        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
-												[mod] pylint all files with one profile / drop PYLINT_SEARXNG_DISABLE_OPTION

In the past, some files were tested with the standard profile, others with a
profile in which most of the messages were switched off ... some files were not
checked at all.

- ``PYLINT_SEARXNG_DISABLE_OPTION`` has been abolished
- the distinction ``# lint: pylint`` is no longer necessary
- the pylint tasks have been reduced from three to two

  1. ./searx/engines -> lint engines with additional builtins
  2. ./searx ./searxng_extra ./tests -> lint all other python files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-11 14:06:26 +01:00
+								        with self.assertRaises(utils._HTMLTextExtractorException):  # pylint: disable=protected-access
-												[fix] searx.utils.HTMLTextExtractor: invalid HTML don't raise an Exception

Close #2188

											
										
										
											2020-09-11 10:23:56 +02:00
+								            self.html_text_extractor.feed(text)
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
-												[mod] pylint all files with one profile / drop PYLINT_SEARXNG_DISABLE_OPTION

In the past, some files were tested with the standard profile, others with a
profile in which most of the messages were switched off ... some files were not
checked at all.

- ``PYLINT_SEARXNG_DISABLE_OPTION`` has been abolished
- the distinction ``# lint: pylint`` is no longer necessary
- the pylint tasks have been reduced from three to two

  1. ./searx/engines -> lint engines with additional builtins
  2. ./searx ./searxng_extra ./tests -> lint all other python files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-11 14:06:26 +01:00
+								class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
 								    TEST_DOC = """<ul>
 								        <li>Text in <b>bold</b> and <i>italic</i> </li>
 								        <li>Another <b>text</b> <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs="></li>
 								        </ul>"""
 								    def test_get_xpath_cache(self):
 								        xp1 = utils.get_xpath('//a')
 								        xp2 = utils.get_xpath('//div')
 								        xp3 = utils.get_xpath('//a')
 								        self.assertEqual(id(xp1), id(xp3))
 								        self.assertNotEqual(id(xp1), id(xp2))
 								    def test_get_xpath_type(self):
 								        utils.get_xpath(lxml.etree.XPath('//a'))
 								        with self.assertRaises(TypeError):
 								            utils.get_xpath([])
 								    def test_get_xpath_invalid(self):
 								        invalid_xpath = '//a[0].text'
 								        with self.assertRaises(SearxXPathSyntaxException) as context:
 								            utils.get_xpath(invalid_xpath)
 								        self.assertEqual(context.exception.message, 'Invalid expression')
 								        self.assertEqual(context.exception.xpath_str, invalid_xpath)
 								    def test_eval_xpath_unregistered_function(self):
 								        doc = html.fromstring(TestXPathUtils.TEST_DOC)
 								        invalid_function_xpath = 'int(//a)'
 								        with self.assertRaises(SearxEngineXPathException) as context:
 								            utils.eval_xpath(doc, invalid_function_xpath)
 								        self.assertEqual(context.exception.message, 'Unregistered function')
 								        self.assertEqual(context.exception.xpath_str, invalid_function_xpath)
 								    def test_eval_xpath(self):
 								        doc = html.fromstring(TestXPathUtils.TEST_DOC)
 								        self.assertEqual(utils.eval_xpath(doc, '//p'), [])
 								        self.assertEqual(utils.eval_xpath(doc, '//i/text()'), ['italic'])
 								        self.assertEqual(utils.eval_xpath(doc, 'count(//i)'), 1.0)
 								    def test_eval_xpath_list(self):
 								        doc = html.fromstring(TestXPathUtils.TEST_DOC)
 								        # check a not empty list
 								        self.assertEqual(utils.eval_xpath_list(doc, '//i/text()'), ['italic'])
 								        # check min_len parameter
 								        with self.assertRaises(SearxEngineXPathException) as context:
 								            utils.eval_xpath_list(doc, '//p', min_len=1)
 								        self.assertEqual(context.exception.message, 'len(xpath_str) < 1')
 								        self.assertEqual(context.exception.xpath_str, '//p')
 								    def test_eval_xpath_getindex(self):
 								        doc = html.fromstring(TestXPathUtils.TEST_DOC)
 								        # check index 0
 								        self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 0), 'italic')
 								        # default is 'something'
 								        self.assertEqual(utils.eval_xpath_getindex(doc, '//i/text()', 1, default='something'), 'something')
 								        # default is None
-												[refactor] unit tests to utilize paramaterized and break down monolithic tests

- for tests which perform the same arrange/act/assert pattern but with different
  data, the data portion has been moved to the ``paramaterized.expand`` fields

- for monolithic tests which performed multiple arrange/act/asserts,
  they have been broken up into different unit tests.

- when possible, change generic assert statements to more concise
  asserts (i.e. ``assertIsNone``)

This work ultimately is focused on creating smaller and more concise tests.
While paramaterized may make adding new configurations for existing tests
easier, that is just a beneficial side effect.  The main benefit is that smaller
tests are easier to reason about, meaning they are easier to debug when they
start failing.  This improves the developer experience in debugging what went
wrong when refactoring the project.

Total number of tests went from 192 -> 259; or, broke apart larger tests into 69
more concise ones.

											
										
										
											2024-09-24 05:37:30 +02:00
+								        self.assertIsNone(utils.eval_xpath_getindex(doc, '//i/text()', 1, default=None))
-												[enh] record details exception per engine

add an new API /stats/errors

											
										
										
											2020-11-26 15:12:11 +01:00
 								        # index not found
 								        with self.assertRaises(SearxEngineXPathException) as context:
 								            utils.eval_xpath_getindex(doc, '//i/text()', 1)
 								        self.assertEqual(context.exception.message, 'index 1 not found')
 								        # not a list
 								        with self.assertRaises(SearxEngineXPathException) as context:
 								            utils.eval_xpath_getindex(doc, 'count(//i)', 1)
 								        self.assertEqual(context.exception.message, 'the result is not a list')
-												Replace langdetect with fasttext

											
										
										
											2022-12-11 16:45:47 +01:00
 								    def test_detect_language(self):
 								        # make sure new line are not an issue
 								        # fasttext.predict('') does not accept new line.
 								        l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
 								        self.assertEqual(l, 'en')
-												[black] upgrade black 22.12.0 --> 24.2.0

The issue discussed in [1] has been solved since [2] has been merged into black
/ now we can upgrade without touching 69 files as it was needed with black
23.1.0 [3].

[1] https://github.com/searxng/searxng/pull/2159#issuecomment-1425723977
[2] https://github.com/psf/black/pull/4060
[3] https://github.com/searxng/searxng/pull/2159/files

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-03-08 18:22:31 +01:00
+								        l = utils.detect_language(
 								            'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす'
 								        )
-												Replace langdetect with fasttext

											
										
										
											2022-12-11 16:45:47 +01:00
+								        self.assertEqual(l, 'ja')
 								        l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
 								        self.assertEqual(l, 'tr')
 								        l = utils.detect_language('')
 								        self.assertIsNone(l)
 								        # mix languages --> None
 								        l = utils.detect_language('The いろはにほへと Pijamalı')
 								        self.assertIsNone(l)
 								        with self.assertRaises(ValueError):
-												[refactor] unit tests (continued) - plugins

This commit includes some refactoring in unit tests.  As we test more plugins,
it seems unweildy to include every test class in the test_plugins.py file.  This
patch split apart all of the test plugins to their own respective files,
including the new test_plugin_calculator.py file.

											
										
										
											2024-10-05 16:10:21 +02:00
+								            utils.detect_language(None)  # type: ignore