From 050451347b021d05d26c7a0797c790bbd83442e4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 19 Oct 2024 14:19:27 +0200 Subject: [PATCH] [fix] engine: duckduckgo - CAPTCHA detection The previous implementation could not distinguish a CAPTCHA response from an ordinary result list. In the previous implementation a CAPTCHA was taken as a result list where no items are in. DDG does not block IPs. Instead, a CAPTCHA wall is placed in front of request on a dubious request. Signed-off-by: Markus Heiser --- searx/engines/duckduckgo.py | 11 +++++++++++ searx/exceptions.py | 12 +++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 27171778d..2a917ed7a 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -25,6 +25,7 @@ from searx.network import get # see https://github.com/searxng/searxng/issues/7 from searx import redisdb from searx.enginelib.traits import EngineTraits from searx.utils import extr +from searx.exceptions import SearxEngineCaptchaException if TYPE_CHECKING: import logging @@ -292,6 +293,15 @@ def request(query, params): return params +def detect_ddg_captcha(dom): + """In case of CAPTCHA ddg open its own *not a Robot* dialog and is + not redirected to CAPTCHA page. + """ + if eval_xpath(dom, "//form[@id='challenge-form']"): + # set suspend time to zero is OK --> ddg does not block the IP + raise SearxEngineCaptchaException(suspended_time=0) + + def response(resp): if resp.status_code == 303: @@ -299,6 +309,7 @@ def response(resp): results = [] doc = lxml.html.fromstring(resp.text) + detect_ddg_captcha(doc) result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') diff --git a/searx/exceptions.py b/searx/exceptions.py index 77c3f998d..3d720467f 100644 --- a/searx/exceptions.py +++ b/searx/exceptions.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Exception types raised by SearXNG modules. """ +from __future__ import annotations from typing import Optional, Union @@ -61,7 +62,7 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException): """This settings contains the default suspended time (default 86400 sec / 1 day).""" - def __init__(self, suspended_time: int = None, message: str = 'Access denied'): + def __init__(self, suspended_time: int | None = None, message: str = 'Access denied'): """Generic exception to raise when an engine denies access to the results. :param suspended_time: How long the engine is going to be suspended in @@ -70,12 +71,13 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException): :param message: Internal message. Defaults to ``Access denied`` :type message: str """ - suspended_time = suspended_time or self._get_default_suspended_time() + if suspended_time is None: + suspended_time = self._get_default_suspended_time() super().__init__(message + ', suspended_time=' + str(suspended_time)) self.suspended_time = suspended_time self.message = message - def _get_default_suspended_time(self): + def _get_default_suspended_time(self) -> int: from searx import get_setting # pylint: disable=C0415 return get_setting(self.SUSPEND_TIME_SETTING) @@ -88,7 +90,7 @@ class SearxEngineCaptchaException(SearxEngineAccessDeniedException): """This settings contains the default suspended time (default 86400 sec / 1 day).""" - def __init__(self, suspended_time=None, message='CAPTCHA'): + def __init__(self, suspended_time: int | None = None, message='CAPTCHA'): super().__init__(message=message, suspended_time=suspended_time) @@ -102,7 +104,7 @@ class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException): """This settings contains the default suspended time (default 3660 sec / 1 hour).""" - def __init__(self, suspended_time=None, message='Too many request'): + def __init__(self, suspended_time: int | None = None, message='Too many request'): super().__init__(message=message, suspended_time=suspended_time)