2023-05-23 18:16:37 +02:00
|
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
# lint: pylint
|
|
|
|
"""
|
|
|
|
Method ``http_user_agent``
|
|
|
|
--------------------------
|
|
|
|
|
|
|
|
The ``http_user_agent`` method evaluates a request as the request of a bot if
|
|
|
|
the User-Agent_ header is unset or matches the regular expression
|
|
|
|
:py:obj:`USER_AGENT`.
|
|
|
|
|
|
|
|
.. _User-Agent:
|
|
|
|
https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent
|
|
|
|
|
|
|
|
"""
|
2023-05-26 17:24:43 +02:00
|
|
|
# pylint: disable=unused-argument
|
2023-05-23 18:16:37 +02:00
|
|
|
|
2023-06-01 15:41:48 +02:00
|
|
|
from __future__ import annotations
|
2023-05-23 18:16:37 +02:00
|
|
|
import re
|
2023-06-01 15:41:48 +02:00
|
|
|
from ipaddress import (
|
|
|
|
IPv4Network,
|
|
|
|
IPv6Network,
|
|
|
|
)
|
|
|
|
|
2023-05-23 18:16:37 +02:00
|
|
|
import flask
|
2023-05-28 18:58:31 +02:00
|
|
|
import werkzeug
|
2023-05-23 18:16:37 +02:00
|
|
|
|
2023-05-26 17:24:43 +02:00
|
|
|
from searx.tools import config
|
2023-05-28 18:58:31 +02:00
|
|
|
from ._helpers import too_many_requests
|
2023-05-26 17:24:43 +02:00
|
|
|
|
|
|
|
|
2023-05-23 18:16:37 +02:00
|
|
|
USER_AGENT = (
|
|
|
|
r'('
|
|
|
|
+ r'unknown'
|
|
|
|
+ r'|[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp'
|
|
|
|
+ r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy'
|
|
|
|
+ r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot'
|
|
|
|
+ r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot'
|
|
|
|
+ r'|ZmEu|BLEXBot|bitlybot'
|
|
|
|
# unmaintained Farside instances
|
|
|
|
+ r'|'
|
|
|
|
+ re.escape(r'Mozilla/5.0 (compatible; Farside/0.1.0; +https://farside.link)')
|
|
|
|
# other bots and client to block
|
|
|
|
+ '|.*PetalBot.*'
|
|
|
|
+ r')'
|
|
|
|
)
|
|
|
|
"""Regular expression that matches to User-Agent_ from known *bots*"""
|
|
|
|
|
|
|
|
_regexp = None
|
|
|
|
|
|
|
|
|
|
|
|
def regexp_user_agent():
|
|
|
|
global _regexp # pylint: disable=global-statement
|
|
|
|
if not _regexp:
|
|
|
|
_regexp = re.compile(USER_AGENT)
|
|
|
|
return _regexp
|
|
|
|
|
|
|
|
|
2023-06-01 15:41:48 +02:00
|
|
|
def filter_request(
|
|
|
|
network: IPv4Network | IPv6Network,
|
|
|
|
request: flask.Request,
|
|
|
|
cfg: config.Config,
|
|
|
|
) -> werkzeug.Response | None:
|
|
|
|
|
2023-05-23 18:16:37 +02:00
|
|
|
user_agent = request.headers.get('User-Agent', 'unknown')
|
|
|
|
if regexp_user_agent().match(user_agent):
|
2023-06-01 15:41:48 +02:00
|
|
|
return too_many_requests(network, f"bot detected, HTTP header User-Agent: {user_agent}")
|
2023-05-23 18:16:37 +02:00
|
|
|
return None
|