From f79b0fce06936878d334a3927e2075f940a9ac46 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sun, 19 Dec 2021 11:01:50 +0100 Subject: [PATCH] [enh] limiter plugin can replace filtron: * rate limite the number of request per IP and per (IP, User-Agent) * block some bots use Redis data stored in Redis never contains the IP addresses, only HMAC using the secret_key Co-authored-by: Markus Heiser --- docs/src/searx.plugins.limiter.rst | 9 ++ searx/plugins/limiter.py | 129 +++++++++++++++++++++++++++++ searx/settings.yml | 1 + searx/settings_defaults.py | 1 + 4 files changed, 140 insertions(+) create mode 100644 docs/src/searx.plugins.limiter.rst create mode 100644 searx/plugins/limiter.py diff --git a/docs/src/searx.plugins.limiter.rst b/docs/src/searx.plugins.limiter.rst new file mode 100644 index 000000000..4984cd37a --- /dev/null +++ b/docs/src/searx.plugins.limiter.rst @@ -0,0 +1,9 @@ +.. _limiter plugin: + +============== +Limiter Plugin +============== + +.. automodule:: searx.plugins.limiter + :members: + diff --git a/searx/plugins/limiter.py b/searx/plugins/limiter.py new file mode 100644 index 000000000..eb0704c9c --- /dev/null +++ b/searx/plugins/limiter.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pyright: basic +"""Some bot protection / rate limitation + +Enable the plugin in ``settings.yml``: + +- ``server.limiter: true`` +- ``redis.url: ...`` check the value, see :ref:`settings redis` +""" + +import hmac +import re +from flask import request + +from searx.shared import redisdb + +name = "Request limiter" +description = "Limit the number of request" +default_on = False +preference_section = 'service' + + +re_bot = re.compile( + r'(' + + r'[Cc][Uu][Rr][Ll]|[wW]get|Scrapy|splash|JavaFX|FeedFetcher|python-requests|Go-http-client|Java|Jakarta|okhttp' + + r'|HttpClient|Jersey|Python|libwww-perl|Ruby|SynHttpClient|UniversalFeedParser|Googlebot|GoogleImageProxy' + + r'|bingbot|Baiduspider|yacybot|YandexMobileBot|YandexBot|Yahoo! Slurp|MJ12bot|AhrefsBot|archive.org_bot|msnbot' + + r'|MJ12bot|SeznamBot|linkdexbot|Netvibes|SMTBot|zgrab|James BOT|Sogou|Abonti|Pixray|Spinn3r|SemrushBot|Exabot' + + r'|ZmEu|BLEXBot|bitlybot' + + r')' +) + + +def is_accepted_request(inc_get_counter) -> bool: + # pylint: disable=too-many-return-statements + user_agent = request.headers.get('User-Agent', '') + x_forwarded_for = request.headers.get('X-Forwarded-For', '') + + if request.path == '/image_proxy': + if re_bot.match(user_agent): + return False + return True + + c = inc_get_counter(interval=25, keys=[b'IP limit, all paths', x_forwarded_for]) + if c > 30: + return False + + c = inc_get_counter(interval=60, keys=[b'useragent limit, all paths', x_forwarded_for, user_agent]) + if c > 30: + return False + + if request.path in ('/', '/search') and ('q' in request.args or 'q' in request.form): + if re_bot.match(user_agent): + return False + + if 'Accept-Language' not in request.headers: + return False + + if request.headers.get('Connection') == 'close': + return False + + accept_encoding_list = [l.strip() for l in request.headers.get('Accept-Encoding', '').split(',')] + if 'gzip' not in accept_encoding_list or 'deflate' not in accept_encoding_list: + return False + + if 'text/html' not in request.accept_mimetypes: + return False + + if request.args.get('format', 'html') != 'html': + c = inc_get_counter(interval=3600, keys=[b'API limit', x_forwarded_for]) + if c > 4: + return False + return True + + +def create_inc_get_counter(redis_client, secret_key_bytes): + lua_script = """ + local slidingWindow = KEYS[1] + local key = KEYS[2] + local now = tonumber(redis.call('TIME')[1]) + local id = redis.call('INCR', 'counter') + if (id > 2^46) + then + redis.call('SET', 'count', 0) + end + redis.call('ZREMRANGEBYSCORE', key, 0, now - slidingWindow) + redis.call('ZADD', key, now, id) + local result = redis.call('ZCOUNT', key, 0, now+1) + redis.call('EXPIRE', key, slidingWindow) + return result + """ + script_sha = redis_client.script_load(lua_script) + + def inc_get_counter(interval, keys): + m = hmac.new(secret_key_bytes, digestmod='sha256') + for k in keys: + m.update(bytes(str(k), encoding='utf-8') or b'') + m.update(b"\0") + key = m.digest() + return redis_client.evalsha(script_sha, 2, interval, key) + + return inc_get_counter + + +def create_pre_request(get_aggregation_count): + def pre_request(): + if not is_accepted_request(get_aggregation_count): + return '', 429 + return None + + return pre_request + + +def init(app, settings): + + if not settings['server']['limiter']: + return False + + logger.debug("init limiter DB") # pylint: disable=undefined-variable + if not redisdb.init(): + logger.error("init limiter DB failed!!!") # pylint: disable=undefined-variable + return False + + redis_client = redisdb.client() + secret_key_bytes = bytes(settings['server']['secret_key'], encoding='utf-8') + inc_get_counter = create_inc_get_counter(redis_client, secret_key_bytes) + app.before_request(create_pre_request(inc_get_counter)) + return True diff --git a/searx/settings.yml b/searx/settings.yml index be068a10e..9d91e5329 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -43,6 +43,7 @@ server: port: 8888 bind_address: "127.0.0.1" base_url: false # Possible values: false or "https://example.org/location". + limiter: false # rate limit the number of request on the instance, block some bots # If your instance owns a /etc/searxng/settings.yml file, then set the following # values there. diff --git a/searx/settings_defaults.py b/searx/settings_defaults.py index ff556e3bb..15b4524c6 100644 --- a/searx/settings_defaults.py +++ b/searx/settings_defaults.py @@ -163,6 +163,7 @@ SCHEMA = { 'server': { 'port': SettingsValue((int, str), 8888, 'SEARXNG_PORT'), 'bind_address': SettingsValue(str, '127.0.0.1', 'SEARXNG_BIND_ADDRESS'), + 'limiter': SettingsValue(bool, False), 'secret_key': SettingsValue(str, environ_name='SEARXNG_SECRET'), 'base_url': SettingsValue((False, str), False), 'image_proxy': SettingsValue(bool, False),