impr(CLDSRV-796): Handle redis unavailability during rate limit token refill

tmacro · tmacro · commit c0ee91a84b7e · 2025-12-16T10:34:23.000-08:00
diff --git a/.github/actions/setup-ci/action.yaml b/.github/actions/setup-ci/action.yaml
@@ -50,5 +50,5 @@ runs:
     - name: enable rate limiting in config and set the lisa account as the service user
       shell: bash
       run: |
-        cat config.json | jq -r '.rateLimiting = {enabled: true, serviceUserArn: "arn:aws:iam::123456789013:root"}' > config.json.new
+        cat config.json | jq -r '.rateLimiting = {enabled: true, nodes: 6, serviceUserArn: "arn:aws:iam::123456789013:root"}' > config.json.new
         mv config.json.new config.json
diff --git a/lib/api/apiUtils/rateLimit/client.js b/lib/api/apiUtils/rateLimit/client.js
@@ -65,6 +65,14 @@ class RateLimitClient {
             }
         );
     }
+
+    /**
+     * Returns whether the client is ready for operations
+     * @returns {boolean}
+     */
+    isReady() {
+        return this.redis.status === 'ready' || this.redis.status === 'wait';
+    }
 }
 
 let instance;
diff --git a/lib/api/apiUtils/rateLimit/tokenBucket.js b/lib/api/apiUtils/rateLimit/tokenBucket.js
@@ -107,13 +107,24 @@ class WorkerTokenBucket {
                 config.rateLimiting.bucket?.defaultConfig?.requestsPerSecond?.burstCapacity || 1;
             const burstCapacity = burstCapacitySeconds * 1000;
 
-            // Request tokens from Redis (atomic GCRA enforcement)
-            const granted = await util.promisify(redisClient.grantTokens.bind(redisClient))(
-                this.bucketName,
-                requested,
-                interval,
-                burstCapacity,
-            );
+            let granted = requested;
+            if (redisClient.isReady()) {
+                // Request tokens from Redis (atomic GCRA enforcement)
+                granted = await util.promisify(redisClient.grantTokens.bind(redisClient))(
+                    this.bucketName,
+                    requested,
+                    interval,
+                    burstCapacity,
+                );
+            } else {
+                // Connection to redis has failed in some way.
+                // Client will be reconnecting in the background.
+                // We grant the requested amount of tokens anyway to avoid degrading service availability.
+                this.log.warn(
+                    'rate limit redis client not connected. granting tokens anyway to avoid service degradation',
+                    { bucketName: this.bucketName },
+                );
+            }
 
             // Add granted tokens to buffer
             this.tokens += granted;
diff --git a/lib/api/bucketPutRateLimit.js b/lib/api/bucketPutRateLimit.js
@@ -32,9 +32,9 @@ function validateRateLimitConfig(requestConfig, callback) {
         return callback(errorInstances.InvalidArgument
             .customizeDescription(
                 `RequestsPerSecond (${limit}) must be >= ` +
-                `(nodes × workers = ${nodes} × ${workers} = ${minLimit}) or 0 (unlimited). ` +
+                `(nodes x workers = ${nodes} x ${workers} = ${minLimit}) or 0 (unlimited). ` +
                 'Each worker enforces limit/nodes/workers locally. ' +
-                `With limit < ${minLimit}, per-worker rate would be < 1 req/s, ` +
+                `With limit less than ${minLimit}, per-worker rate would be less than 1 req/s, ` +
                 'effectively blocking traffic.'
             ));
     }
diff --git a/lib/server.js b/lib/server.js
@@ -376,9 +376,9 @@ class S3Server {
 
             // Start rate limit background job
             if (this.config.rateLimiting?.enabled) {
-                startCleanupJob(log);
+                startCleanupJob(logger);
                 // Start token refill job for token reservation system
-                startRefillJob(log);
+                startRefillJob(logger);
             }
 
             // Start API server(s)
diff --git a/tests/functional/aws-node-sdk/test/bucket/putBucketRateLimit.js b/tests/functional/aws-node-sdk/test/bucket/putBucketRateLimit.js
@@ -113,26 +113,30 @@ skipIfRateLimitDisabled('Test put bucket rate limit', () => {
     });
 
     describe('validation against node and worker count', () => {
-        it.skip('should reject limits less than (nodes × workers)', async () => {
-            const nodes = config.rateLimiting?.nodes || 1;
-            const workers = config.clusters || 1;
-            const minLimit = nodes * workers;
 
+        const nodes = config.rateLimiting?.nodes || 1;
+        const workers = config.clusters || 1;
+        const minLimit = nodes * workers;
+        const skipIfSingleNode = nodes === 1 ? it.skip : it;
+
+        // Test requires multiple nodes to pass
+        // With only 1 node and 1 worker the minLimit is 1.
+        // This leaves no invalid values to test as 0 is also a valid setting (unlimited)
+        skipIfSingleNode('should reject limits less than (nodes x workers)', async () => {
+            let error;
             try {
                 const invalidConfig = { RequestsPerSecond: minLimit - 1 };
                 await sendRateLimitRequest('PUT', '127.0.0.1:8000',
                     `/${bucket}/?rate-limit`, JSON.stringify(invalidConfig));
-                assert.fail('Should have thrown an error');
             } catch (err) {
-                assert.strictEqual(err.Error.Code[0], 'InvalidArgument');
+                error = err;
+            } finally {
+                assert(error !== undefined, 'error expected');
+                assert.strictEqual(error.Error.Code[0], 'InvalidArgument');
             }
         });
 
-        it.skip('should accept limits equal to (nodes × workers)', async () => {
-            const nodes = config.rateLimiting?.nodes || 1;
-            const workers = config.clusters || 1;
-            const minLimit = nodes * workers;
-
+        it('should accept limits equal to (nodes x workers)', async () => {
             try {
                 const validConfig = { RequestsPerSecond: minLimit };
                 await sendRateLimitRequest('PUT', '127.0.0.1:8000',
@@ -146,11 +150,7 @@ skipIfRateLimitDisabled('Test put bucket rate limit', () => {
             }
         });
 
-        it('should accept limits greater than (nodes × workers)', async () => {
-            const nodes = config.rateLimiting?.nodes || 1;
-            const workers = config.clusters || 1;
-            const minLimit = nodes * workers;
-
+        it('should accept limits greater than (nodes x workers)', async () => {
             try {
                 const validConfig = { RequestsPerSecond: minLimit + 1000 };
                 await sendRateLimitRequest('PUT', '127.0.0.1:8000',
diff --git a/tests/functional/aws-node-sdk/test/rateLimit/client.js b/tests/functional/aws-node-sdk/test/rateLimit/client.js
@@ -0,0 +1,103 @@
+const assert = require('assert');
+const { RateLimitClient } = require('../../../../../lib/api/apiUtils/rateLimit/client');
+const { config } = require('../../../../../lib/Config');
+const { skipIfRateLimitDisabled } = require('./tooling');
+
+const testBucket = 'rate-limit-test-bucket';
+
+skipIfRateLimitDisabled('RateLimitClient', () => {
+    let client;
+
+    before(async () => {
+        // Create a test client using the same config as the application
+        client = new RateLimitClient(config.localCache);
+
+        // Connect the client (since lazyConnect is true)
+        await client.redis.connect();
+    });
+
+    after(async () => client.redis.quit().catch(() => {}));
+
+    beforeEach(async () => {
+        const keys = await client.redis.keys('ratelimit:bucket:*');
+        if (keys.length > 0) {
+            await client.redis.del(...keys);
+        }
+    });
+
+    describe('isReady', () => {
+        it('should return true when client is connected to redis', () => {
+            assert.strictEqual(client.isReady(), true);
+        });
+
+       it('should return true when the client is waiting to connect or the first time', () => {
+            const client = new RateLimitClient(config.localCache);
+            assert.strictEqual(client.isReady(), true);
+        });
+    });
+
+    describe('grantTokens', () => {
+        it('should grant requested tokens when quota is available', done => {
+            const requested = 5;
+            const interval = 100; // 100ms per request = 10 req/s
+            const burstCapacity = 1000; // 1000ms burst capacity
+
+            client.grantTokens(testBucket, requested, interval, burstCapacity, (err, granted) => {
+                assert.ifError(err);
+                assert.strictEqual(granted, requested);
+                done();
+            });
+        });
+
+        it('should grant tokens multiple times within burst capacity', done => {
+            const requested = 2;
+            const interval = 100; // 100ms per request
+            const burstCapacity = 1000; // 1000ms burst capacity
+
+            // First request
+            client.grantTokens(testBucket, requested, interval, burstCapacity, (err, granted1) => {
+                assert.ifError(err);
+                assert.strictEqual(granted1, requested);
+
+                // Second request immediately after
+                client.grantTokens(testBucket, requested, interval, burstCapacity, (err, granted2) => {
+                    assert.ifError(err);
+                    assert.strictEqual(granted2, requested);
+                    done();
+                });
+            });
+        });
+
+        it('should grant partial tokens when request exceeds available capacity', done => {
+            const interval = 100; // 100ms per request
+            const burstCapacity = 500; // 500ms burst capacity = max 5 tokens
+
+            // Request more tokens than available in burst
+            client.grantTokens(testBucket, 10, interval, burstCapacity, (err, granted) => {
+                assert.ifError(err);
+                // Should grant partial tokens (5 tokens max with 500ms burst)
+                assert(granted > 0, 'Should grant at least some tokens');
+                assert(granted <= 5, 'Should not grant more than burst capacity allows');
+                done();
+            });
+        });
+
+        it('should deny tokens (return 0) when quota is exhausted', done => {
+            const interval = 100; // 100ms per request
+            const burstCapacity = 100; // 100ms burst capacity = max 1 token
+
+            // First request consumes the burst capacity
+            client.grantTokens(testBucket, 1, interval, burstCapacity, (err, granted1) => {
+                assert.ifError(err);
+                assert.strictEqual(granted1, 1);
+
+                // Second request immediately after should be denied
+                client.grantTokens(testBucket, 1, interval, burstCapacity, (err, granted2) => {
+                    assert.ifError(err);
+                    assert.strictEqual(granted2, 0, 'Should deny tokens when quota exhausted');
+                    done();
+                });
+            });
+        });
+    });
+});

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,14 @@ class RateLimitClient {`
`65`	`65`	`}`
`66`	`66`	`);`
`67`	`67`	`}`
	`68`	`+`
	`69`	`+ /**`
	`70`	`+ * Returns whether the client is ready for operations`
	`71`	`+ * @returns {boolean}`
	`72`	`+ */`
	`73`	`+ isReady() {`
	`74`	`+ return this.redis.status === 'ready' \|\| this.redis.status === 'wait';`
	`75`	`+ }`
`68`	`76`	`}`
`69`	`77`
`70`	`78`	`let instance;`