Skip to content

Commit 0aed7c1

Browse files
vkuttypclaude
andcommitted
fix(MsSql): don't leak idle slot when validation throws OperationCanceledException
The previous AcquireAsync removed an idle connection from the channel, called validateConnection on it, and only disposed + decremented _count when validation returned false. When validation threw — most commonly OperationCanceledException from the request CT, but anything propagating out — the connection was already pulled from _idle and never put back, disposed, or counted down. Each cancelled validate leaks one slot. After enough leaks _count exceeds _maxConnections even though the channel is empty, and AcquireAsync deadlocks at _idle.Reader.ReadAsync waiting for a write that will never come. This wedged CosmoS3 PUTs through cosmoproxy: every aws s3 cp triggered parallel cred + user lookups whose CT got cancelled when AWS CLI hit its read timeout. Wrap both validation sites in TryValidateOrDiscardAsync, which always disposes and decrements before propagating an exception. Add a regression test that fires N cancelled acquires and asserts _count returns to zero. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent bcefce9 commit 0aed7c1

2 files changed

Lines changed: 97 additions & 6 deletions

File tree

src/CosmoSQLClient.MsSql/MsSqlConnectionPool.cs

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,8 @@ public async Task<MsSqlConnection> AcquireAsync(CancellationToken ct = default)
205205
{
206206
if (_idle.Reader.TryRead(out var idleConn))
207207
{
208-
if (await _validateConnection(idleConn, ct).ConfigureAwait(false))
208+
if (await TryValidateOrDiscardAsync(idleConn, ct).ConfigureAwait(false))
209209
return idleConn;
210-
211-
await idleConn.DisposeAsync().ConfigureAwait(false);
212-
Interlocked.Decrement(ref _count);
213210
continue;
214211
}
215212

@@ -222,12 +219,42 @@ public async Task<MsSqlConnection> AcquireAsync(CancellationToken ct = default)
222219

223220
Interlocked.Decrement(ref _count);
224221
var pooledConn = await _idle.Reader.ReadAsync(ct).ConfigureAwait(false);
225-
if (await _validateConnection(pooledConn, ct).ConfigureAwait(false))
222+
if (await TryValidateOrDiscardAsync(pooledConn, ct).ConfigureAwait(false))
226223
return pooledConn;
224+
}
225+
}
227226

228-
await pooledConn.DisposeAsync().ConfigureAwait(false);
227+
// Validate a connection that's been pulled out of the idle channel.
228+
// Returns true if the connection is healthy and the caller may keep it;
229+
// returns false when validation rejected it (already disposed + decremented).
230+
// CRITICAL: cancellation during _validateConnection propagates as
231+
// OperationCanceledException, so the connection is already removed
232+
// from the channel. We MUST dispose it and decrement _count before
233+
// re-throwing — otherwise the slot is leaked and, after enough
234+
// cancellations, the pool deadlocks waiting on an empty channel
235+
// because _count says we're at capacity.
236+
private async ValueTask<bool> TryValidateOrDiscardAsync(MsSqlConnection conn, CancellationToken ct)
237+
{
238+
try
239+
{
240+
if (await _validateConnection(conn, ct).ConfigureAwait(false))
241+
return true;
242+
}
243+
catch
244+
{
245+
await SafeDisposeAsync(conn).ConfigureAwait(false);
229246
Interlocked.Decrement(ref _count);
247+
throw;
230248
}
249+
250+
await SafeDisposeAsync(conn).ConfigureAwait(false);
251+
Interlocked.Decrement(ref _count);
252+
return false;
253+
}
254+
255+
private static async ValueTask SafeDisposeAsync(MsSqlConnection conn)
256+
{
257+
try { await conn.DisposeAsync().ConfigureAwait(false); } catch { /* best effort */ }
231258
}
232259

233260
internal async Task InjectIdleConnectionAsync(MsSqlConnection conn)

tests/CosmoSQLClient.MsSql.Tests/MsSqlConnectionPoolTests.cs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
using System;
2+
using System.Reflection;
13
using System.Threading;
24
using System.Threading.Tasks;
35
using CosmoSQLClient.MsSql;
@@ -7,6 +9,14 @@ namespace CosmoSQLClient.MsSql.Tests;
79

810
public sealed class MsSqlConnectionPoolTests
911
{
12+
private static int ReadCount(MsSqlConnectionPool pool)
13+
{
14+
// _count is a private int field accessed via Interlocked. Reflection
15+
// is the only way to read it from a test without changing visibility.
16+
var f = typeof(MsSqlConnectionPool).GetField("_count", BindingFlags.NonPublic | BindingFlags.Instance);
17+
return (int)f!.GetValue(pool)!;
18+
}
19+
1020
[Fact]
1121
public async Task AcquireAsync_RecreatesConnectionWhenValidationFails()
1222
{
@@ -42,4 +52,58 @@ public async Task AcquireAsync_RecreatesConnectionWhenValidationFails()
4252
Assert.Equal(1, factoryCalls);
4353
Assert.Equal(1, validatorCalls);
4454
}
55+
56+
[Fact]
57+
public async Task AcquireAsync_CancelledValidation_DoesNotLeakIdleSlot()
58+
{
59+
// Regression for the bug that wedged CosmoS3 PUTs through cosmoproxy:
60+
// when validateConnection threw OperationCanceledException, the
61+
// already-claimed idle connection was neither disposed nor counted
62+
// back. After enough cancellations, _count exceeded _maxConnections
63+
// even though _idle was empty, and AcquireAsync deadlocked on
64+
// _idle.Reader.ReadAsync waiting for a write that would never come.
65+
66+
var config = new MsSqlConfiguration
67+
{
68+
Host = "localhost",
69+
Database = "master",
70+
TrustServerCertificate = true
71+
};
72+
73+
var pool = new MsSqlConnectionPool(
74+
config,
75+
maxConnections: 2,
76+
minIdle: 0,
77+
validateConnection: (conn, ct) =>
78+
{
79+
ct.ThrowIfCancellationRequested();
80+
throw new OperationCanceledException(ct);
81+
},
82+
connectionFactory: ct =>
83+
{
84+
ct.ThrowIfCancellationRequested();
85+
return Task.FromResult(new MsSqlConnection("Server=localhost;Database=master;User Id=sa;Password=pass;Pooling=false;"));
86+
});
87+
88+
// Pre-warm two idle connections so the validation path is exercised.
89+
await pool.InjectIdleConnectionAsync(new MsSqlConnection("Server=localhost;Database=master;User Id=sa;Password=pass;Pooling=false;"));
90+
await pool.InjectIdleConnectionAsync(new MsSqlConnection("Server=localhost;Database=master;User Id=sa;Password=pass;Pooling=false;"));
91+
Assert.Equal(2, ReadCount(pool));
92+
93+
var alreadyCancelled = new CancellationToken(canceled: true);
94+
95+
for (int i = 0; i < 5; i++)
96+
{
97+
await Assert.ThrowsAsync<OperationCanceledException>(async () =>
98+
{
99+
await pool.AcquireAsync(alreadyCancelled);
100+
});
101+
}
102+
103+
// After all the cancelled acquires, the pool's accounting should still
104+
// be correct — every connection that was pulled from _idle must have
105+
// been disposed and decremented. With the old code _count would still
106+
// be 2, leaving the pool unable to open new connections.
107+
Assert.Equal(0, ReadCount(pool));
108+
}
45109
}

0 commit comments

Comments
 (0)