Skip to content

Commit 7ba3d2e

Browse files
DeagleGrossCopilot
andauthored
[release/10.0] Fix KeyRingProvider thread pool starvation on cold start (#66737)
* Fix KeyRingProvider thread pool starvation on cold start (#66683) * fix thread starvation * address PR comments Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 16dd15a commit 7ba3d2e

2 files changed

Lines changed: 81 additions & 3 deletions

File tree

src/DataProtection/DataProtection/src/KeyManagement/KeyRingProvider.cs

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,44 @@ private IKeyRing GetCurrentKeyRingCoreNew(DateTime utcNow, bool forceRefresh)
360360
existingTask = _cacheableKeyRingTask;
361361
if (existingTask is null)
362362
{
363-
// If there's no existing task, make one now
363+
// The forceRefresh path skipped reading _cacheableKeyRing above; read it now.
364+
existingCacheableKeyRing ??= Volatile.Read(ref _cacheableKeyRing);
365+
366+
if (existingCacheableKeyRing is null)
367+
{
368+
// Cold start: run the refresh inline on this thread. Scheduling the work
369+
// onto TaskScheduler.Default would risk thread-pool starvation - if all
370+
// available pool threads are blocked waiting on the result, the queued
371+
// work item can't be picked up and the app hangs until the runtime
372+
// injects more threads. See https://github.com/dotnet/aspnetcore/issues/66380.
373+
//
374+
// Other concurrent callers are parked on _cacheableKeyRingLockObj while we
375+
// run; once we publish _cacheableKeyRing and release the lock, they wake up.
376+
// Non-force-refresh callers can then re-check the cache and return the
377+
// freshly populated ring, while forceRefresh callers observe that a cached
378+
// ring now exists and continue through the stale-cache refresh path below.
379+
// This matches the pre-#54675 behavior (https://github.com/dotnet/aspnetcore/pull/54675).
380+
CacheableKeyRing newCacheableKeyRing;
381+
try
382+
{
383+
newCacheableKeyRing = CacheableKeyRingProvider.GetCacheableKeyRing(utcNow);
384+
}
385+
catch (Exception ex)
386+
{
387+
// Cold-start branch: always a "reading" rather than "refreshing" failure, and
388+
// there's no stale ring to extend via WithTemporaryExtendedLifetime.
389+
// The async refresh path handles both concerns. We leave _cacheableKeyRing
390+
// and _cacheableKeyRingTask null, so the next caller retries from scratch.
391+
_logger.ErrorOccurredWhileReadingKeyRing(ex);
392+
throw;
393+
}
394+
395+
Volatile.Write(ref _cacheableKeyRing, newCacheableKeyRing);
396+
return newCacheableKeyRing.KeyRing;
397+
}
398+
399+
// Stale ring exists: dispatch the refresh asynchronously and let every other
400+
// caller return the stale ring without waiting. This is the perf win from #54675.
364401
// PERF: Closing over utcNow substantially slows down the fast case (valid cache) in micro-benchmarks
365402
// (closing over `this` for CacheableKeyRingProvider doesn't seem impactful)
366403
existingTask = Task.Factory.StartNew(
@@ -390,9 +427,8 @@ private IKeyRing GetCurrentKeyRingCoreNew(DateTime utcNow, bool forceRefresh)
390427
}
391428

392429
// Prefer a stale cached key ring to blocking
393-
if (existingCacheableKeyRing is not null)
430+
if (!forceRefresh && existingCacheableKeyRing is not null)
394431
{
395-
Debug.Assert(!forceRefresh, "Consumed cached key ring even though forceRefresh is true");
396432
Debug.Assert(!CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow), "Should have returned a valid cached key ring above");
397433
return existingCacheableKeyRing.KeyRing;
398434
}

src/DataProtection/DataProtection/test/Microsoft.AspNetCore.DataProtection.Tests/KeyManagement/KeyRingProviderTests.cs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,48 @@ private static ICacheableKeyRingProvider SetupCreateCacheableKeyRingTestAndCreat
812812
return CreateKeyRingProvider(mockKeyManager.Object, mockDefaultKeyResolver.Object, keyManagementOptions);
813813
}
814814

815+
// Regression test for https://github.com/dotnet/aspnetcore/issues/66380.
816+
// On cold start (no cached key ring) the new async-refresh code path used to schedule
817+
// the refresh work to TaskScheduler.Default and then block every caller on Task.Wait().
818+
// When all available thread-pool threads were callers, the queued refresh work could
819+
// not be picked up - producing thread-pool starvation and a multi-second freeze on
820+
// start until the runtime's hill-climbing injected more threads.
821+
//
822+
// The starvation can't be reproduced deterministically in a unit test (the runtime's
823+
// hill-climbing will eventually rescue the buggy code, so any timing-based assertion
824+
// is flaky). We instead verify the underlying invariant that prevents starvation:
825+
// on cold start, the calling thread itself must perform the refresh, not a pool worker.
826+
// If this invariant holds, starvation is impossible by construction regardless of pool
827+
// size or hill-climbing behavior.
828+
[Fact]
829+
public void GetCurrentKeyRing_ColdStart_RefreshRunsOnCallingThread()
830+
{
831+
var now = StringToDateTime("2015-03-01 00:00:00Z");
832+
var expectedKeyRing = new Mock<IKeyRing>().Object;
833+
834+
var callingThreadId = Environment.CurrentManagedThreadId;
835+
int? refreshThreadId = null;
836+
837+
var mockCacheableKeyRingProvider = new Mock<ICacheableKeyRingProvider>();
838+
mockCacheableKeyRingProvider
839+
.Setup(o => o.GetCacheableKeyRing(now))
840+
.Returns(() =>
841+
{
842+
refreshThreadId = Environment.CurrentManagedThreadId;
843+
return new CacheableKeyRing(
844+
expirationToken: CancellationToken.None,
845+
expirationTime: now.AddDays(1),
846+
keyRing: expectedKeyRing);
847+
});
848+
849+
var keyRingProvider = CreateKeyRingProvider(mockCacheableKeyRingProvider.Object);
850+
851+
var keyRing = keyRingProvider.GetCurrentKeyRingCore(now);
852+
853+
Assert.Same(expectedKeyRing, keyRing);
854+
Assert.Equal(callingThreadId, refreshThreadId);
855+
}
856+
815857
[Fact]
816858
public async Task MultipleThreadsSeeExpiredCachedValue()
817859
{

0 commit comments

Comments
 (0)