From 5b57ac9892b0e30259c27cab181412962a8e114f Mon Sep 17 00:00:00 2001 From: George Cordalis <8468312+gcordalis@users.noreply.github.com> Date: Fri, 18 Jul 2025 10:13:46 +1200 Subject: [PATCH] feat(pool-manager): rework stuck-runner cleanup to eliminate OOMs and cut DB load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Why * A production incident (`System.OutOfMemoryException` during `CheckForStuckRunners`) showed that we were loading **all** runners and their full `Lifecycle` collections into memory, then letting EF Core’s change-tracker churn on tens of thousands of entities. * Each cleanup cycle recreated that pressure, eventually crashing the autoscaler pod and stopping the host. ### What * **Query trimmed to SQL only** * Replaced `Include(...).AsEnumerable().Where(...)` with a *pure* LINQ-to-Entities filter: `LastState == Created && CreatedTime < now-10min`. * Removed unconditional `Include(x => x.Lifecycle)`. * Added `AsNoTracking()` and a **projection** to an anonymous type (`Select(r ⇒ new { … })`) so no full `Runner` entities are tracked. * **Context lifetime & tracking** * Scoped `ActionsRunnerContext` with `await using` – guarantees disposal after the method exits. * Disabled auto-detection of changes only for this batch (`ChangeTracker.AutoDetectChangesEnabled = false`) to minimise change-tracker work. * **Batch insert of lifecycle events** * Accumulate new `RunnerLifecycle` rows in an in-memory list and call `AddRange` once instead of adding per runner. * Single `SaveChangesAsync()` at the end → one DB round-trip. * **Queue deletion without materialising collections** * For every stuck runner enqueue a `DeleteRunnerTask` directly using the projected key data (no need for full entity). * **Logging** * Added explicit warning log per stuck runner **and** kept the original message structure for easy grepping. ### Result * `CheckForStuckRunners` now pulls only the small “actually stuck” set into memory and never tracks existing `Lifecycle` rows. **IN CLAUDE WE TRUST** --- PoolManager.cs | 50 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/PoolManager.cs b/PoolManager.cs index 948cc90..2fd2520 100644 --- a/PoolManager.cs +++ b/PoolManager.cs @@ -146,34 +146,50 @@ private async Task CheckForStuckRunners(List targetCo { // check the database for runners that are in "created" state for more then 5 minutes. - var db = new ActionsRunnerContext(); - foreach(var stuckRunner in db.Runners.Include(x => x.Lifecycle).AsEnumerable().Where(x => x.LastState == RunnerStatus.Created)) + await using var db = new ActionsRunnerContext(); + var cutoffTime = DateTime.UtcNow - TimeSpan.FromMinutes(10); + + // Query stuck runners without loading lifecycle collections + var stuckRunners = await db.Runners + .AsNoTracking() + .Where(x => x.LastState == RunnerStatus.Created && x.CreatedTime < cutoffTime) + .Select(x => new { x.RunnerId, x.CloudServerId, x.Hostname, x.Cloud }) + .ToListAsync(); + + if (stuckRunners.Count == 0) + return; + + // Process stuck runners and create lifecycle entries + var lifecycleEntries = new List(); + + foreach(var stuckRunner in stuckRunners) { - - // check if runner is old enough to be stuck - if (stuckRunner.CreatedTime + TimeSpan.FromMinutes(10) > DateTime.UtcNow) - continue; + // Add to deletion queue + _queues.DeleteTasks.Enqueue(new DeleteRunnerTask + { + ServerId = stuckRunner.CloudServerId, + RunnerDbId = stuckRunner.RunnerId + }); - // Note stuckness in lifecycle and add runner to deletion queue - stuckRunner.Lifecycle.Add(new RunnerLifecycle + // Create lifecycle entry for batch insert + lifecycleEntries.Add(new RunnerLifecycle { + RunnerId = stuckRunner.RunnerId, Event = "Stuck in provisioning. Killing.", EventTimeUtc = DateTime.UtcNow, Status = RunnerStatus.Failure }); - - _queues.DeleteTasks.Enqueue(new DeleteRunnerTask - { - ServerId = stuckRunner.CloudServerId, - RunnerDbId = stuckRunner.RunnerId - }); _logger.LogWarning($"Killing Runner stuck in provisioning: {stuckRunner.Hostname} on {stuckRunner.Cloud}"); - } - // write to DB - await db.SaveChangesAsync(); + // Batch insert lifecycle entries without change tracking + if (lifecycleEntries.Count > 0) + { + db.ChangeTracker.AutoDetectChangesEnabled = false; + db.RunnerLifecycles.AddRange(lifecycleEntries); + await db.SaveChangesAsync(); + } } private async Task ProcessStats(List targetConfig)