Skip to content

Commit a57bc5a

Browse files
improve stuck job detection
1 parent e351779 commit a57bc5a

File tree

2 files changed

+50
-40
lines changed

2 files changed

+50
-40
lines changed

Models/RunnerQueue.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ public int CountMatchingRunners(string size, string owner, string profile)
7676
ctq => ctq.RunnerDbId,
7777
r => r.RunnerId,
7878
(ctq, r) => r)
79-
.Count(r => r.Size == size && r.Owner == owner && r.Profile == profile);
79+
.Count(r => r.Size == size && r.Owner == owner && r.Profile == profile && r.CreationQueuedTime < (DateTime.UtcNow - TimeSpan.FromHours(2)));
8080
}
8181

8282
public bool TryDequeue(out CreateRunnerTask? task)

PoolManager.cs

Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,10 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
547547
.Where(x => (x.State == JobState.Queued || x.State == JobState.Throttled) && x.RunnerId == null && x.QueueTime < stuckTime)
548548
.ToListAsync();
549549

550-
activity?.SetTag("stuck_jobs.count", stuckJobs.Count);
550+
activity?.SetTag("stuck_jobs.candidate_count", stuckJobs.Count);
551551

552+
// First pass: verify with GitHub which jobs are actually still queued
553+
var confirmedStuckJobs = new List<Job>();
552554
foreach (var stuckJob in stuckJobs)
553555
{
554556
var owner = targetConfig.FirstOrDefault(x => x.Name == stuckJob.Owner);
@@ -583,7 +585,48 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
583585
}
584586
}
585587

586-
// Job is genuinely stuck (not due to quota) - create replacement runner
588+
// Verify job is still queued on GitHub before treating it as stuck
589+
GitHubApiWorkflowRun ghJob = await GitHubApi.GetJobInfoForRepo(stuckJob.GithubJobId, stuckJob.Repository , owner.GitHubToken);
590+
if (ghJob == null || ghJob.Status != "queued")
591+
{
592+
if (ghJob == null)
593+
{
594+
_logger.LogWarning($"GHjob for {stuckJob.JobId} is null - not actually stuck");
595+
}
596+
else if (ghJob.Status == "completed")
597+
{
598+
_logger.LogInformation($"Job {stuckJob.JobId} already completed on GitHub - updating local state");
599+
stuckJob.State = JobState.Completed;
600+
stuckJob.CompleteTime = DateTime.UtcNow;
601+
await db.SaveChangesAsync();
602+
}
603+
else
604+
{
605+
_logger.LogInformation($"Job {stuckJob.JobId} has GitHub status '{ghJob.Status}' - not queued");
606+
607+
if (stuckJob.QueueTime + TimeSpan.FromHours(2) < DateTime.UtcNow)
608+
{
609+
_logger.LogWarning($"Marking job {stuckJob.GithubJobId} as vanished - no longer queued on GitHub for over 2h.");
610+
stuckJob.State = JobState.Vanished;
611+
stuckJob.CompleteTime = DateTime.UtcNow;
612+
await db.SaveChangesAsync();
613+
}
614+
}
615+
616+
continue;
617+
}
618+
619+
// Job is confirmed stuck on GitHub
620+
confirmedStuckJobs.Add(stuckJob);
621+
}
622+
623+
activity?.SetTag("stuck_jobs.confirmed_count", confirmedStuckJobs.Count);
624+
625+
// Second pass: create replacement runners for confirmed stuck jobs
626+
foreach (var stuckJob in confirmedStuckJobs)
627+
{
628+
var owner = targetConfig.First(x => x.Name == stuckJob.Owner);
629+
587630
_logger.LogWarning($"Found stuck Job: {stuckJob.JobId} in {stuckJob.Repository}. Starting new runner to compensate...");
588631

589632
// Check if there is already a runner in queue to unstuck
@@ -593,13 +636,13 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
593636
continue;
594637
}
595638

596-
// Count-based check: compare matching runners in pipeline vs stuck jobs needing them
639+
// Count-based check: compare matching runners in pipeline vs confirmed stuck jobs needing them
597640
var profile = stuckJob.RequestedProfile ?? "default";
598641
int matchingRunnersInPipeline =
599642
_queues.CreateTasks.CountMatchingRunners(stuckJob.RequestedSize, stuckJob.Owner, profile)
600643
+ _queues.CreatedRunners.CountMatchingRunners(stuckJob.RequestedSize, stuckJob.Owner, profile);
601644

602-
int stuckJobsWithSameRequirements = stuckJobs.Count(j =>
645+
int stuckJobsWithSameRequirements = confirmedStuckJobs.Count(j =>
603646
j.RequestedSize == stuckJob.RequestedSize
604647
&& j.Owner == stuckJob.Owner
605648
&& (j.RequestedProfile ?? "default") == profile);
@@ -619,39 +662,6 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
619662
_logger.LogWarning($"Creating queue already has {replacementsInQueue} stuck jobs replacements. Not adding more strain.");
620663
continue;
621664
}
622-
623-
// check job on github
624-
GitHubApiWorkflowRun ghJob = await GitHubApi.GetJobInfoForRepo(stuckJob.GithubJobId, stuckJob.Repository , owner.GitHubToken);
625-
if (ghJob == null || ghJob.Status != "queued")
626-
{
627-
_logger.LogWarning($"job info for {stuckJob.JobId} not found or job not queued anymore on github.");
628-
629-
if (ghJob == null)
630-
{
631-
_logger.LogWarning($"GHjob for {stuckJob.JobId} is null");
632-
}
633-
else if (ghJob.Status == "completed")
634-
{
635-
_logger.LogWarning($"GHjob status for {stuckJob.JobId} is {ghJob.Status} - Marking job accordingly");
636-
stuckJob.State = JobState.Completed;
637-
stuckJob.CompleteTime = DateTime.UtcNow;
638-
await db.SaveChangesAsync();
639-
}
640-
else if (ghJob.Status != "queued")
641-
{
642-
_logger.LogWarning($"GHjob status for {stuckJob.JobId} is {ghJob.Status}");
643-
644-
if (stuckJob.QueueTime + TimeSpan.FromHours(2) < DateTime.UtcNow)
645-
{
646-
_logger.LogWarning($"Marking stuck job {stuckJob.GithubJobId} vanished as it's no longer in the GitHub queued state for more than 2h.");
647-
stuckJob.State = JobState.Vanished;
648-
stuckJob.CompleteTime = DateTime.UtcNow;
649-
await db.SaveChangesAsync();
650-
}
651-
}
652-
653-
continue;
654-
}
655665

656666
string arch = Program.Config.Sizes.FirstOrDefault(x => x.Name == stuckJob.RequestedSize)?.Arch;
657667
Runner newRunner = new()
@@ -675,11 +685,11 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
675685
IsCustom = profile != "default",
676686
Owner = stuckJob.Owner,
677687
StuckJobReplacement = true
678-
688+
679689
};
680690
await db.Runners.AddAsync(newRunner);
681691
await db.SaveChangesAsync();
682-
692+
683693
_queues.CreateTasks.Enqueue(new CreateRunnerTask
684694
{
685695
RepoName = stuckJob.Repository,

0 commit comments

Comments
 (0)