@@ -547,8 +547,10 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
547547 . Where ( x => ( x . State == JobState . Queued || x . State == JobState . Throttled ) && x . RunnerId == null && x . QueueTime < stuckTime )
548548 . ToListAsync ( ) ;
549549
550- activity ? . SetTag ( "stuck_jobs.count " , stuckJobs . Count ) ;
550+ activity ? . SetTag ( "stuck_jobs.candidate_count " , stuckJobs . Count ) ;
551551
552+ // First pass: verify with GitHub which jobs are actually still queued
553+ var confirmedStuckJobs = new List < Job > ( ) ;
552554 foreach ( var stuckJob in stuckJobs )
553555 {
554556 var owner = targetConfig . FirstOrDefault ( x => x . Name == stuckJob . Owner ) ;
@@ -583,7 +585,48 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
583585 }
584586 }
585587
586- // Job is genuinely stuck (not due to quota) - create replacement runner
588+ // Verify job is still queued on GitHub before treating it as stuck
589+ GitHubApiWorkflowRun ghJob = await GitHubApi . GetJobInfoForRepo ( stuckJob . GithubJobId , stuckJob . Repository , owner . GitHubToken ) ;
590+ if ( ghJob == null || ghJob . Status != "queued" )
591+ {
592+ if ( ghJob == null )
593+ {
594+ _logger . LogWarning ( $ "GHjob for { stuckJob . JobId } is null - not actually stuck") ;
595+ }
596+ else if ( ghJob . Status == "completed" )
597+ {
598+ _logger . LogInformation ( $ "Job { stuckJob . JobId } already completed on GitHub - updating local state") ;
599+ stuckJob . State = JobState . Completed ;
600+ stuckJob . CompleteTime = DateTime . UtcNow ;
601+ await db . SaveChangesAsync ( ) ;
602+ }
603+ else
604+ {
605+ _logger . LogInformation ( $ "Job { stuckJob . JobId } has GitHub status '{ ghJob . Status } ' - not queued") ;
606+
607+ if ( stuckJob . QueueTime + TimeSpan . FromHours ( 2 ) < DateTime . UtcNow )
608+ {
609+ _logger . LogWarning ( $ "Marking job { stuckJob . GithubJobId } as vanished - no longer queued on GitHub for over 2h.") ;
610+ stuckJob . State = JobState . Vanished ;
611+ stuckJob . CompleteTime = DateTime . UtcNow ;
612+ await db . SaveChangesAsync ( ) ;
613+ }
614+ }
615+
616+ continue ;
617+ }
618+
619+ // Job is confirmed stuck on GitHub
620+ confirmedStuckJobs . Add ( stuckJob ) ;
621+ }
622+
623+ activity ? . SetTag ( "stuck_jobs.confirmed_count" , confirmedStuckJobs . Count ) ;
624+
625+ // Second pass: create replacement runners for confirmed stuck jobs
626+ foreach ( var stuckJob in confirmedStuckJobs )
627+ {
628+ var owner = targetConfig . First ( x => x . Name == stuckJob . Owner ) ;
629+
587630 _logger . LogWarning ( $ "Found stuck Job: { stuckJob . JobId } in { stuckJob . Repository } . Starting new runner to compensate...") ;
588631
589632 // Check if there is already a runner in queue to unstuck
@@ -593,13 +636,13 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
593636 continue ;
594637 }
595638
596- // Count-based check: compare matching runners in pipeline vs stuck jobs needing them
639+ // Count-based check: compare matching runners in pipeline vs confirmed stuck jobs needing them
597640 var profile = stuckJob . RequestedProfile ?? "default" ;
598641 int matchingRunnersInPipeline =
599642 _queues . CreateTasks . CountMatchingRunners ( stuckJob . RequestedSize , stuckJob . Owner , profile )
600643 + _queues . CreatedRunners . CountMatchingRunners ( stuckJob . RequestedSize , stuckJob . Owner , profile ) ;
601644
602- int stuckJobsWithSameRequirements = stuckJobs . Count ( j =>
645+ int stuckJobsWithSameRequirements = confirmedStuckJobs . Count ( j =>
603646 j . RequestedSize == stuckJob . RequestedSize
604647 && j . Owner == stuckJob . Owner
605648 && ( j . RequestedProfile ?? "default" ) == profile ) ;
@@ -619,39 +662,6 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
619662 _logger . LogWarning ( $ "Creating queue already has { replacementsInQueue } stuck jobs replacements. Not adding more strain.") ;
620663 continue ;
621664 }
622-
623- // check job on github
624- GitHubApiWorkflowRun ghJob = await GitHubApi . GetJobInfoForRepo ( stuckJob . GithubJobId , stuckJob . Repository , owner . GitHubToken ) ;
625- if ( ghJob == null || ghJob . Status != "queued" )
626- {
627- _logger . LogWarning ( $ "job info for { stuckJob . JobId } not found or job not queued anymore on github.") ;
628-
629- if ( ghJob == null )
630- {
631- _logger . LogWarning ( $ "GHjob for { stuckJob . JobId } is null") ;
632- }
633- else if ( ghJob . Status == "completed" )
634- {
635- _logger . LogWarning ( $ "GHjob status for { stuckJob . JobId } is { ghJob . Status } - Marking job accordingly") ;
636- stuckJob . State = JobState . Completed ;
637- stuckJob . CompleteTime = DateTime . UtcNow ;
638- await db . SaveChangesAsync ( ) ;
639- }
640- else if ( ghJob . Status != "queued" )
641- {
642- _logger . LogWarning ( $ "GHjob status for { stuckJob . JobId } is { ghJob . Status } ") ;
643-
644- if ( stuckJob . QueueTime + TimeSpan . FromHours ( 2 ) < DateTime . UtcNow )
645- {
646- _logger . LogWarning ( $ "Marking stuck job { stuckJob . GithubJobId } vanished as it's no longer in the GitHub queued state for more than 2h.") ;
647- stuckJob . State = JobState . Vanished ;
648- stuckJob . CompleteTime = DateTime . UtcNow ;
649- await db . SaveChangesAsync ( ) ;
650- }
651- }
652-
653- continue ;
654- }
655665
656666 string arch = Program . Config . Sizes . FirstOrDefault ( x => x . Name == stuckJob . RequestedSize ) ? . Arch ;
657667 Runner newRunner = new ( )
@@ -675,11 +685,11 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
675685 IsCustom = profile != "default" ,
676686 Owner = stuckJob . Owner ,
677687 StuckJobReplacement = true
678-
688+
679689 } ;
680690 await db . Runners . AddAsync ( newRunner ) ;
681691 await db . SaveChangesAsync ( ) ;
682-
692+
683693 _queues . CreateTasks . Enqueue ( new CreateRunnerTask
684694 {
685695 RepoName = stuckJob . Repository ,
0 commit comments