@@ -529,31 +529,93 @@ func (b *Backend) getFailedPodInfo(ctx context.Context, jobName string) string {
529529 return result
530530}
531531
532- // hasJobFailedCreateEvent returns true if the Kubernetes Job has emitted at
533- // least one FailedCreate event — meaning the Job controller tried to create a
534- // pod but was rejected before the pod object was ever persisted (e.g. due to
535- // Pod Security Admission enforcement). In that case there are no pod objects
536- // to inspect, so hasTerminalContainerWaitingError cannot detect the failure.
537- // The most recent event message is returned as the reason string.
538- func (b * Backend ) hasJobFailedCreateEvent (ctx context.Context , jobName string ) (bool , string ) {
539- evList , err := b .client .CoreV1 ().Events (b .conf .Kubernetes .JobsNamespace ).List (ctx , metav1.ListOptions {
532+ // hasJobFailedCreateEvent returns the total occurrence count of FailedCreate
533+ // events emitted by the Kubernetes Job controller for jobName, along with the
534+ // message from the most recent such event. A FailedCreate event is emitted
535+ // when the Job controller tried to create a pod but was rejected before the
536+ // pod object was ever persisted (e.g. due to Pod Security Admission
537+ // enforcement). In that case there are no pod objects to inspect, so
538+ // hasTerminalContainerWaitingError cannot detect the failure.
539+ //
540+ // Callers should require a minimum count before treating the situation as a
541+ // permanent failure, since a single FailedCreate event may be transient.
542+ //
543+ // A count of 0 is returned when no FailedCreate events exist, or when a
544+ // SuccessfulCreate event with a later timestamp is found — meaning the Job
545+ // controller recovered and successfully created a pod after the failures.
546+ // Kubernetes deduplicates repeated identical events into a single Event object
547+ // with an incremented Count field, so this function sums Count across all
548+ // FailedCreate event objects rather than using len(evList.Items).
549+ func (b * Backend ) hasJobFailedCreateEvent (ctx context.Context , jobName string ) (int , string ) {
550+ ns := b .conf .Kubernetes .JobsNamespace
551+
552+ failedList , err := b .client .CoreV1 ().Events (ns ).List (ctx , metav1.ListOptions {
540553 FieldSelector : fmt .Sprintf ("involvedObject.name=%s,reason=FailedCreate" , jobName ),
541554 })
542555 if err != nil {
543- b .log .Error ("reconcile: listing events for job" , "taskID" , jobName , "error" , err )
544- b .log .Debug ("assuming no FailedCreate events due to error listing events" , "taskID" , jobName )
545- return false , ""
556+ b .log .Error ("reconcile: listing FailedCreate events for job" , "taskID" , jobName , "error" , err )
557+ return 0 , ""
546558 }
547- if len (evList .Items ) == 0 {
548- b .log .Debug ("no FailedCreate events found for job" , "taskID" , jobName )
549- return false , ""
559+ if len (failedList .Items ) == 0 {
560+ return 0 , ""
561+ }
562+
563+ // Find the most recent FailedCreate timestamp and sum occurrence counts.
564+ // Kubernetes deduplicates rapid-fire identical events into a single Event
565+ // object with Count > 1, so we sum Count rather than len(failedList.Items).
566+ var latestFailed metav1.Time
567+ var latestMsg string
568+ var totalCount int
569+ for _ , ev := range failedList .Items {
570+ c := int (ev .Count )
571+ if c < 1 {
572+ c = 1 // Count is 0 for brand-new singleton events; treat as 1
573+ }
574+ totalCount += c
575+ ts := ev .LastTimestamp
576+ if ts .IsZero () {
577+ ts = metav1.Time {Time : ev .CreationTimestamp .Time }
578+ }
579+ if latestFailed .IsZero () || ts .After (latestFailed .Time ) {
580+ latestFailed = ts
581+ latestMsg = ev .Message
582+ }
550583 }
551- // Return the message from the most recent event.
552- latest := evList .Items [len (evList .Items )- 1 ]
553- b .log .Debug ("found FailedCreate event for job" , "taskID" , jobName , "reason" , latest .Message )
554- return true , latest .Message
584+
585+ // Check whether a SuccessfulCreate event exists with a timestamp after the
586+ // last FailedCreate. If so, the Job controller recovered on its own
587+ // (e.g. a missing ServiceAccount was created moments later by Helm) and we
588+ // should not surface this as an error.
589+ successList , err := b .client .CoreV1 ().Events (ns ).List (ctx , metav1.ListOptions {
590+ FieldSelector : fmt .Sprintf ("involvedObject.name=%s,reason=SuccessfulCreate" , jobName ),
591+ })
592+ if err != nil {
593+ b .log .Error ("reconcile: listing SuccessfulCreate events for job" , "taskID" , jobName , "error" , err )
594+ // Proceed conservatively: treat as unresolved so the count is returned.
595+ } else {
596+ for _ , ev := range successList .Items {
597+ ts := ev .LastTimestamp
598+ if ts .IsZero () {
599+ ts = metav1.Time {Time : ev .CreationTimestamp .Time }
600+ }
601+ if ts .After (latestFailed .Time ) {
602+ b .log .Debug ("reconcile: FailedCreate resolved by later SuccessfulCreate" , "taskID" , jobName )
603+ return 0 , ""
604+ }
605+ }
606+ }
607+
608+ b .log .Debug ("found unresolved FailedCreate events for job" , "taskID" , jobName , "count" , totalCount , "reason" , latestMsg )
609+ return totalCount , latestMsg
555610}
556611
612+ // maxErrEventWrites is the minimum number of FailedCreate event occurrences
613+ // (summed across all Event objects for a Job) required before the reconciler
614+ // treats the situation as a permanent failure and marks the task SYSTEM_ERROR.
615+ // This guards against false positives from transient API hiccups that produce
616+ // a single FailedCreate before self-resolving.
617+ const maxErrEventWrites = 2
618+
557619// Reconcile loops through tasks and checks the status from Funnel's database
558620// against the status reported by Kubernetes. This allows the backend to report
559621// system error's that prevented the worker process from running.
@@ -617,7 +679,6 @@ func (b *Backend) reconcile(ctx context.Context, rate time.Duration, disableClea
617679
618680 ticker := time .NewTicker (rate )
619681 failedJobEvents := make (map [string ]int )
620- const maxErrEventWrites = 2
621682
622683 for {
623684 select {
@@ -708,9 +769,12 @@ func (b *Backend) reconcile(ctx context.Context, rate time.Duration, disableClea
708769 // cases where pod creation is rejected before a pod object is
709770 // ever persisted (e.g. Pod Security Admission enforcement blocks
710771 // the pod), so there are no pod container statuses to inspect.
772+ // We require at least maxErrEventWrites events before treating
773+ // the situation as permanent, to avoid false positives from
774+ // transient API hiccups.
711775 b .log .Debug ("checking for FailedCreate events on job" , "taskID" , jobName )
712- if failed , reason := b .hasJobFailedCreateEvent (ctx , jobName ); failed {
713- b .log .Debug ("reconcile: worker job has FailedCreate event" , "taskID" , jobName , "reason" , reason )
776+ if count , reason := b .hasJobFailedCreateEvent (ctx , jobName ); count >= maxErrEventWrites {
777+ b .log .Debug ("reconcile: worker job has FailedCreate event" , "taskID" , jobName , "count" , count , " reason" , reason )
714778 b .event .WriteEvent (ctx , events .NewState (jobName , tes .SystemError ))
715779 b .event .WriteEvent (ctx , events .NewSystemLog (
716780 jobName , 0 , 0 , "error" ,
@@ -806,8 +870,11 @@ func (b *Backend) reconcile(ctx context.Context, rate time.Duration, disableClea
806870 // persists a pod object (e.g. Pod Security Admission blocks the
807871 // pod). Check for FailedCreate events which are the only signal
808872 // available in this state.
809- if failed , reason := b .hasJobFailedCreateEvent (ctx , jobName ); failed {
810- b .log .Debug ("reconcile: worker job has FailedCreate event (zero-status)" , "taskID" , jobName , "reason" , reason )
873+ // We require at least maxErrEventWrites events before treating
874+ // the situation as permanent, to avoid false positives from
875+ // transient API hiccups.
876+ if count , reason := b .hasJobFailedCreateEvent (ctx , jobName ); count >= maxErrEventWrites {
877+ b .log .Debug ("reconcile: worker job has FailedCreate event (zero-status)" , "taskID" , jobName , "count" , count , "reason" , reason )
811878 b .event .WriteEvent (ctx , events .NewState (jobName , tes .SystemError ))
812879 b .event .WriteEvent (ctx , events .NewSystemLog (
813880 jobName , 0 , 0 , "error" ,
0 commit comments