Skip to content

Commit bddf1d9

Browse files
committed
feat: Add support for graceful FailedCreateEvents
Signed-off-by: Liam Beckman <lbeckman314@gmail.com>
1 parent c089317 commit bddf1d9

2 files changed

Lines changed: 117 additions & 39 deletions

File tree

compute/kubernetes/backend.go

Lines changed: 90 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -529,31 +529,93 @@ func (b *Backend) getFailedPodInfo(ctx context.Context, jobName string) string {
529529
return result
530530
}
531531

532-
// hasJobFailedCreateEvent returns true if the Kubernetes Job has emitted at
533-
// least one FailedCreate event — meaning the Job controller tried to create a
534-
// pod but was rejected before the pod object was ever persisted (e.g. due to
535-
// Pod Security Admission enforcement). In that case there are no pod objects
536-
// to inspect, so hasTerminalContainerWaitingError cannot detect the failure.
537-
// The most recent event message is returned as the reason string.
538-
func (b *Backend) hasJobFailedCreateEvent(ctx context.Context, jobName string) (bool, string) {
539-
evList, err := b.client.CoreV1().Events(b.conf.Kubernetes.JobsNamespace).List(ctx, metav1.ListOptions{
532+
// hasJobFailedCreateEvent returns the total occurrence count of FailedCreate
533+
// events emitted by the Kubernetes Job controller for jobName, along with the
534+
// message from the most recent such event. A FailedCreate event is emitted
535+
// when the Job controller tried to create a pod but was rejected before the
536+
// pod object was ever persisted (e.g. due to Pod Security Admission
537+
// enforcement). In that case there are no pod objects to inspect, so
538+
// hasTerminalContainerWaitingError cannot detect the failure.
539+
//
540+
// Callers should require a minimum count before treating the situation as a
541+
// permanent failure, since a single FailedCreate event may be transient.
542+
//
543+
// A count of 0 is returned when no FailedCreate events exist, or when a
544+
// SuccessfulCreate event with a later timestamp is found — meaning the Job
545+
// controller recovered and successfully created a pod after the failures.
546+
// Kubernetes deduplicates repeated identical events into a single Event object
547+
// with an incremented Count field, so this function sums Count across all
548+
// FailedCreate event objects rather than using len(evList.Items).
549+
func (b *Backend) hasJobFailedCreateEvent(ctx context.Context, jobName string) (int, string) {
550+
ns := b.conf.Kubernetes.JobsNamespace
551+
552+
failedList, err := b.client.CoreV1().Events(ns).List(ctx, metav1.ListOptions{
540553
FieldSelector: fmt.Sprintf("involvedObject.name=%s,reason=FailedCreate", jobName),
541554
})
542555
if err != nil {
543-
b.log.Error("reconcile: listing events for job", "taskID", jobName, "error", err)
544-
b.log.Debug("assuming no FailedCreate events due to error listing events", "taskID", jobName)
545-
return false, ""
556+
b.log.Error("reconcile: listing FailedCreate events for job", "taskID", jobName, "error", err)
557+
return 0, ""
546558
}
547-
if len(evList.Items) == 0 {
548-
b.log.Debug("no FailedCreate events found for job", "taskID", jobName)
549-
return false, ""
559+
if len(failedList.Items) == 0 {
560+
return 0, ""
561+
}
562+
563+
// Find the most recent FailedCreate timestamp and sum occurrence counts.
564+
// Kubernetes deduplicates rapid-fire identical events into a single Event
565+
// object with Count > 1, so we sum Count rather than len(failedList.Items).
566+
var latestFailed metav1.Time
567+
var latestMsg string
568+
var totalCount int
569+
for _, ev := range failedList.Items {
570+
c := int(ev.Count)
571+
if c < 1 {
572+
c = 1 // Count is 0 for brand-new singleton events; treat as 1
573+
}
574+
totalCount += c
575+
ts := ev.LastTimestamp
576+
if ts.IsZero() {
577+
ts = metav1.Time{Time: ev.CreationTimestamp.Time}
578+
}
579+
if latestFailed.IsZero() || ts.After(latestFailed.Time) {
580+
latestFailed = ts
581+
latestMsg = ev.Message
582+
}
550583
}
551-
// Return the message from the most recent event.
552-
latest := evList.Items[len(evList.Items)-1]
553-
b.log.Debug("found FailedCreate event for job", "taskID", jobName, "reason", latest.Message)
554-
return true, latest.Message
584+
585+
// Check whether a SuccessfulCreate event exists with a timestamp after the
586+
// last FailedCreate. If so, the Job controller recovered on its own
587+
// (e.g. a missing ServiceAccount was created moments later by Helm) and we
588+
// should not surface this as an error.
589+
successList, err := b.client.CoreV1().Events(ns).List(ctx, metav1.ListOptions{
590+
FieldSelector: fmt.Sprintf("involvedObject.name=%s,reason=SuccessfulCreate", jobName),
591+
})
592+
if err != nil {
593+
b.log.Error("reconcile: listing SuccessfulCreate events for job", "taskID", jobName, "error", err)
594+
// Proceed conservatively: treat as unresolved so the count is returned.
595+
} else {
596+
for _, ev := range successList.Items {
597+
ts := ev.LastTimestamp
598+
if ts.IsZero() {
599+
ts = metav1.Time{Time: ev.CreationTimestamp.Time}
600+
}
601+
if ts.After(latestFailed.Time) {
602+
b.log.Debug("reconcile: FailedCreate resolved by later SuccessfulCreate", "taskID", jobName)
603+
return 0, ""
604+
}
605+
}
606+
}
607+
608+
b.log.Debug("found unresolved FailedCreate events for job", "taskID", jobName, "count", totalCount, "reason", latestMsg)
609+
return totalCount, latestMsg
555610
}
556611

612+
// maxErrEventWrites is the minimum number of FailedCreate event occurrences
613+
// (summed across all Event objects for a Job) required before the reconciler
614+
// treats the situation as a permanent failure and marks the task SYSTEM_ERROR.
615+
// This guards against false positives from transient API hiccups that produce
616+
// a single FailedCreate before self-resolving.
617+
const maxErrEventWrites = 2
618+
557619
// Reconcile loops through tasks and checks the status from Funnel's database
558620
// against the status reported by Kubernetes. This allows the backend to report
559621
// system error's that prevented the worker process from running.
@@ -617,7 +679,6 @@ func (b *Backend) reconcile(ctx context.Context, rate time.Duration, disableClea
617679

618680
ticker := time.NewTicker(rate)
619681
failedJobEvents := make(map[string]int)
620-
const maxErrEventWrites = 2
621682

622683
for {
623684
select {
@@ -708,9 +769,12 @@ func (b *Backend) reconcile(ctx context.Context, rate time.Duration, disableClea
708769
// cases where pod creation is rejected before a pod object is
709770
// ever persisted (e.g. Pod Security Admission enforcement blocks
710771
// the pod), so there are no pod container statuses to inspect.
772+
// We require at least maxErrEventWrites events before treating
773+
// the situation as permanent, to avoid false positives from
774+
// transient API hiccups.
711775
b.log.Debug("checking for FailedCreate events on job", "taskID", jobName)
712-
if failed, reason := b.hasJobFailedCreateEvent(ctx, jobName); failed {
713-
b.log.Debug("reconcile: worker job has FailedCreate event", "taskID", jobName, "reason", reason)
776+
if count, reason := b.hasJobFailedCreateEvent(ctx, jobName); count >= maxErrEventWrites {
777+
b.log.Debug("reconcile: worker job has FailedCreate event", "taskID", jobName, "count", count, "reason", reason)
714778
b.event.WriteEvent(ctx, events.NewState(jobName, tes.SystemError))
715779
b.event.WriteEvent(ctx, events.NewSystemLog(
716780
jobName, 0, 0, "error",
@@ -806,8 +870,11 @@ func (b *Backend) reconcile(ctx context.Context, rate time.Duration, disableClea
806870
// persists a pod object (e.g. Pod Security Admission blocks the
807871
// pod). Check for FailedCreate events which are the only signal
808872
// available in this state.
809-
if failed, reason := b.hasJobFailedCreateEvent(ctx, jobName); failed {
810-
b.log.Debug("reconcile: worker job has FailedCreate event (zero-status)", "taskID", jobName, "reason", reason)
873+
// We require at least maxErrEventWrites events before treating
874+
// the situation as permanent, to avoid false positives from
875+
// transient API hiccups.
876+
if count, reason := b.hasJobFailedCreateEvent(ctx, jobName); count >= maxErrEventWrites {
877+
b.log.Debug("reconcile: worker job has FailedCreate event (zero-status)", "taskID", jobName, "count", count, "reason", reason)
811878
b.event.WriteEvent(ctx, events.NewState(jobName, tes.SystemError))
812879
b.event.WriteEvent(ctx, events.NewSystemLog(
813880
jobName, 0, 0, "error",

compute/kubernetes/backend_test.go

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -535,16 +535,16 @@ func TestHasJobFailedCreateEvent(t *testing.T) {
535535
cases := []struct {
536536
name string
537537
events []corev1.Event
538-
wantFailed bool
538+
wantCount int
539539
wantReasonPart string
540540
}{
541541
{
542-
name: "no events → no failure",
543-
events: nil,
544-
wantFailed: false,
542+
name: "no events → count 0",
543+
events: nil,
544+
wantCount: 0,
545545
},
546546
{
547-
name: "unrelated event reason → no failure",
547+
name: "unrelated event reason → count 0",
548548
events: []corev1.Event{
549549
{
550550
ObjectMeta: metav1.ObjectMeta{Name: "ev1", Namespace: ns},
@@ -553,10 +553,10 @@ func TestHasJobFailedCreateEvent(t *testing.T) {
553553
Message: "Successfully assigned pod",
554554
},
555555
},
556-
wantFailed: false,
556+
wantCount: 0,
557557
},
558558
{
559-
name: "FailedCreate event from PSA enforcement → failure detected",
559+
name: "one FailedCreate event from PSA enforcement → count 1 with message",
560560
events: []corev1.Event{
561561
{
562562
ObjectMeta: metav1.ObjectMeta{Name: "ev-fc", Namespace: ns},
@@ -565,11 +565,11 @@ func TestHasJobFailedCreateEvent(t *testing.T) {
565565
Message: psaMessage,
566566
},
567567
},
568-
wantFailed: true,
568+
wantCount: 1,
569569
wantReasonPart: "violates PodSecurity",
570570
},
571571
{
572-
name: "FailedCreate for missing service account → failure detected",
572+
name: "one FailedCreate for missing service account → count 1 with message",
573573
events: []corev1.Event{
574574
{
575575
ObjectMeta: metav1.ObjectMeta{Name: "ev-sa", Namespace: ns},
@@ -578,11 +578,11 @@ func TestHasJobFailedCreateEvent(t *testing.T) {
578578
Message: `pods "test-job-" is forbidden: error looking up service account jobs/funnel-worker-sa: serviceaccount "funnel-worker-sa" not found`,
579579
},
580580
},
581-
wantFailed: true,
581+
wantCount: 1,
582582
wantReasonPart: "serviceaccount",
583583
},
584584
{
585-
name: "multiple events, last is FailedCreate → failure detected with last message",
585+
name: "multiple FailedCreate events → count reflects all, last message returned",
586586
events: []corev1.Event{
587587
{
588588
ObjectMeta: metav1.ObjectMeta{Name: "ev1", Namespace: ns},
@@ -594,10 +594,16 @@ func TestHasJobFailedCreateEvent(t *testing.T) {
594594
ObjectMeta: metav1.ObjectMeta{Name: "ev2", Namespace: ns},
595595
InvolvedObject: corev1.ObjectReference{Name: jobName},
596596
Reason: "FailedCreate",
597+
Message: "earlier failure",
598+
},
599+
{
600+
ObjectMeta: metav1.ObjectMeta{Name: "ev3", Namespace: ns},
601+
InvolvedObject: corev1.ObjectReference{Name: jobName},
602+
Reason: "FailedCreate",
597603
Message: psaMessage,
598604
},
599605
},
600-
wantFailed: true,
606+
wantCount: 2,
601607
wantReasonPart: "violates PodSecurity",
602608
},
603609
}
@@ -648,11 +654,11 @@ func TestHasJobFailedCreateEvent(t *testing.T) {
648654
conf: conf,
649655
}
650656

651-
got, reason := b.hasJobFailedCreateEvent(ctx, jobName)
652-
if got != tc.wantFailed {
653-
t.Errorf("hasJobFailedCreateEvent() = %v, want %v (reason=%q)", got, tc.wantFailed, reason)
657+
gotCount, reason := b.hasJobFailedCreateEvent(ctx, jobName)
658+
if gotCount != tc.wantCount {
659+
t.Errorf("hasJobFailedCreateEvent() count = %d, want %d (reason=%q)", gotCount, tc.wantCount, reason)
654660
}
655-
if tc.wantFailed && tc.wantReasonPart != "" {
661+
if tc.wantCount > 0 && tc.wantReasonPart != "" {
656662
if !strings.Contains(reason, tc.wantReasonPart) {
657663
t.Errorf("reason %q does not contain %q", reason, tc.wantReasonPart)
658664
}
@@ -745,11 +751,16 @@ func TestReconcile_ZeroStatusFailedCreate(t *testing.T) {
745751
}
746752

747753
// FailedCreate event on the Job (emitted by the Job controller).
754+
// Count is set to maxErrEventWrites so that the reconciler's threshold is
755+
// met by a single deduplicated event object, as Kubernetes would produce
756+
// after the Job controller retries pod creation repeatedly.
748757
failedCreateEvent := &corev1.Event{
749758
ObjectMeta: metav1.ObjectMeta{Name: "ev-fc", Namespace: ns},
750759
InvolvedObject: corev1.ObjectReference{Name: taskID},
751760
Reason: "FailedCreate",
752761
Message: psaMsg,
762+
Count: maxErrEventWrites,
763+
LastTimestamp: metav1.Now(),
753764
}
754765

755766
fakeClient := fake.NewSimpleClientset(job, failedCreateEvent)

0 commit comments

Comments
 (0)