Skip to content

Commit 1e60658

Browse files
fix: Allow Retrying and other reasons to override ProgressDeadlineExceeded
The markAsProgressing guard was too strict, blocking all reasons except Succeeded when ProgressDeadlineExceeded was set. This prevented proper status updates during archival, causing e2e test timeouts. The guard is now precise: it only blocks RollingOut (which causes the reconcile loop) while allowing Retrying, Succeeded, and any future reasons to update the condition. Generated-by: Cursor:Claude
1 parent 4510b1b commit 1e60658

4 files changed

Lines changed: 349 additions & 18 deletions

File tree

internal/operator-controller/controllers/clusterobjectset_controller.go

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
"sigs.k8s.io/controller-runtime/pkg/builder"
3232
"sigs.k8s.io/controller-runtime/pkg/client"
3333
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
34-
"sigs.k8s.io/controller-runtime/pkg/event"
3534
"sigs.k8s.io/controller-runtime/pkg/handler"
3635
"sigs.k8s.io/controller-runtime/pkg/log"
3736
"sigs.k8s.io/controller-runtime/pkg/predicate"
@@ -344,29 +343,12 @@ type Sourcoser interface {
344343
}
345344

346345
func (c *ClusterObjectSetReconciler) SetupWithManager(mgr ctrl.Manager) error {
347-
skipProgressDeadlineExceededPredicate := predicate.Funcs{
348-
UpdateFunc: func(e event.UpdateEvent) bool {
349-
rev, ok := e.ObjectNew.(*ocv1.ClusterObjectSet)
350-
if !ok {
351-
return true
352-
}
353-
// allow deletions to happen
354-
if !rev.DeletionTimestamp.IsZero() {
355-
return true
356-
}
357-
if cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil && cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded {
358-
return false
359-
}
360-
return true
361-
},
362-
}
363346
c.Clock = clock.RealClock{}
364347
return ctrl.NewControllerManagedBy(mgr).
365348
For(
366349
&ocv1.ClusterObjectSet{},
367350
builder.WithPredicates(
368351
predicate.ResourceVersionChangedPredicate{},
369-
skipProgressDeadlineExceededPredicate,
370352
),
371353
).
372354
WatchesRawSource(
@@ -684,7 +666,31 @@ func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string) {
684666
}
685667
}
686668

669+
// markAsProgressing sets the Progressing condition to True with the given reason.
670+
//
671+
// Once ProgressDeadlineExceeded has been set for the current generation, this function
672+
// blocks only the RollingOut reason to prevent a reconcile loop. The loop occurs when
673+
// the reconciler sets Progressing=True with RollingOut on each run, only for the
674+
// deadline check to immediately reset it back to ProgressDeadlineExceeded. This would
675+
// produce an unnecessary status update that triggers another reconcile.
676+
//
677+
// If the generation changed (spec update) since ProgressDeadlineExceeded was recorded,
678+
// the guard allows the update through so the condition reflects the new generation.
679+
//
680+
// The RollingOut-only guard means that other reasons like Succeeded or Retrying can
681+
// still update the condition. This allows a revision that exceeded its deadline to:
682+
// - Transition to Succeeded if the rollout eventually completes
683+
// - Show Retrying state when encountering errors (e.g., during archival)
684+
// - Support any future status reasons that need to override ProgressDeadlineExceeded
687685
func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string) {
686+
if cnd := meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil &&
687+
cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded &&
688+
cnd.ObservedGeneration == cos.Generation &&
689+
reason == ocv1.ReasonRollingOut {
690+
log.Log.V(1).Info("skipping markAsProgressing: ProgressDeadlineExceeded is sticky for RollingOut reason",
691+
"requestedReason", reason, "revision", cos.Name)
692+
return
693+
}
688694
meta.SetStatusCondition(&cos.Status.Conditions, metav1.Condition{
689695
Type: ocv1.ClusterObjectSetTypeProgressing,
690696
Status: metav1.ConditionTrue,

internal/operator-controller/controllers/clusterobjectset_controller_internal_test.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"time"
77

88
"github.com/stretchr/testify/require"
9+
"k8s.io/apimachinery/pkg/api/meta"
910
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1011
"k8s.io/apimachinery/pkg/runtime"
1112
"k8s.io/apimachinery/pkg/runtime/schema"
@@ -235,3 +236,113 @@ func (m *mockTrackingCacheInternal) Watch(ctx context.Context, user client.Objec
235236
func (m *mockTrackingCacheInternal) Source(h handler.EventHandler, predicates ...predicate.Predicate) source.Source {
236237
return nil
237238
}
239+
240+
func Test_markAsProgressing_doesNotOverwriteProgressDeadlineExceeded(t *testing.T) {
241+
for _, tc := range []struct {
242+
name string
243+
generation int64
244+
existingConditions []metav1.Condition
245+
reason string
246+
expectProgressing metav1.ConditionStatus
247+
expectReason string
248+
}{
249+
{
250+
name: "sets Progressing when no condition exists",
251+
generation: 1,
252+
existingConditions: nil,
253+
reason: ocv1.ReasonRollingOut,
254+
expectProgressing: metav1.ConditionTrue,
255+
expectReason: ocv1.ReasonRollingOut,
256+
},
257+
{
258+
name: "sets Progressing when currently RollingOut",
259+
generation: 1,
260+
existingConditions: []metav1.Condition{
261+
{
262+
Type: ocv1.ClusterObjectSetTypeProgressing,
263+
Status: metav1.ConditionTrue,
264+
Reason: ocv1.ReasonRollingOut,
265+
ObservedGeneration: 1,
266+
},
267+
},
268+
reason: ocv1.ReasonSucceeded,
269+
expectProgressing: metav1.ConditionTrue,
270+
expectReason: ocv1.ReasonSucceeded,
271+
},
272+
{
273+
name: "does not overwrite ProgressDeadlineExceeded with RollingOut (same generation)",
274+
generation: 1,
275+
existingConditions: []metav1.Condition{
276+
{
277+
Type: ocv1.ClusterObjectSetTypeProgressing,
278+
Status: metav1.ConditionFalse,
279+
Reason: ocv1.ReasonProgressDeadlineExceeded,
280+
ObservedGeneration: 1,
281+
},
282+
},
283+
reason: ocv1.ReasonRollingOut,
284+
expectProgressing: metav1.ConditionFalse,
285+
expectReason: ocv1.ReasonProgressDeadlineExceeded,
286+
},
287+
{
288+
name: "allows Succeeded to overwrite ProgressDeadlineExceeded",
289+
generation: 1,
290+
existingConditions: []metav1.Condition{
291+
{
292+
Type: ocv1.ClusterObjectSetTypeProgressing,
293+
Status: metav1.ConditionFalse,
294+
Reason: ocv1.ReasonProgressDeadlineExceeded,
295+
ObservedGeneration: 1,
296+
},
297+
},
298+
reason: ocv1.ReasonSucceeded,
299+
expectProgressing: metav1.ConditionTrue,
300+
expectReason: ocv1.ReasonSucceeded,
301+
},
302+
{
303+
name: "allows Retrying to overwrite ProgressDeadlineExceeded (same generation)",
304+
generation: 1,
305+
existingConditions: []metav1.Condition{
306+
{
307+
Type: ocv1.ClusterObjectSetTypeProgressing,
308+
Status: metav1.ConditionFalse,
309+
Reason: ocv1.ReasonProgressDeadlineExceeded,
310+
ObservedGeneration: 1,
311+
},
312+
},
313+
reason: ocv1.ClusterObjectSetReasonRetrying,
314+
expectProgressing: metav1.ConditionTrue,
315+
expectReason: ocv1.ClusterObjectSetReasonRetrying,
316+
},
317+
{
318+
name: "allows RollingOut when generation changed since ProgressDeadlineExceeded",
319+
generation: 2,
320+
existingConditions: []metav1.Condition{
321+
{
322+
Type: ocv1.ClusterObjectSetTypeProgressing,
323+
Status: metav1.ConditionFalse,
324+
Reason: ocv1.ReasonProgressDeadlineExceeded,
325+
ObservedGeneration: 1,
326+
},
327+
},
328+
reason: ocv1.ReasonRollingOut,
329+
expectProgressing: metav1.ConditionTrue,
330+
expectReason: ocv1.ReasonRollingOut,
331+
},
332+
} {
333+
t.Run(tc.name, func(t *testing.T) {
334+
cos := &ocv1.ClusterObjectSet{
335+
ObjectMeta: metav1.ObjectMeta{Generation: tc.generation},
336+
Status: ocv1.ClusterObjectSetStatus{
337+
Conditions: tc.existingConditions,
338+
},
339+
}
340+
markAsProgressing(cos, tc.reason, "test message")
341+
342+
cnd := meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
343+
require.NotNil(t, cnd)
344+
require.Equal(t, tc.expectProgressing, cnd.Status)
345+
require.Equal(t, tc.expectReason, cnd.Reason)
346+
})
347+
}
348+
}

internal/operator-controller/controllers/clusterobjectset_controller_test.go

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,185 @@ func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadline(t *testing.T) {
10701070
}
10711071
}
10721072

1073+
// Test_ClusterObjectSetReconciler_Reconcile_ArchivalAfterProgressDeadlineExceeded verifies that
1074+
// a COS with ProgressDeadlineExceeded can still be archived. It simulates the real scenario:
1075+
// the COS starts as Active with ProgressDeadlineExceeded, then a spec patch sets
1076+
// lifecycleState to Archived (as a succeeding revision would do), and a subsequent
1077+
// reconcile processes the archival.
1078+
func Test_ClusterObjectSetReconciler_Reconcile_ArchivalAfterProgressDeadlineExceeded(t *testing.T) {
1079+
const clusterObjectSetName = "test-ext-1"
1080+
1081+
testScheme := newScheme(t)
1082+
require.NoError(t, corev1.AddToScheme(testScheme))
1083+
1084+
ext := newTestClusterExtension()
1085+
rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme)
1086+
rev1.Finalizers = []string{"olm.operatorframework.io/teardown"}
1087+
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
1088+
Type: ocv1.ClusterObjectSetTypeProgressing,
1089+
Status: metav1.ConditionFalse,
1090+
Reason: ocv1.ReasonProgressDeadlineExceeded,
1091+
Message: "Revision has not rolled out for 1 minute(s).",
1092+
ObservedGeneration: rev1.Generation,
1093+
})
1094+
1095+
testClient := fake.NewClientBuilder().
1096+
WithScheme(testScheme).
1097+
WithStatusSubresource(&ocv1.ClusterObjectSet{}).
1098+
WithObjects(rev1, ext).
1099+
Build()
1100+
1101+
// Simulate the patch that a succeeding revision would apply.
1102+
patch := []byte(`{"spec":{"lifecycleState":"Archived"}}`)
1103+
require.NoError(t, testClient.Patch(t.Context(),
1104+
&ocv1.ClusterObjectSet{ObjectMeta: metav1.ObjectMeta{Name: clusterObjectSetName}},
1105+
client.RawPatch(types.MergePatchType, patch),
1106+
))
1107+
1108+
mockEngine := &mockRevisionEngine{
1109+
teardown: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionTeardownOption) (machinery.RevisionTeardownResult, error) {
1110+
return &mockRevisionTeardownResult{isComplete: true}, nil
1111+
},
1112+
}
1113+
1114+
result, err := (&controllers.ClusterObjectSetReconciler{
1115+
Client: testClient,
1116+
RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine},
1117+
TrackingCache: &mockTrackingCache{client: testClient},
1118+
}).Reconcile(t.Context(), ctrl.Request{
1119+
NamespacedName: types.NamespacedName{Name: clusterObjectSetName},
1120+
})
1121+
require.NoError(t, err)
1122+
require.Equal(t, ctrl.Result{}, result)
1123+
1124+
rev := &ocv1.ClusterObjectSet{}
1125+
require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev))
1126+
1127+
cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
1128+
require.NotNil(t, cond)
1129+
require.Equal(t, metav1.ConditionFalse, cond.Status)
1130+
require.Equal(t, ocv1.ClusterObjectSetReasonArchived, cond.Reason)
1131+
1132+
cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable)
1133+
require.NotNil(t, cond)
1134+
require.Equal(t, metav1.ConditionUnknown, cond.Status)
1135+
require.Equal(t, ocv1.ClusterObjectSetReasonArchived, cond.Reason)
1136+
}
1137+
1138+
// Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_StaysSticky verifies that
1139+
// once ProgressDeadlineExceeded is set, it is not overwritten when the reconciler runs again
1140+
// and sees objects still in transition. The Progressing condition should stay at
1141+
// ProgressDeadlineExceeded instead of being set back to RollingOut.
1142+
func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_StaysSticky(t *testing.T) {
1143+
const clusterObjectSetName = "test-ext-1"
1144+
1145+
testScheme := newScheme(t)
1146+
require.NoError(t, corev1.AddToScheme(testScheme))
1147+
1148+
ext := newTestClusterExtension()
1149+
rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme)
1150+
rev1.Spec.ProgressDeadlineMinutes = 1
1151+
rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-5 * time.Minute))
1152+
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
1153+
Type: ocv1.ClusterObjectSetTypeProgressing,
1154+
Status: metav1.ConditionFalse,
1155+
Reason: ocv1.ReasonProgressDeadlineExceeded,
1156+
Message: "Revision has not rolled out for 1 minute(s).",
1157+
ObservedGeneration: rev1.Generation,
1158+
})
1159+
1160+
testClient := fake.NewClientBuilder().
1161+
WithScheme(testScheme).
1162+
WithStatusSubresource(&ocv1.ClusterObjectSet{}).
1163+
WithObjects(rev1, ext).
1164+
Build()
1165+
1166+
mockEngine := &mockRevisionEngine{
1167+
reconcile: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionReconcileOption) (machinery.RevisionResult, error) {
1168+
return &mockRevisionResult{inTransition: true}, nil
1169+
},
1170+
}
1171+
1172+
result, err := (&controllers.ClusterObjectSetReconciler{
1173+
Client: testClient,
1174+
RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine},
1175+
TrackingCache: &mockTrackingCache{client: testClient},
1176+
}).Reconcile(t.Context(), ctrl.Request{
1177+
NamespacedName: types.NamespacedName{Name: clusterObjectSetName},
1178+
})
1179+
require.NoError(t, err)
1180+
require.Equal(t, ctrl.Result{}, result)
1181+
1182+
rev := &ocv1.ClusterObjectSet{}
1183+
require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev))
1184+
1185+
cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
1186+
require.NotNil(t, cond)
1187+
require.Equal(t, metav1.ConditionFalse, cond.Status, "Progressing should stay False")
1188+
require.Equal(t, ocv1.ReasonProgressDeadlineExceeded, cond.Reason, "Reason should stay ProgressDeadlineExceeded")
1189+
}
1190+
1191+
// Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_SucceededOverrides verifies
1192+
// that if a revision's objects eventually complete rolling out after the deadline was exceeded,
1193+
// the COS can still transition to Succeeded. ProgressDeadlineExceeded should not prevent a
1194+
// revision from completing when its objects become ready.
1195+
func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_SucceededOverrides(t *testing.T) {
1196+
const clusterObjectSetName = "test-ext-1"
1197+
1198+
testScheme := newScheme(t)
1199+
require.NoError(t, corev1.AddToScheme(testScheme))
1200+
1201+
ext := newTestClusterExtension()
1202+
rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme)
1203+
rev1.Spec.ProgressDeadlineMinutes = 1
1204+
rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-5 * time.Minute))
1205+
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
1206+
Type: ocv1.ClusterObjectSetTypeProgressing,
1207+
Status: metav1.ConditionFalse,
1208+
Reason: ocv1.ReasonProgressDeadlineExceeded,
1209+
Message: "Revision has not rolled out for 1 minute(s).",
1210+
ObservedGeneration: rev1.Generation,
1211+
})
1212+
1213+
testClient := fake.NewClientBuilder().
1214+
WithScheme(testScheme).
1215+
WithStatusSubresource(&ocv1.ClusterObjectSet{}).
1216+
WithObjects(rev1, ext).
1217+
Build()
1218+
1219+
mockEngine := &mockRevisionEngine{
1220+
reconcile: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionReconcileOption) (machinery.RevisionResult, error) {
1221+
return &mockRevisionResult{isComplete: true}, nil
1222+
},
1223+
}
1224+
1225+
result, err := (&controllers.ClusterObjectSetReconciler{
1226+
Client: testClient,
1227+
RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine},
1228+
TrackingCache: &mockTrackingCache{client: testClient},
1229+
}).Reconcile(t.Context(), ctrl.Request{
1230+
NamespacedName: types.NamespacedName{Name: clusterObjectSetName},
1231+
})
1232+
require.NoError(t, err)
1233+
require.Equal(t, ctrl.Result{}, result)
1234+
1235+
rev := &ocv1.ClusterObjectSet{}
1236+
require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev))
1237+
1238+
cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
1239+
require.NotNil(t, cond)
1240+
require.Equal(t, metav1.ConditionTrue, cond.Status, "Progressing should transition to True")
1241+
require.Equal(t, ocv1.ReasonSucceeded, cond.Reason, "Reason should be Succeeded")
1242+
1243+
cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable)
1244+
require.NotNil(t, cond)
1245+
require.Equal(t, metav1.ConditionTrue, cond.Status)
1246+
1247+
cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded)
1248+
require.NotNil(t, cond)
1249+
require.Equal(t, metav1.ConditionTrue, cond.Status)
1250+
}
1251+
10731252
func newTestClusterExtension() *ocv1.ClusterExtension {
10741253
return &ocv1.ClusterExtension{
10751254
ObjectMeta: metav1.ObjectMeta{

0 commit comments

Comments
 (0)