Skip to content

Commit 451e43a

Browse files
fix: Allow spec changes after ProgressDeadlineExceeded
This commit addresses the reconcile loop and archival timeout issues when ProgressDeadlineExceeded is set on a ClusterObjectSet. Generated-By: Claude
1 parent 4510b1b commit 451e43a

File tree

4 files changed

+358
-19
lines changed

4 files changed

+358
-19
lines changed

internal/operator-controller/controllers/clusterobjectset_controller.go

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
"sigs.k8s.io/controller-runtime/pkg/builder"
3232
"sigs.k8s.io/controller-runtime/pkg/client"
3333
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
34-
"sigs.k8s.io/controller-runtime/pkg/event"
3534
"sigs.k8s.io/controller-runtime/pkg/handler"
3635
"sigs.k8s.io/controller-runtime/pkg/log"
3736
"sigs.k8s.io/controller-runtime/pkg/predicate"
@@ -81,7 +80,10 @@ func (c *ClusterObjectSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
8180
reconciledRev := existingRev.DeepCopy()
8281
res, reconcileErr := c.reconcile(ctx, reconciledRev)
8382

84-
if pd := existingRev.Spec.ProgressDeadlineMinutes; pd > 0 {
83+
// Progress deadline enforcement only applies to Active revisions, not Archived ones.
84+
// Archived revisions may need to retry teardown operations and should not have their
85+
// Progressing condition or requeue behavior overridden by the deadline check.
86+
if pd := existingRev.Spec.ProgressDeadlineMinutes; pd > 0 && reconciledRev.Spec.LifecycleState != ocv1.ClusterObjectSetLifecycleStateArchived {
8587
cnd := meta.FindStatusCondition(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
8688
isStillProgressing := cnd != nil && cnd.Status == metav1.ConditionTrue && cnd.Reason != ocv1.ReasonSucceeded
8789
succeeded := meta.IsStatusConditionTrue(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded)
@@ -344,29 +346,12 @@ type Sourcoser interface {
344346
}
345347

346348
func (c *ClusterObjectSetReconciler) SetupWithManager(mgr ctrl.Manager) error {
347-
skipProgressDeadlineExceededPredicate := predicate.Funcs{
348-
UpdateFunc: func(e event.UpdateEvent) bool {
349-
rev, ok := e.ObjectNew.(*ocv1.ClusterObjectSet)
350-
if !ok {
351-
return true
352-
}
353-
// allow deletions to happen
354-
if !rev.DeletionTimestamp.IsZero() {
355-
return true
356-
}
357-
if cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil && cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded {
358-
return false
359-
}
360-
return true
361-
},
362-
}
363349
c.Clock = clock.RealClock{}
364350
return ctrl.NewControllerManagedBy(mgr).
365351
For(
366352
&ocv1.ClusterObjectSet{},
367353
builder.WithPredicates(
368354
predicate.ResourceVersionChangedPredicate{},
369-
skipProgressDeadlineExceededPredicate,
370355
),
371356
).
372357
WatchesRawSource(
@@ -684,7 +669,33 @@ func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string) {
684669
}
685670
}
686671

672+
// markAsProgressing sets the Progressing condition to True with the given reason.
673+
//
674+
// Once ProgressDeadlineExceeded has been set for the current generation, this function
675+
// keeps that condition sticky for non-terminal progressing reasons that would otherwise
676+
// cause the reconciler to oscillate between Progressing=True and
677+
// ProgressDeadlineExceeded=False for the same generation.
678+
//
679+
// In particular, both RollingOut and Retrying are blocked for Active revisions.
680+
// Allowing those reasons to overwrite ProgressDeadlineExceeded can trigger unnecessary
681+
// status-only reconciles and interfere with retry/requeue behavior due to the progress
682+
// deadline check resetting the condition.
683+
//
684+
// Archived revisions are exempt from this guard because the progress deadline check
685+
// does not apply to them, allowing archival teardown retries to show Retrying status.
686+
//
687+
// If the generation changed (spec update) since ProgressDeadlineExceeded was recorded,
688+
// the guard allows the update through so the condition reflects the new generation.
687689
func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string) {
690+
if cnd := meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil &&
691+
cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded &&
692+
cnd.ObservedGeneration == cos.Generation &&
693+
(reason == ocv1.ReasonRollingOut || reason == ocv1.ClusterObjectSetReasonRetrying) &&
694+
cos.Spec.LifecycleState != ocv1.ClusterObjectSetLifecycleStateArchived {
695+
log.Log.V(1).Info("skipping markAsProgressing: ProgressDeadlineExceeded is sticky for non-terminal progressing reasons",
696+
"requestedReason", reason, "revision", cos.Name, "lifecycleState", cos.Spec.LifecycleState)
697+
return
698+
}
688699
meta.SetStatusCondition(&cos.Status.Conditions, metav1.Condition{
689700
Type: ocv1.ClusterObjectSetTypeProgressing,
690701
Status: metav1.ConditionTrue,

internal/operator-controller/controllers/clusterobjectset_controller_internal_test.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"time"
77

88
"github.com/stretchr/testify/require"
9+
"k8s.io/apimachinery/pkg/api/meta"
910
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1011
"k8s.io/apimachinery/pkg/runtime"
1112
"k8s.io/apimachinery/pkg/runtime/schema"
@@ -235,3 +236,113 @@ func (m *mockTrackingCacheInternal) Watch(ctx context.Context, user client.Objec
235236
func (m *mockTrackingCacheInternal) Source(h handler.EventHandler, predicates ...predicate.Predicate) source.Source {
236237
return nil
237238
}
239+
240+
func Test_markAsProgressing_doesNotOverwriteProgressDeadlineExceeded(t *testing.T) {
241+
for _, tc := range []struct {
242+
name string
243+
generation int64
244+
existingConditions []metav1.Condition
245+
reason string
246+
expectProgressing metav1.ConditionStatus
247+
expectReason string
248+
}{
249+
{
250+
name: "sets Progressing when no condition exists",
251+
generation: 1,
252+
existingConditions: nil,
253+
reason: ocv1.ReasonRollingOut,
254+
expectProgressing: metav1.ConditionTrue,
255+
expectReason: ocv1.ReasonRollingOut,
256+
},
257+
{
258+
name: "sets Progressing when currently RollingOut",
259+
generation: 1,
260+
existingConditions: []metav1.Condition{
261+
{
262+
Type: ocv1.ClusterObjectSetTypeProgressing,
263+
Status: metav1.ConditionTrue,
264+
Reason: ocv1.ReasonRollingOut,
265+
ObservedGeneration: 1,
266+
},
267+
},
268+
reason: ocv1.ReasonSucceeded,
269+
expectProgressing: metav1.ConditionTrue,
270+
expectReason: ocv1.ReasonSucceeded,
271+
},
272+
{
273+
name: "does not overwrite ProgressDeadlineExceeded with RollingOut (same generation)",
274+
generation: 1,
275+
existingConditions: []metav1.Condition{
276+
{
277+
Type: ocv1.ClusterObjectSetTypeProgressing,
278+
Status: metav1.ConditionFalse,
279+
Reason: ocv1.ReasonProgressDeadlineExceeded,
280+
ObservedGeneration: 1,
281+
},
282+
},
283+
reason: ocv1.ReasonRollingOut,
284+
expectProgressing: metav1.ConditionFalse,
285+
expectReason: ocv1.ReasonProgressDeadlineExceeded,
286+
},
287+
{
288+
name: "allows Succeeded to overwrite ProgressDeadlineExceeded",
289+
generation: 1,
290+
existingConditions: []metav1.Condition{
291+
{
292+
Type: ocv1.ClusterObjectSetTypeProgressing,
293+
Status: metav1.ConditionFalse,
294+
Reason: ocv1.ReasonProgressDeadlineExceeded,
295+
ObservedGeneration: 1,
296+
},
297+
},
298+
reason: ocv1.ReasonSucceeded,
299+
expectProgressing: metav1.ConditionTrue,
300+
expectReason: ocv1.ReasonSucceeded,
301+
},
302+
{
303+
name: "does not overwrite ProgressDeadlineExceeded with Retrying (same generation)",
304+
generation: 1,
305+
existingConditions: []metav1.Condition{
306+
{
307+
Type: ocv1.ClusterObjectSetTypeProgressing,
308+
Status: metav1.ConditionFalse,
309+
Reason: ocv1.ReasonProgressDeadlineExceeded,
310+
ObservedGeneration: 1,
311+
},
312+
},
313+
reason: ocv1.ClusterObjectSetReasonRetrying,
314+
expectProgressing: metav1.ConditionFalse,
315+
expectReason: ocv1.ReasonProgressDeadlineExceeded,
316+
},
317+
{
318+
name: "allows RollingOut when generation changed since ProgressDeadlineExceeded",
319+
generation: 2,
320+
existingConditions: []metav1.Condition{
321+
{
322+
Type: ocv1.ClusterObjectSetTypeProgressing,
323+
Status: metav1.ConditionFalse,
324+
Reason: ocv1.ReasonProgressDeadlineExceeded,
325+
ObservedGeneration: 1,
326+
},
327+
},
328+
reason: ocv1.ReasonRollingOut,
329+
expectProgressing: metav1.ConditionTrue,
330+
expectReason: ocv1.ReasonRollingOut,
331+
},
332+
} {
333+
t.Run(tc.name, func(t *testing.T) {
334+
cos := &ocv1.ClusterObjectSet{
335+
ObjectMeta: metav1.ObjectMeta{Generation: tc.generation},
336+
Status: ocv1.ClusterObjectSetStatus{
337+
Conditions: tc.existingConditions,
338+
},
339+
}
340+
markAsProgressing(cos, tc.reason, "test message")
341+
342+
cnd := meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
343+
require.NotNil(t, cnd)
344+
require.Equal(t, tc.expectProgressing, cnd.Status)
345+
require.Equal(t, tc.expectReason, cnd.Reason)
346+
})
347+
}
348+
}

internal/operator-controller/controllers/clusterobjectset_controller_test.go

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,188 @@ func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadline(t *testing.T) {
10701070
}
10711071
}
10721072

1073+
// Test_ClusterObjectSetReconciler_Reconcile_ArchivalAfterProgressDeadlineExceeded verifies that
1074+
// a COS with ProgressDeadlineExceeded can still be archived. It simulates the real scenario:
1075+
// the COS starts as Active with ProgressDeadlineExceeded, then a spec patch sets
1076+
// lifecycleState to Archived (as a succeeding revision would do), and a subsequent
1077+
// reconcile processes the archival.
1078+
func Test_ClusterObjectSetReconciler_Reconcile_ArchivalAfterProgressDeadlineExceeded(t *testing.T) {
1079+
const clusterObjectSetName = "test-ext-1"
1080+
1081+
testScheme := newScheme(t)
1082+
require.NoError(t, corev1.AddToScheme(testScheme))
1083+
1084+
ext := newTestClusterExtension()
1085+
rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme)
1086+
rev1.Finalizers = []string{"olm.operatorframework.io/teardown"}
1087+
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
1088+
Type: ocv1.ClusterObjectSetTypeProgressing,
1089+
Status: metav1.ConditionFalse,
1090+
Reason: ocv1.ReasonProgressDeadlineExceeded,
1091+
Message: "Revision has not rolled out for 1 minute(s).",
1092+
ObservedGeneration: rev1.Generation,
1093+
})
1094+
1095+
testClient := fake.NewClientBuilder().
1096+
WithScheme(testScheme).
1097+
WithStatusSubresource(&ocv1.ClusterObjectSet{}).
1098+
WithObjects(rev1, ext).
1099+
Build()
1100+
1101+
// Simulate the patch that a succeeding revision would apply.
1102+
patch := []byte(`{"spec":{"lifecycleState":"Archived"}}`)
1103+
require.NoError(t, testClient.Patch(t.Context(),
1104+
&ocv1.ClusterObjectSet{ObjectMeta: metav1.ObjectMeta{Name: clusterObjectSetName}},
1105+
client.RawPatch(types.MergePatchType, patch),
1106+
))
1107+
1108+
mockEngine := &mockRevisionEngine{
1109+
teardown: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionTeardownOption) (machinery.RevisionTeardownResult, error) {
1110+
return &mockRevisionTeardownResult{isComplete: true}, nil
1111+
},
1112+
}
1113+
1114+
result, err := (&controllers.ClusterObjectSetReconciler{
1115+
Client: testClient,
1116+
RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine},
1117+
TrackingCache: &mockTrackingCache{client: testClient},
1118+
Clock: clock.RealClock{},
1119+
}).Reconcile(t.Context(), ctrl.Request{
1120+
NamespacedName: types.NamespacedName{Name: clusterObjectSetName},
1121+
})
1122+
require.NoError(t, err)
1123+
require.Equal(t, ctrl.Result{}, result)
1124+
1125+
rev := &ocv1.ClusterObjectSet{}
1126+
require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev))
1127+
1128+
cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
1129+
require.NotNil(t, cond)
1130+
require.Equal(t, metav1.ConditionFalse, cond.Status)
1131+
require.Equal(t, ocv1.ClusterObjectSetReasonArchived, cond.Reason)
1132+
1133+
cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable)
1134+
require.NotNil(t, cond)
1135+
require.Equal(t, metav1.ConditionUnknown, cond.Status)
1136+
require.Equal(t, ocv1.ClusterObjectSetReasonArchived, cond.Reason)
1137+
}
1138+
1139+
// Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_StaysSticky verifies that
1140+
// once ProgressDeadlineExceeded is set, it is not overwritten when the reconciler runs again
1141+
// and sees objects still in transition. The Progressing condition should stay at
1142+
// ProgressDeadlineExceeded instead of being set back to RollingOut.
1143+
func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_StaysSticky(t *testing.T) {
1144+
const clusterObjectSetName = "test-ext-1"
1145+
1146+
testScheme := newScheme(t)
1147+
require.NoError(t, corev1.AddToScheme(testScheme))
1148+
1149+
ext := newTestClusterExtension()
1150+
rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme)
1151+
rev1.Spec.ProgressDeadlineMinutes = 1
1152+
rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-5 * time.Minute))
1153+
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
1154+
Type: ocv1.ClusterObjectSetTypeProgressing,
1155+
Status: metav1.ConditionFalse,
1156+
Reason: ocv1.ReasonProgressDeadlineExceeded,
1157+
Message: "Revision has not rolled out for 1 minute(s).",
1158+
ObservedGeneration: rev1.Generation,
1159+
})
1160+
1161+
testClient := fake.NewClientBuilder().
1162+
WithScheme(testScheme).
1163+
WithStatusSubresource(&ocv1.ClusterObjectSet{}).
1164+
WithObjects(rev1, ext).
1165+
Build()
1166+
1167+
mockEngine := &mockRevisionEngine{
1168+
reconcile: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionReconcileOption) (machinery.RevisionResult, error) {
1169+
return &mockRevisionResult{inTransition: true}, nil
1170+
},
1171+
}
1172+
1173+
result, err := (&controllers.ClusterObjectSetReconciler{
1174+
Client: testClient,
1175+
RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine},
1176+
TrackingCache: &mockTrackingCache{client: testClient},
1177+
Clock: clock.RealClock{},
1178+
}).Reconcile(t.Context(), ctrl.Request{
1179+
NamespacedName: types.NamespacedName{Name: clusterObjectSetName},
1180+
})
1181+
require.NoError(t, err)
1182+
require.Equal(t, ctrl.Result{}, result)
1183+
1184+
rev := &ocv1.ClusterObjectSet{}
1185+
require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev))
1186+
1187+
cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
1188+
require.NotNil(t, cond)
1189+
require.Equal(t, metav1.ConditionFalse, cond.Status, "Progressing should stay False")
1190+
require.Equal(t, ocv1.ReasonProgressDeadlineExceeded, cond.Reason, "Reason should stay ProgressDeadlineExceeded")
1191+
}
1192+
1193+
// Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_SucceededOverrides verifies
1194+
// that if a revision's objects eventually complete rolling out after the deadline was exceeded,
1195+
// the COS can still transition to Succeeded. ProgressDeadlineExceeded should not prevent a
1196+
// revision from completing when its objects become ready.
1197+
func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadlineExceeded_SucceededOverrides(t *testing.T) {
1198+
const clusterObjectSetName = "test-ext-1"
1199+
1200+
testScheme := newScheme(t)
1201+
require.NoError(t, corev1.AddToScheme(testScheme))
1202+
1203+
ext := newTestClusterExtension()
1204+
rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme)
1205+
rev1.Spec.ProgressDeadlineMinutes = 1
1206+
rev1.CreationTimestamp = metav1.NewTime(time.Now().Add(-5 * time.Minute))
1207+
meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{
1208+
Type: ocv1.ClusterObjectSetTypeProgressing,
1209+
Status: metav1.ConditionFalse,
1210+
Reason: ocv1.ReasonProgressDeadlineExceeded,
1211+
Message: "Revision has not rolled out for 1 minute(s).",
1212+
ObservedGeneration: rev1.Generation,
1213+
})
1214+
1215+
testClient := fake.NewClientBuilder().
1216+
WithScheme(testScheme).
1217+
WithStatusSubresource(&ocv1.ClusterObjectSet{}).
1218+
WithObjects(rev1, ext).
1219+
Build()
1220+
1221+
mockEngine := &mockRevisionEngine{
1222+
reconcile: func(ctx context.Context, rev machinerytypes.Revision, opts ...machinerytypes.RevisionReconcileOption) (machinery.RevisionResult, error) {
1223+
return &mockRevisionResult{isComplete: true}, nil
1224+
},
1225+
}
1226+
1227+
result, err := (&controllers.ClusterObjectSetReconciler{
1228+
Client: testClient,
1229+
RevisionEngineFactory: &mockRevisionEngineFactory{engine: mockEngine},
1230+
TrackingCache: &mockTrackingCache{client: testClient},
1231+
Clock: clock.RealClock{},
1232+
}).Reconcile(t.Context(), ctrl.Request{
1233+
NamespacedName: types.NamespacedName{Name: clusterObjectSetName},
1234+
})
1235+
require.NoError(t, err)
1236+
require.Equal(t, ctrl.Result{}, result)
1237+
1238+
rev := &ocv1.ClusterObjectSet{}
1239+
require.NoError(t, testClient.Get(t.Context(), client.ObjectKey{Name: clusterObjectSetName}, rev))
1240+
1241+
cond := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
1242+
require.NotNil(t, cond)
1243+
require.Equal(t, metav1.ConditionTrue, cond.Status, "Progressing should transition to True")
1244+
require.Equal(t, ocv1.ReasonSucceeded, cond.Reason, "Reason should be Succeeded")
1245+
1246+
cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable)
1247+
require.NotNil(t, cond)
1248+
require.Equal(t, metav1.ConditionTrue, cond.Status)
1249+
1250+
cond = meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded)
1251+
require.NotNil(t, cond)
1252+
require.Equal(t, metav1.ConditionTrue, cond.Status)
1253+
}
1254+
10731255
func newTestClusterExtension() *ocv1.ClusterExtension {
10741256
return &ocv1.ClusterExtension{
10751257
ObjectMeta: metav1.ObjectMeta{

0 commit comments

Comments
 (0)