Skip to content

Commit 96d0dc1

Browse files
fix: Allow spec changes after ProgressDeadlineExceeded
This commit addresses the reconcile loop and archival timeout issues when ProgressDeadlineExceeded is set on a ClusterObjectSet. Generated-By: Claude
1 parent 4510b1b commit 96d0dc1

File tree

9 files changed

+480
-55
lines changed

9 files changed

+480
-55
lines changed

internal/operator-controller/controllers/boxcutter_reconcile_steps.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ func (d *BoxcutterRevisionStatesGetter) GetRevisionStates(ctx context.Context, e
6262
// is fairly decoupled from this code where we get the annotations back out. We may want to co-locate
6363
// the set/get logic a bit better to make it more maintainable and less likely to get out of sync.
6464
rm := &RevisionMetadata{
65-
RevisionName: rev.Name,
66-
Package: rev.Annotations[labels.PackageNameKey],
67-
Image: rev.Annotations[labels.BundleReferenceKey],
68-
Conditions: rev.Status.Conditions,
65+
RevisionName: rev.Name,
66+
Package: rev.Annotations[labels.PackageNameKey],
67+
Image: rev.Annotations[labels.BundleReferenceKey],
68+
ResolutionDigest: rev.Annotations[labels.ResolutionDigestKey],
69+
Conditions: rev.Status.Conditions,
6970
BundleMetadata: ocv1.BundleMetadata{
7071
Name: rev.Annotations[labels.BundleNameKey],
7172
Version: rev.Annotations[labels.BundleVersionKey],
@@ -100,10 +101,11 @@ func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, e
100101
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
101102
l := log.FromContext(ctx)
102103
revisionAnnotations := map[string]string{
103-
labels.BundleNameKey: state.resolvedRevisionMetadata.Name,
104-
labels.PackageNameKey: state.resolvedRevisionMetadata.Package,
105-
labels.BundleVersionKey: state.resolvedRevisionMetadata.Version,
106-
labels.BundleReferenceKey: state.resolvedRevisionMetadata.Image,
104+
labels.BundleNameKey: state.resolvedRevisionMetadata.Name,
105+
labels.PackageNameKey: state.resolvedRevisionMetadata.Package,
106+
labels.BundleVersionKey: state.resolvedRevisionMetadata.Version,
107+
labels.BundleReferenceKey: state.resolvedRevisionMetadata.Image,
108+
labels.ResolutionDigestKey: state.resolvedRevisionMetadata.ResolutionDigest,
107109
}
108110
objLbls := map[string]string{
109111
labels.OwnerKindKey: ocv1.ClusterExtensionKind,

internal/operator-controller/controllers/clusterextension_controller.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -499,9 +499,10 @@ func clusterExtensionRequestsForCatalog(c client.Reader, logger logr.Logger) crh
499499
}
500500

501501
type RevisionMetadata struct {
502-
RevisionName string
503-
Package string
504-
Image string
502+
RevisionName string
503+
Package string
504+
Image string
505+
ResolutionDigest string
505506
ocv1.BundleMetadata
506507
Conditions []metav1.Condition
507508
}
@@ -535,8 +536,9 @@ func (d *HelmRevisionStatesGetter) GetRevisionStates(ctx context.Context, ext *o
535536
for _, rel := range relhis {
536537
if rel.Info != nil && rel.Info.Status == release.StatusDeployed {
537538
rs.Installed = &RevisionMetadata{
538-
Package: rel.Labels[labels.PackageNameKey],
539-
Image: rel.Labels[labels.BundleReferenceKey],
539+
Package: rel.Labels[labels.PackageNameKey],
540+
Image: rel.Labels[labels.BundleReferenceKey],
541+
ResolutionDigest: rel.Labels[labels.ResolutionDigestKey],
540542
BundleMetadata: ocv1.BundleMetadata{
541543
Name: rel.Labels[labels.BundleNameKey],
542544
Version: rel.Labels[labels.BundleVersionKey],

internal/operator-controller/controllers/clusterextension_controller_test.go

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,12 @@ func TestClusterExtensionBoxcutterApplierFailsDoesNotLeakDeprecationErrors(t *te
785785
// the same inputs the runtime would see.
786786
d.RevisionStatesGetter = &MockRevisionStatesGetter{
787787
RevisionStates: &controllers.RevisionStates{
788-
RollingOut: []*controllers.RevisionMetadata{{}},
788+
RollingOut: []*controllers.RevisionMetadata{{
789+
// Different digest from current spec - ensures we call the resolver
790+
// (This simulates a rolling-out revision from a previous reconcile)
791+
ResolutionDigest: "different-digest-from-previous-reconcile",
792+
BundleMetadata: ocv1.BundleMetadata{Version: "1.0.0"},
793+
}},
789794
},
790795
}
791796
d.Resolver = resolve.Func(func(ctx context.Context, ext *ocv1.ClusterExtension, installedBundle *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) {
@@ -845,21 +850,18 @@ func TestClusterExtensionBoxcutterApplierFailsDoesNotLeakDeprecationErrors(t *te
845850

846851
deprecatedCond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeDeprecated)
847852
require.NotNil(t, deprecatedCond)
848-
require.Equal(t, metav1.ConditionUnknown, deprecatedCond.Status, "no catalog data during rollout, so Unknown")
849-
require.Equal(t, ocv1.ReasonDeprecationStatusUnknown, deprecatedCond.Reason)
850-
require.Equal(t, "deprecation status unknown: catalog data unavailable", deprecatedCond.Message)
853+
require.Equal(t, metav1.ConditionFalse, deprecatedCond.Status, "catalog data available from resolution, not deprecated")
854+
require.Equal(t, ocv1.ReasonNotDeprecated, deprecatedCond.Reason)
851855

852856
packageCond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypePackageDeprecated)
853857
require.NotNil(t, packageCond)
854-
require.Equal(t, metav1.ConditionUnknown, packageCond.Status, "no catalog data during rollout, so Unknown")
855-
require.Equal(t, ocv1.ReasonDeprecationStatusUnknown, packageCond.Reason)
856-
require.Equal(t, "deprecation status unknown: catalog data unavailable", packageCond.Message)
858+
require.Equal(t, metav1.ConditionFalse, packageCond.Status, "catalog data available from resolution, package not deprecated")
859+
require.Equal(t, ocv1.ReasonNotDeprecated, packageCond.Reason)
857860

858861
channelCond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeChannelDeprecated)
859862
require.NotNil(t, channelCond)
860-
require.Equal(t, metav1.ConditionUnknown, channelCond.Status, "no catalog data during rollout, so Unknown")
861-
require.Equal(t, ocv1.ReasonDeprecationStatusUnknown, channelCond.Reason)
862-
require.Equal(t, "deprecation status unknown: catalog data unavailable", channelCond.Message)
863+
require.Equal(t, metav1.ConditionFalse, channelCond.Status, "catalog data available from resolution, channel not deprecated")
864+
require.Equal(t, ocv1.ReasonNotDeprecated, channelCond.Reason)
863865

864866
bundleCond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeBundleDeprecated)
865867
require.NotNil(t, bundleCond)

internal/operator-controller/controllers/clusterextension_reconcile_steps.go

Lines changed: 86 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ package controllers
1818

1919
import (
2020
"context"
21+
"crypto/sha256"
22+
"encoding/json"
2123
"errors"
2224
"fmt"
2325

@@ -37,6 +39,51 @@ import (
3739
imageutil "github.com/operator-framework/operator-controller/internal/shared/util/image"
3840
)
3941

42+
// resolutionInputs captures the catalog filter fields that affect bundle resolution.
43+
// Changes to any of these fields require re-resolution.
44+
type resolutionInputs struct {
45+
PackageName string `json:"packageName"`
46+
Version string `json:"version"`
47+
Channels []string `json:"channels,omitempty"`
48+
Selector string `json:"selector,omitempty"`
49+
UpgradeConstraintPolicy string `json:"upgradeConstraintPolicy,omitempty"`
50+
}
51+
52+
// calculateResolutionDigest computes a SHA256 hash of the catalog filter inputs
53+
// that affect bundle resolution. This digest enables detection of spec changes that
54+
// require re-resolution versus when a rolling-out revision can be reused.
55+
func calculateResolutionDigest(catalog *ocv1.CatalogFilter) (string, error) {
56+
if catalog == nil {
57+
return "", nil
58+
}
59+
60+
inputs := resolutionInputs{
61+
PackageName: catalog.PackageName,
62+
Version: catalog.Version,
63+
Channels: catalog.Channels,
64+
UpgradeConstraintPolicy: string(catalog.UpgradeConstraintPolicy),
65+
}
66+
67+
// Convert selector to a canonical string representation
68+
if catalog.Selector != nil {
69+
selectorBytes, err := json.Marshal(catalog.Selector)
70+
if err != nil {
71+
return "", fmt.Errorf("marshaling selector: %w", err)
72+
}
73+
inputs.Selector = string(selectorBytes)
74+
}
75+
76+
// Marshal to JSON for consistent hashing
77+
inputsBytes, err := json.Marshal(inputs)
78+
if err != nil {
79+
return "", fmt.Errorf("marshaling resolution inputs: %w", err)
80+
}
81+
82+
// Compute SHA256 hash and return hex string
83+
hash := sha256.Sum256(inputsBytes)
84+
return fmt.Sprintf("%x", hash), nil
85+
}
86+
4087
func HandleFinalizers(f finalizer.Finalizer) ReconcileStepFunc {
4188
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
4289
l := log.FromContext(ctx)
@@ -140,15 +187,32 @@ func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc {
140187
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
141188
l := log.FromContext(ctx)
142189

143-
// If already rolling out, use existing revision and set deprecation to Unknown (no catalog check)
190+
// If already rolling out, check if the latest rolling-out revision matches the current spec.
191+
// If spec has changed (e.g., version, channels, selector, upgradeConstraintPolicy), we need
192+
// to resolve a new bundle instead of reusing the rolling-out revision.
193+
//
194+
// Note: RollingOut slice is sorted oldest→newest, so use the last element (most recent).
195+
// This avoids re-resolving when a newer revision already matches the spec, even if an
196+
// older revision is still stuck rolling out.
144197
if len(state.revisionStates.RollingOut) > 0 {
145-
installedBundleName := ""
146-
if state.revisionStates.Installed != nil {
147-
installedBundleName = state.revisionStates.Installed.Name
198+
latestRollingOutRevision := state.revisionStates.RollingOut[len(state.revisionStates.RollingOut)-1]
199+
200+
// Calculate digest of current spec's catalog filter
201+
currentDigest, err := calculateResolutionDigest(ext.Spec.Source.Catalog)
202+
if err != nil {
203+
l.Error(err, "failed to calculate resolution digest from spec")
204+
// On digest calculation error, fall through to re-resolve for safety
205+
} else if currentDigest == latestRollingOutRevision.ResolutionDigest {
206+
// Resolution inputs haven't changed - reuse the rolling-out revision
207+
installedBundleName := ""
208+
if state.revisionStates.Installed != nil {
209+
installedBundleName = state.revisionStates.Installed.Name
210+
}
211+
SetDeprecationStatus(ext, installedBundleName, nil, false)
212+
state.resolvedRevisionMetadata = latestRollingOutRevision
213+
return nil, nil
148214
}
149-
SetDeprecationStatus(ext, installedBundleName, nil, false)
150-
state.resolvedRevisionMetadata = state.revisionStates.RollingOut[0]
151-
return nil, nil
215+
// Resolution inputs changed - fall through to resolve new bundle from catalog
152216
}
153217

154218
// Resolve a new bundle from the catalog
@@ -188,9 +252,17 @@ func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc {
188252
return handleResolutionError(ctx, c, state, ext, err)
189253
}
190254

255+
// Calculate digest of the resolution inputs for change detection
256+
digest, err := calculateResolutionDigest(ext.Spec.Source.Catalog)
257+
if err != nil {
258+
l.Error(err, "failed to calculate resolution digest, continuing without it")
259+
digest = ""
260+
}
261+
191262
state.resolvedRevisionMetadata = &RevisionMetadata{
192-
Package: resolvedBundle.Package,
193-
Image: resolvedBundle.Image,
263+
Package: resolvedBundle.Package,
264+
Image: resolvedBundle.Image,
265+
ResolutionDigest: digest,
194266
// TODO: Right now, operator-controller only supports registry+v1 bundles and has no concept
195267
// of a "release" field. If/when we add a release field concept or a new bundle format
196268
// we need to re-evaluate use of `AsLegacyRegistryV1Version` so that we avoid propagating
@@ -409,10 +481,11 @@ func ApplyBundle(a Applier) ReconcileStepFunc {
409481
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
410482
l := log.FromContext(ctx)
411483
revisionAnnotations := map[string]string{
412-
labels.BundleNameKey: state.resolvedRevisionMetadata.Name,
413-
labels.PackageNameKey: state.resolvedRevisionMetadata.Package,
414-
labels.BundleVersionKey: state.resolvedRevisionMetadata.Version,
415-
labels.BundleReferenceKey: state.resolvedRevisionMetadata.Image,
484+
labels.BundleNameKey: state.resolvedRevisionMetadata.Name,
485+
labels.PackageNameKey: state.resolvedRevisionMetadata.Package,
486+
labels.BundleVersionKey: state.resolvedRevisionMetadata.Version,
487+
labels.BundleReferenceKey: state.resolvedRevisionMetadata.Image,
488+
labels.ResolutionDigestKey: state.resolvedRevisionMetadata.ResolutionDigest,
416489
}
417490
objLbls := map[string]string{
418491
labels.OwnerKindKey: ocv1.ClusterExtensionKind,

internal/operator-controller/controllers/clusterobjectset_controller.go

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import (
3131
"sigs.k8s.io/controller-runtime/pkg/builder"
3232
"sigs.k8s.io/controller-runtime/pkg/client"
3333
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
34-
"sigs.k8s.io/controller-runtime/pkg/event"
3534
"sigs.k8s.io/controller-runtime/pkg/handler"
3635
"sigs.k8s.io/controller-runtime/pkg/log"
3736
"sigs.k8s.io/controller-runtime/pkg/predicate"
@@ -81,7 +80,10 @@ func (c *ClusterObjectSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
8180
reconciledRev := existingRev.DeepCopy()
8281
res, reconcileErr := c.reconcile(ctx, reconciledRev)
8382

84-
if pd := existingRev.Spec.ProgressDeadlineMinutes; pd > 0 {
83+
// Progress deadline enforcement only applies to Active revisions, not Archived ones.
84+
// Archived revisions may need to retry teardown operations and should not have their
85+
// Progressing condition or requeue behavior overridden by the deadline check.
86+
if pd := existingRev.Spec.ProgressDeadlineMinutes; pd > 0 && reconciledRev.Spec.LifecycleState != ocv1.ClusterObjectSetLifecycleStateArchived {
8587
cnd := meta.FindStatusCondition(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing)
8688
isStillProgressing := cnd != nil && cnd.Status == metav1.ConditionTrue && cnd.Reason != ocv1.ReasonSucceeded
8789
succeeded := meta.IsStatusConditionTrue(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded)
@@ -344,29 +346,12 @@ type Sourcoser interface {
344346
}
345347

346348
func (c *ClusterObjectSetReconciler) SetupWithManager(mgr ctrl.Manager) error {
347-
skipProgressDeadlineExceededPredicate := predicate.Funcs{
348-
UpdateFunc: func(e event.UpdateEvent) bool {
349-
rev, ok := e.ObjectNew.(*ocv1.ClusterObjectSet)
350-
if !ok {
351-
return true
352-
}
353-
// allow deletions to happen
354-
if !rev.DeletionTimestamp.IsZero() {
355-
return true
356-
}
357-
if cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil && cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded {
358-
return false
359-
}
360-
return true
361-
},
362-
}
363349
c.Clock = clock.RealClock{}
364350
return ctrl.NewControllerManagedBy(mgr).
365351
For(
366352
&ocv1.ClusterObjectSet{},
367353
builder.WithPredicates(
368354
predicate.ResourceVersionChangedPredicate{},
369-
skipProgressDeadlineExceededPredicate,
370355
),
371356
).
372357
WatchesRawSource(
@@ -684,7 +669,33 @@ func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string) {
684669
}
685670
}
686671

672+
// markAsProgressing sets the Progressing condition to True with the given reason.
673+
//
674+
// Once ProgressDeadlineExceeded has been set for the current generation, this function
675+
// blocks only the RollingOut reason to prevent a reconcile loop for Active revisions.
676+
// The loop would occur when reconcile() sees in-transition objects and sets
677+
// Progressing=True/RollingOut, then the progress deadline check immediately resets it
678+
// to ProgressDeadlineExceeded, causing a status-only update that triggers another reconcile.
679+
//
680+
// Archived revisions are exempt because they skip progress deadline enforcement entirely,
681+
// so their status updates won't be overridden.
682+
//
683+
// Other reasons like Succeeded and Retrying are always allowed because:
684+
// - Succeeded represents terminal success (rollout eventually completed)
685+
// - Retrying represents transient errors that need to be visible in status
686+
//
687+
// If the generation changed since ProgressDeadlineExceeded was recorded, the guard
688+
// allows all updates through so the condition reflects the new generation.
687689
func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string) {
690+
if cnd := meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil &&
691+
cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded &&
692+
cnd.ObservedGeneration == cos.Generation &&
693+
reason == ocv1.ReasonRollingOut &&
694+
cos.Spec.LifecycleState != ocv1.ClusterObjectSetLifecycleStateArchived {
695+
log.Log.V(1).Info("skipping markAsProgressing: ProgressDeadlineExceeded is sticky for RollingOut on Active revisions",
696+
"requestedReason", reason, "revision", cos.Name, "lifecycleState", cos.Spec.LifecycleState)
697+
return
698+
}
688699
meta.SetStatusCondition(&cos.Status.Conditions, metav1.Condition{
689700
Type: ocv1.ClusterObjectSetTypeProgressing,
690701
Status: metav1.ConditionTrue,

0 commit comments

Comments
 (0)