Skip to content

Commit d31cbc3

Browse files
fix(Boxcutter): Fix ClusterExtension spec changes ignored while revision is rolling out
When a ClusterExtensionRevision was stuck rolling out (e.g. probe failure), any changes to the ClusterExtension spec (like requesting a different version) were silently ignored. The controller kept retrying the old stuck version instead of re-resolving from the catalog. The root cause was in ResolveBundle: it unconditionally short-circuited resolution when any rolling out revision existed, without checking if the spec had changed since that revision was created. The fix adds a check that compares the rolling out revision's package name and version against the current spec constraints. If they no longer match (meaning the user changed the spec), the controller re-resolves from the catalog to honor the new desired state. Generated-by: Cursor/Claude
1 parent 147cfa8 commit d31cbc3

File tree

3 files changed

+263
-9
lines changed

3 files changed

+263
-9
lines changed

internal/operator-controller/controllers/clusterextension_controller_test.go

Lines changed: 169 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -680,10 +680,18 @@ func TestClusterExtensionBoxcutterApplierFailsDoesNotLeakDeprecationErrors(t *te
680680

681681
cl, reconciler := newClientAndReconciler(t, func(d *deps) {
682682
// Boxcutter keeps a rolling revision when apply fails. We mirror that state so the test uses
683-
// the same inputs the runtime would see.
683+
// the same inputs the runtime would see. The revision metadata must match the spec so the
684+
// controller recognizes this as a still-valid rollout (not a spec change).
684685
d.RevisionStatesGetter = &MockRevisionStatesGetter{
685686
RevisionStates: &controllers.RevisionStates{
686-
RollingOut: []*controllers.RevisionMetadata{{}},
687+
RollingOut: []*controllers.RevisionMetadata{{
688+
Package: "prometheus",
689+
Image: "quay.io/operatorhubio/prometheus@fake1.0.0",
690+
BundleMetadata: ocv1.BundleMetadata{
691+
Name: "prometheus.v1.0.0",
692+
Version: "1.0.0",
693+
},
694+
}},
687695
},
688696
}
689697
d.Resolver = resolve.Func(func(ctx context.Context, ext *ocv1.ClusterExtension, installedBundle *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) {
@@ -768,6 +776,165 @@ func TestClusterExtensionBoxcutterApplierFailsDoesNotLeakDeprecationErrors(t *te
768776
require.NoError(t, cl.DeleteAllOf(ctx, &ocv1.ClusterExtension{}))
769777
}
770778

779+
// TestClusterExtensionSpecMismatchWhileRollingOutReResolvesBundle verifies that when a
780+
// ClusterExtension's spec requests a different version than the revision currently rolling out,
781+
// the controller re-resolves the bundle from the catalog instead of continuing to use the stale revision.
782+
//
783+
// Scenario:
784+
// - A revision for v1.0.2 is rolling out (e.g., stuck due to deployment probe failure)
785+
// - The ClusterExtension spec already requests v1.0.3 at reconcile time (e.g., previously updated by the user)
786+
// - The controller detects the spec/revision mismatch and re-resolves from the catalog
787+
// - The resolver returns v1.0.3, which is used for the new rollout
788+
func TestClusterExtensionSpecMismatchWhileRollingOutReResolvesBundle(t *testing.T) {
789+
require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=true", features.BoxcutterRuntime)))
790+
t.Cleanup(func() {
791+
require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=false", features.BoxcutterRuntime)))
792+
})
793+
794+
resolverCalled := false
795+
cl, reconciler := newClientAndReconciler(t, func(d *deps) {
796+
d.RevisionStatesGetter = &MockRevisionStatesGetter{
797+
RevisionStates: &controllers.RevisionStates{
798+
Installed: &controllers.RevisionMetadata{
799+
Package: "nginx88138",
800+
Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.1-nginxolm88138",
801+
BundleMetadata: ocv1.BundleMetadata{
802+
Name: "nginx88138.v1.0.1",
803+
Version: "1.0.1",
804+
},
805+
},
806+
RollingOut: []*controllers.RevisionMetadata{{
807+
RevisionName: "extension-88138-2",
808+
Package: "nginx88138",
809+
Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.2-nginx88138",
810+
BundleMetadata: ocv1.BundleMetadata{
811+
Name: "nginx88138.v1.0.2",
812+
Version: "1.0.2",
813+
},
814+
}},
815+
},
816+
}
817+
d.Resolver = resolve.Func(func(ctx context.Context, ext *ocv1.ClusterExtension, installedBundle *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) {
818+
resolverCalled = true
819+
v := bundle.VersionRelease{
820+
Version: bsemver.MustParse("1.0.3"),
821+
}
822+
return &declcfg.Bundle{
823+
Name: "nginx88138.v1.0.3",
824+
Package: "nginx88138",
825+
Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.3-nginxolm88138",
826+
}, &v, nil, nil
827+
})
828+
d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}}
829+
d.Applier = &MockApplier{installCompleted: true}
830+
})
831+
832+
ctx := context.Background()
833+
extKey := types.NamespacedName{Name: fmt.Sprintf("cluster-extension-test-%s", rand.String(8))}
834+
835+
clusterExtension := &ocv1.ClusterExtension{
836+
ObjectMeta: metav1.ObjectMeta{Name: extKey.Name},
837+
Spec: ocv1.ClusterExtensionSpec{
838+
Source: ocv1.SourceConfig{
839+
SourceType: "Catalog",
840+
Catalog: &ocv1.CatalogFilter{
841+
PackageName: "nginx88138",
842+
Version: "1.0.3",
843+
},
844+
},
845+
Namespace: "default",
846+
ServiceAccount: ocv1.ServiceAccountReference{
847+
Name: "default",
848+
},
849+
},
850+
}
851+
require.NoError(t, cl.Create(ctx, clusterExtension))
852+
853+
res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey})
854+
require.Equal(t, ctrl.Result{}, res)
855+
require.NoError(t, err)
856+
857+
require.True(t, resolverCalled,
858+
"resolver should be called because the rolling out revision (v1.0.2) does not match the spec (v1.0.3)")
859+
860+
require.NoError(t, cl.Get(ctx, extKey, clusterExtension))
861+
862+
installedCond := apimeta.FindStatusCondition(clusterExtension.Status.Conditions, ocv1.TypeInstalled)
863+
require.NotNil(t, installedCond)
864+
require.Equal(t, metav1.ConditionTrue, installedCond.Status)
865+
require.Contains(t, installedCond.Message, "1.0.3")
866+
867+
require.NoError(t, cl.DeleteAllOf(ctx, &ocv1.ClusterExtension{}))
868+
}
869+
870+
// TestClusterExtensionRollingOutRevisionMatchesSpecNoReResolve verifies that when a
871+
// revision is rolling out and the spec hasn't changed, the controller uses the existing
872+
// rolling out revision without re-resolving from the catalog.
873+
func TestClusterExtensionRollingOutRevisionMatchesSpecNoReResolve(t *testing.T) {
874+
require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=true", features.BoxcutterRuntime)))
875+
t.Cleanup(func() {
876+
require.NoError(t, features.OperatorControllerFeatureGate.Set(fmt.Sprintf("%s=false", features.BoxcutterRuntime)))
877+
})
878+
879+
resolverCalled := false
880+
cl, reconciler := newClientAndReconciler(t, func(d *deps) {
881+
d.RevisionStatesGetter = &MockRevisionStatesGetter{
882+
RevisionStates: &controllers.RevisionStates{
883+
RollingOut: []*controllers.RevisionMetadata{{
884+
Package: "nginx88138",
885+
Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.2-nginx88138",
886+
BundleMetadata: ocv1.BundleMetadata{
887+
Name: "nginx88138.v1.0.2",
888+
Version: "1.0.2",
889+
},
890+
}},
891+
},
892+
}
893+
d.Resolver = resolve.Func(func(ctx context.Context, ext *ocv1.ClusterExtension, installedBundle *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) {
894+
resolverCalled = true
895+
v := bundle.VersionRelease{
896+
Version: bsemver.MustParse("1.0.2"),
897+
}
898+
return &declcfg.Bundle{
899+
Name: "nginx88138.v1.0.2",
900+
Package: "nginx88138",
901+
Image: "quay.io/olmqe/nginxolm-operator-bundle:v1.0.2-nginx88138",
902+
}, &v, nil, nil
903+
})
904+
d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}}
905+
d.Applier = &MockApplier{installCompleted: true}
906+
})
907+
908+
ctx := context.Background()
909+
extKey := types.NamespacedName{Name: fmt.Sprintf("cluster-extension-test-%s", rand.String(8))}
910+
911+
clusterExtension := &ocv1.ClusterExtension{
912+
ObjectMeta: metav1.ObjectMeta{Name: extKey.Name},
913+
Spec: ocv1.ClusterExtensionSpec{
914+
Source: ocv1.SourceConfig{
915+
SourceType: "Catalog",
916+
Catalog: &ocv1.CatalogFilter{
917+
PackageName: "nginx88138",
918+
Version: "1.0.2",
919+
},
920+
},
921+
Namespace: "default",
922+
ServiceAccount: ocv1.ServiceAccountReference{
923+
Name: "default",
924+
},
925+
},
926+
}
927+
require.NoError(t, cl.Create(ctx, clusterExtension))
928+
929+
_, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey})
930+
require.NoError(t, err)
931+
932+
require.False(t, resolverCalled,
933+
"resolver should NOT be called because the rolling out revision (v1.0.2) matches the spec (v1.0.2)")
934+
935+
require.NoError(t, cl.DeleteAllOf(ctx, &ocv1.ClusterExtension{}))
936+
}
937+
771938
func TestValidateClusterExtension(t *testing.T) {
772939
tests := []struct {
773940
name string

internal/operator-controller/controllers/clusterextension_reconcile_steps.go

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"errors"
2222
"fmt"
2323

24+
bsemver "github.com/blang/semver/v4"
2425
apierrors "k8s.io/apimachinery/pkg/api/errors"
2526
apimeta "k8s.io/apimachinery/pkg/api/meta"
2627
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -32,6 +33,7 @@ import (
3233

3334
ocv1 "github.com/operator-framework/operator-controller/api/v1"
3435
"github.com/operator-framework/operator-controller/internal/operator-controller/bundleutil"
36+
"github.com/operator-framework/operator-controller/internal/operator-controller/catalogmetadata/compare"
3537
"github.com/operator-framework/operator-controller/internal/operator-controller/labels"
3638
"github.com/operator-framework/operator-controller/internal/operator-controller/resolve"
3739
imageutil "github.com/operator-framework/operator-controller/internal/shared/util/image"
@@ -140,15 +142,32 @@ func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc {
140142
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
141143
l := log.FromContext(ctx)
142144

143-
// If already rolling out, use existing revision and set deprecation to Unknown (no catalog check)
145+
// If already rolling out, check if any rolling-out revision still matches the current spec.
146+
// When the spec hasn't changed, reuse an existing matching revision to avoid unnecessary catalog
147+
// lookups and provide resilience during catalog outages.
148+
// When the spec has changed (e.g., user requested a different version), we must re-resolve
149+
// from the catalog to honor the new desired state.
144150
if len(state.revisionStates.RollingOut) > 0 {
145-
installedBundleName := ""
146-
if state.revisionStates.Installed != nil {
147-
installedBundleName = state.revisionStates.Installed.Name
151+
for i := len(state.revisionStates.RollingOut) - 1; i >= 0; i-- {
152+
rollingOut := state.revisionStates.RollingOut[i]
153+
if rollingOutRevisionMatchesSpec(rollingOut, ext) {
154+
installedBundleName := ""
155+
if state.revisionStates.Installed != nil {
156+
installedBundleName = state.revisionStates.Installed.Name
157+
}
158+
SetDeprecationStatus(ext, installedBundleName, nil, false)
159+
state.resolvedRevisionMetadata = rollingOut
160+
return nil, nil
161+
}
148162
}
149-
SetDeprecationStatus(ext, installedBundleName, nil, false)
150-
state.resolvedRevisionMetadata = state.revisionStates.RollingOut[0]
151-
return nil, nil
163+
specVersion := ""
164+
if ext.Spec.Source.Catalog != nil {
165+
specVersion = ext.Spec.Source.Catalog.Version
166+
}
167+
l.Info("spec changed while revisions are rolling out, re-resolving bundle",
168+
"rollingOutCount", len(state.revisionStates.RollingOut),
169+
"specVersion", specVersion,
170+
)
152171
}
153172

154173
// Resolve a new bundle from the catalog
@@ -199,6 +218,43 @@ func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc {
199218
}
200219
}
201220

221+
// rollingOutRevisionMatchesSpec checks whether a rolling out revision is still compatible
222+
// with the current ClusterExtension spec. It compares the revision's package name and
223+
// version against the spec's constraints.
224+
//
225+
// Returns true if the revision matches the spec (meaning we should continue rolling it out).
226+
// Returns false if the spec has changed and we should re-resolve from the catalog.
227+
func rollingOutRevisionMatchesSpec(rm *RevisionMetadata, ext *ocv1.ClusterExtension) bool {
228+
if ext.Spec.Source.Catalog == nil {
229+
return false
230+
}
231+
232+
if rm.Package != ext.Spec.Source.Catalog.PackageName {
233+
return false
234+
}
235+
236+
specVersion := ext.Spec.Source.Catalog.Version
237+
if specVersion == "" {
238+
return true
239+
}
240+
241+
if rm.Version == "" {
242+
return false
243+
}
244+
245+
versionRange, err := compare.NewVersionRange(specVersion)
246+
if err != nil {
247+
return false
248+
}
249+
250+
revVersion, err := bsemver.Parse(rm.Version)
251+
if err != nil {
252+
return false
253+
}
254+
255+
return versionRange(revVersion)
256+
}
257+
202258
// handleResolutionError handles the case when bundle resolution fails.
203259
//
204260
// Decision logic (evaluated in order):

test/e2e/features/update.feature

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,3 +243,34 @@ Feature: Update ClusterExtension
243243
And ClusterExtensionRevision "${NAME}-2" reports Progressing as True with Reason RollingOut
244244
And ClusterExtensionRevision "${NAME}-2" reports Available as False with Reason ProbeFailure
245245

246+
@BoxcutterRuntime
247+
Scenario: Version change while a revision is stuck rolling out re-resolves from catalog
248+
Given ClusterExtension is applied
249+
"""
250+
apiVersion: olm.operatorframework.io/v1
251+
kind: ClusterExtension
252+
metadata:
253+
name: ${NAME}
254+
spec:
255+
namespace: ${TEST_NAMESPACE}
256+
serviceAccount:
257+
name: olm-sa
258+
source:
259+
sourceType: Catalog
260+
catalog:
261+
packageName: test
262+
selector:
263+
matchLabels:
264+
"olm.operatorframework.io/metadata.name": test-catalog
265+
version: 1.0.0
266+
upgradeConstraintPolicy: SelfCertified
267+
"""
268+
And ClusterExtension is rolled out
269+
And ClusterExtension is available
270+
When ClusterExtension is updated to version "1.0.2"
271+
And ClusterExtensionRevision "${NAME}-2" reports Available as False with Reason ProbeFailure
272+
And ClusterExtension is updated to version "1.0.1"
273+
Then ClusterExtension is rolled out
274+
And ClusterExtension is available
275+
And bundle "test-operator.1.0.1" is installed in version "1.0.1"
276+

0 commit comments

Comments
 (0)