Skip to content

Commit 8851183

Browse files
(fix) catalog deletion resilience support
Enables installed extensions to continue working when their source catalog becomes unavailable or is deleted. When resolution fails due to catalog unavailability, the operator now continues reconciling with the currently installed bundle instead of failing. Changes: - Resolution falls back to installed bundle when catalog unavailable - Unpacking skipped when maintaining current installed state - Helm and Boxcutter appliers handle nil contentFS gracefully - Version upgrades properly blocked without catalog access This ensures workloads remain stable and operational even when the catalog they were installed from is temporarily unavailable or deleted, while appropriately preventing version changes that require catalog access.
1 parent d15b7bf commit 8851183

4 files changed

Lines changed: 240 additions & 12 deletions

File tree

internal/operator-controller/applier/boxcutter.go

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,6 @@ func (m *BoxcutterStorageMigrator) Migrate(ctx context.Context, ext *ocv1.Cluste
267267
}
268268

269269
// Set initial status on the migrated revision to mark it as succeeded.
270-
//
271270
// The revision must have a Succeeded=True status condition immediately after creation.
272271
//
273272
// A revision is only considered "Installed" (vs "RollingOut") when it has this condition.
@@ -330,22 +329,35 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, obj client.Object) erro
330329
}
331330

332331
func (bc *Boxcutter) apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) {
333-
// Generate desired revision
334-
desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations)
332+
// List all existing revisions
333+
existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName())
335334
if err != nil {
336335
return false, "", err
337336
}
338337

339-
if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil {
340-
return false, "", fmt.Errorf("set ownerref: %w", err)
338+
// If contentFS is nil, we're maintaining the current state without catalog access.
339+
// In this case, we should use the existing installed revision without generating a new one.
340+
if contentFS == nil {
341+
if len(existingRevisions) == 0 {
342+
return false, "", fmt.Errorf("no bundle content available and no existing revisions found")
343+
}
344+
// Use the most recent revision and rely on its existing controller loop (don't create a new one).
345+
// Returning true here signals that the rollout has succeeded using the current revision; the
346+
// ClusterExtensionRevision controller will continue to reconcile and maintain the resources
347+
// independently of this apply call.
348+
return true, "", nil
341349
}
342350

343-
// List all existing revisions
344-
existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName())
351+
// Generate desired revision
352+
desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations)
345353
if err != nil {
346354
return false, "", err
347355
}
348356

357+
if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil {
358+
return false, "", fmt.Errorf("set ownerref: %w", err)
359+
}
360+
349361
currentRevision := &ocv1.ClusterExtensionRevision{}
350362
state := StateNeedsInstall
351363
// check if we can update the current revision.

internal/operator-controller/applier/helm.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE
103103
}
104104

105105
func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) {
106+
// If contentFS is nil, we're maintaining the current state without catalog access.
107+
// In this case, reconcile the existing Helm release if it exists.
108+
if contentFS == nil {
109+
ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
110+
if err != nil {
111+
return false, "", err
112+
}
113+
return h.reconcileExistingRelease(ctx, ac, ext)
114+
}
115+
106116
chrt, err := h.buildHelmChart(contentFS, ext)
107117
if err != nil {
108118
return false, "", err
@@ -197,6 +207,45 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
197207
return true, "", nil
198208
}
199209

210+
// reconcileExistingRelease reconciles an existing Helm release without catalog access.
211+
// This is used when the catalog is unavailable but we need to maintain the current installation.
212+
// It reconciles the release and sets up watchers to ensure resources are maintained.
213+
func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) {
214+
rel, err := ac.Get(ext.GetName())
215+
if errors.Is(err, driver.ErrReleaseNotFound) {
216+
return false, "", fmt.Errorf("no bundle content available and no existing release found")
217+
}
218+
if err != nil {
219+
return false, "", fmt.Errorf("getting current release: %w", err)
220+
}
221+
222+
// Reconcile the existing release to ensure resources are maintained
223+
if err := ac.Reconcile(rel); err != nil {
224+
// Reconcile failed - resources NOT maintained
225+
// Return false (rollout failed) with error
226+
return false, "", err
227+
}
228+
229+
// At this point: Reconcile succeeded - resources ARE maintained
230+
// The operations below are for setting up monitoring (watches).
231+
// If they fail, the resources are still successfully reconciled and maintained,
232+
// so we return true (rollout succeeded) even though monitoring setup failed.
233+
relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name))
234+
if err != nil {
235+
return true, "", err
236+
}
237+
klog.FromContext(ctx).Info("watching managed objects")
238+
cache, err := h.Manager.Get(ctx, ext)
239+
if err != nil {
240+
return true, "", err
241+
}
242+
if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil {
243+
return true, "", err
244+
}
245+
246+
return true, "", nil
247+
}
248+
200249
func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) {
201250
if h.HelmChartProvider == nil {
202251
return nil, errors.New("HelmChartProvider is nil")

internal/operator-controller/controllers/clusterextension_controller_test.go

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,3 +1611,116 @@ func TestGetInstalledBundleHistory(t *testing.T) {
16111611
}
16121612
}
16131613
}
1614+
1615+
// TestResolutionFallbackToInstalledBundle tests the catalog deletion resilience fallback logic
1616+
func TestResolutionFallbackToInstalledBundle(t *testing.T) {
1617+
t.Run("falls back when catalog unavailable and no version change", func(t *testing.T) {
1618+
cl, reconciler := newClientAndReconciler(t, func(d *deps) {
1619+
// Resolver fails (simulating catalog unavailable)
1620+
d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) {
1621+
return nil, nil, nil, fmt.Errorf("catalog unavailable")
1622+
})
1623+
// Applier succeeds (resources maintained)
1624+
d.Applier = &MockApplier{
1625+
installCompleted: true,
1626+
installStatus: "",
1627+
err: nil,
1628+
}
1629+
d.RevisionStatesGetter = &MockRevisionStatesGetter{
1630+
RevisionStates: &controllers.RevisionStates{
1631+
Installed: &controllers.RevisionMetadata{
1632+
BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"},
1633+
Image: "test-image:1.0.0",
1634+
},
1635+
},
1636+
}
1637+
})
1638+
1639+
ctx := context.Background()
1640+
extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))}
1641+
1642+
// Create ClusterExtension with no version specified
1643+
ext := &ocv1.ClusterExtension{
1644+
ObjectMeta: metav1.ObjectMeta{Name: extKey.Name},
1645+
Spec: ocv1.ClusterExtensionSpec{
1646+
Source: ocv1.SourceConfig{
1647+
SourceType: "Catalog",
1648+
Catalog: &ocv1.CatalogFilter{
1649+
PackageName: "test-pkg",
1650+
// No version - should fall back
1651+
},
1652+
},
1653+
Namespace: "default",
1654+
ServiceAccount: ocv1.ServiceAccountReference{Name: "default"},
1655+
},
1656+
}
1657+
require.NoError(t, cl.Create(ctx, ext))
1658+
1659+
// Reconcile should succeed (fallback to installed, then apply succeeds)
1660+
res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey})
1661+
require.NoError(t, err)
1662+
require.Equal(t, ctrl.Result{}, res)
1663+
1664+
// Verify status shows successful reconciliation
1665+
require.NoError(t, cl.Get(ctx, extKey, ext))
1666+
1667+
// Progressing should be Succeeded (apply completed successfully)
1668+
progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing)
1669+
require.NotNil(t, progCond)
1670+
require.Equal(t, metav1.ConditionTrue, progCond.Status)
1671+
require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason)
1672+
1673+
// Installed should be True (maintaining current version)
1674+
instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled)
1675+
require.NotNil(t, instCond)
1676+
require.Equal(t, metav1.ConditionTrue, instCond.Status)
1677+
require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason)
1678+
})
1679+
1680+
t.Run("fails when version upgrade requested without catalog", func(t *testing.T) {
1681+
cl, reconciler := newClientAndReconciler(t, func(d *deps) {
1682+
d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) {
1683+
return nil, nil, nil, fmt.Errorf("catalog unavailable")
1684+
})
1685+
d.RevisionStatesGetter = &MockRevisionStatesGetter{
1686+
RevisionStates: &controllers.RevisionStates{
1687+
Installed: &controllers.RevisionMetadata{
1688+
BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"},
1689+
},
1690+
},
1691+
}
1692+
})
1693+
1694+
ctx := context.Background()
1695+
extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))}
1696+
1697+
// Create ClusterExtension requesting version upgrade
1698+
ext := &ocv1.ClusterExtension{
1699+
ObjectMeta: metav1.ObjectMeta{Name: extKey.Name},
1700+
Spec: ocv1.ClusterExtensionSpec{
1701+
Source: ocv1.SourceConfig{
1702+
SourceType: "Catalog",
1703+
Catalog: &ocv1.CatalogFilter{
1704+
PackageName: "test-pkg",
1705+
Version: "1.0.1", // Requesting upgrade
1706+
},
1707+
},
1708+
Namespace: "default",
1709+
ServiceAccount: ocv1.ServiceAccountReference{Name: "default"},
1710+
},
1711+
}
1712+
require.NoError(t, cl.Create(ctx, ext))
1713+
1714+
// Reconcile should fail (can't upgrade without catalog)
1715+
res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey})
1716+
require.Error(t, err)
1717+
require.Equal(t, ctrl.Result{}, res)
1718+
1719+
// Verify status shows Retrying
1720+
require.NoError(t, cl.Get(ctx, extKey, ext))
1721+
cond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing)
1722+
require.NotNil(t, cond)
1723+
require.Equal(t, metav1.ConditionTrue, cond.Status)
1724+
require.Equal(t, ocv1.ReasonRetrying, cond.Reason)
1725+
})
1726+
}

internal/operator-controller/controllers/clusterextension_reconcile_steps.go

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,7 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc {
9595
}
9696
resolvedBundle, resolvedBundleVersion, resolvedDeprecation, err := r.Resolve(ctx, ext, bm)
9797
if err != nil {
98-
// Note: We don't distinguish between resolution-specific errors and generic errors
99-
setStatusProgressing(ext, err)
100-
setInstalledStatusFromRevisionStates(ext, state.revisionStates)
101-
ensureAllConditionsWithReason(ext, ocv1.ReasonFailed, err.Error())
102-
return nil, err
98+
return handleResolutionError(ctx, state, ext, err)
10399
}
104100

105101
// set deprecation status after _successful_ resolution
@@ -134,9 +130,67 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc {
134130
}
135131
}
136132

133+
// handleResolutionError handles the case when bundle resolution fails.
134+
// If a bundle is already installed and the spec isn't requesting a version change,
135+
// we fall back to using the installed bundle to maintain the current state.
136+
// This enables workload resilience when the catalog becomes unavailable.
137+
// However, if the spec explicitly requests a different version, we must fail and retry.
138+
func handleResolutionError(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension, err error) (*ctrl.Result, error) {
139+
l := log.FromContext(ctx)
140+
141+
// If we have an installed bundle, check if we can fall back to it
142+
if state.revisionStates.Installed != nil {
143+
// Check if the spec is requesting a specific version that differs from installed
144+
specVersion := ""
145+
if ext.Spec.Source.Catalog != nil {
146+
specVersion = ext.Spec.Source.Catalog.Version
147+
}
148+
installedVersion := state.revisionStates.Installed.Version
149+
150+
// If spec requests a different version, we cannot fall back - must fail and retry
151+
if specVersion != "" && specVersion != installedVersion {
152+
l.Info("resolution failed and spec requests version change - cannot fall back",
153+
"error", err,
154+
"requestedVersion", specVersion,
155+
"installedVersion", installedVersion)
156+
setStatusProgressing(ext, err)
157+
setInstalledStatusFromRevisionStates(ext, state.revisionStates)
158+
ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error())
159+
return nil, err
160+
}
161+
162+
// No version change requested - safe to fall back to maintain current state
163+
l.Info("resolution failed but continuing with installed bundle", "error", err, "installedBundle", state.revisionStates.Installed.BundleMetadata)
164+
// Set Progressing condition to indicate we're operating in degraded mode (retrying resolution)
165+
// but maintaining the current state successfully
166+
setStatusProgressing(ext, err)
167+
setInstalledStatusFromRevisionStates(ext, state.revisionStates)
168+
state.resolvedRevisionMetadata = state.revisionStates.Installed
169+
return nil, nil
170+
}
171+
172+
// No installed bundle and resolution failed - cannot proceed
173+
setStatusProgressing(ext, err)
174+
setInstalledStatusFromRevisionStates(ext, state.revisionStates)
175+
ensureAllConditionsWithReason(ext, ocv1.ReasonFailed, err.Error())
176+
return nil, err
177+
}
178+
137179
func UnpackBundle(i imageutil.Puller, cache imageutil.Cache) ReconcileStepFunc {
138180
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
139181
l := log.FromContext(ctx)
182+
183+
// Skip unpacking if we're using an already-installed bundle
184+
// (e.g., when catalog is unavailable but we're maintaining current state)
185+
if state.revisionStates.Installed != nil &&
186+
state.resolvedRevisionMetadata.BundleMetadata == state.revisionStates.Installed.BundleMetadata {
187+
l.Info("skipping unpack - using installed bundle content")
188+
// imageFS will remain nil; applier implementations MUST handle nil imageFS by using
189+
// existing installed content. See Helm.reconcileExistingRelease() and Boxcutter.apply()
190+
// for nil contentFS handling.
191+
return nil, nil
192+
}
193+
140194
l.Info("unpacking resolved bundle")
141195
imageFS, _, _, err := i.Pull(ctx, ext.GetName(), state.resolvedRevisionMetadata.Image, cache)
142196
if err != nil {

0 commit comments

Comments
 (0)