Skip to content

Commit cb490a4

Browse files
(fix) catalog deletion resilience support
Enables installed extensions to continue working when their source catalog becomes unavailable or is deleted. When resolution fails due to catalog unavailability, the operator now continues reconciling with the currently installed bundle instead of failing. Changes: - Resolution falls back to installed bundle when catalog unavailable - Unpacking skipped when maintaining current installed state - Helm and Boxcutter appliers handle nil contentFS gracefully - Version upgrades properly blocked without catalog access This ensures workloads remain stable and operational even when the catalog they were installed from is temporarily unavailable or deleted, while appropriately preventing version changes that require catalog access. Assisted-by: Cursor
1 parent cc41210 commit cb490a4

14 files changed

Lines changed: 753 additions & 69 deletions

cmd/operator-controller/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ func (c *boxcutterReconcilerConfigurator) Configure(ceReconciler *controllers.Cl
623623
controllers.HandleFinalizers(c.finalizers),
624624
controllers.MigrateStorage(storageMigrator),
625625
controllers.RetrieveRevisionStates(revisionStatesGetter),
626-
controllers.ResolveBundle(c.resolver),
626+
controllers.ResolveBundle(c.resolver, c.mgr.GetClient()),
627627
controllers.UnpackBundle(c.imagePuller, c.imageCache),
628628
controllers.ApplyBundleWithBoxcutter(appl.Apply),
629629
}
@@ -742,7 +742,7 @@ func (c *helmReconcilerConfigurator) Configure(ceReconciler *controllers.Cluster
742742
ceReconciler.ReconcileSteps = []controllers.ReconcileStepFunc{
743743
controllers.HandleFinalizers(c.finalizers),
744744
controllers.RetrieveRevisionStates(revisionStatesGetter),
745-
controllers.ResolveBundle(c.resolver),
745+
controllers.ResolveBundle(c.resolver, c.mgr.GetClient()),
746746
controllers.UnpackBundle(c.imagePuller, c.imageCache),
747747
controllers.ApplyBundle(appl),
748748
}

internal/operator-controller/applier/boxcutter.go

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -307,21 +307,37 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, obj client.Object) erro
307307
return bc.Client.Patch(ctx, obj, client.Apply, client.FieldOwner(bc.FieldOwner), client.ForceOwnership)
308308
}
309309

310-
func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error {
311-
// Generate desired revision
312-
desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations)
310+
func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) {
311+
// Note: We list revisions first (before checking contentFS) because we need the revision list
312+
// to determine if we can fall back when contentFS is nil. If the API call fails here,
313+
// it indicates a serious cluster connectivity issue, and we should not proceed even in fallback mode
314+
// since the ClusterExtensionRevision controller also requires API access to maintain resources.
315+
existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName())
313316
if err != nil {
314-
return err
317+
return false, "", err
315318
}
316319

317-
if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil {
318-
return fmt.Errorf("set ownerref: %w", err)
320+
// If contentFS is nil, we're maintaining the current state without catalog access.
321+
// In this case, we should use the existing installed revision without generating a new one.
322+
if contentFS == nil {
323+
if len(existingRevisions) == 0 {
324+
return false, "", fmt.Errorf("catalog content unavailable and no revision installed")
325+
}
326+
// Returning true here signals that the rollout has succeeded using the current revision. The
327+
// ClusterExtensionRevision controller will continue to reconcile, apply, and maintain the
328+
// resources defined in that revision via Server-Side Apply, ensuring the workload keeps running
329+
// even when catalog access (and thus new revision content) is unavailable.
330+
return true, "", nil
319331
}
320332

321-
// List all existing revisions
322-
existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName())
333+
// Generate desired revision
334+
desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations)
323335
if err != nil {
324-
return err
336+
return false, "", err
337+
}
338+
339+
if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil {
340+
return false, "", fmt.Errorf("set ownerref: %w", err)
325341
}
326342

327343
currentRevision := &ocv1.ClusterExtensionRevision{}
@@ -343,7 +359,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
343359
// inplace patch was successful, no changes in phases
344360
state = StateUnchanged
345361
default:
346-
return fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err)
362+
return false, "", fmt.Errorf("patching %s Revision: %w", desiredRevision.Name, err)
347363
}
348364
}
349365

@@ -357,7 +373,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
357373
case StateNeedsInstall:
358374
err := preflight.Install(ctx, plainObjs)
359375
if err != nil {
360-
return err
376+
return false, "", err
361377
}
362378
// TODO: jlanford's IDE says that "StateNeedsUpgrade" condition is always true, but
363379
// it isn't immediately obvious why that is. Perhaps len(existingRevisions) is
@@ -366,7 +382,7 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
366382
case StateNeedsUpgrade:
367383
err := preflight.Upgrade(ctx, plainObjs)
368384
if err != nil {
369-
return err
385+
return false, "", err
370386
}
371387
}
372388
}
@@ -380,15 +396,15 @@ func (bc *Boxcutter) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.Clust
380396
desiredRevision.Spec.Revision = revisionNumber
381397

382398
if err = bc.garbageCollectOldRevisions(ctx, prevRevisions); err != nil {
383-
return fmt.Errorf("garbage collecting old revisions: %w", err)
399+
return false, "", fmt.Errorf("garbage collecting old revisions: %w", err)
384400
}
385401

386402
if err := bc.createOrUpdate(ctx, desiredRevision); err != nil {
387-
return fmt.Errorf("creating new Revision: %w", err)
403+
return false, "", fmt.Errorf("creating new Revision: %w", err)
388404
}
389405
}
390406

391-
return nil
407+
return true, "", nil
392408
}
393409

394410
// garbageCollectOldRevisions deletes archived revisions beyond ClusterExtensionRevisionRetentionLimit.

internal/operator-controller/applier/boxcutter_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -986,14 +986,18 @@ func TestBoxcutter_Apply(t *testing.T) {
986986
labels.PackageNameKey: "test-package",
987987
}
988988
}
989-
err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations)
989+
completed, status, err := boxcutter.Apply(t.Context(), testFS, ext, nil, revisionAnnotations)
990990

991991
// Assert
992992
if tc.expectedErr != "" {
993993
require.Error(t, err)
994994
assert.Contains(t, err.Error(), tc.expectedErr)
995+
assert.False(t, completed)
996+
assert.Empty(t, status)
995997
} else {
996998
require.NoError(t, err)
999+
assert.True(t, completed)
1000+
assert.Empty(t, status)
9971001
}
9981002

9991003
if tc.validate != nil {

internal/operator-controller/applier/helm.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE
103103
}
104104

105105
func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) {
106+
// If contentFS is nil, we're maintaining the current state without catalog access.
107+
// In this case, reconcile the existing Helm release if it exists.
108+
if contentFS == nil {
109+
ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext)
110+
if err != nil {
111+
return false, "", err
112+
}
113+
return h.reconcileExistingRelease(ctx, ac, ext)
114+
}
115+
106116
chrt, err := h.buildHelmChart(contentFS, ext)
107117
if err != nil {
108118
return false, "", err
@@ -197,6 +207,62 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte
197207
return true, "", nil
198208
}
199209

210+
// reconcileExistingRelease reconciles an existing Helm release without catalog access.
211+
// This is used when the catalog is unavailable but we need to maintain the current installation.
212+
// It reconciles the release to actively maintain resources, and sets up watchers for monitoring/observability.
213+
func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) {
214+
rel, err := ac.Get(ext.GetName())
215+
if errors.Is(err, driver.ErrReleaseNotFound) {
216+
return false, "", fmt.Errorf("catalog content unavailable and no release installed")
217+
}
218+
if err != nil {
219+
return false, "", fmt.Errorf("failed to get current release: %w", err)
220+
}
221+
222+
// Reconcile the existing release to ensure resources are maintained
223+
if err := ac.Reconcile(rel); err != nil {
224+
// Reconcile failed - resources NOT maintained
225+
// Return false (rollout failed) with error
226+
return false, "", err
227+
}
228+
229+
// At this point: Reconcile succeeded - resources ARE maintained (applied to cluster via Server-Side Apply)
230+
// The operations below are for setting up watches to detect drift (i.e., if someone manually modifies the
231+
// resources). If watch setup fails, the resources are still successfully maintained, but we won't detect
232+
// and auto-correct manual modifications. We return true (rollout succeeded) and log watch errors.
233+
logger := klog.FromContext(ctx)
234+
235+
relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name))
236+
if err != nil {
237+
logger.Error(err, "failed to parse manifest objects, cannot set up drift detection watches (resources are applied but drift detection disabled)")
238+
return true, "", nil
239+
}
240+
241+
logger.V(1).Info("setting up drift detection watches on managed objects")
242+
243+
// Defensive nil checks to prevent panics if Manager or Watcher not properly initialized
244+
if h.Manager == nil {
245+
logger.Error(fmt.Errorf("manager is nil"), "Manager not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)")
246+
return true, "", nil
247+
}
248+
cache, err := h.Manager.Get(ctx, ext)
249+
if err != nil {
250+
logger.Error(err, "failed to get managed content cache, cannot set up drift detection watches (resources are applied but drift detection disabled)")
251+
return true, "", nil
252+
}
253+
254+
if h.Watcher == nil {
255+
logger.Error(fmt.Errorf("watcher is nil"), "Watcher not initialized, cannot set up drift detection watches (resources are applied but drift detection disabled)")
256+
return true, "", nil
257+
}
258+
if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil {
259+
logger.Error(err, "failed to set up drift detection watches (resources are applied but drift detection disabled)")
260+
return true, "", nil
261+
}
262+
263+
return true, "", nil
264+
}
265+
200266
func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) {
201267
if h.HelmChartProvider == nil {
202268
return nil, errors.New("HelmChartProvider is nil")

internal/operator-controller/controllers/boxcutter_reconcile_steps.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func MigrateStorage(m StorageMigrator) ReconcileStepFunc {
9494
}
9595
}
9696

97-
func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) error) ReconcileStepFunc {
97+
func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error)) ReconcileStepFunc {
9898
return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) {
9999
l := log.FromContext(ctx)
100100
revisionAnnotations := map[string]string{
@@ -109,7 +109,8 @@ func ApplyBundleWithBoxcutter(apply func(ctx context.Context, contentFS fs.FS, e
109109
}
110110

111111
l.Info("applying bundle contents")
112-
if err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations); err != nil {
112+
_, _, err := apply(ctx, state.imageFS, ext, objLbls, revisionAnnotations)
113+
if err != nil {
113114
// If there was an error applying the resolved bundle,
114115
// report the error via the Progressing condition.
115116
setStatusProgressing(ext, wrapErrorWithResolutionInfo(state.resolvedRevisionMetadata.BundleMetadata, err))

internal/operator-controller/controllers/boxcutter_reconcile_steps_apply_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ func TestApplyBundleWithBoxcutter(t *testing.T) {
133133
imageFS: fstest.MapFS{},
134134
}
135135

136-
stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) error {
137-
return nil
136+
stepFunc := ApplyBundleWithBoxcutter(func(_ context.Context, _ fs.FS, _ *ocv1.ClusterExtension, _, _ map[string]string) (bool, string, error) {
137+
return true, "", nil
138138
})
139139
result, err := stepFunc(ctx, state, ext)
140140
require.NoError(t, err)

internal/operator-controller/controllers/clusterextension_admission_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ import (
1313
)
1414

1515
func TestClusterExtensionSourceConfig(t *testing.T) {
16-
sourceTypeEmptyError := "Invalid value: null"
16+
// NOTE: Kubernetes validation error format for JSON null values varies across K8s versions.
17+
// We check for the common part "Invalid value:" which appears in all versions.
18+
sourceTypeEmptyError := "Invalid value:"
1719
sourceTypeMismatchError := "spec.source.sourceType: Unsupported value"
1820
sourceConfigInvalidError := "spec.source: Invalid value"
1921
// unionField represents the required Catalog or (future) Bundle field required by SourceConfig

internal/operator-controller/controllers/clusterextension_controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ func (r *ClusterExtensionReconciler) Reconcile(ctx context.Context, req ctrl.Req
168168

169169
// ensureAllConditionsWithReason checks that all defined condition types exist in the given ClusterExtension,
170170
// and assigns a specified reason and custom message to any missing condition.
171+
//
172+
//nolint:unparam // reason parameter is designed to be flexible, even if current callers use the same value
171173
func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.ConditionReason, message string) {
172174
for _, condType := range conditionsets.ConditionTypes {
173175
cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType)

0 commit comments

Comments
 (0)