Skip to content

Commit d0c58d3

Browse files
(chore): add e2e tests for workload resilience when catalog is deleted
Assisted-by: Cursor
1 parent a9e5614 commit d0c58d3

3 files changed

Lines changed: 196 additions & 10 deletions

File tree

test/e2e/features/recover.feature

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,153 @@ Feature: Recover cluster extension from errors that might occur during its lifet
115115
Then ClusterExtension is available
116116
And ClusterExtension reports Progressing as True with Reason Succeeded
117117
And ClusterExtension reports Installed as True
118+
119+
# CATALOG DELETION RESILIENCE SCENARIOS
120+
121+
Scenario: Extension continues running after catalog deletion
122+
Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
123+
And ClusterExtension is applied
124+
"""
125+
apiVersion: olm.operatorframework.io/v1
126+
kind: ClusterExtension
127+
metadata:
128+
name: ${NAME}
129+
spec:
130+
namespace: ${TEST_NAMESPACE}
131+
serviceAccount:
132+
name: olm-sa
133+
source:
134+
sourceType: Catalog
135+
catalog:
136+
packageName: test
137+
selector:
138+
matchLabels:
139+
"olm.operatorframework.io/metadata.name": test-catalog
140+
"""
141+
And ClusterExtension is rolled out
142+
And ClusterExtension is available
143+
And resource "deployment/test-operator" is available
144+
And resource "configmap/test-configmap" is available
145+
When ClusterCatalog "test" is deleted
146+
# Verify controller still maintains resources after catalog deletion by removing and restoring a resource.
147+
# This approach avoids race conditions because:
148+
# - We don't rely on status flags that might be unchanged (e.g., Installed=True before and after)
149+
# - Resource restoration is an observable event that PROVES the controller reconciled after deletion
150+
# - The controller must actively apply manifests to restore the removed resource
151+
And resource "configmap/test-configmap" is removed
152+
Then resource "configmap/test-configmap" is eventually restored
153+
And resource "deployment/test-operator" is available
154+
155+
Scenario: Resources are restored after catalog deletion
156+
Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
157+
And ClusterExtension is applied
158+
"""
159+
apiVersion: olm.operatorframework.io/v1
160+
kind: ClusterExtension
161+
metadata:
162+
name: ${NAME}
163+
spec:
164+
namespace: ${TEST_NAMESPACE}
165+
serviceAccount:
166+
name: olm-sa
167+
source:
168+
sourceType: Catalog
169+
catalog:
170+
packageName: test
171+
selector:
172+
matchLabels:
173+
"olm.operatorframework.io/metadata.name": test-catalog
174+
"""
175+
And ClusterExtension is rolled out
176+
And ClusterExtension is available
177+
And resource "configmap/test-configmap" exists
178+
And ClusterCatalog "test" is deleted
179+
When resource "configmap/test-configmap" is removed
180+
Then resource "configmap/test-configmap" is eventually restored
181+
182+
Scenario: Config changes are allowed even when the catalog does not exist anymore
183+
Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
184+
And ClusterExtension is applied
185+
"""
186+
apiVersion: olm.operatorframework.io/v1
187+
kind: ClusterExtension
188+
metadata:
189+
name: ${NAME}
190+
spec:
191+
namespace: ${TEST_NAMESPACE}
192+
serviceAccount:
193+
name: olm-sa
194+
source:
195+
sourceType: Catalog
196+
catalog:
197+
packageName: test
198+
selector:
199+
matchLabels:
200+
"olm.operatorframework.io/metadata.name": test-catalog
201+
"""
202+
And ClusterExtension is rolled out
203+
And ClusterExtension is available
204+
And ClusterCatalog "test" is deleted
205+
When ClusterExtension is updated to add preflight config
206+
"""
207+
apiVersion: olm.operatorframework.io/v1
208+
kind: ClusterExtension
209+
metadata:
210+
name: ${NAME}
211+
spec:
212+
namespace: ${TEST_NAMESPACE}
213+
serviceAccount:
214+
name: olm-sa
215+
install:
216+
preflight:
217+
crdUpgradeSafety:
218+
enforcement: None
219+
source:
220+
sourceType: Catalog
221+
catalog:
222+
packageName: test
223+
selector:
224+
matchLabels:
225+
"olm.operatorframework.io/metadata.name": test-catalog
226+
"""
227+
# Wait for reconciliation of the updated spec (config change should succeed without catalog)
228+
# First ensure the controller has reconciled the new generation (spec update)
229+
And ClusterExtension has reconciled the latest generation
230+
# Config-only changes don't trigger resolution failure (bundle unchanged), so resolution succeeds
231+
# using the installed bundle metadata. Verify reconciliation completed successfully.
232+
And ClusterExtension reports Progressing as True with Reason Succeeded
233+
Then ClusterExtension is available
234+
And ClusterExtension reports Installed as True
235+
236+
Scenario: Version upgrade does not proceed when catalog does not exist
237+
Given ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE}
238+
And ClusterExtension is applied
239+
"""
240+
apiVersion: olm.operatorframework.io/v1
241+
kind: ClusterExtension
242+
metadata:
243+
name: ${NAME}
244+
spec:
245+
namespace: ${TEST_NAMESPACE}
246+
serviceAccount:
247+
name: olm-sa
248+
source:
249+
sourceType: Catalog
250+
catalog:
251+
packageName: test
252+
version: "1.0.0"
253+
selector:
254+
matchLabels:
255+
"olm.operatorframework.io/metadata.name": test-catalog
256+
"""
257+
And ClusterExtension is rolled out
258+
And ClusterExtension is available
259+
And bundle "test-operator.1.0.0" is installed in version "1.0.0"
260+
When ClusterCatalog "test" is deleted
261+
And ClusterExtension is updated to version "1.0.1"
262+
# Wait for reconciliation after the version change request
263+
# Note: Retrying status means controller will auto-upgrade when catalog becomes available
264+
And ClusterExtension reports Progressing as True with Reason Retrying
265+
# Verify upgrade did not proceed: version remains at 1.0.0 (not 1.0.1)
266+
Then bundle "test-operator.1.0.0" is installed in version "1.0.0"
267+
And ClusterExtension reports Installed as True

test/e2e/steps/hooks.go

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -139,28 +139,32 @@ func stderrOutput(err error) string {
139139
return ""
140140
}
141141

142-
func ScenarioCleanup(ctx context.Context, _ *godog.Scenario, err error) (context.Context, error) {
142+
func ScenarioCleanup(ctx context.Context, _ *godog.Scenario, scenarioErr error) (context.Context, error) {
143143
sc := scenarioCtx(ctx)
144144
for _, bgCmd := range sc.backGroundCmds {
145145
if p := bgCmd.Process; p != nil {
146146
_ = p.Kill()
147147
}
148148
}
149-
if err != nil {
150-
return ctx, err
151-
}
152149

150+
// Run cleanup ALWAYS, even if scenario failed (to prevent resource leaks into next scenario)
153151
forDeletion := []resource{}
154152
if sc.clusterExtensionName != "" {
155153
forDeletion = append(forDeletion, resource{name: sc.clusterExtensionName, kind: "clusterextension"})
156154
}
157155
forDeletion = append(forDeletion, resource{name: sc.namespace, kind: "namespace"})
158-
go func() {
159-
for _, r := range forDeletion {
160-
if _, err := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true"); err != nil {
161-
logger.Info("Error deleting resource", "name", r.name, "namespace", sc.namespace, "stderr", stderrOutput(err))
156+
157+
// Cleanup must be synchronous to ensure proper test isolation.
158+
// If cleanup runs in background, the next scenario may start before resources are deleted.
159+
for _, r := range forDeletion {
160+
// Try graceful deletion first (60s timeout), fall back to force if stuck
161+
if _, err := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true", "--wait=true", "--timeout=60s"); err != nil {
162+
// Force delete if stuck on finalizers (test isolation > graceful cleanup)
163+
if _, forceErr := k8sClient("delete", r.kind, r.name, "--ignore-not-found=true", "--force", "--grace-period=0"); forceErr != nil {
164+
logger.Info("Error force deleting resource", "kind", r.kind, "name", r.name, "stderr", stderrOutput(forceErr))
162165
}
163166
}
164-
}()
165-
return ctx, nil
167+
}
168+
169+
return ctx, scenarioErr
166170
}

test/e2e/steps/steps.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ func RegisterSteps(sc *godog.ScenarioContext) {
5555
sc.Step(`^(?i)ClusterExtension is updated(?:\s+.*)?$`, ResourceIsApplied)
5656
sc.Step(`^(?i)ClusterExtension is available$`, ClusterExtensionIsAvailable)
5757
sc.Step(`^(?i)ClusterExtension is rolled out$`, ClusterExtensionIsRolledOut)
58+
sc.Step(`^(?i)ClusterExtension has reconciled the latest generation$`, ClusterExtensionReconciledLatestGeneration)
5859
sc.Step(`^(?i)ClusterExtension reports "([^"]+)" as active revision(s?)$`, ClusterExtensionReportsActiveRevisions)
5960
sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+) and Message:$`, ClusterExtensionReportsCondition)
6061
sc.Step(`^(?i)ClusterExtension reports ([[:alnum:]]+) as ([[:alnum:]]+) with Reason ([[:alnum:]]+)$`, ClusterExtensionReportsConditionWithoutMsg)
@@ -86,6 +87,7 @@ func RegisterSteps(sc *godog.ScenarioContext) {
8687
sc.Step(`^(?i)ClusterCatalog "([^"]+)" serves bundles$`, CatalogServesBundles)
8788
sc.Step(`^"([^"]+)" catalog image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage)
8889
sc.Step(`^(?i)ClusterCatalog "([^"]+)" image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage)
90+
sc.Step(`^(?i)ClusterCatalog "([^"]+)" is deleted$`, CatalogIsDeleted)
8991

9092
sc.Step(`^(?i)operator "([^"]+)" target namespace is "([^"]+)"$`, OperatorTargetNamespace)
9193
sc.Step(`^(?i)Prometheus metrics are returned in the response$`, PrometheusMetricsAreReturned)
@@ -249,6 +251,25 @@ func ClusterExtensionIsAvailable(ctx context.Context) error {
249251
return nil
250252
}
251253

254+
func ClusterExtensionReconciledLatestGeneration(ctx context.Context) error {
255+
sc := scenarioCtx(ctx)
256+
require.Eventually(godog.T(ctx), func() bool {
257+
// Get generation from metadata
258+
genOutput, err := k8sClient("get", "clusterextension", sc.clusterExtensionName, "-o", "jsonpath={.metadata.generation}")
259+
if err != nil || genOutput == "" {
260+
return false
261+
}
262+
// Get observedGeneration from Progressing condition (each condition tracks its own observedGeneration)
263+
obsGenOutput, err := k8sClient("get", "clusterextension", sc.clusterExtensionName, "-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].observedGeneration}")
264+
if err != nil || obsGenOutput == "" {
265+
return false
266+
}
267+
// Both exist and are equal means reconciliation happened
268+
return genOutput == obsGenOutput
269+
}, timeout, tick)
270+
return nil
271+
}
272+
252273
func ClusterExtensionIsRolledOut(ctx context.Context) error {
253274
sc := scenarioCtx(ctx)
254275
require.Eventually(godog.T(ctx), func() bool {
@@ -697,6 +718,17 @@ func TagCatalogImage(name, oldTag, newTag string) error {
697718
return crane.Tag(imageRef, newTag, crane.Insecure)
698719
}
699720

721+
func CatalogIsDeleted(ctx context.Context, catalogName string) error {
722+
catalogFullName := fmt.Sprintf("%s-catalog", catalogName)
723+
// Using --wait=true makes kubectl wait for the resource to be fully deleted,
724+
// eliminating the need for manual polling
725+
_, err := k8sClient("delete", "clustercatalog", catalogFullName, "--ignore-not-found=true", "--wait=true")
726+
if err != nil {
727+
return fmt.Errorf("failed to delete catalog: %v", err)
728+
}
729+
return nil
730+
}
731+
700732
func PrometheusMetricsAreReturned(ctx context.Context) error {
701733
sc := scenarioCtx(ctx)
702734
for podName, mr := range sc.metricsResponse {

0 commit comments

Comments
 (0)