Skip to content

Commit cda37d2

Browse files
test: add e2e test for UIPlugin post-uninstall cleanup (COO-1404)
Adds TestUIPluginUninstallCleanup to verify that deleting a UIPlugin CR after the operator has been uninstalled via OLM correctly cascade-deletes all child resources (Deployments, Services, ServiceAccounts, ClusterRoles, ClusterRoleBindings, pods) through Kubernetes garbage collection. Per OLM design, uninstalling an operator (deleting CSV + Subscription) does NOT remove CRDs or CRs. The admin is expected to delete CRs manually. This test verifies that when they do, the OwnerReference chain works without the operator running. Before the finalizer fix: UIPlugin CR gets stuck in Terminating forever. After the fix: CR deletes immediately, GC cascades to all children. Also adds: - --postpone-restoration flag to run-e2e.sh for manual cluster inspection - --openshift flag forwarding for UIPlugin tests on OpenShift clusters Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent a55377c commit cda37d2

3 files changed

Lines changed: 424 additions & 3 deletions

File tree

test/e2e/main_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ var (
2929
const e2eTestNamespace = "e2e-tests"
3030

3131
var (
32-
retain = flag.Bool("retain", false, "When set, the namespace in which tests are run will not be cleaned up")
33-
operatorInstallNS = flag.String("operatorInstallNS", "openshift-operator", "The namespace where the operator is installed")
32+
retain = flag.Bool("retain", false, "When set, the namespace in which tests are run will not be cleaned up")
33+
operatorInstallNS = flag.String("operatorInstallNS", "openshift-operator", "The namespace where the operator is installed")
34+
postponeRestoration = flag.Duration("postpone-restoration", 0, "Wait this duration before restoring the operator Subscription after uninstall tests (e.g. 10m for manual inspection)")
3435
)
3536

3637
func TestMain(m *testing.M) {
Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
package e2e
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strings"
7+
"testing"
8+
"time"
9+
10+
olmv1alpha1 "github.com/operator-framework/api/pkg/operators/v1alpha1"
11+
"gotest.tools/v3/assert"
12+
appsv1 "k8s.io/api/apps/v1"
13+
corev1 "k8s.io/api/core/v1"
14+
rbacv1 "k8s.io/api/rbac/v1"
15+
apierrors "k8s.io/apimachinery/pkg/api/errors"
16+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17+
"k8s.io/apimachinery/pkg/util/wait"
18+
"sigs.k8s.io/controller-runtime/pkg/client"
19+
20+
uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
21+
"github.com/rhobs/observability-operator/test/e2e/framework"
22+
)
23+
24+
// TestUIPluginUninstallCleanup verifies that UIPlugin operands are properly
25+
// cleaned up when an admin deletes the UIPlugin CR after the operator has been
26+
// uninstalled via OLM.
27+
//
28+
// Per OLM design, uninstalling an operator (deleting CSV + Subscription) does
29+
// NOT remove CRDs or CRs — this is intentional to prevent data loss. The admin
30+
// is expected to delete CRs manually (OLM uninstall Step 1). This test verifies
31+
// that when the admin does delete the UIPlugin CR post-uninstall, the child
32+
// resources are properly cascade-deleted via Kubernetes garbage collection
33+
// (OwnerReferences), without requiring the operator to be running.
34+
//
35+
// Before the fix (finalizers): UIPlugin CR gets stuck in Terminating forever
36+
// because the operator is gone and can't remove the finalizer.
37+
// After the fix (no finalizers + OwnerReferences): UIPlugin CR deletes
38+
// immediately and Kubernetes GC cascade-deletes all children.
39+
//
40+
// The test:
41+
// 1. Creates a monitoring UIPlugin with health-analyzer enabled
42+
// 2. Waits for operand deployments to be ready
43+
// 3. Simulates OLM uninstall by deleting the CSV and Subscription
44+
// 4. Deletes the UIPlugin CR (simulating admin Step 1 post-uninstall)
45+
// 5. Verifies that all child resources are cascade-deleted
46+
func TestUIPluginUninstallCleanup(t *testing.T) {
47+
if !f.IsOpenshiftCluster {
48+
t.Skip("Skipping: requires OpenShift cluster")
49+
}
50+
51+
f.SkipIfClusterVersionBelow(t, "4.19")
52+
53+
assertCRDExists(t, "uiplugins.observability.openshift.io")
54+
55+
ctx := context.Background()
56+
ns := f.OperatorNamespace
57+
58+
// --- Phase 0: Clean up any leftover UIPlugins from previous runs ---
59+
60+
t.Log("Phase 0: Ensuring no stale UIPlugins exist")
61+
forceDeleteAllUIPlugins(t, ctx)
62+
63+
// --- Phase 1: Create UIPlugin and verify operands are running ---
64+
65+
t.Log("Phase 1: Creating monitoring UIPlugin with health-analyzer enabled")
66+
plugin := &uiv1.UIPlugin{
67+
ObjectMeta: metav1.ObjectMeta{
68+
Name: "monitoring",
69+
},
70+
Spec: uiv1.UIPluginSpec{
71+
Type: uiv1.TypeMonitoring,
72+
Monitoring: &uiv1.MonitoringConfig{
73+
ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
74+
Enabled: true,
75+
},
76+
},
77+
},
78+
}
79+
80+
err := f.K8sClient.Create(ctx, plugin)
81+
assert.NilError(t, err, "failed to create monitoring UIPlugin")
82+
83+
t.Log("Waiting for monitoring plugin deployment to be ready...")
84+
f.AssertDeploymentReady("monitoring", ns, framework.WithTimeout(5*time.Minute))(t)
85+
86+
t.Log("Waiting for health-analyzer deployment to be ready...")
87+
f.AssertDeploymentReady("health-analyzer", ns, framework.WithTimeout(5*time.Minute))(t)
88+
89+
// --- Phase 2: Simulate OLM uninstall (delete CSV + Subscription) ---
90+
91+
t.Log("Phase 2: Simulating OLM uninstall by deleting CSV and Subscription")
92+
93+
csv, sub := findOLMResources(t, ctx, ns)
94+
95+
if sub != nil && !f.Retain {
96+
savedSub := &olmv1alpha1.Subscription{
97+
ObjectMeta: metav1.ObjectMeta{
98+
Name: sub.Name,
99+
Namespace: sub.Namespace,
100+
},
101+
Spec: sub.Spec.DeepCopy(),
102+
}
103+
t.Cleanup(func() {
104+
if delay := *postponeRestoration; delay > 0 {
105+
t.Logf("Cleanup: Waiting %v before restoring operator (inspect the cluster now)", delay)
106+
time.Sleep(delay)
107+
}
108+
t.Log("Cleanup: Reinstalling operator Subscription so the cluster is usable for next run")
109+
forceDeleteAllUIPlugins(t, context.Background())
110+
if err := f.K8sClient.Create(context.Background(), savedSub); err != nil {
111+
if apierrors.IsAlreadyExists(err) {
112+
t.Log("Cleanup: Subscription already exists, skipping")
113+
return
114+
}
115+
t.Logf("Cleanup: WARNING — failed to recreate Subscription: %v", err)
116+
t.Log("Cleanup: Reinstall manually with: oc apply -f <subscription.yaml>")
117+
return
118+
}
119+
t.Log("Cleanup: Subscription recreated, OLM will reinstall the operator")
120+
})
121+
}
122+
123+
if sub != nil {
124+
t.Logf("Deleting Subscription %s/%s", sub.Namespace, sub.Name)
125+
err = f.K8sClient.Delete(ctx, sub)
126+
if err != nil && !apierrors.IsNotFound(err) {
127+
t.Fatalf("failed to delete Subscription: %v", err)
128+
}
129+
}
130+
131+
if csv != nil {
132+
t.Logf("Deleting CSV %s/%s", csv.Namespace, csv.Name)
133+
err = f.K8sClient.Delete(ctx, csv)
134+
if err != nil && !apierrors.IsNotFound(err) {
135+
t.Fatalf("failed to delete CSV: %v", err)
136+
}
137+
}
138+
139+
t.Log("Waiting for operator deployment to be removed...")
140+
waitForResourceAbsent(t, "observability-operator", ns, &appsv1.Deployment{}, 5*time.Minute)
141+
142+
// --- Phase 3: Delete UIPlugin CR (admin cleanup step) ---
143+
// Per OLM docs, the admin is responsible for deleting CRs after uninstall.
144+
// This step simulates that. With the finalizer fix, the CR should delete
145+
// immediately (no operator needed). Without the fix, this would hang forever.
146+
147+
t.Log("Phase 3: Deleting UIPlugin CR (simulating admin post-uninstall cleanup)")
148+
149+
// Re-fetch the UIPlugin to get the latest version.
150+
currentPlugin := &uiv1.UIPlugin{}
151+
err = f.K8sClient.Get(ctx, client.ObjectKey{Name: "monitoring"}, currentPlugin)
152+
assert.NilError(t, err, "UIPlugin should still exist after operator uninstall")
153+
154+
if len(currentPlugin.Finalizers) > 0 {
155+
t.Logf("UIPlugin has finalizers %v — this will block deletion (pre-fix behavior)", currentPlugin.Finalizers)
156+
} else {
157+
t.Log("UIPlugin has no finalizers — deletion should proceed immediately (post-fix behavior)")
158+
}
159+
160+
err = f.K8sClient.Delete(ctx, currentPlugin)
161+
assert.NilError(t, err, "failed to delete UIPlugin CR")
162+
163+
// The UIPlugin CR itself should be gone quickly (no finalizer to block it).
164+
// Allow a short timeout — if it exceeds this, the finalizer is likely stuck.
165+
// Use Errorf (not Fatalf) so Phase 4 still runs even if the CR is stuck —
166+
// this shows the full scope of failure on pre-fix builds.
167+
t.Log("Waiting for UIPlugin CR to be fully deleted...")
168+
if !waitForResourceGone(t, "monitoring", "", &uiv1.UIPlugin{}, 1*time.Minute) {
169+
t.Errorf("UIPlugin CR stuck in Terminating (finalizer not removed) — pre-fix behavior confirmed")
170+
}
171+
172+
// --- Phase 4: Verify cascade deletion of child resources ---
173+
174+
t.Log("Phase 4: Verifying child resource cascade deletion")
175+
176+
cleanupTimeout := 3 * time.Minute
177+
178+
t.Run("cascade deletion", func(t *testing.T) {
179+
t.Run("monitoring plugin deployment is deleted", func(t *testing.T) {
180+
t.Parallel()
181+
waitForResourceAbsent(t, "monitoring", ns, &appsv1.Deployment{}, cleanupTimeout)
182+
})
183+
184+
t.Run("health-analyzer deployment is deleted", func(t *testing.T) {
185+
t.Parallel()
186+
waitForResourceAbsent(t, "health-analyzer", ns, &appsv1.Deployment{}, cleanupTimeout)
187+
})
188+
189+
t.Run("health-analyzer service is deleted", func(t *testing.T) {
190+
t.Parallel()
191+
waitForResourceAbsent(t, "health-analyzer", ns, &corev1.Service{}, cleanupTimeout)
192+
})
193+
194+
t.Run("monitoring plugin service is deleted", func(t *testing.T) {
195+
t.Parallel()
196+
waitForResourceAbsent(t, "monitoring", ns, &corev1.Service{}, cleanupTimeout)
197+
})
198+
199+
t.Run("monitoring plugin service account is deleted", func(t *testing.T) {
200+
t.Parallel()
201+
waitForResourceAbsent(t, "monitoring-sa", ns, &corev1.ServiceAccount{}, cleanupTimeout)
202+
})
203+
204+
t.Run("components-health-view ClusterRole is deleted", func(t *testing.T) {
205+
t.Parallel()
206+
waitForResourceAbsent(t, "components-health-view", "", &rbacv1.ClusterRole{}, cleanupTimeout)
207+
})
208+
209+
t.Run("components-health-view ClusterRoleBinding is deleted", func(t *testing.T) {
210+
t.Parallel()
211+
waitForResourceAbsent(t, "monitoring-components-health-view", "", &rbacv1.ClusterRoleBinding{}, cleanupTimeout)
212+
})
213+
214+
t.Run("no UIPlugin-managed pods remain in operator namespace", func(t *testing.T) {
215+
t.Parallel()
216+
assertNoManagedPodsRemain(t, ctx, ns)
217+
})
218+
})
219+
220+
t.Log("Phase 4: All cascade deletion checks completed")
221+
}
222+
223+
// findOLMResources locates the COO Subscription and CSV in the given namespace.
224+
func findOLMResources(t *testing.T, ctx context.Context, ns string) (*olmv1alpha1.ClusterServiceVersion, *olmv1alpha1.Subscription) {
225+
t.Helper()
226+
227+
var foundCSV *olmv1alpha1.ClusterServiceVersion
228+
var foundSub *olmv1alpha1.Subscription
229+
230+
subs := &olmv1alpha1.SubscriptionList{}
231+
err := f.K8sClient.List(ctx, subs, &client.ListOptions{Namespace: ns})
232+
if err != nil {
233+
t.Logf("warning: failed to list subscriptions: %v", err)
234+
} else {
235+
for i := range subs.Items {
236+
if subs.Items[i].Spec.Package == "observability-operator" ||
237+
subs.Items[i].Spec.Package == "cluster-observability-operator" {
238+
foundSub = &subs.Items[i]
239+
t.Logf("Found Subscription: %s (package: %s)", foundSub.Name, foundSub.Spec.Package)
240+
break
241+
}
242+
}
243+
}
244+
245+
csvs := &olmv1alpha1.ClusterServiceVersionList{}
246+
err = f.K8sClient.List(ctx, csvs, &client.ListOptions{Namespace: ns})
247+
if err != nil {
248+
t.Logf("warning: failed to list CSVs: %v", err)
249+
} else {
250+
for i := range csvs.Items {
251+
if strings.Contains(csvs.Items[i].Name, "observability-operator") {
252+
foundCSV = &csvs.Items[i]
253+
t.Logf("Found CSV: %s", foundCSV.Name)
254+
break
255+
}
256+
}
257+
}
258+
259+
if foundCSV == nil && foundSub == nil {
260+
t.Fatal("Could not find COO Subscription or CSV — operator may not be installed via OLM")
261+
}
262+
263+
return foundCSV, foundSub
264+
}
265+
266+
// waitForResourceGone polls until the named resource no longer exists.
267+
// Returns true if the resource disappeared, false if the timeout was reached.
268+
func waitForResourceGone(t *testing.T, name, namespace string, obj client.Object, timeout time.Duration) bool {
269+
t.Helper()
270+
key := client.ObjectKey{Name: name, Namespace: namespace}
271+
err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) {
272+
if err := f.K8sClient.Get(ctx, key, obj); apierrors.IsNotFound(err) {
273+
return true, nil
274+
}
275+
return false, nil
276+
})
277+
return !wait.Interrupted(err)
278+
}
279+
280+
// waitForResourceAbsent polls until the named resource no longer exists.
281+
func waitForResourceAbsent(t *testing.T, name, namespace string, obj client.Object, timeout time.Duration) {
282+
t.Helper()
283+
key := client.ObjectKey{Name: name, Namespace: namespace}
284+
err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) {
285+
if err := f.K8sClient.Get(ctx, key, obj); apierrors.IsNotFound(err) {
286+
return true, nil
287+
}
288+
return false, nil
289+
})
290+
if wait.Interrupted(err) {
291+
kind := fmt.Sprintf("%T", obj)
292+
t.Fatalf("%s %s/%s was not cleaned up (waited %v)", kind, namespace, name, timeout)
293+
}
294+
}
295+
296+
// forceDeleteAllUIPlugins removes all UIPlugin CRs, stripping finalizers if
297+
// necessary. This handles the case where a previous test left UIPlugins stuck
298+
// in Terminating because the operator was already gone.
299+
func forceDeleteAllUIPlugins(t *testing.T, ctx context.Context) {
300+
t.Helper()
301+
302+
var plugins uiv1.UIPluginList
303+
if err := f.K8sClient.List(ctx, &plugins); err != nil {
304+
t.Logf("Could not list UIPlugins (CRD may not exist yet): %v", err)
305+
return
306+
}
307+
308+
for i := range plugins.Items {
309+
p := &plugins.Items[i]
310+
311+
if len(p.Finalizers) > 0 {
312+
t.Logf("Stripping finalizers from UIPlugin %s", p.Name)
313+
patch := client.MergeFrom(p.DeepCopy())
314+
p.Finalizers = nil
315+
if err := f.K8sClient.Patch(ctx, p, patch); err != nil && !apierrors.IsNotFound(err) {
316+
t.Logf("warning: failed to strip finalizers from %s: %v", p.Name, err)
317+
}
318+
}
319+
320+
if p.DeletionTimestamp.IsZero() {
321+
t.Logf("Deleting UIPlugin %s", p.Name)
322+
if err := f.K8sClient.Delete(ctx, p); err != nil && !apierrors.IsNotFound(err) {
323+
t.Logf("warning: failed to delete UIPlugin %s: %v", p.Name, err)
324+
}
325+
}
326+
}
327+
328+
err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
329+
var remaining uiv1.UIPluginList
330+
if err := f.K8sClient.List(ctx, &remaining); err != nil {
331+
return false, nil
332+
}
333+
return len(remaining.Items) == 0, nil
334+
})
335+
if wait.Interrupted(err) {
336+
t.Fatal("Stale UIPlugins still exist after force cleanup")
337+
}
338+
}
339+
340+
// assertNoManagedPodsRemain verifies that no UIPlugin-managed pods are left
341+
// running in the operator namespace after uninstall.
342+
func assertNoManagedPodsRemain(t *testing.T, ctx context.Context, namespace string) {
343+
t.Helper()
344+
345+
managedLabels := map[string]string{
346+
"app.kubernetes.io/managed-by": "observability-operator",
347+
}
348+
349+
var lastSeen []string
350+
err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
351+
pods := &corev1.PodList{}
352+
if err := f.K8sClient.List(ctx, pods,
353+
client.InNamespace(namespace),
354+
client.MatchingLabels(managedLabels),
355+
); err != nil {
356+
return false, nil
357+
}
358+
359+
if len(pods.Items) == 0 {
360+
return true, nil
361+
}
362+
363+
lastSeen = make([]string, 0, len(pods.Items))
364+
for _, p := range pods.Items {
365+
lastSeen = append(lastSeen, fmt.Sprintf("%s (phase=%s)", p.Name, p.Status.Phase))
366+
}
367+
return false, nil
368+
})
369+
370+
if wait.Interrupted(err) {
371+
t.Fatalf("managed pods not cleaned up after UIPlugin deletion: %v", lastSeen)
372+
}
373+
}

0 commit comments

Comments
 (0)