From 6c7de57bb4be6e0680e07c6eee553bb1deef17d1 Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Mon, 4 May 2026 20:34:05 +0200 Subject: [PATCH 1/8] OLM tests --- .github/workflows/create-dev-cluster.yml | 11 ++++- .github/workflows/e2e-tests.yml | 17 ++++++- .github/workflows/main-push.yml | 4 +- .github/workflows/pr.yml | 62 +++++++++++++++++++++--- 4 files changed, 81 insertions(+), 13 deletions(-) diff --git a/.github/workflows/create-dev-cluster.yml b/.github/workflows/create-dev-cluster.yml index b790e721..3274cf18 100644 --- a/.github/workflows/create-dev-cluster.yml +++ b/.github/workflows/create-dev-cluster.yml @@ -6,6 +6,13 @@ on: cluster-name: required: true type: string + flavor: + required: true + type: string + args: + required: false + type: string + default: '' outputs: cluster-name: description: "Name of the created cluster" @@ -19,9 +26,9 @@ jobs: steps: - uses: stackrox/actions/infra/create-cluster@v1 with: - flavor: gke-default + flavor: ${{ inputs.flavor }} name: ${{ inputs.cluster-name }} - args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd + args: ${{ inputs.args }} lifespan: "2h" wait: true token: ${{ secrets.INFRA_CI_TOKEN }} diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 322defb6..bdfe2880 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -9,6 +9,14 @@ on: image: required: true type: string + cluster-type: + required: false + type: string + default: 'gke' + skip-olm-tests: + required: false + type: string + default: 'true' env: REGISTRY: quay.io IMAGE_NAME: rhacs-eng/roxie @@ -23,7 +31,6 @@ jobs: KUBECONFIG: /github/home/artifacts/kubeconfig INFRA_TOKEN: ${{ secrets.INFRA_CI_TOKEN }} INFRACTL: bin/infractl -k -e localhost:8443 - USE_GKE_GCLOUD_AUTH_PLUGIN: "True" steps: - name: Checkout uses: actions/checkout@v6 @@ -65,15 +72,21 @@ jobs: roxctl version - name: Authenticate to GCloud + if: inputs.cluster-type == 'gke' uses: google-github-actions/auth@v3 with: credentials_json: ${{ secrets.ROXIE_CI_AUTOMATION_GCP_SA }} - name: Set up Cloud SDK + if: inputs.cluster-type == 'gke' uses: "google-github-actions/setup-gcloud@v3" with: install_components: "gke-gcloud-auth-plugin" + - name: Configure GKE auth plugin + if: inputs.cluster-type == 'gke' + run: echo "USE_GKE_GCLOUD_AUTH_PLUGIN=True" >> "$GITHUB_ENV" + - name: Download production infractl uses: stackrox/actions/infra/install-infractl@v1 @@ -89,7 +102,7 @@ jobs: env: REGISTRY_USERNAME: ${{ secrets.QUAY_RHACS_ENG_RO_USERNAME }} REGISTRY_PASSWORD: ${{ secrets.QUAY_RHACS_ENG_RO_PASSWORD }} - SKIP_OLM_TESTS: "true" + SKIP_OLM_TESTS: ${{ inputs.skip-olm-tests == 'true' && 'true' || '' }} run: | make run-test-e2e diff --git a/.github/workflows/main-push.yml b/.github/workflows/main-push.yml index 064187c0..32b0322d 100644 --- a/.github/workflows/main-push.yml +++ b/.github/workflows/main-push.yml @@ -15,7 +15,9 @@ jobs: create-dev-cluster: uses: ./.github/workflows/create-dev-cluster.yml with: - cluster-name: infra-roxie-main-${{ github.run_number }} + cluster-name: infra-roxie-main-${{ github.run_number }}-gke + flavor: gke-default + args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd secrets: inherit build-roxie-image: diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 4e27e6a4..46498ba1 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -12,10 +12,38 @@ jobs: unit-tests: uses: ./.github/workflows/unit-tests.yml - create-dev-cluster: + check-olm-label: + runs-on: ubuntu-latest + outputs: + has-label: ${{ steps.check.outputs.has-label }} + steps: + - name: Check for olm-tests label + id: check + run: | + has_label="${{ contains(github.event.pull_request.labels.*.name, 'olm-tests') }}" + echo "has-label=${has_label}" >> "$GITHUB_OUTPUT" + if [ "$has_label" = "true" ]; then + echo "::notice::olm-tests label is set — OpenShift cluster will be created" + else + echo "::notice::olm-tests label is not set — skipping OpenShift cluster" + fi + + create-gke-cluster: uses: ./.github/workflows/create-dev-cluster.yml with: - cluster-name: infra-roxie-pr-${{ github.event.pull_request.number }} + cluster-name: infra-roxie-pr-${{ github.event.pull_request.number }}-gke + flavor: gke-default + args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd + secrets: inherit + + create-openshift-cluster: + needs: check-olm-label + if: needs.check-olm-label.outputs.has-label == 'true' + uses: ./.github/workflows/create-dev-cluster.yml + with: + cluster-name: infra-roxie-pr-${{ github.event.pull_request.number }}-openshift + flavor: ocp-4 + args: master-node-type=e2-standard-4,worker-node-type=e2-standard-4,master-node-count=3,worker-node-count=3 secrets: inherit build-roxie-image: @@ -26,17 +54,35 @@ jobs: secrets: inherit e2e-tests: - needs: [ create-dev-cluster, build-roxie-image ] + needs: [ create-gke-cluster, build-roxie-image ] + uses: ./.github/workflows/e2e-tests.yml + with: + cluster-name: ${{ needs.create-gke-cluster.outputs.cluster-name }} + image: ${{ needs.build-roxie-image.outputs.image }} + secrets: inherit + + e2e-tests-openshift: + needs: [ create-openshift-cluster, build-roxie-image ] uses: ./.github/workflows/e2e-tests.yml with: - cluster-name: ${{ needs.create-dev-cluster.outputs.cluster-name }} + cluster-name: ${{ needs.create-openshift-cluster.outputs.cluster-name }} image: ${{ needs.build-roxie-image.outputs.image }} + cluster-type: openshift + skip-olm-tests: 'false' + secrets: inherit + + delete-gke-cluster: + if: ${{ always() && needs.create-gke-cluster.result == 'success' }} + needs: [ create-gke-cluster, e2e-tests ] + uses: ./.github/workflows/delete-dev-cluster.yml + with: + cluster-name: ${{ needs.create-gke-cluster.outputs.cluster-name }} secrets: inherit - delete-dev-cluster: - if: ${{ always() && needs.create-dev-cluster.result == 'success' }} - needs: [ create-dev-cluster, e2e-tests ] + delete-openshift-cluster: + if: ${{ always() && needs.create-openshift-cluster.result == 'success' }} + needs: [ create-openshift-cluster, e2e-tests-openshift ] uses: ./.github/workflows/delete-dev-cluster.yml with: - cluster-name: ${{ needs.create-dev-cluster.outputs.cluster-name }} + cluster-name: ${{ needs.create-openshift-cluster.outputs.cluster-name }} secrets: inherit From 37772a6bc95ceb03f0b7b31004a737f04966c4bd Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Tue, 5 May 2026 10:24:53 +0200 Subject: [PATCH 2/8] Bump worker node (experiment), because the cluster didn't come up --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 46498ba1..8da10822 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -43,7 +43,7 @@ jobs: with: cluster-name: infra-roxie-pr-${{ github.event.pull_request.number }}-openshift flavor: ocp-4 - args: master-node-type=e2-standard-4,worker-node-type=e2-standard-4,master-node-count=3,worker-node-count=3 + args: master-node-type=e2-standard-4,worker-node-type=e2-standard-8,master-node-count=3,worker-node-count=3 secrets: inherit build-roxie-image: From 0185b5efdfab5a32dc9882585aa5f15b7e9f5dbd Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Wed, 6 May 2026 10:03:05 +0200 Subject: [PATCH 3/8] Dump cluster state on failure --- tests/e2e/basic_test.go | 2 + tests/e2e/e2e_test.go | 2 + tests/e2e/helpers.go | 109 +++++++++++++++++++++++++++++++++++ tests/e2e/olm_switch_test.go | 8 +++ 4 files changed, 121 insertions(+) diff --git a/tests/e2e/basic_test.go b/tests/e2e/basic_test.go index 048ad36d..5ad1766c 100644 --- a/tests/e2e/basic_test.go +++ b/tests/e2e/basic_test.go @@ -11,6 +11,8 @@ import ( // TestDeployBothSimple tests deploying both components together (simplest scenario) func TestDeployBothSimple(t *testing.T) { + dumpClusterOnFailure(t) + // Create temporary envrc file envrcFile, err := os.CreateTemp(t.TempDir(), ".envrc.roxie-test-*") if err != nil { diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 5f85da97..4763082c 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -39,6 +39,8 @@ func TestMain(m *testing.M) { } func TestDeployBothComponentsTogetherInSingleNamespace(t *testing.T) { + dumpClusterOnFailure(t) + // Create temporary envrc file. envrcFile, err := os.CreateTemp(t.TempDir(), ".envrc.roxie-test-*") if err != nil { diff --git a/tests/e2e/helpers.go b/tests/e2e/helpers.go index a97c1653..9eba4d5c 100644 --- a/tests/e2e/helpers.go +++ b/tests/e2e/helpers.go @@ -239,6 +239,115 @@ func verifySecuredClusterNotInstalled(t *testing.T, namespace string) { } } +var clusterDumpNamespaces = []string{ + "rhacs-operator-system", + "acs-central", + "acs-sensor", + "stackrox", +} + +func dumpClusterOnFailure(t *testing.T) { + t.Helper() + t.Cleanup(func() { + if !t.Failed() { + return + } + dumpClusterResources(t) + }) +} + +func dumpClusterResources(t *testing.T) { + t.Helper() + fmt.Fprintf(os.Stderr, "=== CLUSTER RESOURCE DUMP (test %s failed) ===\n", t.Name()) + + runKubectlDump("get", "namespaces") + + for _, ns := range clusterDumpNamespaces { + fmt.Fprintf(os.Stderr, "--- Namespace: %s ---\n", ns) + runKubectlDump("get", "pods", "-n", ns, "-o", "wide") + runKubectlDump("describe", "pods", "-n", ns) + runKubectlDump("get", "deployments", "-n", ns, "-o", "wide") + runKubectlDump("describe", "deployments", "-n", ns) + runKubectlDump("get", "daemonsets", "-n", ns, "-o", "wide") + runKubectlDump("describe", "daemonsets", "-n", ns) + runKubectlDump("get", "events", "-n", ns, "--sort-by=.lastTimestamp") + dumpLogsForFailingPods(ns) + } + + dumpACSCustomResources() + dumpOLMResources() + + fmt.Fprintln(os.Stderr, "=== END CLUSTER RESOURCE DUMP ===") +} + +func dumpACSCustomResources() { + fmt.Fprintln(os.Stderr, "--- ACS Custom Resources ---") + for _, ns := range clusterDumpNamespaces { + runKubectlDump("get", "centrals.platform.stackrox.io", "-n", ns, "-o", "yaml") + runKubectlDump("get", "securedclusters.platform.stackrox.io", "-n", ns, "-o", "yaml") + } +} + +func dumpOLMResources() { + cmd := exec.Command("kubectl", "api-resources", "--api-group=operators.coreos.com", "-o", "name") + output, err := cmd.Output() + if err != nil || strings.TrimSpace(string(output)) == "" { + fmt.Fprintln(os.Stderr, "[dump] OLM not installed, skipping OLM resource dump") + return + } + + fmt.Fprintln(os.Stderr, "--- OLM Resources ---") + olmNamespace := "rhacs-operator-system" + runKubectlDump("get", "subscriptions.operators.coreos.com", "-n", olmNamespace, "-o", "wide") + runKubectlDump("describe", "subscriptions.operators.coreos.com", "-n", olmNamespace) + runKubectlDump("get", "installplans.operators.coreos.com", "-n", olmNamespace, "-o", "wide") + runKubectlDump("describe", "installplans.operators.coreos.com", "-n", olmNamespace) + runKubectlDump("get", "catalogsources.operators.coreos.com", "-n", olmNamespace, "-o", "wide") + runKubectlDump("describe", "catalogsources.operators.coreos.com", "-n", olmNamespace) + runKubectlDump("get", "clusterserviceversions.operators.coreos.com", "-n", olmNamespace, "-o", "wide") + runKubectlDump("describe", "clusterserviceversions.operators.coreos.com", "-n", olmNamespace) + runKubectlDump("get", "operatorgroups.operators.coreos.com", "-n", olmNamespace, "-o", "wide") + runKubectlDump("describe", "operatorgroups.operators.coreos.com", "-n", olmNamespace) +} + +func runKubectlDump(args ...string) { + fmt.Fprintf(os.Stderr, "## kubectl %s\n", strings.Join(args, " ")) + cmd := exec.Command("kubectl", args...) + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + fmt.Fprintf(os.Stderr, "kubectl failed: %v\n", err) + } + fmt.Fprintln(os.Stderr) +} + +func dumpLogsForFailingPods(namespace string) { + cmd := exec.Command("kubectl", "get", "pods", "-n", namespace, + "-o", "jsonpath={range .items[*]}{.metadata.name}{\"\\t\"}{.status.phase}{\"\\n\"}{end}") + output, err := cmd.Output() + if err != nil { + fmt.Fprintf(os.Stderr, "[dump] failed to list pods in %s: %v\n", namespace, err) + return + } + + for line := range strings.SplitSeq(strings.TrimSpace(string(output)), "\n") { + if line == "" { + continue + } + parts := strings.SplitN(line, "\t", 2) + if len(parts) != 2 { + continue + } + podName, phase := parts[0], parts[1] + if phase == "Running" || phase == "Succeeded" { + continue + } + fmt.Fprintf(os.Stderr, "[dump] logs for pod %s/%s (phase=%s):\n", namespace, podName, phase) + runKubectlDump("logs", "-n", namespace, podName, "--all-containers", "--tail=100") + runKubectlDump("logs", "-n", namespace, podName, "--all-containers", "--previous", "--tail=50") + } +} + func verifyAnnotation(t *testing.T, resourceType, resourceName, namespace, annotationKey, expectedValue string) { t.Helper() diff --git a/tests/e2e/olm_switch_test.go b/tests/e2e/olm_switch_test.go index 09b45079..6b56c8be 100644 --- a/tests/e2e/olm_switch_test.go +++ b/tests/e2e/olm_switch_test.go @@ -65,6 +65,8 @@ func verifyOperatorDeploymentExists(t *testing.T) { // TestOLMToNonOLMSwitch tests switching from OLM operator to non-OLM operator func TestOLMToNonOLMSwitch(t *testing.T) { + dumpClusterOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } @@ -113,6 +115,8 @@ func TestOLMToNonOLMSwitch(t *testing.T) { // TestNonOLMToOLMSwitch tests switching from non-OLM operator to OLM operator func TestNonOLMToOLMSwitch(t *testing.T) { + dumpClusterOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } @@ -161,6 +165,8 @@ func TestNonOLMToOLMSwitch(t *testing.T) { // TestOLMOperatorVersionUpgrade tests that OLM operator version mismatches trigger teardown and redeploy func TestOLMOperatorVersionUpgrade(t *testing.T) { + dumpClusterOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } @@ -223,6 +229,8 @@ func TestOLMOperatorVersionUpgrade(t *testing.T) { // TestSecuredClusterWithOLMSwitch tests that secured-cluster deployment also respects OLM mode switches func TestSecuredClusterWithOLMSwitch(t *testing.T) { + dumpClusterOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } From c3c115bb8abe9c7407d1e8c5bce8bea8ae912b71 Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Thu, 7 May 2026 12:44:15 +0200 Subject: [PATCH 4/8] Rename dumpClusterOnFailure to dumpClusterStateOnFailure Co-Authored-By: Claude Opus 4.6 --- tests/e2e/basic_test.go | 2 +- tests/e2e/e2e_test.go | 2 +- tests/e2e/helpers.go | 2 +- tests/e2e/olm_switch_test.go | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/e2e/basic_test.go b/tests/e2e/basic_test.go index 5ad1766c..ac91591f 100644 --- a/tests/e2e/basic_test.go +++ b/tests/e2e/basic_test.go @@ -11,7 +11,7 @@ import ( // TestDeployBothSimple tests deploying both components together (simplest scenario) func TestDeployBothSimple(t *testing.T) { - dumpClusterOnFailure(t) + dumpClusterStateOnFailure(t) // Create temporary envrc file envrcFile, err := os.CreateTemp(t.TempDir(), ".envrc.roxie-test-*") diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 4763082c..425ec7e9 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -39,7 +39,7 @@ func TestMain(m *testing.M) { } func TestDeployBothComponentsTogetherInSingleNamespace(t *testing.T) { - dumpClusterOnFailure(t) + dumpClusterStateOnFailure(t) // Create temporary envrc file. envrcFile, err := os.CreateTemp(t.TempDir(), ".envrc.roxie-test-*") diff --git a/tests/e2e/helpers.go b/tests/e2e/helpers.go index 9eba4d5c..5d56b531 100644 --- a/tests/e2e/helpers.go +++ b/tests/e2e/helpers.go @@ -246,7 +246,7 @@ var clusterDumpNamespaces = []string{ "stackrox", } -func dumpClusterOnFailure(t *testing.T) { +func dumpClusterStateOnFailure(t *testing.T) { t.Helper() t.Cleanup(func() { if !t.Failed() { diff --git a/tests/e2e/olm_switch_test.go b/tests/e2e/olm_switch_test.go index 6b56c8be..fe9512b0 100644 --- a/tests/e2e/olm_switch_test.go +++ b/tests/e2e/olm_switch_test.go @@ -65,7 +65,7 @@ func verifyOperatorDeploymentExists(t *testing.T) { // TestOLMToNonOLMSwitch tests switching from OLM operator to non-OLM operator func TestOLMToNonOLMSwitch(t *testing.T) { - dumpClusterOnFailure(t) + dumpClusterStateOnFailure(t) if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") @@ -115,7 +115,7 @@ func TestOLMToNonOLMSwitch(t *testing.T) { // TestNonOLMToOLMSwitch tests switching from non-OLM operator to OLM operator func TestNonOLMToOLMSwitch(t *testing.T) { - dumpClusterOnFailure(t) + dumpClusterStateOnFailure(t) if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") @@ -165,7 +165,7 @@ func TestNonOLMToOLMSwitch(t *testing.T) { // TestOLMOperatorVersionUpgrade tests that OLM operator version mismatches trigger teardown and redeploy func TestOLMOperatorVersionUpgrade(t *testing.T) { - dumpClusterOnFailure(t) + dumpClusterStateOnFailure(t) if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") @@ -229,7 +229,7 @@ func TestOLMOperatorVersionUpgrade(t *testing.T) { // TestSecuredClusterWithOLMSwitch tests that secured-cluster deployment also respects OLM mode switches func TestSecuredClusterWithOLMSwitch(t *testing.T) { - dumpClusterOnFailure(t) + dumpClusterStateOnFailure(t) if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") From 0f11508fe1fec0572b0f7bc5910e7a9410edc86b Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Thu, 7 May 2026 12:45:05 +0200 Subject: [PATCH 5/8] Rename olmNamespace to operatorNamespace in dumpOLMResources Co-Authored-By: Claude Opus 4.6 --- tests/e2e/helpers.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/e2e/helpers.go b/tests/e2e/helpers.go index 5d56b531..14c20fce 100644 --- a/tests/e2e/helpers.go +++ b/tests/e2e/helpers.go @@ -297,17 +297,17 @@ func dumpOLMResources() { } fmt.Fprintln(os.Stderr, "--- OLM Resources ---") - olmNamespace := "rhacs-operator-system" - runKubectlDump("get", "subscriptions.operators.coreos.com", "-n", olmNamespace, "-o", "wide") - runKubectlDump("describe", "subscriptions.operators.coreos.com", "-n", olmNamespace) - runKubectlDump("get", "installplans.operators.coreos.com", "-n", olmNamespace, "-o", "wide") - runKubectlDump("describe", "installplans.operators.coreos.com", "-n", olmNamespace) - runKubectlDump("get", "catalogsources.operators.coreos.com", "-n", olmNamespace, "-o", "wide") - runKubectlDump("describe", "catalogsources.operators.coreos.com", "-n", olmNamespace) - runKubectlDump("get", "clusterserviceversions.operators.coreos.com", "-n", olmNamespace, "-o", "wide") - runKubectlDump("describe", "clusterserviceversions.operators.coreos.com", "-n", olmNamespace) - runKubectlDump("get", "operatorgroups.operators.coreos.com", "-n", olmNamespace, "-o", "wide") - runKubectlDump("describe", "operatorgroups.operators.coreos.com", "-n", olmNamespace) + operatorNamespace := "rhacs-operator-system" + runKubectlDump("get", "subscriptions.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "subscriptions.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "installplans.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "installplans.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "catalogsources.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "catalogsources.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "clusterserviceversions.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "clusterserviceversions.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "operatorgroups.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "operatorgroups.operators.coreos.com", "-n", operatorNamespace) } func runKubectlDump(args ...string) { From 47e80f80f7d02415f5423dfa41ebabf35ad70ba4 Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier <111092021+mclasmeier@users.noreply.github.com> Date: Fri, 8 May 2026 12:08:45 +0200 Subject: [PATCH 6/8] Prevent leaking of injected-cabundle during securedcluster teardown (#162) Co-authored-by: Moritz Clasmeier --- internal/deployer/deployer.go | 44 +++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/internal/deployer/deployer.go b/internal/deployer/deployer.go index ab7c2695..dc69a149 100644 --- a/internal/deployer/deployer.go +++ b/internal/deployer/deployer.go @@ -98,10 +98,10 @@ var ( "storageclasses", "validatingwebhookconfigurations", } -) -const ( - injectedCABundleConfigMap = "injected-cabundle-stackrox-central-services" + injectedCABundleConfigMapPrefix = "injected-cabundle-" + injectedCABundleConfigMapCentral = injectedCABundleConfigMapPrefix + centralCrName + injectedCABundleConfigMapSecuredCluster = injectedCABundleConfigMapPrefix + securedClusterCrName ) // Deployer is the base deployer for ACS @@ -209,14 +209,13 @@ func (d *Deployer) deleteCentralResources(ctx context.Context, wait bool) error // Pause reconciliation for other controllers, not just our RHACS operator. // This is needed to ensure that there is no race causing the Cluster Network Operator // to re-create the injected-ca-bundle ConfigMap during resource deletion. - err := d.preventOtherControllersFromReconciling(ctx) - if err != nil { - return fmt.Errorf("failed to prevent other controllers from reconciling: %w", err) + if err := d.preventOtherControllersFromReconciling(ctx, component.Central); err != nil { + return fmt.Errorf("failed to prevent other controllers from reconciling Central resources: %w", err) } // Delete other resources by brute force. resourceKinds := d.filterResourceKinds(allInstallableCentralResourceKinds) - err = d.deleteResources(ctx, d.centralNamespace, resourceKinds, "-l=app.kubernetes.io/part-of=stackrox-central-services") + err := d.deleteResources(ctx, d.centralNamespace, resourceKinds, "-l=app.kubernetes.io/part-of=stackrox-central-services") if err != nil { return err } @@ -226,9 +225,9 @@ func (d *Deployer) deleteCentralResources(ctx context.Context, wait bool) error {Name: "central-db-backup", Kind: "pvc", OwnerName: centralCrName}, {Name: "admin-password", Kind: "secret"}, {Name: "scanner-db-password", Kind: "secret", OwnerName: centralCrName}, - // In case the Cluster Network Operator has succeeded in re-creating the injectedCABundleConfigMap + // In case the Cluster Network Operator has succeeded in re-creating the injected-cabundle configmap // after our operator has already deleted it. - {Name: injectedCABundleConfigMap, Kind: "configmap"}, + {Name: injectedCABundleConfigMapCentral, Kind: "configmap"}, } { d.logger.Dimf("Attempting to delete %s/%s", resource.Kind, resource.Name) if resource.OwnerName != "" { @@ -263,17 +262,22 @@ func (d *Deployer) deleteCentralResources(ctx context.Context, wait bool) error return nil } -func (d *Deployer) preventOtherControllersFromReconciling(ctx context.Context) error { - return d.preventCABundleInjection(ctx) +func (d *Deployer) preventOtherControllersFromReconciling(ctx context.Context, comp component.Component) error { + switch comp { + case component.Central: + return d.preventCABundleInjection(ctx, injectedCABundleConfigMapCentral, d.centralNamespace) + case component.SecuredCluster: + return d.preventCABundleInjection(ctx, injectedCABundleConfigMapSecuredCluster, d.sensorNamespace) + default: + return nil + } } -func (d *Deployer) preventCABundleInjection(ctx context.Context) error { - configMapName := injectedCABundleConfigMap - +func (d *Deployer) preventCABundleInjection(ctx context.Context, configMapName, namespace string) error { d.logger.Info("Removing CNO label from injected-cabundle ConfigMap to prevent CNO from injecting the CA bundle during cleanup") _, err := d.runKubectl(ctx, k8s.KubectlOptions{ Args: []string{ - "label", "configmap", configMapName, "-n", d.centralNamespace, + "label", "configmap", configMapName, "-n", namespace, "config.openshift.io/inject-trusted-cabundle-", }, IgnoreErrors: true, @@ -305,6 +309,13 @@ func (d *Deployer) deleteSecuredClusterResources(ctx context.Context, wait bool) } } + // Pause reconciliation for other controllers, not just our RHACS operator. + // This is needed to ensure that there is no race causing the Cluster Network Operator + // to re-create the injected-ca-bundle ConfigMap during resource deletion. + if err := d.preventOtherControllersFromReconciling(ctx, component.SecuredCluster); err != nil { + return fmt.Errorf("failed to prevent other controllers from reconciling SecuredCluster resources: %w", err) + } + // In the meantime, delete other resources by brute force. resourceKinds := d.filterResourceKinds(allInstallableSecuredClusterResourceKinds) err := d.deleteResources(ctx, d.sensorNamespace, resourceKinds, "-l=app.kubernetes.io/part-of=stackrox-secured-cluster-services") @@ -317,6 +328,9 @@ func (d *Deployer) deleteSecuredClusterResources(ctx context.Context, wait bool) // We need to make sure that don't accidentally delete a scanner-db-password belonging to the central CR, // when both are deployed into the same namespace. {Name: "scanner-db-password", Kind: "secret", OwnerName: securedClusterCrName}, + // In case the Cluster Network Operator has succeeded in re-creating the injected-cabundle configmap + // after our operator has already deleted it. + {Name: injectedCABundleConfigMapSecuredCluster, Kind: "configmap"}, } { d.logger.Dimf("Attempting to delete %s/%s", resource.Kind, resource.Name) if resource.OwnerName != "" { From 2d6340b84275253710621a43b35266fce2f535ed Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Sun, 10 May 2026 11:50:40 +0200 Subject: [PATCH 7/8] Simply teardown, hopefully making it more robust against CNO races --- internal/deployer/deployer.go | 223 ++++++---------------------------- 1 file changed, 38 insertions(+), 185 deletions(-) diff --git a/internal/deployer/deployer.go b/internal/deployer/deployer.go index dc69a149..3451b5fd 100644 --- a/internal/deployer/deployer.go +++ b/internal/deployer/deployer.go @@ -38,70 +38,6 @@ var ( // AdminUsername is the default admin username for StackRox Central AdminUsername = "admin" - - // TODO(#91): at some point this will get out of date. If we filter by the app.../part-of - // label anyway, then maybe we should just delete all resource kinds present on cluster? - // also we should use the fully-qualified types - allInstallableCentralResourceKinds = []string{ - "applications", - "clusterroles", - "configmaps", - "deployments", - "destinationrules", - "endpoints", - "endpointslices", - "horizontalpodautoscalers", - "networkpolicys", - "leases", - "persistentvolumes", - "persistentvolumeclaims", - "pods", - "podsecuritypolicys", - "prometheusrules", - "roles", - "rolebindings", - "replicasets", - "routes", - "secrets", - "services", - "serviceaccounts", - "servicemonitors", - "storageclasses", - } - - allInstallableSecuredClusterResourceKinds = []string{ - "clusterroles", - "clusterrolebindings", - "configmaps", - "consoleplugins", - "controllerrevisions", - "daemonsets", - "deployments", - "endpoints", - "endpointslices", - "destinationrules", - "horizontalpodautoscalers", - "networkpolicys", - "leases", - "persistentvolumes", - "persistentvolumeclaims", - "pods", - "podsecuritypolicys", - "prometheusrules", - "replicasets", - "roles", - "rolebindings", - "secrets", - "services", - "serviceaccounts", - "servicemonitors", - "storageclasses", - "validatingwebhookconfigurations", - } - - injectedCABundleConfigMapPrefix = "injected-cabundle-" - injectedCABundleConfigMapCentral = injectedCABundleConfigMapPrefix + centralCrName - injectedCABundleConfigMapSecuredCluster = injectedCABundleConfigMapPrefix + securedClusterCrName ) // Deployer is the base deployer for ACS @@ -146,16 +82,6 @@ type ResourceToDelete struct { OwnerName string } -func (d *Deployer) filterResourceKinds(resourceKinds []string) []string { - filteredResourceKinds := make([]string, 0, len(resourceKinds)) - for _, resourceKind := range resourceKinds { - if _, ok := d.clusterResourceKinds[resourceKind]; ok { - filteredResourceKinds = append(filteredResourceKinds, resourceKind) - } - } - return filteredResourceKinds -} - func (d *Deployer) deleteResource(ctx context.Context, namespace, resourceType, resourceName string, args ...string) error { return d.deleteResources(ctx, namespace, []string{resourceType}, append([]string{resourceName}, args...)...) } @@ -175,59 +101,31 @@ func (d *Deployer) deleteResources(ctx context.Context, namespace string, resour return err } -func (d *Deployer) deleteFinalizers(ctx context.Context, namespace, resourceType, resourceName string) error { - _, err := d.runKubectl(ctx, k8s.KubectlOptions{ - Args: []string{ - "-n", namespace, "patch", resourceType, resourceName, - "-p", `{"metadata":{"finalizers":null}}`, - "--type=merge", - }, - }) - return err -} - // Expects that reconciliation for the RHACS operator is paused. -func (d *Deployer) deleteCentralResources(ctx context.Context, wait bool) error { +func (d *Deployer) deleteCentralResources(ctx context.Context) error { d.logger.Info("Deleting Central resources") - var crExists bool - - if d.doesResourceExist(ctx, "central", "stackrox-central-services", d.centralNamespace) { - crExists = true - - // Trigger async deletion of the Central CR. - err := d.deleteResource(ctx, d.centralNamespace, "central", "stackrox-central-services", "--wait=false") - if err != nil { - return fmt.Errorf("failed to asynchronously delete Central CR: %w", err) - } - err = d.deleteFinalizers(ctx, d.centralNamespace, "central", "stackrox-central-services") - if err != nil { - return fmt.Errorf("failed to delete finalizers on Central CR: %w", err) - } + d.logger.Info("Removing any pause-reconcile annotation from Central") + if err := d.removePauseReconcileAnnotation(ctx, "central", "stackrox-central-services", d.centralNamespace); err != nil { + return err } - - // Pause reconciliation for other controllers, not just our RHACS operator. - // This is needed to ensure that there is no race causing the Cluster Network Operator - // to re-create the injected-ca-bundle ConfigMap during resource deletion. - if err := d.preventOtherControllersFromReconciling(ctx, component.Central); err != nil { - return fmt.Errorf("failed to prevent other controllers from reconciling Central resources: %w", err) + if d.verbose { + d.logger.Dim("Removed any pause-reconcile annotation from Central") } - // Delete other resources by brute force. - resourceKinds := d.filterResourceKinds(allInstallableCentralResourceKinds) - err := d.deleteResources(ctx, d.centralNamespace, resourceKinds, "-l=app.kubernetes.io/part-of=stackrox-central-services") + err := d.deleteResource(ctx, d.centralNamespace, "central", "stackrox-central-services", "--wait") if err != nil { return err } + if d.verbose { + d.logger.Dim("Deleted Central CR") + } for _, resource := range []ResourceToDelete{ {Name: "central-db", Kind: "pvc", OwnerName: centralCrName}, {Name: "central-db-backup", Kind: "pvc", OwnerName: centralCrName}, {Name: "admin-password", Kind: "secret"}, {Name: "scanner-db-password", Kind: "secret", OwnerName: centralCrName}, - // In case the Cluster Network Operator has succeeded in re-creating the injected-cabundle configmap - // after our operator has already deleted it. - {Name: injectedCABundleConfigMapCentral, Kind: "configmap"}, } { d.logger.Dimf("Attempting to delete %s/%s", resource.Kind, resource.Name) if resource.OwnerName != "" { @@ -251,86 +149,34 @@ func (d *Deployer) deleteCentralResources(ctx context.Context, wait bool) error } } - if crExists { - // Now delete the Central CR synchronously. - err := d.deleteResource(ctx, d.centralNamespace, "central", "stackrox-central-services") - if err != nil { - return fmt.Errorf("failed to delete Central CR: %w", err) - } - } - - return nil -} - -func (d *Deployer) preventOtherControllersFromReconciling(ctx context.Context, comp component.Component) error { - switch comp { - case component.Central: - return d.preventCABundleInjection(ctx, injectedCABundleConfigMapCentral, d.centralNamespace) - case component.SecuredCluster: - return d.preventCABundleInjection(ctx, injectedCABundleConfigMapSecuredCluster, d.sensorNamespace) - default: - return nil - } -} - -func (d *Deployer) preventCABundleInjection(ctx context.Context, configMapName, namespace string) error { - d.logger.Info("Removing CNO label from injected-cabundle ConfigMap to prevent CNO from injecting the CA bundle during cleanup") - _, err := d.runKubectl(ctx, k8s.KubectlOptions{ - Args: []string{ - "label", "configmap", configMapName, "-n", namespace, - "config.openshift.io/inject-trusted-cabundle-", - }, - IgnoreErrors: true, - }) - - if err != nil { - d.logger.Warningf("Failed to remove CNO label from %s: %v", configMapName, err) - } - return nil } -func (d *Deployer) deleteSecuredClusterResources(ctx context.Context, wait bool) error { +func (d *Deployer) deleteSecuredClusterResources(ctx context.Context) error { d.logger.Info("Deleting SecuredCluster resources") - var crExists bool - - if d.doesResourceExist(ctx, "securedcluster", "stackrox-secured-cluster-services", d.sensorNamespace) { - crExists = true - - // Trigger async deletion of the SecuredCluster CR. - err := d.deleteResource(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services", "--wait=false") - if err != nil { - return err - } - err = d.deleteFinalizers(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services") - if err != nil { - return fmt.Errorf("failed to delete finalizers on SecuredCluster CR: %w", err) - } + d.logger.Info("Removing any pause-reconcile annotation from SecuredCluster") + if err := d.removePauseReconcileAnnotation(ctx, "securedcluster", "stackrox-secured-cluster-services", d.sensorNamespace); err != nil { + return err } - - // Pause reconciliation for other controllers, not just our RHACS operator. - // This is needed to ensure that there is no race causing the Cluster Network Operator - // to re-create the injected-ca-bundle ConfigMap during resource deletion. - if err := d.preventOtherControllersFromReconciling(ctx, component.SecuredCluster); err != nil { - return fmt.Errorf("failed to prevent other controllers from reconciling SecuredCluster resources: %w", err) + if d.verbose { + d.logger.Dim("Removed any pause-reconcile annotation from SecuredCluster") } - // In the meantime, delete other resources by brute force. - resourceKinds := d.filterResourceKinds(allInstallableSecuredClusterResourceKinds) - err := d.deleteResources(ctx, d.sensorNamespace, resourceKinds, "-l=app.kubernetes.io/part-of=stackrox-secured-cluster-services") + err := d.deleteResource(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services", "--wait") if err != nil { return err } + if d.verbose { + d.logger.Dim("Deleted SecuredCluster CR") + } + // Delete resources, which are treated special. for _, resource := range []ResourceToDelete{ {Name: "cluster-registration-secret", Kind: "secret"}, // We need to make sure that don't accidentally delete a scanner-db-password belonging to the central CR, // when both are deployed into the same namespace. {Name: "scanner-db-password", Kind: "secret", OwnerName: securedClusterCrName}, - // In case the Cluster Network Operator has succeeded in re-creating the injected-cabundle configmap - // after our operator has already deleted it. - {Name: injectedCABundleConfigMapSecuredCluster, Kind: "configmap"}, } { d.logger.Dimf("Attempting to delete %s/%s", resource.Kind, resource.Name) if resource.OwnerName != "" { @@ -353,14 +199,6 @@ func (d *Deployer) deleteSecuredClusterResources(ctx context.Context, wait bool) } } - if crExists { - // Now delete the SecuredCluster CR synchronously. - err := d.deleteResource(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services") - if err != nil { - return fmt.Errorf("failed to delete SecuredCluster CR: %w", err) - } - } - return nil } @@ -780,7 +618,7 @@ func (d *Deployer) teardownCentral(ctx context.Context) error { } d.logger.Info("⏳ Waiting for Central resources to be fully deleted...") - err := d.deleteCentralResources(ctx, true) + err := d.deleteCentralResources(ctx) if err != nil { return fmt.Errorf("failed to delete Central resources: %w", err) } @@ -805,7 +643,7 @@ func (d *Deployer) teardownSecuredCluster(ctx context.Context) error { } d.logger.Info("⏳ Waiting for SecuredCluster resources to be fully deleted...") - err := d.deleteSecuredClusterResources(ctx, true) + err := d.deleteSecuredClusterResources(ctx) if err != nil { return fmt.Errorf("failed to delete SecuredCluster resources: %w", err) } @@ -1022,6 +860,21 @@ func (d *Deployer) addPauseReconcileAnnotation(ctx context.Context, resourceType return nil } +func (d *Deployer) removePauseReconcileAnnotation(ctx context.Context, resourceType, resourceName, namespace string) error { + _, err := d.runKubectl(ctx, k8s.KubectlOptions{ + Args: []string{ + "annotate", resourceType, resourceName, + "-n", namespace, + fmt.Sprintf("%s-", pauseReconcileAnnotationKey), + }, + }) + if err != nil { + return fmt.Errorf("failed to remove pause-reconcile annotation: %w", err) + } + + return nil +} + func (d *Deployer) SetDeployOperator(deployOperator bool) { d.shouldDeployOperator = deployOperator } From 3c763fe2380dcec397196073ff4a66db338e1143 Mon Sep 17 00:00:00 2001 From: Moritz Clasmeier Date: Sun, 10 May 2026 16:05:16 +0200 Subject: [PATCH 8/8] Don't check ownerRef for central storages --- internal/deployer/deployer.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/deployer/deployer.go b/internal/deployer/deployer.go index 3451b5fd..f74ffd62 100644 --- a/internal/deployer/deployer.go +++ b/internal/deployer/deployer.go @@ -122,8 +122,8 @@ func (d *Deployer) deleteCentralResources(ctx context.Context) error { } for _, resource := range []ResourceToDelete{ - {Name: "central-db", Kind: "pvc", OwnerName: centralCrName}, - {Name: "central-db-backup", Kind: "pvc", OwnerName: centralCrName}, + {Name: "central-db", Kind: "pvc"}, + {Name: "central-db-backup", Kind: "pvc"}, {Name: "admin-password", Kind: "secret"}, {Name: "scanner-db-password", Kind: "secret", OwnerName: centralCrName}, } {