diff --git a/.github/workflows/create-dev-cluster.yml b/.github/workflows/create-dev-cluster.yml index b790e721..3274cf18 100644 --- a/.github/workflows/create-dev-cluster.yml +++ b/.github/workflows/create-dev-cluster.yml @@ -6,6 +6,13 @@ on: cluster-name: required: true type: string + flavor: + required: true + type: string + args: + required: false + type: string + default: '' outputs: cluster-name: description: "Name of the created cluster" @@ -19,9 +26,9 @@ jobs: steps: - uses: stackrox/actions/infra/create-cluster@v1 with: - flavor: gke-default + flavor: ${{ inputs.flavor }} name: ${{ inputs.cluster-name }} - args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd + args: ${{ inputs.args }} lifespan: "2h" wait: true token: ${{ secrets.INFRA_CI_TOKEN }} diff --git a/.github/workflows/e2e-tests-kind.yml b/.github/workflows/e2e-tests-kind.yml new file mode 100644 index 00000000..fda6be0a --- /dev/null +++ b/.github/workflows/e2e-tests-kind.yml @@ -0,0 +1,72 @@ +name: E2E Tests (kind) + +on: + workflow_call: + inputs: + image: + required: true + type: string + +env: + REGISTRY: quay.io + IMAGE_NAME: rhacs-eng/roxie + +jobs: + e2e-tests-kind: + runs-on: ubuntu-latest + env: + SKIP_OLM_TESTS: "true" + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha || github.sha }} + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: go.mod + cache: true + + - name: Log in to Quay.io + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Extract roxie binary from image + run: | + docker create --name roxie-extract "${{ inputs.image }}" + docker cp roxie-extract:/usr/local/bin/roxie "$GITHUB_WORKSPACE/roxie" + docker rm roxie-extract + + - name: Install roxie binary + run: | + cp "${GITHUB_WORKSPACE}/roxie" /usr/local/bin/roxie + chmod +x /usr/local/bin/roxie + roxie version + + - name: Install roxctl + env: + ROXCTL_VERSION: "4.10.0" + ROXCTL_SHA256: "5db647b14569465866c0162522e83393ebf02f671f4556b1b3ed551b9f8433bc" + run: | + curl -fsSLo /usr/local/bin/roxctl \ + "https://mirror.openshift.com/pub/rhacs/assets/${ROXCTL_VERSION}/bin/Linux/roxctl" + echo "${ROXCTL_SHA256} /usr/local/bin/roxctl" | sha256sum -c - + chmod +x /usr/local/bin/roxctl + roxctl version + + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + cluster_name: roxie-e2e + + - name: Run e2e tests + env: + REGISTRY_USERNAME: ${{ secrets.QUAY_RHACS_ENG_RO_USERNAME }} + REGISTRY_PASSWORD: ${{ secrets.QUAY_RHACS_ENG_RO_PASSWORD }} + run: | + make run-test-e2e diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 322defb6..bdfe2880 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -9,6 +9,14 @@ on: image: required: true type: string + cluster-type: + required: false + type: string + default: 'gke' + skip-olm-tests: + required: false + type: string + default: 'true' env: REGISTRY: quay.io IMAGE_NAME: rhacs-eng/roxie @@ -23,7 +31,6 @@ jobs: KUBECONFIG: /github/home/artifacts/kubeconfig INFRA_TOKEN: ${{ secrets.INFRA_CI_TOKEN }} INFRACTL: bin/infractl -k -e localhost:8443 - USE_GKE_GCLOUD_AUTH_PLUGIN: "True" steps: - name: Checkout uses: actions/checkout@v6 @@ -65,15 +72,21 @@ jobs: roxctl version - name: Authenticate to GCloud + if: inputs.cluster-type == 'gke' uses: google-github-actions/auth@v3 with: credentials_json: ${{ secrets.ROXIE_CI_AUTOMATION_GCP_SA }} - name: Set up Cloud SDK + if: inputs.cluster-type == 'gke' uses: "google-github-actions/setup-gcloud@v3" with: install_components: "gke-gcloud-auth-plugin" + - name: Configure GKE auth plugin + if: inputs.cluster-type == 'gke' + run: echo "USE_GKE_GCLOUD_AUTH_PLUGIN=True" >> "$GITHUB_ENV" + - name: Download production infractl uses: stackrox/actions/infra/install-infractl@v1 @@ -89,7 +102,7 @@ jobs: env: REGISTRY_USERNAME: ${{ secrets.QUAY_RHACS_ENG_RO_USERNAME }} REGISTRY_PASSWORD: ${{ secrets.QUAY_RHACS_ENG_RO_PASSWORD }} - SKIP_OLM_TESTS: "true" + SKIP_OLM_TESTS: ${{ inputs.skip-olm-tests == 'true' && 'true' || '' }} run: | make run-test-e2e diff --git a/.github/workflows/main-push.yml b/.github/workflows/main-push.yml index 064187c0..9beecbab 100644 --- a/.github/workflows/main-push.yml +++ b/.github/workflows/main-push.yml @@ -15,7 +15,9 @@ jobs: create-dev-cluster: uses: ./.github/workflows/create-dev-cluster.yml with: - cluster-name: infra-roxie-main-${{ github.run_number }} + cluster-name: infra-roxie-main-${{ github.run_number }}-gke + flavor: gke-default + args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd secrets: inherit build-roxie-image: @@ -33,6 +35,13 @@ jobs: image: ${{ needs.build-roxie-image.outputs.image }} secrets: inherit + e2e-tests-kind: + needs: [ build-roxie-image ] + uses: ./.github/workflows/e2e-tests-kind.yml + with: + image: ${{ needs.build-roxie-image.outputs.image }} + secrets: inherit + delete-dev-cluster: if: ${{ always() && needs.create-dev-cluster.result == 'success' }} needs: [ create-dev-cluster, e2e-tests ] diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 4e27e6a4..65567c61 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -12,10 +12,38 @@ jobs: unit-tests: uses: ./.github/workflows/unit-tests.yml - create-dev-cluster: + check-olm-label: + runs-on: ubuntu-latest + outputs: + has-label: ${{ steps.check.outputs.has-label }} + steps: + - name: Check for olm-tests label + id: check + run: | + has_label="${{ contains(github.event.pull_request.labels.*.name, 'olm-tests') }}" + echo "has-label=${has_label}" >> "$GITHUB_OUTPUT" + if [ "$has_label" = "true" ]; then + echo "::notice::olm-tests label is set — OpenShift cluster will be created" + else + echo "::notice::olm-tests label is not set — skipping OpenShift cluster" + fi + + create-gke-cluster: + uses: ./.github/workflows/create-dev-cluster.yml + with: + cluster-name: infra-roxie-pr-${{ github.event.pull_request.number }}-gke + flavor: gke-default + args: machine-type=e2-standard-4,nodes=3,gcp-image-type=ubuntu_containerd + secrets: inherit + + create-openshift-cluster: + needs: check-olm-label + if: needs.check-olm-label.outputs.has-label == 'true' uses: ./.github/workflows/create-dev-cluster.yml with: - cluster-name: infra-roxie-pr-${{ github.event.pull_request.number }} + cluster-name: infra-roxie-pr-${{ github.event.pull_request.number }}-openshift + flavor: ocp-4 + args: master-node-type=e2-standard-4,worker-node-type=e2-standard-8,master-node-count=3,worker-node-count=3 secrets: inherit build-roxie-image: @@ -26,17 +54,42 @@ jobs: secrets: inherit e2e-tests: - needs: [ create-dev-cluster, build-roxie-image ] + needs: [ create-gke-cluster, build-roxie-image ] + uses: ./.github/workflows/e2e-tests.yml + with: + cluster-name: ${{ needs.create-gke-cluster.outputs.cluster-name }} + image: ${{ needs.build-roxie-image.outputs.image }} + secrets: inherit + + e2e-tests-kind: + needs: [ build-roxie-image ] + uses: ./.github/workflows/e2e-tests-kind.yml + with: + image: ${{ needs.build-roxie-image.outputs.image }} + secrets: inherit + + e2e-tests-openshift: + needs: [ create-openshift-cluster, build-roxie-image ] uses: ./.github/workflows/e2e-tests.yml with: - cluster-name: ${{ needs.create-dev-cluster.outputs.cluster-name }} + cluster-name: ${{ needs.create-openshift-cluster.outputs.cluster-name }} image: ${{ needs.build-roxie-image.outputs.image }} + cluster-type: openshift + skip-olm-tests: 'false' + secrets: inherit + + delete-gke-cluster: + if: ${{ always() && needs.create-gke-cluster.result == 'success' }} + needs: [ create-gke-cluster, e2e-tests ] + uses: ./.github/workflows/delete-dev-cluster.yml + with: + cluster-name: ${{ needs.create-gke-cluster.outputs.cluster-name }} secrets: inherit - delete-dev-cluster: - if: ${{ always() && needs.create-dev-cluster.result == 'success' }} - needs: [ create-dev-cluster, e2e-tests ] + delete-openshift-cluster: + if: ${{ always() && needs.create-openshift-cluster.result == 'success' }} + needs: [ create-openshift-cluster, e2e-tests-openshift ] uses: ./.github/workflows/delete-dev-cluster.yml with: - cluster-name: ${{ needs.create-dev-cluster.outputs.cluster-name }} + cluster-name: ${{ needs.create-openshift-cluster.outputs.cluster-name }} secrets: inherit diff --git a/cmd/deploy.go b/cmd/deploy.go index a5a6131a..a324a5cc 100644 --- a/cmd/deploy.go +++ b/cmd/deploy.go @@ -76,14 +76,6 @@ func runDeploy(cmd *cobra.Command, args []string) error { return errors.New("running without a controlling terminal requires --envrc to be set") } - if envrc != "" && portForwarding { - return errors.New("cannot use --envrc with --port-forwarding. The --envrc flag is for non-interactive mode with remote cluster access") - } - - if envrc != "" && exposure == "none" { - return errors.New("cannot use --envrc with --exposure=none. The --envrc flag requires a remotely accessible endpoint (e.g., --exposure=loadbalancer)") - } - portForwardEnabledFinal := portForwarding || exposure == "none" if env.RunningInRoxieContainer { diff --git a/internal/deployer/deploy_via_operator.go b/internal/deployer/deploy_via_operator.go index 2bc1ca73..6365a870 100644 --- a/internal/deployer/deploy_via_operator.go +++ b/internal/deployer/deploy_via_operator.go @@ -607,11 +607,20 @@ func (d *Deployer) configureCentralEndpoint(ctx context.Context, exposure string } } - endpoint, err := d.portForward.Start(d.centralNamespace, serviceName, 443, 8443) - if err != nil { - return fmt.Errorf("failed to start port-forward: %w", err) + if d.envrcFile != "" { + endpoint, pid, err := d.portForward.StartDetached(d.centralNamespace, serviceName, 443, 8443) + if err != nil { + return fmt.Errorf("failed to start detached port-forward: %w", err) + } + d.centralEndpoint = endpoint + d.portForwardPID = pid + } else { + endpoint, err := d.portForward.Start(d.centralNamespace, serviceName, 443, 8443) + if err != nil { + return fmt.Errorf("failed to start port-forward: %w", err) + } + d.centralEndpoint = endpoint } - d.centralEndpoint = endpoint } else if exposure == "loadbalancer" { endpoint, err := d.waitForLoadBalancer(ctx, d.centralNamespace, "central-loadbalancer", 300) if err != nil { diff --git a/internal/deployer/deployer.go b/internal/deployer/deployer.go index dc69a149..a30372a0 100644 --- a/internal/deployer/deployer.go +++ b/internal/deployer/deployer.go @@ -7,8 +7,10 @@ import ( "fmt" "os" "os/exec" + "strconv" "strings" "sync" + "syscall" "time" "github.com/fatih/color" @@ -38,70 +40,6 @@ var ( // AdminUsername is the default admin username for StackRox Central AdminUsername = "admin" - - // TODO(#91): at some point this will get out of date. If we filter by the app.../part-of - // label anyway, then maybe we should just delete all resource kinds present on cluster? - // also we should use the fully-qualified types - allInstallableCentralResourceKinds = []string{ - "applications", - "clusterroles", - "configmaps", - "deployments", - "destinationrules", - "endpoints", - "endpointslices", - "horizontalpodautoscalers", - "networkpolicys", - "leases", - "persistentvolumes", - "persistentvolumeclaims", - "pods", - "podsecuritypolicys", - "prometheusrules", - "roles", - "rolebindings", - "replicasets", - "routes", - "secrets", - "services", - "serviceaccounts", - "servicemonitors", - "storageclasses", - } - - allInstallableSecuredClusterResourceKinds = []string{ - "clusterroles", - "clusterrolebindings", - "configmaps", - "consoleplugins", - "controllerrevisions", - "daemonsets", - "deployments", - "endpoints", - "endpointslices", - "destinationrules", - "horizontalpodautoscalers", - "networkpolicys", - "leases", - "persistentvolumes", - "persistentvolumeclaims", - "pods", - "podsecuritypolicys", - "prometheusrules", - "replicasets", - "roles", - "rolebindings", - "secrets", - "services", - "serviceaccounts", - "servicemonitors", - "storageclasses", - "validatingwebhookconfigurations", - } - - injectedCABundleConfigMapPrefix = "injected-cabundle-" - injectedCABundleConfigMapCentral = injectedCABundleConfigMapPrefix + centralCrName - injectedCABundleConfigMapSecuredCluster = injectedCABundleConfigMapPrefix + securedClusterCrName ) // Deployer is the base deployer for ACS @@ -128,6 +66,7 @@ type Deployer struct { securedClusterOverrides map[string]interface{} featureFlagOverrides map[string]interface{} envrcFile string + portForwardPID int useOLM bool useKonflux bool shouldDeployOperator bool @@ -146,16 +85,6 @@ type ResourceToDelete struct { OwnerName string } -func (d *Deployer) filterResourceKinds(resourceKinds []string) []string { - filteredResourceKinds := make([]string, 0, len(resourceKinds)) - for _, resourceKind := range resourceKinds { - if _, ok := d.clusterResourceKinds[resourceKind]; ok { - filteredResourceKinds = append(filteredResourceKinds, resourceKind) - } - } - return filteredResourceKinds -} - func (d *Deployer) deleteResource(ctx context.Context, namespace, resourceType, resourceName string, args ...string) error { return d.deleteResources(ctx, namespace, []string{resourceType}, append([]string{resourceName}, args...)...) } @@ -175,59 +104,46 @@ func (d *Deployer) deleteResources(ctx context.Context, namespace string, resour return err } -func (d *Deployer) deleteFinalizers(ctx context.Context, namespace, resourceType, resourceName string) error { - _, err := d.runKubectl(ctx, k8s.KubectlOptions{ - Args: []string{ - "-n", namespace, "patch", resourceType, resourceName, - "-p", `{"metadata":{"finalizers":null}}`, - "--type=merge", - }, - }) - return err -} - // Expects that reconciliation for the RHACS operator is paused. -func (d *Deployer) deleteCentralResources(ctx context.Context, wait bool) error { +func (d *Deployer) deleteCentralResources(ctx context.Context) error { d.logger.Info("Deleting Central resources") - var crExists bool + crExists := true - if d.doesResourceExist(ctx, "central", "stackrox-central-services", d.centralNamespace) { - crExists = true + if _, err := k8s.RetrieveResourceFromCluster(ctx, d.logger, d.centralNamespace, "central", "stackrox-central-services"); err != nil { + if !k8s.IsResourceNotFound(err) { + return fmt.Errorf("retrieving Central CR: %w", err) + } + crExists = false + } - // Trigger async deletion of the Central CR. - err := d.deleteResource(ctx, d.centralNamespace, "central", "stackrox-central-services", "--wait=false") - if err != nil { - return fmt.Errorf("failed to asynchronously delete Central CR: %w", err) + if crExists { + d.logger.Info("Removing any pause-reconcile annotation from Central") + if err := d.removePauseReconcileAnnotation(ctx, "central", "stackrox-central-services", d.centralNamespace); err != nil { + return err + } + if d.verbose { + d.logger.Dim("Removed any pause-reconcile annotation from Central") } - err = d.deleteFinalizers(ctx, d.centralNamespace, "central", "stackrox-central-services") + err := d.deleteResource(ctx, d.centralNamespace, "central", "stackrox-central-services", "--wait") if err != nil { - return fmt.Errorf("failed to delete finalizers on Central CR: %w", err) + return err + } + if d.verbose { + d.logger.Dim("Deleted Central CR") } + } else { + d.logger.Info("Deletion of Central resources requested, but Central CR is not present anymore") } - - // Pause reconciliation for other controllers, not just our RHACS operator. - // This is needed to ensure that there is no race causing the Cluster Network Operator - // to re-create the injected-ca-bundle ConfigMap during resource deletion. - if err := d.preventOtherControllersFromReconciling(ctx, component.Central); err != nil { - return fmt.Errorf("failed to prevent other controllers from reconciling Central resources: %w", err) - } - - // Delete other resources by brute force. - resourceKinds := d.filterResourceKinds(allInstallableCentralResourceKinds) - err := d.deleteResources(ctx, d.centralNamespace, resourceKinds, "-l=app.kubernetes.io/part-of=stackrox-central-services") - if err != nil { - return err + if d.verbose { + d.logger.Dim("Deleted Central CR") } for _, resource := range []ResourceToDelete{ - {Name: "central-db", Kind: "pvc", OwnerName: centralCrName}, - {Name: "central-db-backup", Kind: "pvc", OwnerName: centralCrName}, + {Name: "central-db", Kind: "pvc"}, + {Name: "central-db-backup", Kind: "pvc"}, {Name: "admin-password", Kind: "secret"}, {Name: "scanner-db-password", Kind: "secret", OwnerName: centralCrName}, - // In case the Cluster Network Operator has succeeded in re-creating the injected-cabundle configmap - // after our operator has already deleted it. - {Name: injectedCABundleConfigMapCentral, Kind: "configmap"}, } { d.logger.Dimf("Attempting to delete %s/%s", resource.Kind, resource.Name) if resource.OwnerName != "" { @@ -251,86 +167,46 @@ func (d *Deployer) deleteCentralResources(ctx context.Context, wait bool) error } } - if crExists { - // Now delete the Central CR synchronously. - err := d.deleteResource(ctx, d.centralNamespace, "central", "stackrox-central-services") - if err != nil { - return fmt.Errorf("failed to delete Central CR: %w", err) - } - } - return nil } -func (d *Deployer) preventOtherControllersFromReconciling(ctx context.Context, comp component.Component) error { - switch comp { - case component.Central: - return d.preventCABundleInjection(ctx, injectedCABundleConfigMapCentral, d.centralNamespace) - case component.SecuredCluster: - return d.preventCABundleInjection(ctx, injectedCABundleConfigMapSecuredCluster, d.sensorNamespace) - default: - return nil - } -} - -func (d *Deployer) preventCABundleInjection(ctx context.Context, configMapName, namespace string) error { - d.logger.Info("Removing CNO label from injected-cabundle ConfigMap to prevent CNO from injecting the CA bundle during cleanup") - _, err := d.runKubectl(ctx, k8s.KubectlOptions{ - Args: []string{ - "label", "configmap", configMapName, "-n", namespace, - "config.openshift.io/inject-trusted-cabundle-", - }, - IgnoreErrors: true, - }) - - if err != nil { - d.logger.Warningf("Failed to remove CNO label from %s: %v", configMapName, err) - } - - return nil -} - -func (d *Deployer) deleteSecuredClusterResources(ctx context.Context, wait bool) error { +func (d *Deployer) deleteSecuredClusterResources(ctx context.Context) error { d.logger.Info("Deleting SecuredCluster resources") - var crExists bool + crExists := true - if d.doesResourceExist(ctx, "securedcluster", "stackrox-secured-cluster-services", d.sensorNamespace) { - crExists = true + if _, err := k8s.RetrieveResourceFromCluster(ctx, d.logger, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services"); err != nil { + if !k8s.IsResourceNotFound(err) { + return fmt.Errorf("retrieving SecuredCluster CR: %w", err) + } + crExists = false + } - // Trigger async deletion of the SecuredCluster CR. - err := d.deleteResource(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services", "--wait=false") - if err != nil { + if crExists { + d.logger.Info("Removing any pause-reconcile annotation from SecuredCluster") + if err := d.removePauseReconcileAnnotation(ctx, "securedcluster", "stackrox-secured-cluster-services", d.sensorNamespace); err != nil { return err } + if d.verbose { + d.logger.Dim("Removed any pause-reconcile annotation from SecuredCluster") + } - err = d.deleteFinalizers(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services") + err := d.deleteResource(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services", "--wait") if err != nil { - return fmt.Errorf("failed to delete finalizers on SecuredCluster CR: %w", err) + return err } + if d.verbose { + d.logger.Dim("Deleted SecuredCluster CR") + } + } else { + d.logger.Info("Deletion of SecuredCluster resources requested, but SecuredCluster CR is not present anymore") } - // Pause reconciliation for other controllers, not just our RHACS operator. - // This is needed to ensure that there is no race causing the Cluster Network Operator - // to re-create the injected-ca-bundle ConfigMap during resource deletion. - if err := d.preventOtherControllersFromReconciling(ctx, component.SecuredCluster); err != nil { - return fmt.Errorf("failed to prevent other controllers from reconciling SecuredCluster resources: %w", err) - } - - // In the meantime, delete other resources by brute force. - resourceKinds := d.filterResourceKinds(allInstallableSecuredClusterResourceKinds) - err := d.deleteResources(ctx, d.sensorNamespace, resourceKinds, "-l=app.kubernetes.io/part-of=stackrox-secured-cluster-services") - if err != nil { - return err - } - + // Delete resources, which are treated special. for _, resource := range []ResourceToDelete{ {Name: "cluster-registration-secret", Kind: "secret"}, // We need to make sure that don't accidentally delete a scanner-db-password belonging to the central CR, // when both are deployed into the same namespace. {Name: "scanner-db-password", Kind: "secret", OwnerName: securedClusterCrName}, - // In case the Cluster Network Operator has succeeded in re-creating the injected-cabundle configmap - // after our operator has already deleted it. - {Name: injectedCABundleConfigMapSecuredCluster, Kind: "configmap"}, } { d.logger.Dimf("Attempting to delete %s/%s", resource.Kind, resource.Name) if resource.OwnerName != "" { @@ -353,14 +229,6 @@ func (d *Deployer) deleteSecuredClusterResources(ctx context.Context, wait bool) } } - if crExists { - // Now delete the SecuredCluster CR synchronously. - err := d.deleteResource(ctx, d.sensorNamespace, "securedcluster", "stackrox-secured-cluster-services") - if err != nil { - return fmt.Errorf("failed to delete SecuredCluster CR: %w", err) - } - } - return nil } @@ -569,6 +437,12 @@ func New(log *logger.Logger) (*Deployer, error) { d.roxCACertFile = caCert } + if pidStr := os.Getenv("ROXIE_PORT_FORWARD_PID"); pidStr != "" { + if pid, err := strconv.Atoi(pidStr); err == nil { + d.portForwardPID = pid + } + } + d.kubeContext = env.GetCurrentContext() clusterResourceKinds, err := d.getClusterResourceKinds() @@ -615,6 +489,22 @@ func (d *Deployer) Cleanup() { } } +func (d *Deployer) stopDetachedPortForward() { + if d.portForwardPID == 0 { + return + } + proc, err := os.FindProcess(d.portForwardPID) + if err != nil { + return + } + if err := proc.Signal(syscall.SIGKILL); err != nil { + d.logger.Dimf("Detached port-forward (pid %d) already gone", d.portForwardPID) + return + } + d.logger.Dimf("Stopped detached port-forward (pid %d)", d.portForwardPID) + d.portForwardPID = 0 +} + // Deploy deploys the specified components to the cluster. func (d *Deployer) Deploy(ctx context.Context, components component.Component, resources, exposure string) error { adjustedResources, adjustedExposure, adjustedPortForward := d.clusterDefaults.ApplyConvenienceDefaults( @@ -694,7 +584,6 @@ func (d *Deployer) deployCentral(ctx context.Context, resources, exposure string return err } - // envrc may be used from different processes, so use actual endpoint not port-forward if d.envrcFile != "" { d.logger.Dimf("Writing environment variables to %s", d.envrcFile) if err := d.writeEnvrcFile(ctx, exposure, portForwardWanted); err != nil { @@ -771,6 +660,7 @@ func (d *Deployer) teardownCentral(ctx context.Context) error { } d.portForward.Stop() + d.stopDetachedPortForward() // Add pause-reconcile annotation to not have the operator interfere during resource deletion. if d.doesResourceExist(ctx, "central", "stackrox-central-services", d.centralNamespace) { @@ -780,7 +670,7 @@ func (d *Deployer) teardownCentral(ctx context.Context) error { } d.logger.Info("⏳ Waiting for Central resources to be fully deleted...") - err := d.deleteCentralResources(ctx, true) + err := d.deleteCentralResources(ctx) if err != nil { return fmt.Errorf("failed to delete Central resources: %w", err) } @@ -805,7 +695,7 @@ func (d *Deployer) teardownSecuredCluster(ctx context.Context) error { } d.logger.Info("⏳ Waiting for SecuredCluster resources to be fully deleted...") - err := d.deleteSecuredClusterResources(ctx, true) + err := d.deleteSecuredClusterResources(ctx) if err != nil { return fmt.Errorf("failed to delete SecuredCluster resources: %w", err) } @@ -1022,6 +912,22 @@ func (d *Deployer) addPauseReconcileAnnotation(ctx context.Context, resourceType return nil } +func (d *Deployer) removePauseReconcileAnnotation(ctx context.Context, resourceType, resourceName, namespace string) error { + _, err := d.runKubectl(ctx, k8s.KubectlOptions{ + Args: []string{ + "annotate", resourceType, resourceName, + "-n", namespace, + fmt.Sprintf("%s-", pauseReconcileAnnotationKey), + }, + IgnoreErrors: true, + }) + if err != nil { + return fmt.Errorf("failed to remove pause-reconcile annotation: %w", err) + } + + return nil +} + func (d *Deployer) SetDeployOperator(deployOperator bool) { d.shouldDeployOperator = deployOperator } @@ -1117,6 +1023,9 @@ func (d *Deployer) writeEnvrcFile(ctx context.Context, exposure string, portForw fmt.Fprintf(&content, "export ROX_USERNAME=%q\n", AdminUsername) fmt.Fprintf(&content, "export ROX_ADMIN_PASSWORD=%q\n", d.centralPassword) fmt.Fprintf(&content, "export ROX_CA_CERT_FILE=%q\n", d.roxCACertFile) + if d.portForwardPID != 0 { + fmt.Fprintf(&content, "export ROXIE_PORT_FORWARD_PID=%d\n", d.portForwardPID) + } if err := os.WriteFile(d.envrcFile, []byte(content.String()), 0600); err != nil { return fmt.Errorf("failed to write envrc file: %w", err) diff --git a/internal/portforward/portforward.go b/internal/portforward/portforward.go index 4dcbd0cf..b62f69cb 100644 --- a/internal/portforward/portforward.go +++ b/internal/portforward/portforward.go @@ -120,6 +120,51 @@ func (m *Manager) Start(namespace, serviceName string, remotePort, preferredLoca return endpoint, nil } +// StartDetached starts port-forward as a detached process that survives the +// parent process exiting. Returns the endpoint and the PID of the subprocess. +// The caller is responsible for killing the process when done. +func (m *Manager) StartDetached(namespace, serviceName string, remotePort, preferredLocalPort int) (string, int, error) { + localPort, err := m.findFreeLocalPort(preferredLocalPort) + if err != nil { + return "", 0, fmt.Errorf("failed to find free port: %w", err) + } + + cmd := exec.Command( + m.kubectl, + "-n", namespace, + "port-forward", + fmt.Sprintf("svc/%s", serviceName), + fmt.Sprintf("%d:%d", localPort, remotePort), + "--address", "127.0.0.1", + ) + + cmd.SysProcAttr = &syscall.SysProcAttr{ + Setsid: true, + } + + cmd.Stdout = nil + cmd.Stderr = nil + + if err := cmd.Start(); err != nil { + return "", 0, fmt.Errorf("failed to start port-forward: %w", err) + } + + pid := cmd.Process.Pid + + // Release the process so it won't be waited on by this process. + cmd.Process.Release() + + if !m.waitTCPReady("127.0.0.1", localPort, 20.0) { + syscall.Kill(pid, syscall.SIGTERM) + return "", 0, fmt.Errorf("port-forward did not become ready") + } + + endpoint := fmt.Sprintf("127.0.0.1:%d", localPort) + m.logger.Successf("✓ Detached port-forward active at https://%s (pid %d)", endpoint, pid) + + return endpoint, pid, nil +} + // Stop stops the active port-forward if running func (m *Manager) Stop() { if m.proc == nil || m.proc.Process == nil { diff --git a/tests/e2e/basic_test.go b/tests/e2e/basic_test.go index 048ad36d..e4f58ef6 100644 --- a/tests/e2e/basic_test.go +++ b/tests/e2e/basic_test.go @@ -11,6 +11,8 @@ import ( // TestDeployBothSimple tests deploying both components together (simplest scenario) func TestDeployBothSimple(t *testing.T) { + dumpClusterStateOnFailure(t) + // Create temporary envrc file envrcFile, err := os.CreateTemp(t.TempDir(), ".envrc.roxie-test-*") if err != nil { @@ -20,7 +22,7 @@ func TestDeployBothSimple(t *testing.T) { envrcFile.Close() t.Log("=== Deploying both components together ===") - args := append([]string{roxieBinary, "deploy", "--early-readiness", "both", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args := append([]string{roxieBinary, "deploy", "--early-readiness", "both", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout*2, nil, args...) // Verify namespaces exist and have managed-by labels diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 5f85da97..cb42af42 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -39,6 +39,8 @@ func TestMain(m *testing.M) { } func TestDeployBothComponentsTogetherInSingleNamespace(t *testing.T) { + dumpClusterStateOnFailure(t) + // Create temporary envrc file. envrcFile, err := os.CreateTemp(t.TempDir(), ".envrc.roxie-test-*") if err != nil { @@ -48,7 +50,7 @@ func TestDeployBothComponentsTogetherInSingleNamespace(t *testing.T) { envrcFile.Close() t.Log("=== Deploying both components in single namespace ===") - args := append([]string{roxieBinary, "deploy", "both", "--single-namespace", "--early-readiness", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args := append([]string{roxieBinary, "deploy", "both", "--single-namespace", "--early-readiness", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout*2, nil, args...) verifyCentralInstalled(t, "stackrox") diff --git a/tests/e2e/helpers.go b/tests/e2e/helpers.go index a97c1653..27afb22a 100644 --- a/tests/e2e/helpers.go +++ b/tests/e2e/helpers.go @@ -26,8 +26,7 @@ const ( ) var ( - commonDeployArgs = []string{"--port-forwarding", "--exposure=none", "--resources=small"} - commonDeployArgsNoPortForward = []string{"--exposure=loadbalancer", "--resources=small"} + commonDeployArgs = []string{"--resources=small"} roxieBinary = "roxie" ) @@ -239,6 +238,115 @@ func verifySecuredClusterNotInstalled(t *testing.T, namespace string) { } } +var clusterDumpNamespaces = []string{ + "rhacs-operator-system", + "acs-central", + "acs-sensor", + "stackrox", +} + +func dumpClusterStateOnFailure(t *testing.T) { + t.Helper() + t.Cleanup(func() { + if !t.Failed() { + return + } + dumpClusterResources(t) + }) +} + +func dumpClusterResources(t *testing.T) { + t.Helper() + fmt.Fprintf(os.Stderr, "=== CLUSTER RESOURCE DUMP (test %s failed) ===\n", t.Name()) + + runKubectlDump("get", "namespaces") + + for _, ns := range clusterDumpNamespaces { + fmt.Fprintf(os.Stderr, "--- Namespace: %s ---\n", ns) + runKubectlDump("get", "pods", "-n", ns, "-o", "wide") + runKubectlDump("describe", "pods", "-n", ns) + runKubectlDump("get", "deployments", "-n", ns, "-o", "wide") + runKubectlDump("describe", "deployments", "-n", ns) + runKubectlDump("get", "daemonsets", "-n", ns, "-o", "wide") + runKubectlDump("describe", "daemonsets", "-n", ns) + runKubectlDump("get", "events", "-n", ns, "--sort-by=.lastTimestamp") + dumpLogsForFailingPods(ns) + } + + dumpACSCustomResources() + dumpOLMResources() + + fmt.Fprintln(os.Stderr, "=== END CLUSTER RESOURCE DUMP ===") +} + +func dumpACSCustomResources() { + fmt.Fprintln(os.Stderr, "--- ACS Custom Resources ---") + for _, ns := range clusterDumpNamespaces { + runKubectlDump("get", "centrals.platform.stackrox.io", "-n", ns, "-o", "yaml") + runKubectlDump("get", "securedclusters.platform.stackrox.io", "-n", ns, "-o", "yaml") + } +} + +func dumpOLMResources() { + cmd := exec.Command("kubectl", "api-resources", "--api-group=operators.coreos.com", "-o", "name") + output, err := cmd.Output() + if err != nil || strings.TrimSpace(string(output)) == "" { + fmt.Fprintln(os.Stderr, "[dump] OLM not installed, skipping OLM resource dump") + return + } + + fmt.Fprintln(os.Stderr, "--- OLM Resources ---") + operatorNamespace := "rhacs-operator-system" + runKubectlDump("get", "subscriptions.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "subscriptions.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "installplans.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "installplans.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "catalogsources.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "catalogsources.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "clusterserviceversions.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "clusterserviceversions.operators.coreos.com", "-n", operatorNamespace) + runKubectlDump("get", "operatorgroups.operators.coreos.com", "-n", operatorNamespace, "-o", "wide") + runKubectlDump("describe", "operatorgroups.operators.coreos.com", "-n", operatorNamespace) +} + +func runKubectlDump(args ...string) { + fmt.Fprintf(os.Stderr, "## kubectl %s\n", strings.Join(args, " ")) + cmd := exec.Command("kubectl", args...) + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + fmt.Fprintf(os.Stderr, "kubectl failed: %v\n", err) + } + fmt.Fprintln(os.Stderr) +} + +func dumpLogsForFailingPods(namespace string) { + cmd := exec.Command("kubectl", "get", "pods", "-n", namespace, + "-o", "jsonpath={range .items[*]}{.metadata.name}{\"\\t\"}{.status.phase}{\"\\n\"}{end}") + output, err := cmd.Output() + if err != nil { + fmt.Fprintf(os.Stderr, "[dump] failed to list pods in %s: %v\n", namespace, err) + return + } + + for line := range strings.SplitSeq(strings.TrimSpace(string(output)), "\n") { + if line == "" { + continue + } + parts := strings.SplitN(line, "\t", 2) + if len(parts) != 2 { + continue + } + podName, phase := parts[0], parts[1] + if phase == "Running" || phase == "Succeeded" { + continue + } + fmt.Fprintf(os.Stderr, "[dump] logs for pod %s/%s (phase=%s):\n", namespace, podName, phase) + runKubectlDump("logs", "-n", namespace, podName, "--all-containers", "--tail=100") + runKubectlDump("logs", "-n", namespace, podName, "--all-containers", "--previous", "--tail=50") + } +} + func verifyAnnotation(t *testing.T, resourceType, resourceName, namespace, annotationKey, expectedValue string) { t.Helper() diff --git a/tests/e2e/olm_switch_test.go b/tests/e2e/olm_switch_test.go index 09b45079..0c6fa480 100644 --- a/tests/e2e/olm_switch_test.go +++ b/tests/e2e/olm_switch_test.go @@ -65,6 +65,8 @@ func verifyOperatorDeploymentExists(t *testing.T) { // TestOLMToNonOLMSwitch tests switching from OLM operator to non-OLM operator func TestOLMToNonOLMSwitch(t *testing.T) { + dumpClusterStateOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } @@ -79,7 +81,7 @@ func TestOLMToNonOLMSwitch(t *testing.T) { // Step 1: Deploy central with OLM operator t.Log("=== Step 1: Deploy central with OLM operator ===") - args := append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args := append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout, nil, args...) // Verify operator is in OLM mode @@ -92,7 +94,7 @@ func TestOLMToNonOLMSwitch(t *testing.T) { // Step 2: Deploy central again without OLM (should switch modes) t.Log("=== Step 2: Redeploy central without OLM (triggering mode switch) ===") - args = append([]string{roxieBinary, "deploy", "central", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args = append([]string{roxieBinary, "deploy", "central", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout, nil, args...) // Verify operator switched to non-OLM mode @@ -113,6 +115,8 @@ func TestOLMToNonOLMSwitch(t *testing.T) { // TestNonOLMToOLMSwitch tests switching from non-OLM operator to OLM operator func TestNonOLMToOLMSwitch(t *testing.T) { + dumpClusterStateOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } @@ -127,7 +131,7 @@ func TestNonOLMToOLMSwitch(t *testing.T) { // Step 1: Deploy central without OLM (non-OLM operator) t.Log("=== Step 1: Deploy central with non-OLM operator ===") - args := append([]string{roxieBinary, "deploy", "central", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args := append([]string{roxieBinary, "deploy", "central", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout, nil, args...) // Verify operator is in non-OLM mode @@ -140,7 +144,7 @@ func TestNonOLMToOLMSwitch(t *testing.T) { // Step 2: Deploy central again with OLM (should switch modes) t.Log("=== Step 2: Redeploy central with OLM (triggering mode switch) ===") - args = append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args = append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout, nil, args...) // Verify operator switched to OLM mode @@ -161,6 +165,8 @@ func TestNonOLMToOLMSwitch(t *testing.T) { // TestOLMOperatorVersionUpgrade tests that OLM operator version mismatches trigger teardown and redeploy func TestOLMOperatorVersionUpgrade(t *testing.T) { + dumpClusterStateOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } @@ -179,7 +185,7 @@ func TestOLMOperatorVersionUpgrade(t *testing.T) { // Step 1: Deploy central with OLM operator t.Log("=== Step 1: Deploy central with OLM operator ===") - args := append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args := append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout, nil, args...) // Verify operator is in OLM mode @@ -201,7 +207,7 @@ func TestOLMOperatorVersionUpgrade(t *testing.T) { // Step 2: Redeploy with same version (should skip if version matches) t.Log("=== Step 2: Redeploy with same version (should detect correct version) ===") - args = append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args = append([]string{roxieBinary, "deploy", "central", "--olm", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout, nil, args...) // Verify operator is still in OLM mode and deployment exists @@ -223,6 +229,8 @@ func TestOLMOperatorVersionUpgrade(t *testing.T) { // TestSecuredClusterWithOLMSwitch tests that secured-cluster deployment also respects OLM mode switches func TestSecuredClusterWithOLMSwitch(t *testing.T) { + dumpClusterStateOnFailure(t) + if os.Getenv("SKIP_OLM_TESTS") != "" { t.Skip("SKIP_OLM_TESTS is set") } @@ -237,7 +245,7 @@ func TestSecuredClusterWithOLMSwitch(t *testing.T) { // Step 1: Deploy central with OLM t.Log("=== Step 1: Deploy central with OLM ===") - args := append([]string{roxieBinary, "deploy", "--early-readiness", "central", "--olm", "--envrc", envrcPath}, commonDeployArgsNoPortForward...) + args := append([]string{roxieBinary, "deploy", "--early-readiness", "central", "--olm", "--envrc", envrcPath}, commonDeployArgs...) runCommand(t, deployTimeout, nil, args...) verifyOperatorMode(t, true) @@ -251,7 +259,7 @@ func TestSecuredClusterWithOLMSwitch(t *testing.T) { // Step 2: Deploy secured-cluster (should reuse OLM operator) t.Log("=== Step 2: Deploy secured-cluster (should reuse OLM operator) ===") - args = append([]string{roxieBinary, "deploy", "--early-readiness", "secured-cluster", "--olm"}, commonDeployArgsNoPortForward...) + args = append([]string{roxieBinary, "deploy", "--early-readiness", "secured-cluster", "--olm"}, commonDeployArgs...) runCommand(t, deployTimeout, envrcEnv, args...) // Verify operator is still in OLM mode @@ -260,7 +268,7 @@ func TestSecuredClusterWithOLMSwitch(t *testing.T) { // Step 3: Switch to non-OLM by redeploying secured-cluster without --olm t.Log("=== Step 3: Redeploy secured-cluster without OLM (triggering mode switch) ===") - args = append([]string{roxieBinary, "deploy", "--early-readiness", "secured-cluster"}, commonDeployArgsNoPortForward...) + args = append([]string{roxieBinary, "deploy", "--early-readiness", "secured-cluster"}, commonDeployArgs...) runCommand(t, deployTimeout, envrcEnv, args...) // Verify operator switched to non-OLM mode