diff --git a/charts/shield/Chart.yaml b/charts/shield/Chart.yaml index 49221b8d9..5f45199a8 100644 --- a/charts/shield/Chart.yaml +++ b/charts/shield/Chart.yaml @@ -13,5 +13,5 @@ maintainers: - name: mavimo email: marcovito.moscaritolo@sysdig.com type: application -version: 1.37.1 +version: 1.38.0 appVersion: "1.0.0" diff --git a/charts/shield/README.md b/charts/shield/README.md index e157a66d0..6c3a0145e 100644 --- a/charts/shield/README.md +++ b/charts/shield/README.md @@ -297,7 +297,10 @@ The following table lists the configurable parameters of the `shield` chart and | gke_autopilot.allowlist_version | The Allowlist version label applied to host-shield workloads. Must match an AllowlistSynchronizer the cluster knows about. (Replaces the deprecated top-level "gke_autopilot_allowlist", which is still honored and takes precedence when set.) | sysdig-agent-v1.1.4 | | gke_autopilot.allowlist_waiter.enabled | Enable the waiter Job | false | | gke_autopilot.allowlist_waiter.timeout | Maximum time the Job will block on the AllowlistSynchronizer reaching Ready | 120s | +| gke_autopilot.allowlist_waiter.active_deadline_seconds | Maximum seconds the waiter Pod is allowed to run before Kubernetes terminates it. Acts as a Job-level guard against the pod hanging before the inner `kubectl wait` timeout fires (image-pull stalls, scheduler delays, admission webhook hangs). Should be greater than `timeout` to leave headroom for pod startup. | 300 | | gke_autopilot.allowlist_waiter.service_account_name | Override the name of the waiter ServiceAccount (defaults to -allowlist-waiter) | | +| gke_autopilot.allowlist_waiter.create_rbac | Create the RBAC resources (ServiceAccount, ClusterRole, ClusterRoleBinding) for the allowlist waiter Job. Set to false to manage them externally. | true | +| gke_autopilot.allowlist_waiter.rbac_annotations | Additional annotations applied to the waiter SA/ClusterRole/ClusterRoleBinding | {} | | gke_autopilot.allowlist_waiter.image.registry | The registry where the kubectl image is stored | quay.io | | gke_autopilot.allowlist_waiter.image.repository | The repository where the kubectl image is stored | sysdig/kubectl | | gke_autopilot.allowlist_waiter.image.tag | The tag for the kubectl image | 1.34.3-1.6.21 | diff --git a/charts/shield/templates/host/gke-allowlist-waiter-clusterrole.yaml b/charts/shield/templates/host/gke-allowlist-waiter-clusterrole.yaml index 20f2390d7..6bc08241c 100644 --- a/charts/shield/templates/host/gke-allowlist-waiter-clusterrole.yaml +++ b/charts/shield/templates/host/gke-allowlist-waiter-clusterrole.yaml @@ -1,5 +1,5 @@ {{- if (include "host.allowlist_waiter.enabled" .) -}} -{{- if .Values.host.rbac.create }} +{{- if .Values.gke_autopilot.allowlist_waiter.create_rbac }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -9,8 +9,8 @@ metadata: annotations: helm.sh/hook: "pre-install,pre-upgrade" helm.sh/hook-weight: "-5" - helm.sh/hook-delete-policy: "before-hook-creation,hook-succeeded" - {{- with .Values.host.rbac.annotations }} + helm.sh/hook-delete-policy: "before-hook-creation" + {{- with .Values.gke_autopilot.allowlist_waiter.rbac_annotations }} {{- toYaml . | nindent 4 }} {{- end }} rules: diff --git a/charts/shield/templates/host/gke-allowlist-waiter-clusterrolebinding.yaml b/charts/shield/templates/host/gke-allowlist-waiter-clusterrolebinding.yaml index b469d4157..9f195e98a 100644 --- a/charts/shield/templates/host/gke-allowlist-waiter-clusterrolebinding.yaml +++ b/charts/shield/templates/host/gke-allowlist-waiter-clusterrolebinding.yaml @@ -1,5 +1,5 @@ {{- if (include "host.allowlist_waiter.enabled" .) -}} -{{- if .Values.host.rbac.create }} +{{- if .Values.gke_autopilot.allowlist_waiter.create_rbac }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -9,8 +9,8 @@ metadata: annotations: helm.sh/hook: "pre-install,pre-upgrade" helm.sh/hook-weight: "-5" - helm.sh/hook-delete-policy: "before-hook-creation,hook-succeeded" - {{- with .Values.host.rbac.annotations }} + helm.sh/hook-delete-policy: "before-hook-creation" + {{- with .Values.gke_autopilot.allowlist_waiter.rbac_annotations }} {{- toYaml . | nindent 4 }} {{- end }} roleRef: diff --git a/charts/shield/templates/host/gke-allowlist-waiter-job.yaml b/charts/shield/templates/host/gke-allowlist-waiter-job.yaml index 968951752..23e0529c3 100644 --- a/charts/shield/templates/host/gke-allowlist-waiter-job.yaml +++ b/charts/shield/templates/host/gke-allowlist-waiter-job.yaml @@ -9,9 +9,10 @@ metadata: annotations: helm.sh/hook: "pre-install,pre-upgrade" helm.sh/hook-weight: "5" - helm.sh/hook-delete-policy: "before-hook-creation,hook-succeeded" + helm.sh/hook-delete-policy: "before-hook-creation,hook-succeeded,hook-failed" spec: backoffLimit: 3 + activeDeadlineSeconds: {{ .Values.gke_autopilot.allowlist_waiter.active_deadline_seconds }} template: metadata: name: {{ include "host.allowlist_waiter.fullname" . }} @@ -35,12 +36,26 @@ spec: - /bin/bash - -c - | - set -euo pipefail - echo "Waiting for AllowlistSynchronizer/sysdig-agent-allowlist-synchronizer to become Ready..." + set -uo pipefail + NAMESPACE={{ .Release.Namespace }} + SYNC_NAME=sysdig-agent-allowlist-synchronizer + TIMEOUT={{ .Values.gke_autopilot.allowlist_waiter.timeout }} + + echo "Waiting for AllowlistSynchronizer/$SYNC_NAME to become Ready (timeout $TIMEOUT)..." kubectl wait --for=condition=Ready \ - allowlistsynchronizer/sysdig-agent-allowlist-synchronizer \ - -n {{ .Release.Namespace }} \ - --timeout={{ .Values.gke_autopilot.allowlist_waiter.timeout }} + allowlistsynchronizer/"$SYNC_NAME" \ + -n "$NAMESPACE" \ + --timeout="$TIMEOUT" + ec=$? + + if [ "$ec" -ne 0 ]; then + echo "---" >&2 + echo "kubectl wait failed (exit $ec). Dumping AllowlistSynchronizer state for diagnostics:" >&2 + kubectl describe -n "$NAMESPACE" allowlistsynchronizer/"$SYNC_NAME" >&2 || true + echo "---" >&2 + kubectl get -n "$NAMESPACE" allowlistsynchronizer/"$SYNC_NAME" -o yaml >&2 || true + exit "$ec" + fi echo "AllowlistSynchronizer is Ready." resources: {{- toYaml .Values.gke_autopilot.allowlist_waiter.resources | nindent 12 }} diff --git a/charts/shield/templates/host/gke-allowlist-waiter-serviceaccount.yaml b/charts/shield/templates/host/gke-allowlist-waiter-serviceaccount.yaml index 46e401057..039c8c7e2 100644 --- a/charts/shield/templates/host/gke-allowlist-waiter-serviceaccount.yaml +++ b/charts/shield/templates/host/gke-allowlist-waiter-serviceaccount.yaml @@ -1,5 +1,5 @@ {{- if (include "host.allowlist_waiter.enabled" .) -}} -{{- if .Values.host.rbac.create }} +{{- if .Values.gke_autopilot.allowlist_waiter.create_rbac }} apiVersion: v1 kind: ServiceAccount metadata: @@ -10,8 +10,8 @@ metadata: annotations: helm.sh/hook: "pre-install,pre-upgrade" helm.sh/hook-weight: "-5" - helm.sh/hook-delete-policy: "before-hook-creation,hook-succeeded" - {{- with .Values.host.rbac.annotations }} + helm.sh/hook-delete-policy: "before-hook-creation" + {{- with .Values.gke_autopilot.allowlist_waiter.rbac_annotations }} {{- toYaml . | nindent 4 }} {{- end }} {{- end }} diff --git a/charts/shield/tests/host/gke-allowlist-synchronizer_test.yaml b/charts/shield/tests/host/gke-allowlist-synchronizer_test.yaml index 07d9eb255..7fcaa95eb 100644 --- a/charts/shield/tests/host/gke-allowlist-synchronizer_test.yaml +++ b/charts/shield/tests/host/gke-allowlist-synchronizer_test.yaml @@ -65,10 +65,28 @@ tests: count: 0 template: templates/host/gke-allowlist-waiter-job.yaml - - it: Does not render waiter SA/CR/CRB when host.rbac.create is false + - it: Renders waiter SA/CR/CRB even when host.rbac.create is false (decoupled from host RBAC) set: gke_autopilot.allowlist_waiter.enabled: true host.rbac.create: false + asserts: + - hasDocuments: + count: 1 + template: templates/host/gke-allowlist-waiter-serviceaccount.yaml + - hasDocuments: + count: 1 + template: templates/host/gke-allowlist-waiter-clusterrole.yaml + - hasDocuments: + count: 1 + template: templates/host/gke-allowlist-waiter-clusterrolebinding.yaml + - hasDocuments: + count: 1 + template: templates/host/gke-allowlist-waiter-job.yaml + + - it: Does not render waiter SA/CR/CRB when gke_autopilot.allowlist_waiter.create_rbac is false + set: + gke_autopilot.allowlist_waiter.enabled: true + gke_autopilot.allowlist_waiter.create_rbac: false asserts: - hasDocuments: count: 0 @@ -83,6 +101,25 @@ tests: count: 1 template: templates/host/gke-allowlist-waiter-job.yaml + - it: Propagates gke_autopilot.allowlist_waiter.rbac_annotations to all waiter RBAC objects + set: + gke_autopilot.allowlist_waiter.enabled: true + gke_autopilot.allowlist_waiter.rbac_annotations: + custom.example.com/owner: shield-team + asserts: + - equal: + path: metadata.annotations["custom.example.com/owner"] + value: shield-team + template: templates/host/gke-allowlist-waiter-serviceaccount.yaml + - equal: + path: metadata.annotations["custom.example.com/owner"] + value: shield-team + template: templates/host/gke-allowlist-waiter-clusterrole.yaml + - equal: + path: metadata.annotations["custom.example.com/owner"] + value: shield-team + template: templates/host/gke-allowlist-waiter-clusterrolebinding.yaml + - it: Renders the waiter ServiceAccount with hook annotations set: gke_autopilot.allowlist_waiter.enabled: true @@ -102,7 +139,7 @@ tests: value: "-5" - equal: path: metadata.annotations["helm.sh/hook-delete-policy"] - value: before-hook-creation,hook-succeeded + value: before-hook-creation template: templates/host/gke-allowlist-waiter-serviceaccount.yaml - it: Renders the waiter ClusterRole limited to allowlistsynchronizers @@ -119,6 +156,9 @@ tests: - equal: path: metadata.annotations["helm.sh/hook-weight"] value: "-5" + - equal: + path: metadata.annotations["helm.sh/hook-delete-policy"] + value: before-hook-creation - contains: path: rules content: @@ -146,6 +186,9 @@ tests: - equal: path: metadata.annotations["helm.sh/hook-weight"] value: "-5" + - equal: + path: metadata.annotations["helm.sh/hook-delete-policy"] + value: before-hook-creation - equal: path: roleRef.kind value: ClusterRole @@ -182,7 +225,7 @@ tests: value: "5" - equal: path: metadata.annotations["helm.sh/hook-delete-policy"] - value: before-hook-creation,hook-succeeded + value: before-hook-creation,hook-succeeded,hook-failed - equal: path: spec.template.spec.serviceAccountName value: release-name-shield-host-allowlist-waiter @@ -194,7 +237,17 @@ tests: pattern: "^quay\\.io/sysdig/kubectl:[^\\s]+$" - matchRegex: path: spec.template.spec.containers[0].command[2] - pattern: "--timeout=120s" + pattern: "TIMEOUT=120s" + # The waiter script must dump AllowlistSynchronizer state on wait failure + # so the next on-caller has actionable diagnostics instead of a bare exit. + - matchRegex: + path: spec.template.spec.containers[0].command[2] + pattern: "kubectl describe .* allowlistsynchronizer" + # Job-level guard against pod hangs before the inner `kubectl wait` timeout fires + # (image-pull stalls, scheduler delays, admission webhook hangs). + - equal: + path: spec.activeDeadlineSeconds + value: 300 # The waiter Pod must NOT carry cloud.google.com/matching-allowlist — # otherwise GKE Autopilot would block it on the very allowlist it waits to load. - notExists: @@ -203,6 +256,16 @@ tests: path: spec.template.metadata.labels["autopilot.gke.io/no-connect"] template: templates/host/gke-allowlist-waiter-job.yaml + - it: Honors gke_autopilot.allowlist_waiter.active_deadline_seconds override + set: + gke_autopilot.allowlist_waiter.enabled: true + gke_autopilot.allowlist_waiter.active_deadline_seconds: 600 + asserts: + - equal: + path: spec.activeDeadlineSeconds + value: 600 + template: templates/host/gke-allowlist-waiter-job.yaml + - it: Renders imagePullSecrets when gke_autopilot.allowlist_waiter.image.pull_secrets is set set: gke_autopilot.allowlist_waiter.enabled: true diff --git a/charts/shield/values.yaml b/charts/shield/values.yaml index b51203d0b..3940728a7 100644 --- a/charts/shield/values.yaml +++ b/charts/shield/values.yaml @@ -660,8 +660,17 @@ gke_autopilot: enabled: false # Maximum time the Job will block on the AllowlistSynchronizer reaching Ready timeout: 120s + # Maximum seconds the waiter Pod is allowed to run before Kubernetes terminates it. + # Acts as a Job-level guard against the pod hanging before the inner `kubectl wait` + # timeout fires (image-pull stalls, scheduler delays, admission webhook hangs). + # Should be greater than `timeout` to leave headroom for pod startup. + active_deadline_seconds: 300 # Override the name of the waiter ServiceAccount (defaults to -allowlist-waiter) service_account_name: + # Create the RBAC resources (ServiceAccount, ClusterRole, ClusterRoleBinding) for the allowlist waiter Job. Set to false to manage them externally. + create_rbac: true + # Additional annotations applied to the waiter SA/ClusterRole/ClusterRoleBinding + rbac_annotations: {} image: # The registry where the kubectl image is stored registry: quay.io