From 457554a55a296cf08e36341cc188d6e4b7c1622c Mon Sep 17 00:00:00 2001 From: zeevdr Date: Wed, 29 Apr 2026 15:40:56 +0300 Subject: [PATCH] helm: ship sane resource defaults, NetworkPolicy template, pull-Always Pods previously shipped with `resources: {}` and `imagePullPolicy: IfNotPresent`, so a single replica could starve its node and security-patch updates on a moving tag never propagated. The chart also had no NetworkPolicy, leaving egress wide open in multi-tenant clusters. - Default `requests: 100m/128Mi`, `limits: 1/512Mi`; documented as override-friendly for benchmarking and larger sizing. - Default `imagePullPolicy: Always`; production guidance is to pin `image.tag` by digest and flip back to `IfNotPresent`. - Add `networkPolicy.yaml` gated on `networkPolicy.enabled` (off by default during alpha). Allows ingress to gRPC/HTTP ports plus configurable egress CIDRs for PG, Redis, JWKS, OTel, and DNS. - Add `tests/template_test.sh` asserting render contents + helm lint. - Wire a `helm` job into CI gated on `deploy/helm/**` paths. Closes #220. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 34 ++++++++- deploy/helm/decree/README.md | 10 +++ .../helm/decree/templates/networkpolicy.yaml | 74 +++++++++++++++++++ deploy/helm/decree/tests/template_test.sh | 49 ++++++++++++ deploy/helm/decree/values.yaml | 51 +++++++++++-- 5 files changed, 208 insertions(+), 10 deletions(-) create mode 100644 deploy/helm/decree/templates/networkpolicy.yaml create mode 100755 deploy/helm/decree/tests/template_test.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8947f3a0..73e4f0a4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,6 +59,7 @@ jobs: timeout-minutes: 5 outputs: code: ${{ steps.filter.outputs.code }} + helm: ${{ steps.filter.outputs.helm }} steps: - name: Checkout uses: actions/checkout@v6 @@ -81,6 +82,9 @@ jobs: - 'Makefile' - 'docker-compose*.yml' - '.github/workflows/**' + helm: + - 'deploy/helm/**' + - '.github/workflows/ci.yml' # Builds/pushes the shared decree-tools Docker image used by downstream jobs. # Skips the build if an image tagged with the Dockerfile hash already exists @@ -525,12 +529,38 @@ jobs: - name: Validate meta-schemas run: make validate-meta-schemas + # Renders the Helm chart with default + NetworkPolicy-enabled values and + # asserts the documented requests/limits, NetworkPolicy egress rules, and + # imagePullPolicy. Also runs `helm lint`. + helm: + name: Helm chart + runs-on: ubuntu-latest + needs: changes + if: needs.changes.outputs.helm == 'true' + permissions: + contents: read + timeout-minutes: 5 + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + persist-credentials: false + + - name: Set up Helm + # azure/setup-helm@v4.3.1 + uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 + with: + version: v3.18.4 + + - name: Run chart render tests + run: ./deploy/helm/decree/tests/template_test.sh + # Aggregates all job results for branch protection. A single required check # that passes iff every listed job passed or was legitimately skipped. check: name: CI check if: always() - needs: [lint, test, sdk-compat, docs, e2e, examples, govulncheck, deps-review, meta-schemas] + needs: [lint, test, sdk-compat, docs, e2e, examples, govulncheck, deps-review, meta-schemas, helm] runs-on: ubuntu-latest timeout-minutes: 5 steps: @@ -539,4 +569,4 @@ jobs: uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe with: jobs: ${{ toJSON(needs) }} - allowed-skips: lint, test, sdk-compat, docs, e2e, examples, govulncheck, deps-review + allowed-skips: lint, test, sdk-compat, docs, e2e, examples, govulncheck, deps-review, helm diff --git a/deploy/helm/decree/README.md b/deploy/helm/decree/README.md index e7843d40..66332599 100644 --- a/deploy/helm/decree/README.md +++ b/deploy/helm/decree/README.md @@ -30,3 +30,13 @@ See [values.yaml](values.yaml) for all options. Key settings: | `auth.jwksUrl` | JWKS URL for JWT auth | `""` (metadata auth) | | `ingress.enabled` | Enable Ingress | `false` | | `otel.enabled` | Enable OpenTelemetry | `false` | +| `image.pullPolicy` | Defaults to `Always` so security-patch updates on a moving tag propagate; set to `IfNotPresent` only when pinning by digest | `Always` | +| `resources.requests` / `resources.limits` | Default `100m / 128Mi` requests, `1 / 512Mi` limits — override (or set `resources: {}`) for benchmarking, dev, or larger sizing | sane defaults | +| `networkPolicy.enabled` | Restrict ingress + egress to documented dependencies. Off by default in alpha; recommended for any multi-tenant or production cluster. See `networkPolicy.egress.*CIDR` to whitelist PG, Redis, JWKS, OTel | `false` | + +## Production hardening checklist + +- Pin `image.tag` to an immutable digest (`sha256:…`) and switch `image.pullPolicy` to `IfNotPresent`. +- Override `resources.requests` / `resources.limits` to match your traffic profile. +- Enable `networkPolicy.enabled=true` and populate the `networkPolicy.egress.*CIDR` keys for PostgreSQL, Redis, the JWKS endpoint, and (if used) the OTel collector. +- Use `database.existingSecret` / `redis.existingSecret` instead of plaintext URLs in values. diff --git a/deploy/helm/decree/templates/networkpolicy.yaml b/deploy/helm/decree/templates/networkpolicy.yaml new file mode 100644 index 00000000..34a3cb7a --- /dev/null +++ b/deploy/helm/decree/templates/networkpolicy.yaml @@ -0,0 +1,74 @@ +{{- if .Values.networkPolicy.enabled }} +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "decree.fullname" . }} + labels: + {{- include "decree.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "decree.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + - from: + {{- if .Values.networkPolicy.ingressFrom }} + {{- toYaml .Values.networkPolicy.ingressFrom | nindent 8 }} + {{- else }} + - podSelector: {} + {{- end }} + ports: + - port: {{ .Values.config.grpcPort }} + protocol: TCP + {{- if .Values.config.httpPort }} + - port: {{ .Values.config.httpPort }} + protocol: TCP + {{- end }} + egress: + {{- if .Values.networkPolicy.egress.allowDNS }} + - to: + - namespaceSelector: {} + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + {{- end }} + {{- with .Values.networkPolicy.egress.postgresCIDR }} + - to: + - ipBlock: + cidr: {{ . }} + ports: + - port: {{ $.Values.networkPolicy.egress.postgresPort }} + protocol: TCP + {{- end }} + {{- with .Values.networkPolicy.egress.redisCIDR }} + - to: + - ipBlock: + cidr: {{ . }} + ports: + - port: {{ $.Values.networkPolicy.egress.redisPort }} + protocol: TCP + {{- end }} + {{- with .Values.networkPolicy.egress.jwksCIDR }} + - to: + - ipBlock: + cidr: {{ . }} + ports: + - port: {{ $.Values.networkPolicy.egress.jwksPort }} + protocol: TCP + {{- end }} + {{- with .Values.networkPolicy.egress.otelCIDR }} + - to: + - ipBlock: + cidr: {{ . }} + ports: + - port: {{ $.Values.networkPolicy.egress.otelPort }} + protocol: TCP + {{- end }} +{{- end }} diff --git a/deploy/helm/decree/tests/template_test.sh b/deploy/helm/decree/tests/template_test.sh new file mode 100755 index 00000000..e6215a28 --- /dev/null +++ b/deploy/helm/decree/tests/template_test.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Helm chart render tests. Asserts that: +# - default render includes the documented resource requests/limits +# - NetworkPolicy is omitted by default and rendered when enabled +# - imagePullPolicy defaults to Always +# Run from repo root: ./deploy/helm/decree/tests/template_test.sh +set -euo pipefail + +CHART="$(cd "$(dirname "$0")/.." && pwd)" +TMP="$(mktemp -d)" +trap 'rm -rf "$TMP"' EXIT + +fail() { echo "FAIL: $*" >&2; exit 1; } +pass() { echo "PASS: $*"; } + +# --- defaults --- +helm template decree "$CHART" --set database.writeUrl=postgres://x \ + --set redis.url=redis://x >"$TMP/default.yaml" + +grep -q 'imagePullPolicy: Always' "$TMP/default.yaml" || fail "default imagePullPolicy not Always" +grep -q 'cpu: 100m' "$TMP/default.yaml" || fail "default requests.cpu missing" +grep -q 'memory: 128Mi' "$TMP/default.yaml" || fail "default requests.memory missing" +grep -qE 'cpu: "?1"?$' "$TMP/default.yaml" || fail "default limits.cpu missing" +grep -q 'memory: 512Mi' "$TMP/default.yaml" || fail "default limits.memory missing" +grep -q 'kind: NetworkPolicy' "$TMP/default.yaml" && fail "NetworkPolicy emitted when disabled" +pass "defaults" + +# --- NetworkPolicy enabled --- +helm template decree "$CHART" \ + --set database.writeUrl=postgres://x \ + --set redis.url=redis://x \ + --set networkPolicy.enabled=true \ + --set networkPolicy.egress.postgresCIDR=10.0.0.0/24 \ + --set networkPolicy.egress.redisCIDR=10.0.1.0/24 \ + --set networkPolicy.egress.jwksCIDR=0.0.0.0/0 \ + --set auth.jwksUrl=https://example.test/jwks >"$TMP/np.yaml" + +grep -q 'kind: NetworkPolicy' "$TMP/np.yaml" || fail "NetworkPolicy not emitted when enabled" +grep -q 'cidr: 10.0.0.0/24' "$TMP/np.yaml" || fail "postgres egress CIDR missing" +grep -q 'cidr: 10.0.1.0/24' "$TMP/np.yaml" || fail "redis egress CIDR missing" +grep -q 'k8s-app: kube-dns' "$TMP/np.yaml" || fail "DNS egress missing" +grep -q 'port: 5432' "$TMP/np.yaml" || fail "postgres port missing" +pass "networkPolicy enabled" + +# --- helm lint --- +helm lint "$CHART" >"$TMP/lint.out" 2>&1 || { cat "$TMP/lint.out"; fail "helm lint failed"; } +pass "helm lint" + +echo "All helm template tests passed." diff --git a/deploy/helm/decree/values.yaml b/deploy/helm/decree/values.yaml index a95aef04..94e15998 100644 --- a/deploy/helm/decree/values.yaml +++ b/deploy/helm/decree/values.yaml @@ -5,7 +5,9 @@ replicaCount: 1 image: repository: ghcr.io/opendecree/decree tag: "" # defaults to Chart.appVersion - pullPolicy: IfNotPresent + # Always pull so security-patch updates on a moving tag propagate without manual rollout. + # Override to IfNotPresent only when pinning by digest (recommended for production). + pullPolicy: Always imagePullSecrets: [] nameOverride: "" @@ -97,13 +99,46 @@ ingress: pathType: Prefix tls: [] -resources: {} - # limits: - # cpu: 500m - # memory: 256Mi - # requests: - # cpu: 100m - # memory: 128Mi +# Default resource requests/limits keep a single pod from starving its node. +# Override (or set to {}) for benchmarking, dev, or larger production sizing. +resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: "1" + memory: 512Mi + +# NetworkPolicy restricts egress to the documented dependencies (PG, Redis, JWKS, +# DNS, OTel collector) and ingress to the configured client paths. Disabled by +# default during alpha; recommended for any multi-tenant or production cluster. +networkPolicy: + enabled: false + # Pod selectors / namespace selectors permitted to reach the gRPC + HTTP ports. + # Empty = allow from any pod in the same namespace. + ingressFrom: [] + # - podSelector: + # matchLabels: + # app: my-client + # - namespaceSelector: + # matchLabels: + # name: trusted-ns + egress: + # CIDR (or in-cluster selector) for PostgreSQL. Empty = no rule emitted. + postgresCIDR: "" + postgresPort: 5432 + # CIDR (or in-cluster selector) for Redis. + redisCIDR: "" + redisPort: 6379 + # JWKS endpoint host CIDR (when JWT auth is enabled). + jwksCIDR: "" + jwksPort: 443 + # OpenTelemetry collector CIDR (when otel.enabled). + otelCIDR: "" + otelPort: 4317 + # Allow DNS resolution (kube-dns / CoreDNS). Disable only in clusters with + # custom service discovery. + allowDNS: true nodeSelector: {} tolerations: []