Improve deployment reliability with tuned K8s probes and rollout strategy (#60542)

heiskr · Copilot · web-flow · commit 771735dd465d · 2026-03-30T19:13:44.000Z
Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/config/kubernetes/default/deployments/webapp.yaml b/config/kubernetes/default/deployments/webapp.yaml
@@ -22,6 +22,7 @@ spec:
         ad.datadoghq.com/tolerate-unready: 'true'
     spec:
       dnsPolicy: Default
+      terminationGracePeriodSeconds: 60
       containers:
         - name: webapp
           image: docs-internal
@@ -67,18 +68,19 @@ spec:
             preStop:
               exec:
                 command: ['sleep', '5']
+          # See production/deployments/webapp.yaml for detailed comments on probe config.
           startupProbe:
             httpGet:
               path: /healthcheck
               port: http
-            initialDelaySeconds: 5
+            initialDelaySeconds: 30
             periodSeconds: 5
-            failureThreshold: 12
+            failureThreshold: 30
             timeoutSeconds: 5
           readinessProbe:
-            timeoutSeconds: 5
-            periodSeconds: 10
-            failureThreshold: 3
             httpGet:
               path: /healthcheck
               port: http
+            periodSeconds: 10
+            failureThreshold: 5
+            timeoutSeconds: 5
diff --git a/config/kubernetes/production/deployments/webapp.yaml b/config/kubernetes/production/deployments/webapp.yaml
@@ -3,15 +3,18 @@ kind: Deployment
 metadata:
   name: webapp
   annotations:
-    moda.github.net/allow-missing-ready-pods: '1'
+    moda.github.net/allow-missing-ready-pods: '0'
     moda.github.net/inject-unified-service-tag-env-var: docs-internal
 spec:
   replicas: 12
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxUnavailable: 1
-      maxSurge: 2
+      # Don't kill old pods until new ones pass readiness.
+      # Prevents capacity loss during deploys. Safe because we're over-provisioned.
+      maxUnavailable: 0
+      # Percentage so it scales with replica count changes.
+      maxSurge: '25%'
   selector:
     matchLabels:
       app: webapp
@@ -28,6 +31,10 @@ spec:
         ad.datadoghq.com/tolerate-unready: 'true'
     spec:
       dnsPolicy: Default
+      # Hard deadline for pod shutdown after SIGTERM (includes preStop sleep).
+      # Default is 30s; 60s gives plenty of room for in-flight request draining
+      # and OTEL SDK shutdown even if DNS is slow.
+      terminationGracePeriodSeconds: 60
       containers:
         - name: webapp
           image: docs-internal
@@ -74,18 +81,34 @@ spec:
             preStop:
               exec:
                 command: ['sleep', '5']
+          # warmServer() loads ~3500 content files × 9 languages × 9 versions.
+          # Avg startup: ~25s, worst observed: ~48s (Datadog: docs.warm_server).
+          # Server does not listen until warmup completes, so probes fail at
+          # TCP level during boot — no app-level readiness flag needed.
           startupProbe:
             httpGet:
               path: /healthcheck
               port: http
-            initialDelaySeconds: 5
+            # Server can't respond until warmup finishes (~25s avg), so don't
+            # waste probes checking before that.
+            initialDelaySeconds: 30
             periodSeconds: 5
-            failureThreshold: 12
+            # Total runway: 30s + (30 × 5s) = 180s. Covers worst-case startup
+            # plus resource contention when multiple pods boot during a deploy.
+            failureThreshold: 30
             timeoutSeconds: 5
           readinessProbe:
-            timeoutSeconds: 5
-            periodSeconds: 10
-            failureThreshold: 3
             httpGet:
               path: /healthcheck
               port: http
+            periodSeconds: 10
+            # 5 × 10s = 50s before pulling pod from load balancer.
+            # Healthcheck is always-200 (no app-level logic), so failures
+            # mean the process is hung or under extreme pressure.
+            failureThreshold: 5
+            timeoutSeconds: 5
+          # No livenessProbe: healthcheck always returns 200 with no app-level
+          # checks, so a liveness probe would only catch a fully hung process.
+          # Readiness already removes hung pods from the load balancer, and we
+          # intentionally avoid liveness restarts — they risk killing pods
+          # during GC pauses or transient load spikes.