@@ -3,15 +3,18 @@ kind: Deployment
33metadata :
44 name : webapp
55 annotations :
6- moda.github.net/allow-missing-ready-pods : ' 1 '
6+ moda.github.net/allow-missing-ready-pods : ' 0 '
77 moda.github.net/inject-unified-service-tag-env-var : docs-internal
88spec :
99 replicas : 12
1010 strategy :
1111 type : RollingUpdate
1212 rollingUpdate :
13- maxUnavailable : 1
14- maxSurge : 2
13+ # Don't kill old pods until new ones pass readiness.
14+ # Prevents capacity loss during deploys. Safe because we're over-provisioned.
15+ maxUnavailable : 0
16+ # Percentage so it scales with replica count changes.
17+ maxSurge : ' 25%'
1518 selector :
1619 matchLabels :
1720 app : webapp
2831 ad.datadoghq.com/tolerate-unready : ' true'
2932 spec :
3033 dnsPolicy : Default
34+ # Hard deadline for pod shutdown after SIGTERM (includes preStop sleep).
35+ # Default is 30s; 60s gives plenty of room for in-flight request draining
36+ # and OTEL SDK shutdown even if DNS is slow.
37+ terminationGracePeriodSeconds : 60
3138 containers :
3239 - name : webapp
3340 image : docs-internal
@@ -74,18 +81,34 @@ spec:
7481 preStop :
7582 exec :
7683 command : ['sleep', '5']
84+ # warmServer() loads ~3500 content files × 9 languages × 9 versions.
85+ # Avg startup: ~25s, worst observed: ~48s (Datadog: docs.warm_server).
86+ # Server does not listen until warmup completes, so probes fail at
87+ # TCP level during boot — no app-level readiness flag needed.
7788 startupProbe :
7889 httpGet :
7990 path : /healthcheck
8091 port : http
81- initialDelaySeconds : 5
92+ # Server can't respond until warmup finishes (~25s avg), so don't
93+ # waste probes checking before that.
94+ initialDelaySeconds : 30
8295 periodSeconds : 5
83- failureThreshold : 12
96+ # Total runway: 30s + (30 × 5s) = 180s. Covers worst-case startup
97+ # plus resource contention when multiple pods boot during a deploy.
98+ failureThreshold : 30
8499 timeoutSeconds : 5
85100 readinessProbe :
86- timeoutSeconds : 5
87- periodSeconds : 10
88- failureThreshold : 3
89101 httpGet :
90102 path : /healthcheck
91103 port : http
104+ periodSeconds : 10
105+ # 5 × 10s = 50s before pulling pod from load balancer.
106+ # Healthcheck is always-200 (no app-level logic), so failures
107+ # mean the process is hung or under extreme pressure.
108+ failureThreshold : 5
109+ timeoutSeconds : 5
110+ # No livenessProbe: healthcheck always returns 200 with no app-level
111+ # checks, so a liveness probe would only catch a fully hung process.
112+ # Readiness already removes hung pods from the load balancer, and we
113+ # intentionally avoid liveness restarts — they risk killing pods
114+ # during GC pauses or transient load spikes.
0 commit comments