Skip to content

Commit 771735d

Browse files
heiskrCopilot
andauthored
Improve deployment reliability with tuned K8s probes and rollout strategy (#60542)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent b0cbab7 commit 771735d

File tree

2 files changed

+38
-13
lines changed

2 files changed

+38
-13
lines changed

config/kubernetes/default/deployments/webapp.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ spec:
2222
ad.datadoghq.com/tolerate-unready: 'true'
2323
spec:
2424
dnsPolicy: Default
25+
terminationGracePeriodSeconds: 60
2526
containers:
2627
- name: webapp
2728
image: docs-internal
@@ -67,18 +68,19 @@ spec:
6768
preStop:
6869
exec:
6970
command: ['sleep', '5']
71+
# See production/deployments/webapp.yaml for detailed comments on probe config.
7072
startupProbe:
7173
httpGet:
7274
path: /healthcheck
7375
port: http
74-
initialDelaySeconds: 5
76+
initialDelaySeconds: 30
7577
periodSeconds: 5
76-
failureThreshold: 12
78+
failureThreshold: 30
7779
timeoutSeconds: 5
7880
readinessProbe:
79-
timeoutSeconds: 5
80-
periodSeconds: 10
81-
failureThreshold: 3
8281
httpGet:
8382
path: /healthcheck
8483
port: http
84+
periodSeconds: 10
85+
failureThreshold: 5
86+
timeoutSeconds: 5

config/kubernetes/production/deployments/webapp.yaml

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,18 @@ kind: Deployment
33
metadata:
44
name: webapp
55
annotations:
6-
moda.github.net/allow-missing-ready-pods: '1'
6+
moda.github.net/allow-missing-ready-pods: '0'
77
moda.github.net/inject-unified-service-tag-env-var: docs-internal
88
spec:
99
replicas: 12
1010
strategy:
1111
type: RollingUpdate
1212
rollingUpdate:
13-
maxUnavailable: 1
14-
maxSurge: 2
13+
# Don't kill old pods until new ones pass readiness.
14+
# Prevents capacity loss during deploys. Safe because we're over-provisioned.
15+
maxUnavailable: 0
16+
# Percentage so it scales with replica count changes.
17+
maxSurge: '25%'
1518
selector:
1619
matchLabels:
1720
app: webapp
@@ -28,6 +31,10 @@ spec:
2831
ad.datadoghq.com/tolerate-unready: 'true'
2932
spec:
3033
dnsPolicy: Default
34+
# Hard deadline for pod shutdown after SIGTERM (includes preStop sleep).
35+
# Default is 30s; 60s gives plenty of room for in-flight request draining
36+
# and OTEL SDK shutdown even if DNS is slow.
37+
terminationGracePeriodSeconds: 60
3138
containers:
3239
- name: webapp
3340
image: docs-internal
@@ -74,18 +81,34 @@ spec:
7481
preStop:
7582
exec:
7683
command: ['sleep', '5']
84+
# warmServer() loads ~3500 content files × 9 languages × 9 versions.
85+
# Avg startup: ~25s, worst observed: ~48s (Datadog: docs.warm_server).
86+
# Server does not listen until warmup completes, so probes fail at
87+
# TCP level during boot — no app-level readiness flag needed.
7788
startupProbe:
7889
httpGet:
7990
path: /healthcheck
8091
port: http
81-
initialDelaySeconds: 5
92+
# Server can't respond until warmup finishes (~25s avg), so don't
93+
# waste probes checking before that.
94+
initialDelaySeconds: 30
8295
periodSeconds: 5
83-
failureThreshold: 12
96+
# Total runway: 30s + (30 × 5s) = 180s. Covers worst-case startup
97+
# plus resource contention when multiple pods boot during a deploy.
98+
failureThreshold: 30
8499
timeoutSeconds: 5
85100
readinessProbe:
86-
timeoutSeconds: 5
87-
periodSeconds: 10
88-
failureThreshold: 3
89101
httpGet:
90102
path: /healthcheck
91103
port: http
104+
periodSeconds: 10
105+
# 5 × 10s = 50s before pulling pod from load balancer.
106+
# Healthcheck is always-200 (no app-level logic), so failures
107+
# mean the process is hung or under extreme pressure.
108+
failureThreshold: 5
109+
timeoutSeconds: 5
110+
# No livenessProbe: healthcheck always returns 200 with no app-level
111+
# checks, so a liveness probe would only catch a fully hung process.
112+
# Readiness already removes hung pods from the load balancer, and we
113+
# intentionally avoid liveness restarts — they risk killing pods
114+
# during GC pauses or transient load spikes.

0 commit comments

Comments
 (0)