Skip to content

Commit 69c7ff5

Browse files
scotwellsclaude
andcommitted
fix: wait for instances to be ready before reporting rollout complete
The `compute deploy` rollout watcher reported PHASE=Done and exited within seconds of creating the workload, before any instances were scheduled. A WorkloadDeployment's Status.DesiredReplicas stays at zero until the controller first reconciles it, and computePhase treated zero desired as Done — so the very first poll of a fresh deployment looked complete. Resolve the wait target from the spec minimum while the controller has not yet reported a desired count, and require that no stale replicas remain before reporting Done so scale-downs and rolling updates aren't declared complete while old instances are still draining. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 291f5a4 commit 69c7ff5

2 files changed

Lines changed: 200 additions & 10 deletions

File tree

internal/cmd/compute/watch/watch.go

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,11 @@ func processDeployments(
112112
key := d.Spec.CityCode
113113
prev, exists := states[key]
114114

115-
desired := d.Status.DesiredReplicas
115+
desired := resolveDesired(d)
116116
ready := d.Status.ReadyReplicas
117117
current := d.Status.CurrentReplicas
118118

119-
newPhase := computePhase(desired, ready, current, prev)
119+
newPhase := computePhase(desired, ready, current, d.Status.Replicas)
120120

121121
if !exists || prev.desired != desired || prev.ready != ready || prev.current != current || prev.phase != newPhase {
122122
newPhase = updateDeploymentState(states, key, d, exists, prev, desired, ready, current, newPhase)
@@ -186,10 +186,7 @@ func printDeploymentRow(
186186
current, ready int32,
187187
newPhase deploymentPhase,
188188
) {
189-
old := d.Status.Replicas - d.Status.CurrentReplicas
190-
if old < 0 {
191-
old = 0
192-
}
189+
old := max(d.Status.Replicas-d.Status.CurrentReplicas, 0)
193190

194191
_, _ = fmt.Fprintf(tw, " %s\t%s\t%d\t%d\t%d\t%s\n",
195192
d.Spec.PlacementName,
@@ -217,16 +214,28 @@ func printElapsed(out io.Writer, elapsed time.Duration) {
217214
}
218215
}
219216

220-
func computePhase(desired, ready, current int32, _ *deploymentState) deploymentPhase {
217+
// resolveDesired returns the replica count the rollout should wait for.
218+
// Status.DesiredReplicas stays at zero until the controller first reconciles
219+
// the deployment; until then, fall back to the spec minimum so a freshly
220+
// created deployment isn't reported Done before any instances are scheduled.
221+
// Once the controller has reported a desired count, trust it.
222+
func resolveDesired(d computev1alpha.WorkloadDeployment) int32 {
223+
if d.Status.DesiredReplicas == 0 {
224+
return d.Spec.ScaleSettings.MinReplicas
225+
}
226+
return d.Status.DesiredReplicas
227+
}
228+
229+
func computePhase(desired, ready, current, replicas int32) deploymentPhase {
221230
if desired == 0 {
222231
return phaseDone
223232
}
233+
if ready >= desired && current >= desired && replicas <= current {
234+
return phaseDone
235+
}
224236
if current == 0 {
225237
return phasePending
226238
}
227-
if ready >= desired && current >= desired {
228-
return phaseDone
229-
}
230239
return phaseUpdating
231240
}
232241

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// SPDX-License-Identifier: AGPL-3.0-only
2+
3+
package watch
4+
5+
import (
6+
"testing"
7+
"time"
8+
9+
computev1alpha "go.datum.net/compute/api/v1alpha"
10+
)
11+
12+
func TestResolveDesired(t *testing.T) {
13+
tests := []struct {
14+
name string
15+
statusDesired int32
16+
specMinReplicas int32
17+
want int32
18+
}{
19+
{
20+
name: "unreconciled status falls back to spec min",
21+
statusDesired: 0,
22+
specMinReplicas: 1,
23+
want: 1,
24+
},
25+
{
26+
name: "genuine scale-to-zero",
27+
statusDesired: 0,
28+
specMinReplicas: 0,
29+
want: 0,
30+
},
31+
{
32+
name: "controller desired above min (e.g. autoscaled)",
33+
statusDesired: 3,
34+
specMinReplicas: 1,
35+
want: 3,
36+
},
37+
{
38+
// Once the controller has spoken, trust its value even if below spec min.
39+
name: "controller desired below min — trust controller",
40+
statusDesired: 1,
41+
specMinReplicas: 2,
42+
want: 1,
43+
},
44+
}
45+
46+
for _, tc := range tests {
47+
t.Run(tc.name, func(t *testing.T) {
48+
d := computev1alpha.WorkloadDeployment{}
49+
d.Status.DesiredReplicas = tc.statusDesired
50+
d.Spec.ScaleSettings.MinReplicas = tc.specMinReplicas
51+
52+
got := resolveDesired(d)
53+
if got != tc.want {
54+
t.Errorf("resolveDesired() = %d, want %d", got, tc.want)
55+
}
56+
})
57+
}
58+
}
59+
60+
func TestComputePhase(t *testing.T) {
61+
tests := []struct {
62+
name string
63+
desired int32
64+
ready int32
65+
current int32
66+
replicas int32
67+
want deploymentPhase
68+
}{
69+
{
70+
name: "zero desired is Done",
71+
desired: 0,
72+
ready: 0,
73+
current: 0,
74+
replicas: 0,
75+
want: phaseDone,
76+
},
77+
{
78+
name: "fresh create with no instances yet is Pending",
79+
desired: 1,
80+
ready: 0,
81+
current: 0,
82+
replicas: 0,
83+
want: phasePending,
84+
},
85+
{
86+
name: "instance scheduled but not ready is Updating",
87+
desired: 1,
88+
ready: 0,
89+
current: 1,
90+
replicas: 1,
91+
want: phaseUpdating,
92+
},
93+
{
94+
name: "single instance ready is Done",
95+
desired: 1,
96+
ready: 1,
97+
current: 1,
98+
replicas: 1,
99+
want: phaseDone,
100+
},
101+
{
102+
// OLD replicas still draining after scale-down must not report Done.
103+
name: "scale-down with old replicas still draining is Updating",
104+
desired: 1,
105+
ready: 1,
106+
current: 1,
107+
replicas: 5,
108+
want: phaseUpdating,
109+
},
110+
{
111+
name: "partial readiness is Updating",
112+
desired: 3,
113+
ready: 1,
114+
current: 2,
115+
replicas: 3,
116+
want: phaseUpdating,
117+
},
118+
{
119+
name: "all replicas ready is Done",
120+
desired: 2,
121+
ready: 2,
122+
current: 2,
123+
replicas: 2,
124+
want: phaseDone,
125+
},
126+
}
127+
128+
for _, tc := range tests {
129+
t.Run(tc.name, func(t *testing.T) {
130+
got := computePhase(tc.desired, tc.ready, tc.current, tc.replicas)
131+
if got != tc.want {
132+
t.Errorf("computePhase(%d, %d, %d, %d) = %q, want %q",
133+
tc.desired, tc.ready, tc.current, tc.replicas, got, tc.want)
134+
}
135+
})
136+
}
137+
}
138+
139+
func TestUpdateDeploymentState(t *testing.T) {
140+
const key = "DFW"
141+
142+
makeDeployment := func() computev1alpha.WorkloadDeployment {
143+
var d computev1alpha.WorkloadDeployment
144+
d.Spec.PlacementName = "default"
145+
d.Spec.CityCode = key
146+
return d
147+
}
148+
149+
t.Run("stalled updating for 40s is promoted to Blocked", func(t *testing.T) {
150+
states := map[string]*deploymentState{}
151+
stalledAt := time.Now().Add(-40 * time.Second)
152+
states[key] = &deploymentState{
153+
phase: phaseUpdating,
154+
ready: 1,
155+
current: 2,
156+
stalledSince: stalledAt,
157+
}
158+
159+
got := updateDeploymentState(states, key, makeDeployment(), true, states[key], 3, 1, 2, phaseUpdating)
160+
161+
if got != phaseBlocked {
162+
t.Errorf("updateDeploymentState() phase = %q, want %q", got, phaseBlocked)
163+
}
164+
if states[key].phase != phaseBlocked {
165+
t.Errorf("states[key].phase = %q, want %q", states[key].phase, phaseBlocked)
166+
}
167+
})
168+
169+
t.Run("first observation of Updating is not yet Blocked", func(t *testing.T) {
170+
states := map[string]*deploymentState{}
171+
172+
got := updateDeploymentState(states, key, makeDeployment(), false, nil, 3, 1, 2, phaseUpdating)
173+
174+
if got != phaseUpdating {
175+
t.Errorf("updateDeploymentState() phase = %q, want %q", got, phaseUpdating)
176+
}
177+
if states[key].phase != phaseUpdating {
178+
t.Errorf("states[key].phase = %q, want %q", states[key].phase, phaseUpdating)
179+
}
180+
})
181+
}

0 commit comments

Comments
 (0)