Skip to content

Commit c51ec3e

Browse files
committed
fix(services): HealthChecker.Register immediately populates cache when running
When Register() is called on an already-started HealthChecker, the new service's health state now immediately reflects in IsReady() / IsHealthy() without waiting for the next 15-second polling tick. Root cause: the job spawner starts a service and then registers it with the HealthChecker (spawner.go). Before this fix, the service would appear absent from health reports for up to 15 seconds even though Ready() already returned nil, because the HealthChecker cache was only refreshed by a background ticker. The servicesMu lock is released before calling IfStarted to avoid a deadlock with Start(), which holds the StateMachine lock while calling update() (which acquires servicesMu.RLock).
1 parent 8115835 commit c51ec3e

2 files changed

Lines changed: 94 additions & 7 deletions

File tree

pkg/services/health.go

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -218,14 +218,39 @@ func (c *HealthChecker) Register(service HealthReporter) error {
218218
return fmt.Errorf("misconfigured check %#v for %T", name, service)
219219
}
220220

221-
c.servicesMu.Lock()
222-
defer c.servicesMu.Unlock()
223-
if testing.Testing() {
224-
if orig, ok := c.services[name]; ok {
225-
panic(fmt.Errorf("duplicate name %q: service names must be unique: types %T & %T", name, service, orig))
221+
func() {
222+
c.servicesMu.Lock()
223+
defer c.servicesMu.Unlock()
224+
if testing.Testing() {
225+
if orig, ok := c.services[name]; ok {
226+
panic(fmt.Errorf("duplicate name %q: service names must be unique: types %T & %T", name, service, orig))
227+
}
226228
}
227-
}
228-
c.services[name] = service
229+
c.services[name] = service
230+
}()
231+
232+
// If already started, immediately populate the health state for this
233+
// service so callers don't wait up to <interval> for the next tick.
234+
// servicesMu must be released before calling IfStarted to avoid a
235+
// deadlock with Start(), which holds the StateMachine lock while
236+
// calling update() (which acquires servicesMu.RLock).
237+
c.IfStarted(func() {
238+
ctx := context.Background()
239+
ready := service.Ready()
240+
241+
c.stateMu.Lock()
242+
defer c.stateMu.Unlock()
243+
c.ready[name] = ready
244+
for n, err := range service.HealthReport() {
245+
c.healthy[n] = err
246+
value := 0
247+
if err == nil {
248+
value = 1
249+
}
250+
c.cfg.SetStatus(ctx, name, value)
251+
}
252+
})
253+
229254
return nil
230255
}
231256

pkg/services/health_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,65 @@ func TestContainsError(t *testing.T) {
5656
})
5757
}
5858
}
59+
60+
// testHealthReporter is a minimal HealthReporter for use in tests.
61+
type testHealthReporter struct {
62+
name string
63+
ready error
64+
healthy error
65+
}
66+
67+
func (r *testHealthReporter) Name() string { return r.name }
68+
func (r *testHealthReporter) Ready() error { return r.ready }
69+
func (r *testHealthReporter) HealthReport() map[string]error {
70+
return map[string]error{r.name: r.healthy}
71+
}
72+
73+
// TestHealthChecker_Register_ImmediateUpdate verifies that Register() immediately
74+
// populates the health state when the HealthChecker is already running, so
75+
// callers don't wait up to 15 s for the next polling tick.
76+
func TestHealthChecker_Register_ImmediateUpdate(t *testing.T) {
77+
hc := HealthCheckerConfig{Ver: "test", Sha: "abc1234"}.New()
78+
if err := hc.Start(); err != nil {
79+
t.Fatalf("Start: %v", err)
80+
}
81+
t.Cleanup(func() { _ = hc.Close() })
82+
83+
svc := &testHealthReporter{name: "svc", ready: nil, healthy: nil}
84+
if err := hc.Register(svc); err != nil {
85+
t.Fatalf("Register: %v", err)
86+
}
87+
88+
// No sleep needed — Register() must have synchronously populated the state.
89+
ready, readyErrs := hc.IsReady()
90+
assert.True(t, ready, "expected ready after Register, got errors: %v", readyErrs)
91+
assert.Nil(t, readyErrs["svc"], "expected svc ready, got: %v", readyErrs["svc"])
92+
93+
healthy, healthyErrs := hc.IsHealthy()
94+
assert.True(t, healthy, "expected healthy after Register, got errors: %v", healthyErrs)
95+
assert.Nil(t, healthyErrs["svc"], "expected svc healthy, got: %v", healthyErrs["svc"])
96+
}
97+
98+
// TestHealthChecker_Register_ImmediateUpdate_Unhealthy verifies that an unhealthy
99+
// service registered on a running HealthChecker is immediately visible.
100+
func TestHealthChecker_Register_ImmediateUpdate_Unhealthy(t *testing.T) {
101+
hc := HealthCheckerConfig{Ver: "test", Sha: "abc1234"}.New()
102+
if err := hc.Start(); err != nil {
103+
t.Fatalf("Start: %v", err)
104+
}
105+
t.Cleanup(func() { _ = hc.Close() })
106+
107+
notReady := errors.New("service is starting")
108+
svc := &testHealthReporter{name: "svc2", ready: notReady, healthy: notReady}
109+
if err := hc.Register(svc); err != nil {
110+
t.Fatalf("Register: %v", err)
111+
}
112+
113+
ready, readyErrs := hc.IsReady()
114+
assert.False(t, ready, "expected not ready")
115+
assert.Equal(t, notReady, readyErrs["svc2"])
116+
117+
healthy, healthyErrs := hc.IsHealthy()
118+
assert.False(t, healthy, "expected not healthy")
119+
assert.Equal(t, notReady, healthyErrs["svc2"])
120+
}

0 commit comments

Comments
 (0)