Skip to content

Commit 67cf028

Browse files
authored
feat(admin): /admin/healthz/leader for load-balancer leader probes (#689)
## Summary - Add `/admin/healthz/leader` so an upstream load balancer (Caddy, HAProxy, ...) can route admin traffic only to the current Raft leader, skipping the follower→leader AdminForward proxy hop on writes. - Mirrors the `/healthz/leader` endpoints already shipped on the S3 (`adapter/s3.go:serveS3LeaderHealthz`) and DynamoDB (`adapter/dynamodb.go:serveDynamoLeaderHealthz`) adapters — same semantics, same body shape, same method allowlist — so a multi-protocol load balancer probing any of the three sees identical responses. - New `LeaderProbe` interface in `internal/admin`; `NewRouterWithLeaderProbe` constructor (`NewRouter` delegates with `nil` probe, preserving existing callers); `ServerDeps.LeaderProbe`; `newAdminLeaderProbe` in `main_admin.go` consults the local Raft group runtimes. ## Behaviour matrix | Path | Probe state | Response | |----------------------------|--------------------|----------------------------| | `/admin/healthz` | (n/a) | 200 `ok` | | `/admin/healthz/leader` | wired, leader | 200 `ok` | | `/admin/healthz/leader` | wired, follower | 503 `not leader` | | `/admin/healthz/leader` | not wired | 404 `not_found` (JSON) | | `/admin/healthz/leader` POST | (any) | 405 `method_not_allowed` | ## Self-review (CLAUDE.md 5 lenses) 1. **Data loss** — None. Read-only handler; no Raft proposals. 2. **Concurrency** — `VerifyLeader` is bounded at 2 s, called only on nodes that already claim leadership via `Status()`. No new shared state. 3. **Performance** — Cheap path: followers short-circuit via `Status()` (no Raft round-trip). Leader path: one ReadIndex per probe per group; load balancers typically probe every few seconds, so the overhead is bounded and well below the existing background HLC lease + ReadIndex traffic. 4. **Data consistency** — `VerifyLeader` is the same ReadIndex round-trip the lease-read path uses; a stale-leader follower mid-silent-leadership-change cannot return 200. 5. **Test coverage** — six router-level tests: 200-when-leader, 503-when-follower, HEAD on both branches, POST rejection, nil-probe → 404, classify ordering (path is not swallowed by the SPA fallback). ## Test plan - [x] `go test -race ./internal/admin/...` — passes (1.3s) - [x] `go test -race -run TestStartAdminServer .` — passes - [x] `golangci-lint run ./...` — clean - [ ] CI: full suite under `-race` @claude review
2 parents a3d312f + 689f2e4 commit 67cf028

5 files changed

Lines changed: 303 additions & 19 deletions

File tree

internal/admin/router.go

Lines changed: 114 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,49 @@ import (
2626
// versions (`/admin/api`, `/admin/api/v2`, ...) return a JSON 404
2727
// instead of being silently answered with the SPA HTML.
2828
const (
29-
pathPrefixAdmin = "/admin"
30-
pathAPIRoot = "/admin/api"
31-
pathPrefixAPI = pathAPIRoot + "/"
32-
pathAPIv1Root = "/admin/api/v1"
33-
pathPrefixAPIv1 = pathAPIv1Root + "/"
34-
pathHealthz = "/admin/healthz"
29+
pathPrefixAdmin = "/admin"
30+
pathAPIRoot = "/admin/api"
31+
pathPrefixAPI = pathAPIRoot + "/"
32+
pathAPIv1Root = "/admin/api/v1"
33+
pathPrefixAPIv1 = pathAPIv1Root + "/"
34+
pathHealthz = "/admin/healthz"
35+
// pathHealthzLeader is a sibling of pathHealthz for load balancers
36+
// that want to route admin traffic only to the current Raft leader
37+
// (so admin writes do not bounce through the AdminForward proxy on
38+
// followers). Mirrors the /healthz/leader endpoints already shipped
39+
// on the S3 and DynamoDB adapters: returns 200 only when this node
40+
// is the verified leader of the default Raft group; 503 otherwise.
41+
// classify() routes this path before the SPA catch-all so it cannot
42+
// be silently answered with index.html.
43+
pathHealthzLeader = "/admin/healthz/leader"
3544
pathAssetsRoot = "/admin/assets"
3645
pathPrefixAssets = pathAssetsRoot + "/"
3746
pathRootAssetsDir = "assets"
3847
pathIndexHTML = "index.html"
3948
)
4049

50+
// LeaderProbe is the cheap healthz contract used by /admin/healthz/leader.
51+
// Implementations should return true only when this node is the verified
52+
// Raft leader — a stale-leader follower returning true during a silent
53+
// leadership change defeats the load balancer's purpose. Production wires
54+
// this to the default group's coordinator IsLeader + VerifyLeader pair;
55+
// tests use a stub returning a fixed bool.
56+
//
57+
// A nil LeaderProbe makes /admin/healthz/leader unavailable — the router
58+
// answers it with the standard JSON 404, distinguishing "not enabled" from
59+
// the operational "503 not leader" state. Mirrors the S3/DynamoDB
60+
// /healthz/leader contract.
61+
type LeaderProbe interface {
62+
IsVerifiedLeader() bool
63+
}
64+
65+
// LeaderProbeFunc is a convenience adapter for wiring a plain function
66+
// without defining an interface implementation. Mirrors ClusterInfoFunc.
67+
type LeaderProbeFunc func() bool
68+
69+
// IsVerifiedLeader implements LeaderProbe.
70+
func (f LeaderProbeFunc) IsVerifiedLeader() bool { return f() }
71+
4172
// APIHandler is the bridge between the router and all JSON API endpoints.
4273
// Everything under /admin/api/v1/ resolves through it; individual endpoint
4374
// routing is the handler's responsibility (see apiMux below).
@@ -51,6 +82,7 @@ type APIHandler http.Handler
5182
type Router struct {
5283
api http.Handler
5384
static fs.FS
85+
leader LeaderProbe
5486
notFound http.Handler
5587
}
5688

@@ -62,10 +94,24 @@ type Router struct {
6294
// SPA catch-all (which always serves index.html). A nil static FS
6395
// causes 404s for asset and SPA routes, which is the expected state
6496
// while the SPA has not been built yet.
97+
//
98+
// /admin/healthz/leader is wired via NewRouterWithLeaderProbe; this
99+
// constructor leaves it unrouted (404) for callers that do not need the
100+
// leader healthz endpoint.
65101
func NewRouter(api http.Handler, static fs.FS) *Router {
102+
return NewRouterWithLeaderProbe(api, static, nil)
103+
}
104+
105+
// NewRouterWithLeaderProbe is the long-form constructor used by
106+
// production wiring (see ServerDeps.LeaderProbe). The probe drives
107+
// /admin/healthz/leader: 200 when probe.IsVerifiedLeader() is true, 503
108+
// otherwise. A nil probe behaves identically to NewRouter — the path
109+
// returns the standard JSON 404.
110+
func NewRouterWithLeaderProbe(api http.Handler, static fs.FS, leader LeaderProbe) *Router {
66111
return &Router{
67112
api: api,
68113
static: static,
114+
leader: leader,
69115
notFound: http.HandlerFunc(writeJSONNotFound),
70116
}
71117
}
@@ -90,6 +136,7 @@ const (
90136
routeAPIv1 routeKind = iota
91137
routeAPIOther
92138
routeHealthz
139+
routeHealthzLeader
93140
routeAssetsRoot
94141
routeAsset
95142
routeSPA
@@ -103,6 +150,14 @@ func (rt *Router) classify(p string) routeKind {
103150
if k, ok := classifyAssets(p); ok {
104151
return k
105152
}
153+
// Both /admin/healthz and /admin/healthz/leader are exact-match
154+
// equality checks — relative ordering between them does not affect
155+
// correctness, but BOTH must run before the SPA prefix branch
156+
// below; otherwise the catch-all "anything under /admin/" resolves
157+
// these paths to index.html.
158+
if p == pathHealthzLeader {
159+
return routeHealthzLeader
160+
}
106161
if p == pathHealthz {
107162
return routeHealthz
108163
}
@@ -146,6 +201,11 @@ func (rt *Router) dispatch(k routeKind) http.Handler {
146201
return rt.api
147202
case routeHealthz:
148203
return http.HandlerFunc(rt.serveHealth)
204+
case routeHealthzLeader:
205+
if rt.leader == nil {
206+
return rt.notFound
207+
}
208+
return http.HandlerFunc(rt.serveLeaderHealth)
149209
case routeAsset:
150210
return http.HandlerFunc(rt.serveAsset)
151211
case routeSPA:
@@ -156,9 +216,27 @@ func (rt *Router) dispatch(k routeKind) http.Handler {
156216
return rt.notFound
157217
}
158218

219+
// allowGetHead is the canonical Allow value for read-only handlers
220+
// (healthz / SPA / static assets). RFC 7231 §6.5.5 requires every 405
221+
// to advertise the supported methods; load balancers and synthetic-
222+
// monitor tools key off this header to discover the right verbs
223+
// without scraping the body. Mirrors the value the S3 and DynamoDB
224+
// /healthz/leader handlers already set
225+
// (adapter/s3.go:2404, adapter/dynamodb.go:399).
226+
const allowGetHead = "GET, HEAD"
227+
228+
// writeMethodNotAllowed centralises the read-only 405 response shape
229+
// for router-served paths. The Allow header is set BEFORE
230+
// writeJSONError because writeJSONError calls w.WriteHeader, after
231+
// which header mutations would silently no-op.
232+
func writeMethodNotAllowed(w http.ResponseWriter) {
233+
w.Header().Set("Allow", allowGetHead)
234+
writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET or HEAD supported")
235+
}
236+
159237
func (rt *Router) serveHealth(w http.ResponseWriter, r *http.Request) {
160238
if r.Method != http.MethodGet && r.Method != http.MethodHead {
161-
writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET or HEAD supported")
239+
writeMethodNotAllowed(w)
162240
return
163241
}
164242
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
@@ -169,6 +247,33 @@ func (rt *Router) serveHealth(w http.ResponseWriter, r *http.Request) {
169247
}
170248
}
171249

250+
// serveLeaderHealth answers /admin/healthz/leader. 200 + "ok" only when
251+
// the LeaderProbe reports a verified leader; 503 + "not leader"
252+
// otherwise. The body shape and method-allowlist mirror the S3 / Dynamo
253+
// /healthz/leader endpoints (adapter/s3.go:serveS3LeaderHealthz,
254+
// adapter/dynamodb.go:serveDynamoLeaderHealthz) so an upstream load
255+
// balancer that probes any of the three sees identical semantics.
256+
//
257+
// Precondition: rt.leader is non-nil. dispatch() short-circuits the
258+
// nil case to rt.notFound before this handler is ever invoked, so a
259+
// belt-and-braces nil check inside this body would be dead code.
260+
func (rt *Router) serveLeaderHealth(w http.ResponseWriter, r *http.Request) {
261+
if r.Method != http.MethodGet && r.Method != http.MethodHead {
262+
writeMethodNotAllowed(w)
263+
return
264+
}
265+
status, body := http.StatusOK, "ok\n"
266+
if !rt.leader.IsVerifiedLeader() {
267+
status, body = http.StatusServiceUnavailable, "not leader\n"
268+
}
269+
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
270+
w.Header().Set("Cache-Control", "no-store")
271+
w.WriteHeader(status)
272+
if r.Method == http.MethodGet {
273+
_, _ = w.Write([]byte(body))
274+
}
275+
}
276+
172277
func (rt *Router) serveAsset(w http.ResponseWriter, r *http.Request) {
173278
// Static assets are read-only resources; rejecting non-GET/HEAD
174279
// here keeps the response uniform with /admin/healthz and the
@@ -177,7 +282,7 @@ func (rt *Router) serveAsset(w http.ResponseWriter, r *http.Request) {
177282
// the file body for a write request) or surface as a confusing
178283
// 404 — neither matches the API contract for assets.
179284
if r.Method != http.MethodGet && r.Method != http.MethodHead {
180-
writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET or HEAD supported")
285+
writeMethodNotAllowed(w)
181286
return
182287
}
183288
if rt.static == nil {
@@ -254,7 +359,7 @@ func (rt *Router) serveSPA(w http.ResponseWriter, r *http.Request) {
254359
// /admin/something returned a JSON 404 with a nil static and a
255360
// JSON 405 with a populated static — same URL, different answer.
256361
if r.Method != http.MethodGet && r.Method != http.MethodHead {
257-
writeJSONError(w, http.StatusMethodNotAllowed, "method_not_allowed", "only GET or HEAD supported")
362+
writeMethodNotAllowed(w)
258363
return
259364
}
260365
if rt.static == nil {

internal/admin/router_test.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,135 @@ func TestRouter_HealthzRejectsPost(t *testing.T) {
6565
require.Equal(t, http.StatusMethodNotAllowed, rec.Code)
6666
}
6767

68+
// TestRouter_HealthzLeader_ReturnsOKWhenLeader locks down the
69+
// happy path on /admin/healthz/leader: a verified leader probe
70+
// produces 200 + "ok\n" + text/plain. Mirrors the S3 / Dynamo
71+
// /healthz/leader contract so a multi-protocol load balancer
72+
// sees identical semantics.
73+
func TestRouter_HealthzLeader_ReturnsOKWhenLeader(t *testing.T) {
74+
r := NewRouterWithLeaderProbe(nil, nil, LeaderProbeFunc(func() bool { return true }))
75+
req := httptest.NewRequest(http.MethodGet, "/admin/healthz/leader", nil)
76+
rec := httptest.NewRecorder()
77+
r.ServeHTTP(rec, req)
78+
79+
require.Equal(t, http.StatusOK, rec.Code)
80+
require.Contains(t, rec.Header().Get("Content-Type"), "text/plain")
81+
require.Equal(t, "ok\n", rec.Body.String())
82+
require.Equal(t, "no-store", rec.Header().Get("Cache-Control"))
83+
}
84+
85+
// TestRouter_HealthzLeader_Returns503WhenNotLeader locks down the
86+
// non-leader contract: 503 Service Unavailable + "not leader\n".
87+
// A load balancer probing this endpoint takes the node out of
88+
// rotation when it loses leadership; the body string is informative
89+
// for operators reading curl output.
90+
func TestRouter_HealthzLeader_Returns503WhenNotLeader(t *testing.T) {
91+
r := NewRouterWithLeaderProbe(nil, nil, LeaderProbeFunc(func() bool { return false }))
92+
req := httptest.NewRequest(http.MethodGet, "/admin/healthz/leader", nil)
93+
rec := httptest.NewRecorder()
94+
r.ServeHTTP(rec, req)
95+
96+
require.Equal(t, http.StatusServiceUnavailable, rec.Code)
97+
require.Contains(t, rec.Header().Get("Content-Type"), "text/plain")
98+
require.Equal(t, "not leader\n", rec.Body.String())
99+
}
100+
101+
// TestRouter_HealthzLeader_HeadOmitsBody mirrors the existing
102+
// healthz HEAD test. The status code must still indicate the
103+
// leader state; only the body is suppressed.
104+
func TestRouter_HealthzLeader_HeadOmitsBody(t *testing.T) {
105+
rLeader := NewRouterWithLeaderProbe(nil, nil, LeaderProbeFunc(func() bool { return true }))
106+
req := httptest.NewRequest(http.MethodHead, "/admin/healthz/leader", nil)
107+
rec := httptest.NewRecorder()
108+
rLeader.ServeHTTP(rec, req)
109+
require.Equal(t, http.StatusOK, rec.Code)
110+
require.Equal(t, "", rec.Body.String())
111+
112+
rFollower := NewRouterWithLeaderProbe(nil, nil, LeaderProbeFunc(func() bool { return false }))
113+
req = httptest.NewRequest(http.MethodHead, "/admin/healthz/leader", nil)
114+
rec = httptest.NewRecorder()
115+
rFollower.ServeHTTP(rec, req)
116+
require.Equal(t, http.StatusServiceUnavailable, rec.Code)
117+
require.Equal(t, "", rec.Body.String())
118+
}
119+
120+
// TestRouter_HealthzLeader_RejectsPost guards the method allowlist:
121+
// only GET / HEAD are accepted. Mirrors TestRouter_HealthzRejectsPost.
122+
// Also asserts the Allow: GET, HEAD header required by RFC 7231
123+
// §6.5.5 — load balancers and synthetic-monitor tools key off this
124+
// header to discover supported verbs.
125+
func TestRouter_HealthzLeader_RejectsPost(t *testing.T) {
126+
r := NewRouterWithLeaderProbe(nil, nil, LeaderProbeFunc(func() bool { return true }))
127+
req := httptest.NewRequest(http.MethodPost, "/admin/healthz/leader", strings.NewReader(""))
128+
rec := httptest.NewRecorder()
129+
r.ServeHTTP(rec, req)
130+
require.Equal(t, http.StatusMethodNotAllowed, rec.Code)
131+
require.Equal(t, "GET, HEAD", rec.Header().Get("Allow"))
132+
}
133+
134+
// TestRouter_405_AllowHeader sweeps every router-served path that
135+
// rejects non-GET/HEAD verbs and asserts each emits the
136+
// RFC-required Allow header. Locks down the writeMethodNotAllowed
137+
// invariant — a future handler that bypasses the helper would
138+
// regress this test rather than silently shipping a non-compliant
139+
// 405.
140+
func TestRouter_405_AllowHeader(t *testing.T) {
141+
t.Parallel()
142+
cases := []struct {
143+
name string
144+
path string
145+
}{
146+
{"healthz", "/admin/healthz"},
147+
{"healthz_leader", "/admin/healthz/leader"},
148+
{"asset", "/admin/assets/app.js"},
149+
{"spa", "/admin/somewhere"},
150+
}
151+
r := NewRouterWithLeaderProbe(nil, newTestStatic(), LeaderProbeFunc(func() bool { return true }))
152+
for _, tc := range cases {
153+
t.Run(tc.name, func(t *testing.T) {
154+
t.Parallel()
155+
req := httptest.NewRequest(http.MethodPost, tc.path, strings.NewReader(""))
156+
rec := httptest.NewRecorder()
157+
r.ServeHTTP(rec, req)
158+
require.Equal(t, http.StatusMethodNotAllowed, rec.Code)
159+
require.Equal(t, "GET, HEAD", rec.Header().Get("Allow"),
160+
"path %s missing RFC 7231 Allow header on 405", tc.path)
161+
})
162+
}
163+
}
164+
165+
// TestRouter_HealthzLeader_NilProbeReturns404 locks down the
166+
// "feature off" pattern: when the LeaderProbe is not wired, the
167+
// router answers /admin/healthz/leader with the standard JSON 404
168+
// — distinct from the operational 503 (which means "wired, not
169+
// leader"). Single-node dev runs and admin builds without runtime
170+
// access fall here.
171+
func TestRouter_HealthzLeader_NilProbeReturns404(t *testing.T) {
172+
r := NewRouter(nil, newTestStatic()) // NewRouter passes nil probe to NewRouterWithLeaderProbe
173+
req := httptest.NewRequest(http.MethodGet, "/admin/healthz/leader", nil)
174+
rec := httptest.NewRecorder()
175+
r.ServeHTTP(rec, req)
176+
require.Equal(t, http.StatusNotFound, rec.Code)
177+
require.Contains(t, rec.Header().Get("Content-Type"), "application/json")
178+
require.Contains(t, rec.Body.String(), "not_found")
179+
}
180+
181+
// TestRouter_HealthzLeader_NotSwallowedBySPA locks down the
182+
// classify ordering: /admin/healthz/leader must reach the leader
183+
// handler (200/503) — not the SPA fallback that would otherwise
184+
// answer with index.html. A regression here would cause a load
185+
// balancer probing the path to see HTML 200 forever and never
186+
// detect a leadership change.
187+
func TestRouter_HealthzLeader_NotSwallowedBySPA(t *testing.T) {
188+
probe := LeaderProbeFunc(func() bool { return false })
189+
r := NewRouterWithLeaderProbe(nil, newTestStatic(), probe)
190+
req := httptest.NewRequest(http.MethodGet, "/admin/healthz/leader", nil)
191+
rec := httptest.NewRecorder()
192+
r.ServeHTTP(rec, req)
193+
require.Equal(t, http.StatusServiceUnavailable, rec.Code)
194+
require.NotContains(t, rec.Body.String(), "<html")
195+
}
196+
68197
func TestRouter_StaticAssetServed(t *testing.T) {
69198
r := NewRouter(nil, newTestStatic())
70199
req := httptest.NewRequest(http.MethodGet, "/admin/assets/app.js", nil)

internal/admin/server.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ type ServerDeps struct {
8080
// /admin/assets/* and the SPA fallback in that case.
8181
StaticFS fs.FS
8282

83+
// LeaderProbe drives /admin/healthz/leader: 200 when probe.
84+
// IsVerifiedLeader() is true, 503 otherwise. A nil probe keeps the
85+
// path unrouted (returns the standard JSON 404), matching the
86+
// "feature off" pattern Tables / Buckets / Queues already use.
87+
// Production wires this to the default group's IsLeader +
88+
// VerifyLeader pair so a stale-leader follower in the middle of a
89+
// silent leadership change cannot return 200.
90+
LeaderProbe LeaderProbe
91+
8392
// AuthOpts configures cookie attributes and rate limiting. Zero
8493
// values pick production-appropriate defaults.
8594
AuthOpts AuthServiceOpts
@@ -129,7 +138,7 @@ func NewServer(deps ServerDeps) (*Server, error) {
129138
keyviz := NewKeyVizHandler(deps.KeyViz).WithLogger(logger).WithFanout(deps.KeyVizFanout)
130139
sqs := buildSqsHandlerForDeps(deps, logger)
131140
mux := buildAPIMux(auth, deps.Verifier, cluster, dynamo, s3, keyviz, sqs, logger)
132-
router := NewRouter(mux, deps.StaticFS)
141+
router := NewRouterWithLeaderProbe(mux, deps.StaticFS, deps.LeaderProbe)
133142
return &Server{deps: deps, router: router, auth: auth, mux: mux}, nil
134143
}
135144

0 commit comments

Comments
 (0)