Skip to content

Commit 46a1b1f

Browse files
JAORMXclaude
andauthored
Treat 401/403 from auth-configured backends as healthy (#4935)
Health probes run as background system checks with no user context, so they cannot carry the per-user upstream tokens that upstream_inject backends require. For OAuth-protected backends, a 401/403 challenge is the expected response to an unauthenticated probe and proves the backend is reachable, its auth layer works, and the network path is sound. Map auth errors to BackendHealthy when the backend has an outgoing auth strategy configured; keep BackendUnauthenticated only as a narrower misconfiguration signal (backend demands auth, none configured). This unblocks discovery from excluding OAuth-protected backends from capability aggregation (resolves the TODO at discovery/middleware.go). Drop the `|| BackendUnauthenticated` clause from /status since the state now represents misconfig rather than routability. Legacy PR #4866 routability semantics in monitor.go are left in place with a TODO for a follow-up revert; they touch operator controllers and warrant a separate PR. Closes #4920 Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent b11bcf2 commit 46a1b1f

6 files changed

Lines changed: 289 additions & 42 deletions

File tree

pkg/vmcp/discovery/middleware.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,17 +135,21 @@ func Middleware(
135135
// or degraded. Backends that are unhealthy, unknown, or unauthenticated are excluded
136136
// from capability aggregation to prevent exposing tools from unavailable backends.
137137
//
138-
// TODO(#4920): Unauthenticated backends are treated as routable for phase determination
139-
// but are excluded here because discovery probes cannot carry user tokens. If health
140-
// probes could authenticate, these backends would be fully healthy and included here.
138+
// A note on BackendUnauthenticated: a 401/403 from a backend that has an outgoing
139+
// auth strategy configured is treated as BackendHealthy by the health checker
140+
// (health probes deliberately do not carry user credentials — the challenge proves
141+
// reachability). BackendUnauthenticated therefore indicates a misconfiguration:
142+
// the backend requires authentication but no outgoing auth strategy is configured
143+
// on the backend target. Excluding such backends from capability aggregation is
144+
// the correct behavior — their capabilities cannot be safely exposed.
141145
//
142146
// Health status filtering:
143147
// - healthy: included (fully operational)
144148
// - degraded: included (slow but working)
145149
// - empty/zero-value: included (assume healthy when health monitoring is disabled)
146150
// - unhealthy: excluded (not responding, circuit breaker may be open)
147151
// - unknown: excluded (status not yet determined)
148-
// - unauthenticated: excluded (authentication failed)
152+
// - unauthenticated: excluded (misconfiguration: backend requires auth but none configured)
149153
//
150154
// When healthStatusProvider is provided, the current health status from the health
151155
// monitor is used (respects circuit breaker state). When nil, falls back to the

pkg/vmcp/health/checker.go

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"time"
1616

1717
"github.com/stacklok/toolhive/pkg/vmcp"
18+
authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
1819
)
1920

2021
// healthChecker implements vmcp.HealthChecker using ListCapabilities as the health check.
@@ -59,7 +60,14 @@ func NewHealthChecker(
5960
// Health determination logic:
6061
// - Success with fast response: Backend is healthy (BackendHealthy)
6162
// - Success with slow response (> degradedThreshold): Backend is degraded (BackendDegraded)
62-
// - Authentication error: Backend is unauthenticated (BackendUnauthenticated)
63+
// - Authentication error (HTTP 401/403) AND backend has an outgoing auth strategy
64+
// configured: Backend is healthy (BackendHealthy). Health probes deliberately do
65+
// not carry user credentials, so the backend's auth challenge proves reachability
66+
// and a working auth layer — that is success for probe purposes.
67+
// - Authentication error AND backend has no outgoing auth strategy configured
68+
// (AuthConfig nil or StrategyTypeUnauthenticated): Backend is unauthenticated
69+
// (BackendUnauthenticated). This signals operator misconfiguration — the backend
70+
// requires authentication but none was configured on the backend target.
6371
// - Timeout or connection error: Backend is unhealthy (BackendUnhealthy)
6472
// - Other errors: Backend is unhealthy (BackendUnhealthy)
6573
//
@@ -92,8 +100,19 @@ func (h *healthChecker) CheckHealth(ctx context.Context, target *vmcp.BackendTar
92100
responseDuration := time.Since(startTime)
93101

94102
if err != nil {
95-
// Categorize the error to determine health status
96-
status := categorizeError(err)
103+
// Categorize the error to determine health status. The target's outgoing
104+
// auth config is consulted: a 401/403 from a backend with an outgoing auth
105+
// strategy is the expected response to a no-credential probe and maps to
106+
// BackendHealthy. In that case we return a nil error so the monitor records
107+
// this as a successful check and does not open the circuit breaker.
108+
status := categorizeError(target, err)
109+
if status == vmcp.BackendHealthy {
110+
slog.Debug("health check received expected auth challenge — treating as healthy",
111+
"backend", target.WorkloadName,
112+
"error", err,
113+
"duration", responseDuration)
114+
return vmcp.BackendHealthy, nil
115+
}
97116
slog.Debug("health check failed for backend",
98117
"backend", target.WorkloadName,
99118
"error", err,
@@ -115,18 +134,24 @@ func (h *healthChecker) CheckHealth(ctx context.Context, target *vmcp.BackendTar
115134
return vmcp.BackendHealthy, nil
116135
}
117136

118-
// categorizeError determines the appropriate health status based on the error type.
137+
// categorizeError determines the appropriate health status based on the error type
138+
// and the backend's outgoing auth configuration.
139+
//
119140
// This uses sentinel error checking with errors.Is() for type-safe error categorization.
120141
// Falls back to string-based detection for backwards compatibility with non-wrapped errors.
121-
func categorizeError(err error) vmcp.BackendHealthStatus {
142+
//
143+
// For auth errors (HTTP 401/403), the target's AuthConfig is consulted to distinguish
144+
// an expected auth challenge (backend has outgoing auth configured) from a misconfiguration
145+
// (backend has no outgoing auth strategy). See authErrorStatus for details.
146+
func categorizeError(target *vmcp.BackendTarget, err error) vmcp.BackendHealthStatus {
122147
if err == nil {
123148
return vmcp.BackendHealthy
124149
}
125150

126151
// 1. Type-safe detection: Check for sentinel errors using errors.Is()
127152
// BackendClient now wraps all errors with appropriate sentinel errors
128153
if errors.Is(err, vmcp.ErrAuthenticationFailed) || errors.Is(err, vmcp.ErrAuthorizationFailed) {
129-
return vmcp.BackendUnauthenticated
154+
return authErrorStatus(target)
130155
}
131156

132157
if errors.Is(err, vmcp.ErrTimeout) || errors.Is(err, vmcp.ErrCancelled) {
@@ -140,7 +165,7 @@ func categorizeError(err error) vmcp.BackendHealthStatus {
140165
// 2. String-based detection: Fallback for backwards compatibility
141166
// This handles errors from sources that don't wrap with sentinel errors
142167
if vmcp.IsAuthenticationError(err) {
143-
return vmcp.BackendUnauthenticated
168+
return authErrorStatus(target)
144169
}
145170

146171
if vmcp.IsTimeoutError(err) || vmcp.IsConnectionError(err) {
@@ -150,3 +175,24 @@ func categorizeError(err error) vmcp.BackendHealthStatus {
150175
// Default to unhealthy for unknown errors
151176
return vmcp.BackendUnhealthy
152177
}
178+
179+
// authErrorStatus maps an authentication error (HTTP 401/403) to a health status
180+
// using the backend's outgoing auth configuration.
181+
//
182+
// Health probes deliberately do not carry user credentials. If the backend is
183+
// configured with an outgoing auth strategy, a 401/403 from the backend proves
184+
// that the backend is alive, the auth layer works, and the network+TLS path is
185+
// healthy — this is the expected response to an unauthenticated probe and is
186+
// therefore treated as BackendHealthy.
187+
//
188+
// If the backend has no outgoing auth strategy configured (AuthConfig nil or
189+
// StrategyTypeUnauthenticated), a 401/403 indicates operator misconfiguration:
190+
// the backend requires authentication but none was configured on the backend
191+
// target. This is reported as BackendUnauthenticated so it surfaces in status.
192+
func authErrorStatus(target *vmcp.BackendTarget) vmcp.BackendHealthStatus {
193+
if target != nil && target.AuthConfig != nil &&
194+
target.AuthConfig.Type != authtypes.StrategyTypeUnauthenticated {
195+
return vmcp.BackendHealthy
196+
}
197+
return vmcp.BackendUnauthenticated
198+
}

0 commit comments

Comments
 (0)