Skip to content

Commit f93196d

Browse files
authored
Merge branch 'main' into worktree-vmcp-custom-datastorage-4928
2 parents 804df32 + 46a1b1f commit f93196d

9 files changed

Lines changed: 351 additions & 46 deletions

File tree

pkg/vmcp/discovery/middleware.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,17 +135,21 @@ func Middleware(
135135
// or degraded. Backends that are unhealthy, unknown, or unauthenticated are excluded
136136
// from capability aggregation to prevent exposing tools from unavailable backends.
137137
//
138-
// TODO(#4920): Unauthenticated backends are treated as routable for phase determination
139-
// but are excluded here because discovery probes cannot carry user tokens. If health
140-
// probes could authenticate, these backends would be fully healthy and included here.
138+
// A note on BackendUnauthenticated: a 401/403 from a backend that has an outgoing
139+
// auth strategy configured is treated as BackendHealthy by the health checker
140+
// (health probes deliberately do not carry user credentials — the challenge proves
141+
// reachability). BackendUnauthenticated therefore indicates a misconfiguration:
142+
// the backend requires authentication but no outgoing auth strategy is configured
143+
// on the backend target. Excluding such backends from capability aggregation is
144+
// the correct behavior — their capabilities cannot be safely exposed.
141145
//
142146
// Health status filtering:
143147
// - healthy: included (fully operational)
144148
// - degraded: included (slow but working)
145149
// - empty/zero-value: included (assume healthy when health monitoring is disabled)
146150
// - unhealthy: excluded (not responding, circuit breaker may be open)
147151
// - unknown: excluded (status not yet determined)
148-
// - unauthenticated: excluded (authentication failed)
152+
// - unauthenticated: excluded (misconfiguration: backend requires auth but none configured)
149153
//
150154
// When healthStatusProvider is provided, the current health status from the health
151155
// monitor is used (respects circuit breaker state). When nil, falls back to the

pkg/vmcp/health/checker.go

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"time"
1616

1717
"github.com/stacklok/toolhive/pkg/vmcp"
18+
authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
1819
)
1920

2021
// healthChecker implements vmcp.HealthChecker using ListCapabilities as the health check.
@@ -59,7 +60,14 @@ func NewHealthChecker(
5960
// Health determination logic:
6061
// - Success with fast response: Backend is healthy (BackendHealthy)
6162
// - Success with slow response (> degradedThreshold): Backend is degraded (BackendDegraded)
62-
// - Authentication error: Backend is unauthenticated (BackendUnauthenticated)
63+
// - Authentication error (HTTP 401/403) AND backend has an outgoing auth strategy
64+
// configured: Backend is healthy (BackendHealthy). Health probes deliberately do
65+
// not carry user credentials, so the backend's auth challenge proves reachability
66+
// and a working auth layer — that is success for probe purposes.
67+
// - Authentication error AND backend has no outgoing auth strategy configured
68+
// (AuthConfig nil or StrategyTypeUnauthenticated): Backend is unauthenticated
69+
// (BackendUnauthenticated). This signals operator misconfiguration — the backend
70+
// requires authentication but none was configured on the backend target.
6371
// - Timeout or connection error: Backend is unhealthy (BackendUnhealthy)
6472
// - Other errors: Backend is unhealthy (BackendUnhealthy)
6573
//
@@ -92,8 +100,19 @@ func (h *healthChecker) CheckHealth(ctx context.Context, target *vmcp.BackendTar
92100
responseDuration := time.Since(startTime)
93101

94102
if err != nil {
95-
// Categorize the error to determine health status
96-
status := categorizeError(err)
103+
// Categorize the error to determine health status. The target's outgoing
104+
// auth config is consulted: a 401/403 from a backend with an outgoing auth
105+
// strategy is the expected response to a no-credential probe and maps to
106+
// BackendHealthy. In that case we return a nil error so the monitor records
107+
// this as a successful check and does not open the circuit breaker.
108+
status := categorizeError(target, err)
109+
if status == vmcp.BackendHealthy {
110+
slog.Debug("health check received expected auth challenge — treating as healthy",
111+
"backend", target.WorkloadName,
112+
"error", err,
113+
"duration", responseDuration)
114+
return vmcp.BackendHealthy, nil
115+
}
97116
slog.Debug("health check failed for backend",
98117
"backend", target.WorkloadName,
99118
"error", err,
@@ -115,18 +134,24 @@ func (h *healthChecker) CheckHealth(ctx context.Context, target *vmcp.BackendTar
115134
return vmcp.BackendHealthy, nil
116135
}
117136

118-
// categorizeError determines the appropriate health status based on the error type.
137+
// categorizeError determines the appropriate health status based on the error type
138+
// and the backend's outgoing auth configuration.
139+
//
119140
// This uses sentinel error checking with errors.Is() for type-safe error categorization.
120141
// Falls back to string-based detection for backwards compatibility with non-wrapped errors.
121-
func categorizeError(err error) vmcp.BackendHealthStatus {
142+
//
143+
// For auth errors (HTTP 401/403), the target's AuthConfig is consulted to distinguish
144+
// an expected auth challenge (backend has outgoing auth configured) from a misconfiguration
145+
// (backend has no outgoing auth strategy). See authErrorStatus for details.
146+
func categorizeError(target *vmcp.BackendTarget, err error) vmcp.BackendHealthStatus {
122147
if err == nil {
123148
return vmcp.BackendHealthy
124149
}
125150

126151
// 1. Type-safe detection: Check for sentinel errors using errors.Is()
127152
// BackendClient now wraps all errors with appropriate sentinel errors
128153
if errors.Is(err, vmcp.ErrAuthenticationFailed) || errors.Is(err, vmcp.ErrAuthorizationFailed) {
129-
return vmcp.BackendUnauthenticated
154+
return authErrorStatus(target)
130155
}
131156

132157
if errors.Is(err, vmcp.ErrTimeout) || errors.Is(err, vmcp.ErrCancelled) {
@@ -140,7 +165,7 @@ func categorizeError(err error) vmcp.BackendHealthStatus {
140165
// 2. String-based detection: Fallback for backwards compatibility
141166
// This handles errors from sources that don't wrap with sentinel errors
142167
if vmcp.IsAuthenticationError(err) {
143-
return vmcp.BackendUnauthenticated
168+
return authErrorStatus(target)
144169
}
145170

146171
if vmcp.IsTimeoutError(err) || vmcp.IsConnectionError(err) {
@@ -150,3 +175,24 @@ func categorizeError(err error) vmcp.BackendHealthStatus {
150175
// Default to unhealthy for unknown errors
151176
return vmcp.BackendUnhealthy
152177
}
178+
179+
// authErrorStatus maps an authentication error (HTTP 401/403) to a health status
180+
// using the backend's outgoing auth configuration.
181+
//
182+
// Health probes deliberately do not carry user credentials. If the backend is
183+
// configured with an outgoing auth strategy, a 401/403 from the backend proves
184+
// that the backend is alive, the auth layer works, and the network+TLS path is
185+
// healthy — this is the expected response to an unauthenticated probe and is
186+
// therefore treated as BackendHealthy.
187+
//
188+
// If the backend has no outgoing auth strategy configured (AuthConfig nil or
189+
// StrategyTypeUnauthenticated), a 401/403 indicates operator misconfiguration:
190+
// the backend requires authentication but none was configured on the backend
191+
// target. This is reported as BackendUnauthenticated so it surfaces in status.
192+
func authErrorStatus(target *vmcp.BackendTarget) vmcp.BackendHealthStatus {
193+
if target != nil && target.AuthConfig != nil &&
194+
target.AuthConfig.Type != authtypes.StrategyTypeUnauthenticated {
195+
return vmcp.BackendHealthy
196+
}
197+
return vmcp.BackendUnauthenticated
198+
}

0 commit comments

Comments
 (0)