@@ -15,6 +15,7 @@ import (
1515 "time"
1616
1717 "github.com/stacklok/toolhive/pkg/vmcp"
18+ authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
1819)
1920
2021// healthChecker implements vmcp.HealthChecker using ListCapabilities as the health check.
@@ -59,7 +60,14 @@ func NewHealthChecker(
5960// Health determination logic:
6061// - Success with fast response: Backend is healthy (BackendHealthy)
6162// - Success with slow response (> degradedThreshold): Backend is degraded (BackendDegraded)
62- // - Authentication error: Backend is unauthenticated (BackendUnauthenticated)
63+ // - Authentication error (HTTP 401/403) AND backend has an outgoing auth strategy
64+ // configured: Backend is healthy (BackendHealthy). Health probes deliberately do
65+ // not carry user credentials, so the backend's auth challenge proves reachability
66+ // and a working auth layer — that is success for probe purposes.
67+ // - Authentication error AND backend has no outgoing auth strategy configured
68+ // (AuthConfig nil or StrategyTypeUnauthenticated): Backend is unauthenticated
69+ // (BackendUnauthenticated). This signals operator misconfiguration — the backend
70+ // requires authentication but none was configured on the backend target.
6371// - Timeout or connection error: Backend is unhealthy (BackendUnhealthy)
6472// - Other errors: Backend is unhealthy (BackendUnhealthy)
6573//
@@ -92,8 +100,19 @@ func (h *healthChecker) CheckHealth(ctx context.Context, target *vmcp.BackendTar
92100 responseDuration := time .Since (startTime )
93101
94102 if err != nil {
95- // Categorize the error to determine health status
96- status := categorizeError (err )
103+ // Categorize the error to determine health status. The target's outgoing
104+ // auth config is consulted: a 401/403 from a backend with an outgoing auth
105+ // strategy is the expected response to a no-credential probe and maps to
106+ // BackendHealthy. In that case we return a nil error so the monitor records
107+ // this as a successful check and does not open the circuit breaker.
108+ status := categorizeError (target , err )
109+ if status == vmcp .BackendHealthy {
110+ slog .Debug ("health check received expected auth challenge — treating as healthy" ,
111+ "backend" , target .WorkloadName ,
112+ "error" , err ,
113+ "duration" , responseDuration )
114+ return vmcp .BackendHealthy , nil
115+ }
97116 slog .Debug ("health check failed for backend" ,
98117 "backend" , target .WorkloadName ,
99118 "error" , err ,
@@ -115,18 +134,24 @@ func (h *healthChecker) CheckHealth(ctx context.Context, target *vmcp.BackendTar
115134 return vmcp .BackendHealthy , nil
116135}
117136
118- // categorizeError determines the appropriate health status based on the error type.
137+ // categorizeError determines the appropriate health status based on the error type
138+ // and the backend's outgoing auth configuration.
139+ //
119140// This uses sentinel error checking with errors.Is() for type-safe error categorization.
120141// Falls back to string-based detection for backwards compatibility with non-wrapped errors.
121- func categorizeError (err error ) vmcp.BackendHealthStatus {
142+ //
143+ // For auth errors (HTTP 401/403), the target's AuthConfig is consulted to distinguish
144+ // an expected auth challenge (backend has outgoing auth configured) from a misconfiguration
145+ // (backend has no outgoing auth strategy). See authErrorStatus for details.
146+ func categorizeError (target * vmcp.BackendTarget , err error ) vmcp.BackendHealthStatus {
122147 if err == nil {
123148 return vmcp .BackendHealthy
124149 }
125150
126151 // 1. Type-safe detection: Check for sentinel errors using errors.Is()
127152 // BackendClient now wraps all errors with appropriate sentinel errors
128153 if errors .Is (err , vmcp .ErrAuthenticationFailed ) || errors .Is (err , vmcp .ErrAuthorizationFailed ) {
129- return vmcp . BackendUnauthenticated
154+ return authErrorStatus ( target )
130155 }
131156
132157 if errors .Is (err , vmcp .ErrTimeout ) || errors .Is (err , vmcp .ErrCancelled ) {
@@ -140,7 +165,7 @@ func categorizeError(err error) vmcp.BackendHealthStatus {
140165 // 2. String-based detection: Fallback for backwards compatibility
141166 // This handles errors from sources that don't wrap with sentinel errors
142167 if vmcp .IsAuthenticationError (err ) {
143- return vmcp . BackendUnauthenticated
168+ return authErrorStatus ( target )
144169 }
145170
146171 if vmcp .IsTimeoutError (err ) || vmcp .IsConnectionError (err ) {
@@ -150,3 +175,24 @@ func categorizeError(err error) vmcp.BackendHealthStatus {
150175 // Default to unhealthy for unknown errors
151176 return vmcp .BackendUnhealthy
152177}
178+
179+ // authErrorStatus maps an authentication error (HTTP 401/403) to a health status
180+ // using the backend's outgoing auth configuration.
181+ //
182+ // Health probes deliberately do not carry user credentials. If the backend is
183+ // configured with an outgoing auth strategy, a 401/403 from the backend proves
184+ // that the backend is alive, the auth layer works, and the network+TLS path is
185+ // healthy — this is the expected response to an unauthenticated probe and is
186+ // therefore treated as BackendHealthy.
187+ //
188+ // If the backend has no outgoing auth strategy configured (AuthConfig nil or
189+ // StrategyTypeUnauthenticated), a 401/403 indicates operator misconfiguration:
190+ // the backend requires authentication but none was configured on the backend
191+ // target. This is reported as BackendUnauthenticated so it surfaces in status.
192+ func authErrorStatus (target * vmcp.BackendTarget ) vmcp.BackendHealthStatus {
193+ if target != nil && target .AuthConfig != nil &&
194+ target .AuthConfig .Type != authtypes .StrategyTypeUnauthenticated {
195+ return vmcp .BackendHealthy
196+ }
197+ return vmcp .BackendUnauthenticated
198+ }
0 commit comments