Skip to content

Commit 2136387

Browse files
lorr1claude
andauthored
Treat unauthenticated backends as routable in vMCP health (#4866)
* Treat unauthenticated backends as routable in vMCP health monitoring Backends returning 401 due to missing user-level OAuth tokens (e.g., upstreamInject auth) are reachable and running — they just require per-request user auth. Health probes lack user tokens, so they correctly detect the unauthenticated state, but this should not cause PhaseFailed. Introduce Summary.Routable() (healthy + unauthenticated) and use it for phase determination, BackendCount, and status messages across the health monitor, operator controller, and status collector. Give BackendUnauthenticated its own CRD status "unauthenticated" instead of mapping to "unavailable". Fixes #4824 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Address review feedback on unauthenticated backends PR - Fix godoc: move pluralBackend() above formatBackendMessage doc comment - Fix grammar: add pluralRequire()/quantifyBackends() helpers for correct subject-verb agreement ("1 requires" not "1 require") and drop "All" prefix for singular counts - Fix /status endpoint: treat BackendUnauthenticated as routable, consistent with health monitor - Rename condition Reason from AllBackendsHealthy to AllBackendsRoutable (v1alpha1, no stability guarantee) - Add TestBuildConditions cases for unauthenticated branches - Regenerate CRD manifests Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Regenerate CRD docs for BackendCount description update Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Update TODO to reference follow-up issue #4920 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Rename NoHealthyBackends condition reason to NoRoutableBackends Aligns with the AllBackendsHealthy → AllBackendsRoutable rename already applied to the ready-phase reason string. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4ebcf3b commit 2136387

14 files changed

Lines changed: 218 additions & 72 deletions

File tree

cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,10 +181,11 @@ type BackendAuthConfig struct {
181181
// These are the user-facing values stored in VirtualMCPServer.Status.DiscoveredBackends.
182182
// Use BackendHealthStatus.ToCRDStatus() to convert from internal health status.
183183
const (
184-
BackendStatusReady = "ready"
185-
BackendStatusUnavailable = "unavailable"
186-
BackendStatusDegraded = "degraded"
187-
BackendStatusUnknown = "unknown"
184+
BackendStatusReady = "ready"
185+
BackendStatusUnavailable = "unavailable"
186+
BackendStatusDegraded = "degraded"
187+
BackendStatusUnknown = "unknown"
188+
BackendStatusUnauthenticated = "unauthenticated"
188189
)
189190

190191
// DiscoveredBackend is an alias to the canonical definition in pkg/vmcp/types.go
@@ -222,8 +223,8 @@ type VirtualMCPServerStatus struct {
222223
// +optional
223224
DiscoveredBackends []DiscoveredBackend `json:"discoveredBackends,omitempty"`
224225

225-
// BackendCount is the number of healthy/ready backends
226-
// (excludes unavailable, degraded, and unknown backends)
226+
// BackendCount is the number of routable backends (ready + unauthenticated).
227+
// Excludes unavailable, degraded, and unknown backends.
227228
// +optional
228229
BackendCount int32 `json:"backendCount,omitempty"`
229230

cmd/thv-operator/controllers/virtualmcpserver_controller.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1514,14 +1514,17 @@ type statusDecision struct {
15141514
conditionState metav1.ConditionStatus
15151515
}
15161516

1517-
// countBackendHealth counts ready and unhealthy backends
1518-
func countBackendHealth(ctx context.Context, backends []mcpv1alpha1.DiscoveredBackend) (ready, unhealthy int) {
1517+
// countBackendHealth counts routable and unhealthy backends.
1518+
// Unauthenticated backends are routable — they are reachable but require per-request
1519+
// user auth (e.g., upstream OAuth). Health probes lack user tokens, but real requests
1520+
// with valid OAuth tokens will be served.
1521+
func countBackendHealth(ctx context.Context, backends []mcpv1alpha1.DiscoveredBackend) (routable, unhealthy int) {
15191522
ctxLogger := log.FromContext(ctx)
15201523

15211524
for _, backend := range backends {
15221525
switch backend.Status {
1523-
case mcpv1alpha1.BackendStatusReady:
1524-
ready++
1526+
case mcpv1alpha1.BackendStatusReady, mcpv1alpha1.BackendStatusUnauthenticated:
1527+
routable++
15251528
case mcpv1alpha1.BackendStatusUnavailable,
15261529
mcpv1alpha1.BackendStatusDegraded,
15271530
mcpv1alpha1.BackendStatusUnknown:
@@ -1532,7 +1535,7 @@ func countBackendHealth(ctx context.Context, backends []mcpv1alpha1.DiscoveredBa
15321535
unhealthy++
15331536
}
15341537
}
1535-
return ready, unhealthy
1538+
return routable, unhealthy
15361539
}
15371540

15381541
// determineStatusFromBackends evaluates backend health to determine status
@@ -1542,11 +1545,11 @@ func (*VirtualMCPServerReconciler) determineStatusFromBackends(
15421545
) statusDecision {
15431546
ctxLogger := log.FromContext(ctx)
15441547

1545-
ready, unhealthy := countBackendHealth(ctx, vmcp.Status.DiscoveredBackends)
1546-
total := ready + unhealthy
1548+
routable, unhealthy := countBackendHealth(ctx, vmcp.Status.DiscoveredBackends)
1549+
total := routable + unhealthy
15471550

15481551
// All backends unhealthy
1549-
if ready == 0 && unhealthy > 0 {
1552+
if routable == 0 && unhealthy > 0 {
15501553
return statusDecision{
15511554
phase: mcpv1alpha1.VirtualMCPServerPhaseDegraded,
15521555
message: fmt.Sprintf("Virtual MCP server is running but all %d backends are unhealthy", unhealthy),
@@ -1560,15 +1563,15 @@ func (*VirtualMCPServerReconciler) determineStatusFromBackends(
15601563
if unhealthy > 0 {
15611564
return statusDecision{
15621565
phase: mcpv1alpha1.VirtualMCPServerPhaseDegraded,
1563-
message: fmt.Sprintf("Virtual MCP server is running with %d/%d backends available", ready, total),
1566+
message: fmt.Sprintf("Virtual MCP server is running with %d/%d backends available", routable, total),
15641567
reason: "BackendsDegraded",
15651568
conditionMsg: "Some backends are unhealthy",
15661569
conditionState: metav1.ConditionFalse,
15671570
}
15681571
}
15691572

1570-
// All backends ready
1571-
if ready > 0 {
1573+
// All backends routable
1574+
if routable > 0 {
15721575
return statusDecision{
15731576
phase: mcpv1alpha1.VirtualMCPServerPhaseReady,
15741577
message: "Virtual MCP server is running",

cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -211,14 +211,16 @@ func (s *StatusCollector) UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alp
211211
// Apply discovered backends change
212212
if s.discoveredBackends != nil {
213213
vmcpStatus.DiscoveredBackends = s.discoveredBackends
214-
// BackendCount represents the number of ready backends
215-
var readyCount int32
214+
// BackendCount represents the number of routable backends (ready + unauthenticated).
215+
// Unauthenticated backends are reachable but require per-request user auth.
216+
var routableCount int32
216217
for _, backend := range s.discoveredBackends {
217-
if backend.Status == mcpv1alpha1.BackendStatusReady {
218-
readyCount++
218+
if backend.Status == mcpv1alpha1.BackendStatusReady ||
219+
backend.Status == mcpv1alpha1.BackendStatusUnauthenticated {
220+
routableCount++
219221
}
220222
}
221-
vmcpStatus.BackendCount = readyCount
223+
vmcpStatus.BackendCount = routableCount
222224
}
223225

224226
ctxLogger.V(1).Info("Batched status update applied",

deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2142,8 +2142,8 @@ spec:
21422142
properties:
21432143
backendCount:
21442144
description: |-
2145-
BackendCount is the number of healthy/ready backends
2146-
(excludes unavailable, degraded, and unknown backends)
2145+
BackendCount is the number of routable backends (ready + unauthenticated).
2146+
Excludes unavailable, degraded, and unknown backends.
21472147
format: int32
21482148
type: integer
21492149
conditions:
@@ -2256,7 +2256,7 @@ spec:
22562256
type: string
22572257
status:
22582258
description: |-
2259-
Status is the current status of the backend (ready, degraded, unavailable, unknown).
2259+
Status is the current status of the backend (ready, degraded, unavailable, unauthenticated, unknown).
22602260
Use BackendHealthStatus.ToCRDStatus() to populate this field.
22612261
type: string
22622262
url:

deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,8 +2145,8 @@ spec:
21452145
properties:
21462146
backendCount:
21472147
description: |-
2148-
BackendCount is the number of healthy/ready backends
2149-
(excludes unavailable, degraded, and unknown backends)
2148+
BackendCount is the number of routable backends (ready + unauthenticated).
2149+
Excludes unavailable, degraded, and unknown backends.
21502150
format: int32
21512151
type: integer
21522152
conditions:
@@ -2259,7 +2259,7 @@ spec:
22592259
type: string
22602260
status:
22612261
description: |-
2262-
Status is the current status of the backend (ready, degraded, unavailable, unknown).
2262+
Status is the current status of the backend (ready, degraded, unavailable, unauthenticated, unknown).
22632263
Use BackendHealthStatus.ToCRDStatus() to populate this field.
22642264
type: string
22652265
url:

docs/operator/crd-api.md

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/vmcp/discovery/middleware.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ func Middleware(
135135
// or degraded. Backends that are unhealthy, unknown, or unauthenticated are excluded
136136
// from capability aggregation to prevent exposing tools from unavailable backends.
137137
//
138+
// TODO(#4920): Unauthenticated backends are treated as routable for phase determination
139+
// but are excluded here because discovery probes cannot carry user tokens. If health
140+
// probes could authenticate, these backends would be fully healthy and included here.
141+
//
138142
// Health status filtering:
139143
// - healthy: included (fully operational)
140144
// - degraded: included (slow but working)

pkg/vmcp/health/monitor.go

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,13 @@ type Summary struct {
547547
Unauthenticated int
548548
}
549549

550+
// Routable returns the number of backends that can serve traffic.
551+
// This includes healthy backends and unauthenticated backends (which are
552+
// reachable but require per-request user auth, e.g., upstream OAuth).
553+
func (s Summary) Routable() int {
554+
return s.Healthy + s.Unauthenticated
555+
}
556+
550557
// String returns a human-readable summary.
551558
func (s Summary) String() string {
552559
return fmt.Sprintf("total=%d healthy=%d degraded=%d unhealthy=%d unknown=%d unauthenticated=%d",
@@ -557,11 +564,12 @@ func (s Summary) String() string {
557564
// This converts backend health information into the format needed for status reporting
558565
// to the Kubernetes API or CLI output.
559566
//
560-
// Phase determination:
561-
// - Ready: All backends healthy, or no backends configured (cold start)
567+
// Phase determination (unauthenticated backends are routable — they need per-request user auth
568+
// but are reachable and running):
569+
// - Ready: All backends healthy or unauthenticated, or no backends configured (cold start)
562570
// - Pending: Backends configured but no health check data yet (waiting for first check)
563-
// - Degraded: Some backends healthy, some degraded/unhealthy
564-
// - Failed: No healthy backends (and at least one backend exists)
571+
// - Degraded: Some backends routable (healthy/unauthenticated), some degraded/unhealthy
572+
// - Failed: No routable backends (and at least one backend exists)
565573
//
566574
// Returns a Status instance with current health information and discovered backends.
567575
//
@@ -592,16 +600,18 @@ func (m *Monitor) BuildStatus() *vmcp.Status {
592600
Message: message,
593601
Conditions: conditions,
594602
DiscoveredBackends: discoveredBackends,
595-
BackendCount: int32(summary.Healthy), //nolint:gosec // healthy count is bounded by backend list size
603+
BackendCount: int32(summary.Routable()), //nolint:gosec // routable count is bounded by backend list size
596604
Timestamp: time.Now(),
597605
}
598606
}
599607

600608
// determinePhase determines the overall phase based on backend health.
609+
// Unauthenticated backends are treated as routable — they are reachable and running,
610+
// they just require per-request user auth (e.g., upstream OAuth).
601611
// Takes both the health summary and the count of configured backends to distinguish:
602612
// - No backends configured (configuredCount==0): Ready (cold start)
603613
// - Backends configured but no health data (configuredCount>0 && summary.Total==0): Pending
604-
// - Has health data: Ready/Degraded/Failed based on health status
614+
// - Has health data: Ready/Degraded/Failed based on routable (healthy + unauthenticated) count
605615
func determinePhase(summary Summary, configuredBackendCount int) vmcp.Phase {
606616
if summary.Total == 0 {
607617
// No health data yet - distinguish cold start from waiting for first check
@@ -610,10 +620,11 @@ func determinePhase(summary Summary, configuredBackendCount int) vmcp.Phase {
610620
}
611621
return vmcp.PhasePending // Backends configured but health checks not complete
612622
}
613-
if summary.Healthy == summary.Total {
623+
624+
if summary.Routable() == summary.Total {
614625
return vmcp.PhaseReady
615626
}
616-
if summary.Healthy == 0 {
627+
if summary.Routable() == 0 {
617628
return vmcp.PhaseFailed
618629
}
619630
return vmcp.PhaseDegraded
@@ -629,18 +640,27 @@ func formatStatusMessage(summary Summary, phase vmcp.Phase, configuredBackendCou
629640
return fmt.Sprintf("Waiting for initial health checks (%d backends configured)", configuredBackendCount)
630641
}
631642
if phase == vmcp.PhaseReady {
632-
return fmt.Sprintf("All %d backends healthy", summary.Healthy)
643+
if summary.Unauthenticated == 0 {
644+
return fmt.Sprintf("All %d %s healthy", summary.Healthy, pluralBackend(summary.Healthy))
645+
}
646+
if summary.Healthy == 0 {
647+
return fmt.Sprintf("%s %s authentication",
648+
quantifyBackends(summary.Unauthenticated), pluralRequire(summary.Unauthenticated))
649+
}
650+
return fmt.Sprintf("%d %s healthy, %d %s authentication",
651+
summary.Healthy, pluralBackend(summary.Healthy),
652+
summary.Unauthenticated, pluralRequire(summary.Unauthenticated))
633653
}
634654

635-
// Format unhealthy backend counts (shared by Failed and Degraded)
636-
unhealthyDetails := fmt.Sprintf("%d degraded, %d unhealthy, %d unknown, %d unauthenticated",
637-
summary.Degraded, summary.Unhealthy, summary.Unknown, summary.Unauthenticated)
655+
// Format non-routable backend counts (shared by Failed and Degraded)
656+
nonRoutableDetails := fmt.Sprintf("%d degraded, %d unhealthy, %d unknown",
657+
summary.Degraded, summary.Unhealthy, summary.Unknown)
638658

639659
if phase == vmcp.PhaseFailed {
640-
return fmt.Sprintf("No healthy backends (%s)", unhealthyDetails)
660+
return fmt.Sprintf("No routable backends (%s)", nonRoutableDetails)
641661
}
642662
// Degraded
643-
return fmt.Sprintf("%d/%d backends healthy (%s)", summary.Healthy, summary.Total, unhealthyDetails)
663+
return fmt.Sprintf("%d/%d backends routable (%s)", summary.Routable(), summary.Total, nonRoutableDetails)
644664
}
645665

646666
// convertToDiscoveredBackends converts backend health states to DiscoveredBackend format.
@@ -712,6 +732,30 @@ func extractAuthInfo(backend vmcp.Backend) (authConfigRef, authType string) {
712732
return backend.AuthConfigRef, backend.AuthConfig.Type
713733
}
714734

735+
// pluralBackend returns "backend" or "backends" based on count.
736+
func pluralBackend(n int) string {
737+
if n == 1 {
738+
return "backend"
739+
}
740+
return "backends"
741+
}
742+
743+
// pluralRequire returns "requires" or "require" based on count for subject-verb agreement.
744+
func pluralRequire(n int) string {
745+
if n == 1 {
746+
return "requires"
747+
}
748+
return "require"
749+
}
750+
751+
// quantifyBackends returns "All N backends" for plural, "1 backend" for singular.
752+
func quantifyBackends(n int) string {
753+
if n == 1 {
754+
return fmt.Sprintf("%d backend", n)
755+
}
756+
return fmt.Sprintf("All %d backends", n)
757+
}
758+
715759
// formatBackendMessage creates a human-readable message for a backend's health state.
716760
// This returns generic error categories to avoid exposing sensitive error details in status.
717761
// Detailed errors are logged when they occur (in performHealthCheck) for debugging.
@@ -821,19 +865,27 @@ func buildConditions(summary Summary, phase vmcp.Phase, configuredBackendCount i
821865
switch phase {
822866
case vmcp.PhaseReady:
823867
readyCondition.Status = metav1.ConditionTrue
824-
readyCondition.Reason = "AllBackendsHealthy"
825-
// Distinguish cold start (no backends configured) from having healthy backends
868+
readyCondition.Reason = "AllBackendsRoutable"
869+
// Distinguish cold start (no backends configured) from having routable backends
826870
if summary.Total == 0 && configuredBackendCount == 0 {
827871
readyCondition.Message = "Ready, no backends configured"
872+
} else if summary.Unauthenticated == 0 {
873+
readyCondition.Message = fmt.Sprintf("All %d %s are healthy",
874+
summary.Healthy, pluralBackend(summary.Healthy))
875+
} else if summary.Healthy == 0 {
876+
readyCondition.Message = fmt.Sprintf("%s %s authentication",
877+
quantifyBackends(summary.Unauthenticated), pluralRequire(summary.Unauthenticated))
828878
} else {
829-
readyCondition.Message = fmt.Sprintf("All %d backends are healthy", summary.Healthy)
879+
readyCondition.Message = fmt.Sprintf("%d %s healthy, %d %s authentication",
880+
summary.Healthy, pluralBackend(summary.Healthy),
881+
summary.Unauthenticated, pluralRequire(summary.Unauthenticated))
830882
}
831883
case vmcp.PhaseDegraded:
832884
readyCondition.Reason = "SomeBackendsUnhealthy"
833-
readyCondition.Message = fmt.Sprintf("%d/%d backends healthy", summary.Healthy, summary.Total)
885+
readyCondition.Message = fmt.Sprintf("%d/%d backends routable", summary.Routable(), summary.Total)
834886
case vmcp.PhaseFailed:
835-
readyCondition.Reason = "NoHealthyBackends"
836-
readyCondition.Message = "No healthy backends available"
887+
readyCondition.Reason = "NoRoutableBackends"
888+
readyCondition.Message = "No routable backends available"
837889
case vmcp.PhasePending:
838890
readyCondition.Reason = "BackendsPending"
839891
readyCondition.Message = fmt.Sprintf("Waiting for initial health checks (%d backends configured)", configuredBackendCount)

0 commit comments

Comments
 (0)