Skip to content

Commit 88efbfd

Browse files
committed
Integrate health monitoring with VirtualMCPServer controller
Adds health monitoring integration to the Kubernetes operator controller, enabling real-time backend health status tracking and reporting in the VirtualMCPServer CRD status.
1 parent 97ddff8 commit 88efbfd

File tree

10 files changed

+584
-80
lines changed

10 files changed

+584
-80
lines changed

cmd/thv-operator/controllers/virtualmcpserver_controller.go

Lines changed: 307 additions & 70 deletions
Large diffs are not rendered by default.

cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ func (s *StatusCollector) SetDiscoveredBackends(backends []mcpv1alpha1.Discovere
9494
s.hasChanges = true
9595
}
9696

97+
// GetDiscoveredBackends returns the current discovered backends value.
98+
// If SetDiscoveredBackends has been called, returns the new value.
99+
// Otherwise, returns the existing value from the VirtualMCPServer status.
100+
func (s *StatusCollector) GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend {
101+
if s.discoveredBackends != nil {
102+
return s.discoveredBackends
103+
}
104+
return s.vmcp.Status.DiscoveredBackends
105+
}
106+
97107
// UpdateStatus applies all collected status changes in a single batch update.
98108
// Expects vmcpStatus to be freshly fetched from the cluster to ensure the update operates on the latest resource version.
99109
func (s *StatusCollector) UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool {

cmd/thv-operator/pkg/virtualmcpserverstatus/mocks/mock_collector.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/thv-operator/pkg/virtualmcpserverstatus/types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ type StatusManager interface {
4444
// SetDiscoveredBackends sets the discovered backends list
4545
SetDiscoveredBackends(backends []mcpv1alpha1.DiscoveredBackend)
4646

47+
// GetDiscoveredBackends returns the current discovered backends value.
48+
// If SetDiscoveredBackends has been called, returns the new value.
49+
// Otherwise, returns the existing value from the VirtualMCPServer status.
50+
GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend
51+
4752
// UpdateStatus applies all collected status changes in a single batch update.
4853
// Returns true if updates were applied, false if no changes were collected.
4954
UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool

cmd/thv-operator/pkg/vmcpconfig/converter.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ func (c *Converter) Convert(
124124
}
125125

126126
config.Telemetry = spectoconfig.ConvertTelemetryConfig(ctx, vmcp.Spec.Telemetry, vmcp.Name)
127-
config.Audit = spectoconfig.ConvertAuditConfig(ctx, vmcp.Spec.Audit, vmcp.Name)
128127

129128
// Apply operational defaults (fills missing values)
130129
config.EnsureOperationalDefaults()
@@ -863,7 +862,7 @@ func convertOutputProperty(
863862

864863
// convertOperational converts OperationalConfig from CRD to vmcp config
865864
func (*Converter) convertOperational(
866-
_ context.Context,
865+
ctx context.Context,
867866
vmcp *mcpv1alpha1.VirtualMCPServer,
868867
) *vmcpconfig.OperationalConfig {
869868
operational := &vmcpconfig.OperationalConfig{}
@@ -896,7 +895,12 @@ func (*Converter) convertOperational(
896895

897896
// Parse health check interval
898897
if vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
899-
if duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval); err == nil {
898+
duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
899+
if err != nil {
900+
ctxLogger := log.FromContext(ctx)
901+
ctxLogger.Error(err, "Failed to parse HealthCheckInterval, health monitoring will be disabled",
902+
"value", vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
903+
} else {
900904
operational.FailureHandling.HealthCheckInterval = vmcpconfig.Duration(duration)
901905
}
902906
}
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# Example: VirtualMCPServer with Health Monitoring
2+
#
3+
# This example demonstrates health monitoring configuration for VirtualMCPServer.
4+
# Health monitoring enables:
5+
# - Periodic health checks on backend MCPServers using ListCapabilities calls
6+
# - Automatic detection of unhealthy backends
7+
# - Backend status tracking in VirtualMCPServer status
8+
# - Graceful handling of partial failures (best_effort mode)
9+
#
10+
# Health Status Values:
11+
# - ready: Backend is healthy and responding
12+
# - unavailable: Backend is not responding or workload is in Pending/Failed/Terminating phase
13+
# - degraded: Backend is responding but degraded (e.g., slow response times)
14+
# - unknown: Health status not yet determined
15+
#
16+
# This example creates:
17+
# 1. Three MCPServer backends (demonstrating different health scenarios)
18+
# 2. An MCPGroup to organize them
19+
# 3. A VirtualMCPServer with health monitoring enabled
20+
#
21+
# Usage:
22+
# kubectl apply -f vmcp_health_monitoring.yaml
23+
#
24+
# Monitor health status:
25+
# kubectl get virtualmcpserver health-monitoring-vmcp -o jsonpath='{.status.discoveredBackends[*].status}'
26+
# kubectl get virtualmcpserver health-monitoring-vmcp -o jsonpath='{.status.discoveredBackends[*].lastHealthCheck}'
27+
28+
---
29+
# Step 1: Create MCPGroup
30+
apiVersion: toolhive.stacklok.dev/v1alpha1
31+
kind: MCPGroup
32+
metadata:
33+
name: health-monitoring-group
34+
namespace: default
35+
spec:
36+
description: Group demonstrating health monitoring for VirtualMCPServer
37+
38+
---
39+
# Step 2: Create healthy backend MCPServers
40+
apiVersion: toolhive.stacklok.dev/v1alpha1
41+
kind: MCPServer
42+
metadata:
43+
name: backend-1-healthy
44+
namespace: default
45+
spec:
46+
groupRef: health-monitoring-group
47+
image: ghcr.io/stackloklabs/yardstick/yardstick-server:0.0.2
48+
transport: streamable-http
49+
proxyPort: 8080
50+
mcpPort: 8080
51+
env:
52+
- name: TRANSPORT
53+
value: streamable-http
54+
- name: TOOL_PREFIX
55+
value: backend1
56+
resources:
57+
limits:
58+
cpu: "100m"
59+
memory: "128Mi"
60+
requests:
61+
cpu: "50m"
62+
memory: "64Mi"
63+
64+
---
65+
apiVersion: toolhive.stacklok.dev/v1alpha1
66+
kind: MCPServer
67+
metadata:
68+
name: backend-2-healthy
69+
namespace: default
70+
spec:
71+
groupRef: health-monitoring-group
72+
image: ghcr.io/stackloklabs/yardstick/yardstick-server:0.0.2
73+
transport: streamable-http
74+
proxyPort: 8080
75+
mcpPort: 8080
76+
env:
77+
- name: TRANSPORT
78+
value: streamable-http
79+
- name: TOOL_PREFIX
80+
value: backend2
81+
resources:
82+
limits:
83+
cpu: "100m"
84+
memory: "128Mi"
85+
requests:
86+
cpu: "50m"
87+
memory: "64Mi"
88+
89+
---
90+
# Step 3: Create VirtualMCPServer with health monitoring enabled
91+
apiVersion: toolhive.stacklok.dev/v1alpha1
92+
kind: VirtualMCPServer
93+
metadata:
94+
name: health-monitoring-vmcp
95+
namespace: default
96+
spec:
97+
# Reference to the MCPGroup containing backend MCPServers
98+
groupRef:
99+
name: health-monitoring-group
100+
101+
# Incoming authentication (client -> vMCP)
102+
incomingAuth:
103+
type: anonymous
104+
authzConfig:
105+
type: inline
106+
inline:
107+
policies:
108+
- 'permit(principal, action, resource);'
109+
110+
# Outgoing authentication (vMCP -> backends)
111+
outgoingAuth:
112+
source: discovered
113+
114+
# Aggregation configuration
115+
aggregation:
116+
conflictResolution: prefix
117+
conflictResolutionConfig:
118+
prefixFormat: "{workload}_"
119+
120+
# Service type - NodePort for external access
121+
serviceType: NodePort
122+
123+
# Operational settings with health monitoring
124+
operational:
125+
# Enable debug logging to see health check details
126+
logLevel: debug
127+
128+
# Timeout configuration
129+
timeouts:
130+
default: 30s
131+
132+
# Failure handling with health monitoring
133+
failureHandling:
134+
# Health check interval - how often to check backend health
135+
# Shorter intervals provide faster detection but more overhead
136+
# Recommended: 30s-60s for production, 5s-10s for testing
137+
healthCheckInterval: 30s
138+
139+
# Unhealthy threshold - consecutive failures before marking unhealthy
140+
# Higher values reduce false positives from transient failures
141+
# Recommended: 2-3 for production
142+
unhealthyThreshold: 3
143+
144+
# Partial failure mode - behavior when some backends are unavailable
145+
# - fail: Fail entire request if any backend is unavailable (strict)
146+
# - best_effort: Continue with available backends (resilient)
147+
# Recommended: best_effort for production to maintain availability
148+
partialFailureMode: best_effort
149+
150+
# Optional: Circuit breaker (not yet implemented in this release)
151+
# circuitBreaker:
152+
# enabled: true
153+
# failureThreshold: 5
154+
# timeout: 60s
155+
156+
---
157+
# Example: To test health monitoring behavior, you can:
158+
#
159+
# 1. Check initial status (all backends should be healthy):
160+
# kubectl get virtualmcpserver health-monitoring-vmcp -o yaml
161+
#
162+
# 2. View backend health status:
163+
# kubectl get virtualmcpserver health-monitoring-vmcp \
164+
# -o jsonpath='{range .status.discoveredBackends[*]}{.name}{"\t"}{.status}{"\t"}{.lastHealthCheck}{"\n"}{end}'
165+
#
166+
# 3. Simulate an unhealthy backend by deleting one:
167+
# kubectl delete mcpserver backend-2-healthy
168+
#
169+
# 4. Wait for health checks to detect the failure (up to healthCheckInterval * unhealthyThreshold):
170+
# watch kubectl get virtualmcpserver health-monitoring-vmcp \
171+
# -o jsonpath='{range .status.discoveredBackends[*]}{.name}{"\t"}{.status}{"\n"}{end}'
172+
#
173+
# 5. Observe the VirtualMCPServer phase changes to "Degraded" but continues serving:
174+
# kubectl get virtualmcpserver health-monitoring-vmcp
175+
#
176+
# 6. Recreate the backend to see recovery:
177+
# kubectl apply -f vmcp_health_monitoring.yaml
178+
#
179+
# 7. Health monitoring will detect recovery and update status to "ready"

pkg/vmcp/workloads/k8s.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,10 +331,14 @@ func (d *k8sDiscoverer) discoverAuthConfigFromRef(
331331
}
332332

333333
// mapK8SWorkloadPhaseToHealth converts a MCPServerPhase to a backend health status.
334+
// Note: Running phase maps to Unknown so health checks determine actual reachability.
335+
// A pod can be Running but the MCP server inside may be unreachable (wrong port, etc).
334336
func mapK8SWorkloadPhaseToHealth(phase mcpv1alpha1.MCPServerPhase) vmcp.BackendHealthStatus {
335337
switch phase {
336338
case mcpv1alpha1.MCPServerPhaseRunning:
337-
return vmcp.BackendHealthy
339+
// Use Unknown for Running phase - let health checks determine actual status
340+
// The pod being running doesn't guarantee the MCP server is reachable
341+
return vmcp.BackendUnknown
338342
case mcpv1alpha1.MCPServerPhaseFailed:
339343
return vmcp.BackendUnhealthy
340344
case mcpv1alpha1.MCPServerPhaseTerminating:

test/e2e/thv-operator/virtualmcp/helpers.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func WaitForVirtualMCPServerDeployed(
9999

100100
// Check that pods are running (but not necessarily all backends healthy)
101101
labels := map[string]string{
102-
"app.kubernetes.io/name": "vmcp",
102+
"app.kubernetes.io/name": "virtualmcpserver",
103103
"app.kubernetes.io/instance": name,
104104
}
105105
if err := checkPodsReady(ctx, c, namespace, labels); err != nil {

test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
2121
healthyBackend1 = "healthy-backend-1"
2222
healthyBackend2 = "healthy-backend-2"
2323
unhealthyBackend = "unhealthy-backend"
24-
timeout = 5 * time.Minute
24+
timeout = 3 * time.Minute
2525
pollingInterval = 2 * time.Second
2626
healthCheckInterval = "5s" // Fast checks for e2e
2727
unhealthyThreshold = 2 // Mark unhealthy after 2 consecutive failures
@@ -159,15 +159,66 @@ var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
159159
})
160160

161161
It("should discover all backends including the unhealthy one", func() {
162+
// Wait for health checks to complete and update backend status
163+
// Health monitoring runs every 5s and marks backends unhealthy after 2 consecutive failures
164+
// So we need to wait at least 10-15 seconds for the unhealthy backend to be detected
165+
Eventually(func() error {
166+
vmcpServer := &mcpv1alpha1.VirtualMCPServer{}
167+
if err := k8sClient.Get(ctx, types.NamespacedName{
168+
Name: vmcpServerName,
169+
Namespace: testNamespace,
170+
}, vmcpServer); err != nil {
171+
return err
172+
}
173+
174+
// Should discover all 3 backends
175+
if len(vmcpServer.Status.DiscoveredBackends) != 3 {
176+
return fmt.Errorf("expected 3 discovered backends, got %d", len(vmcpServer.Status.DiscoveredBackends))
177+
}
178+
179+
// BackendCount should be 2 (only ready backends)
180+
if vmcpServer.Status.BackendCount != 2 {
181+
return fmt.Errorf("expected BackendCount=2 (only ready backends), got %d", vmcpServer.Status.BackendCount)
182+
}
183+
184+
// Verify unhealthy backend is marked as unavailable/degraded
185+
unhealthyFound := false
186+
healthyCount := 0
187+
for _, backend := range vmcpServer.Status.DiscoveredBackends {
188+
if backend.Name == unhealthyBackend {
189+
if backend.Status != mcpv1alpha1.BackendStatusUnavailable &&
190+
backend.Status != mcpv1alpha1.BackendStatusDegraded {
191+
return fmt.Errorf("unhealthy backend %s should be unavailable/degraded but is %s",
192+
backend.Name, backend.Status)
193+
}
194+
unhealthyFound = true
195+
} else {
196+
if backend.Status != mcpv1alpha1.BackendStatusReady {
197+
return fmt.Errorf("healthy backend %s should be ready but is %s",
198+
backend.Name, backend.Status)
199+
}
200+
healthyCount++
201+
}
202+
}
203+
204+
if !unhealthyFound {
205+
return fmt.Errorf("unhealthy backend not found in discovered backends")
206+
}
207+
208+
if healthyCount != 2 {
209+
return fmt.Errorf("expected 2 healthy backends, found %d", healthyCount)
210+
}
211+
212+
return nil
213+
}, 30*time.Second, 2*time.Second).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
214+
215+
// Verify all backends are present in discovery
162216
vmcpServer := &mcpv1alpha1.VirtualMCPServer{}
163217
err := k8sClient.Get(ctx, types.NamespacedName{
164218
Name: vmcpServerName,
165219
Namespace: testNamespace,
166220
}, vmcpServer)
167221
Expect(err).NotTo(HaveOccurred())
168-
169-
// Should discover all 3 backends
170-
Expect(vmcpServer.Status.BackendCount).To(Equal(3))
171222
Expect(vmcpServer.Status.DiscoveredBackends).To(HaveLen(3))
172223

173224
// Check that all backends are present in discovery

test/e2e/thv-operator/virtualmcp/virtualmcp_yardstick_base_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ var _ = Describe("VirtualMCPServer Yardstick Base", Ordered, func() {
319319
}, timeout, pollingInterval).Should(Equal(mcpv1alpha1.VirtualMCPServerPhaseDegraded),
320320
"VirtualMCPServer should enter Degraded phase when a backend is unavailable")
321321

322-
By("Verifying backend count reflects one ready backend")
322+
By("Verifying backend count reflects all discovered backends")
323323
// Re-fetch VirtualMCPServer to ensure we have the latest status
324324
Expect(k8sClient.Get(ctx, types.NamespacedName{
325325
Name: vmcpServerName,

0 commit comments

Comments
 (0)