Integrate health monitoring with VirtualMCPServer controller

taskbot · taskbot · commit 88efbfd48d21 · 2025-12-19T12:31:17.000+01:00
Adds health monitoring integration to the Kubernetes operator controller,                                                                   enabling real-time backend health status tracking and reporting in the
VirtualMCPServer CRD status.
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller.go b/cmd/thv-operator/controllers/virtualmcpserver_controller.go
diff --git a/cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go b/cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go
@@ -94,6 +94,16 @@ func (s *StatusCollector) SetDiscoveredBackends(backends []mcpv1alpha1.Discovere
 	s.hasChanges = true
 }
 
+// GetDiscoveredBackends returns the current discovered backends value.
+// If SetDiscoveredBackends has been called, returns the new value.
+// Otherwise, returns the existing value from the VirtualMCPServer status.
+func (s *StatusCollector) GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend {
+	if s.discoveredBackends != nil {
+		return s.discoveredBackends
+	}
+	return s.vmcp.Status.DiscoveredBackends
+}
+
 // UpdateStatus applies all collected status changes in a single batch update.
 // Expects vmcpStatus to be freshly fetched from the cluster to ensure the update operates on the latest resource version.
 func (s *StatusCollector) UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool {
diff --git a/cmd/thv-operator/pkg/virtualmcpserverstatus/mocks/mock_collector.go b/cmd/thv-operator/pkg/virtualmcpserverstatus/mocks/mock_collector.go
diff --git a/cmd/thv-operator/pkg/virtualmcpserverstatus/types.go b/cmd/thv-operator/pkg/virtualmcpserverstatus/types.go
@@ -44,6 +44,11 @@ type StatusManager interface {
 	// SetDiscoveredBackends sets the discovered backends list
 	SetDiscoveredBackends(backends []mcpv1alpha1.DiscoveredBackend)
 
+	// GetDiscoveredBackends returns the current discovered backends value.
+	// If SetDiscoveredBackends has been called, returns the new value.
+	// Otherwise, returns the existing value from the VirtualMCPServer status.
+	GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend
+
 	// UpdateStatus applies all collected status changes in a single batch update.
 	// Returns true if updates were applied, false if no changes were collected.
 	UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool
diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go
@@ -124,7 +124,6 @@ func (c *Converter) Convert(
 	}
 
 	config.Telemetry = spectoconfig.ConvertTelemetryConfig(ctx, vmcp.Spec.Telemetry, vmcp.Name)
-	config.Audit = spectoconfig.ConvertAuditConfig(ctx, vmcp.Spec.Audit, vmcp.Name)
 
 	// Apply operational defaults (fills missing values)
 	config.EnsureOperationalDefaults()
@@ -863,7 +862,7 @@ func convertOutputProperty(
 
 // convertOperational converts OperationalConfig from CRD to vmcp config
 func (*Converter) convertOperational(
-	_ context.Context,
+	ctx context.Context,
 	vmcp *mcpv1alpha1.VirtualMCPServer,
 ) *vmcpconfig.OperationalConfig {
 	operational := &vmcpconfig.OperationalConfig{}
@@ -896,7 +895,12 @@ func (*Converter) convertOperational(
 
 		// Parse health check interval
 		if vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
-			if duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval); err == nil {
+			duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
+			if err != nil {
+				ctxLogger := log.FromContext(ctx)
+				ctxLogger.Error(err, "Failed to parse HealthCheckInterval, health monitoring will be disabled",
+					"value", vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
+			} else {
 				operational.FailureHandling.HealthCheckInterval = vmcpconfig.Duration(duration)
 			}
 		}
diff --git a/examples/operator/virtual-mcps/vmcp_health_monitoring.yaml b/examples/operator/virtual-mcps/vmcp_health_monitoring.yaml
@@ -0,0 +1,179 @@
+# Example: VirtualMCPServer with Health Monitoring
+#
+# This example demonstrates health monitoring configuration for VirtualMCPServer.
+# Health monitoring enables:
+# - Periodic health checks on backend MCPServers using ListCapabilities calls
+# - Automatic detection of unhealthy backends
+# - Backend status tracking in VirtualMCPServer status
+# - Graceful handling of partial failures (best_effort mode)
+#
+# Health Status Values:
+# - ready: Backend is healthy and responding
+# - unavailable: Backend is not responding or workload is in Pending/Failed/Terminating phase
+# - degraded: Backend is responding but degraded (e.g., slow response times)
+# - unknown: Health status not yet determined
+#
+# This example creates:
+# 1. Three MCPServer backends (demonstrating different health scenarios)
+# 2. An MCPGroup to organize them
+# 3. A VirtualMCPServer with health monitoring enabled
+#
+# Usage:
+#   kubectl apply -f vmcp_health_monitoring.yaml
+#
+# Monitor health status:
+#   kubectl get virtualmcpserver health-monitoring-vmcp -o jsonpath='{.status.discoveredBackends[*].status}'
+#   kubectl get virtualmcpserver health-monitoring-vmcp -o jsonpath='{.status.discoveredBackends[*].lastHealthCheck}'
+
+---
+# Step 1: Create MCPGroup
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPGroup
+metadata:
+  name: health-monitoring-group
+  namespace: default
+spec:
+  description: Group demonstrating health monitoring for VirtualMCPServer
+
+---
+# Step 2: Create healthy backend MCPServers
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPServer
+metadata:
+  name: backend-1-healthy
+  namespace: default
+spec:
+  groupRef: health-monitoring-group
+  image: ghcr.io/stackloklabs/yardstick/yardstick-server:0.0.2
+  transport: streamable-http
+  proxyPort: 8080
+  mcpPort: 8080
+  env:
+    - name: TRANSPORT
+      value: streamable-http
+    - name: TOOL_PREFIX
+      value: backend1
+  resources:
+    limits:
+      cpu: "100m"
+      memory: "128Mi"
+    requests:
+      cpu: "50m"
+      memory: "64Mi"
+
+---
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: MCPServer
+metadata:
+  name: backend-2-healthy
+  namespace: default
+spec:
+  groupRef: health-monitoring-group
+  image: ghcr.io/stackloklabs/yardstick/yardstick-server:0.0.2
+  transport: streamable-http
+  proxyPort: 8080
+  mcpPort: 8080
+  env:
+    - name: TRANSPORT
+      value: streamable-http
+    - name: TOOL_PREFIX
+      value: backend2
+  resources:
+    limits:
+      cpu: "100m"
+      memory: "128Mi"
+    requests:
+      cpu: "50m"
+      memory: "64Mi"
+
+---
+# Step 3: Create VirtualMCPServer with health monitoring enabled
+apiVersion: toolhive.stacklok.dev/v1alpha1
+kind: VirtualMCPServer
+metadata:
+  name: health-monitoring-vmcp
+  namespace: default
+spec:
+  # Reference to the MCPGroup containing backend MCPServers
+  groupRef:
+    name: health-monitoring-group
+
+  # Incoming authentication (client -> vMCP)
+  incomingAuth:
+    type: anonymous
+    authzConfig:
+      type: inline
+      inline:
+        policies:
+          - 'permit(principal, action, resource);'
+
+  # Outgoing authentication (vMCP -> backends)
+  outgoingAuth:
+    source: discovered
+
+  # Aggregation configuration
+  aggregation:
+    conflictResolution: prefix
+    conflictResolutionConfig:
+      prefixFormat: "{workload}_"
+
+  # Service type - NodePort for external access
+  serviceType: NodePort
+
+  # Operational settings with health monitoring
+  operational:
+    # Enable debug logging to see health check details
+    logLevel: debug
+
+    # Timeout configuration
+    timeouts:
+      default: 30s
+
+    # Failure handling with health monitoring
+    failureHandling:
+      # Health check interval - how often to check backend health
+      # Shorter intervals provide faster detection but more overhead
+      # Recommended: 30s-60s for production, 5s-10s for testing
+      healthCheckInterval: 30s
+
+      # Unhealthy threshold - consecutive failures before marking unhealthy
+      # Higher values reduce false positives from transient failures
+      # Recommended: 2-3 for production
+      unhealthyThreshold: 3
+
+      # Partial failure mode - behavior when some backends are unavailable
+      # - fail: Fail entire request if any backend is unavailable (strict)
+      # - best_effort: Continue with available backends (resilient)
+      # Recommended: best_effort for production to maintain availability
+      partialFailureMode: best_effort
+
+      # Optional: Circuit breaker (not yet implemented in this release)
+      # circuitBreaker:
+      #   enabled: true
+      #   failureThreshold: 5
+      #   timeout: 60s
+
+---
+# Example: To test health monitoring behavior, you can:
+#
+# 1. Check initial status (all backends should be healthy):
+#    kubectl get virtualmcpserver health-monitoring-vmcp -o yaml
+#
+# 2. View backend health status:
+#    kubectl get virtualmcpserver health-monitoring-vmcp \
+#      -o jsonpath='{range .status.discoveredBackends[*]}{.name}{"\t"}{.status}{"\t"}{.lastHealthCheck}{"\n"}{end}'
+#
+# 3. Simulate an unhealthy backend by deleting one:
+#    kubectl delete mcpserver backend-2-healthy
+#
+# 4. Wait for health checks to detect the failure (up to healthCheckInterval * unhealthyThreshold):
+#    watch kubectl get virtualmcpserver health-monitoring-vmcp \
+#      -o jsonpath='{range .status.discoveredBackends[*]}{.name}{"\t"}{.status}{"\n"}{end}'
+#
+# 5. Observe the VirtualMCPServer phase changes to "Degraded" but continues serving:
+#    kubectl get virtualmcpserver health-monitoring-vmcp
+#
+# 6. Recreate the backend to see recovery:
+#    kubectl apply -f vmcp_health_monitoring.yaml
+#
+# 7. Health monitoring will detect recovery and update status to "ready"
diff --git a/pkg/vmcp/workloads/k8s.go b/pkg/vmcp/workloads/k8s.go
@@ -331,10 +331,14 @@ func (d *k8sDiscoverer) discoverAuthConfigFromRef(
 }
 
 // mapK8SWorkloadPhaseToHealth converts a MCPServerPhase to a backend health status.
+// Note: Running phase maps to Unknown so health checks determine actual reachability.
+// A pod can be Running but the MCP server inside may be unreachable (wrong port, etc).
 func mapK8SWorkloadPhaseToHealth(phase mcpv1alpha1.MCPServerPhase) vmcp.BackendHealthStatus {
 	switch phase {
 	case mcpv1alpha1.MCPServerPhaseRunning:
-		return vmcp.BackendHealthy
+		// Use Unknown for Running phase - let health checks determine actual status
+		// The pod being running doesn't guarantee the MCP server is reachable
+		return vmcp.BackendUnknown
 	case mcpv1alpha1.MCPServerPhaseFailed:
 		return vmcp.BackendUnhealthy
 	case mcpv1alpha1.MCPServerPhaseTerminating:
diff --git a/test/e2e/thv-operator/virtualmcp/helpers.go b/test/e2e/thv-operator/virtualmcp/helpers.go
@@ -99,7 +99,7 @@ func WaitForVirtualMCPServerDeployed(
 
 		// Check that pods are running (but not necessarily all backends healthy)
 		labels := map[string]string{
-			"app.kubernetes.io/name":     "vmcp",
+			"app.kubernetes.io/name":     "virtualmcpserver",
 			"app.kubernetes.io/instance": name,
 		}
 		if err := checkPodsReady(ctx, c, namespace, labels); err != nil {
diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go
@@ -21,7 +21,7 @@ var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
 		healthyBackend1     = "healthy-backend-1"
 		healthyBackend2     = "healthy-backend-2"
 		unhealthyBackend    = "unhealthy-backend"
-		timeout             = 5 * time.Minute
+		timeout             = 3 * time.Minute
 		pollingInterval     = 2 * time.Second
 		healthCheckInterval = "5s" // Fast checks for e2e
 		unhealthyThreshold  = 2    // Mark unhealthy after 2 consecutive failures
@@ -159,15 +159,66 @@ var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
 	})
 
 	It("should discover all backends including the unhealthy one", func() {
+		// Wait for health checks to complete and update backend status
+		// Health monitoring runs every 5s and marks backends unhealthy after 2 consecutive failures
+		// So we need to wait at least 10-15 seconds for the unhealthy backend to be detected
+		Eventually(func() error {
+			vmcpServer := &mcpv1alpha1.VirtualMCPServer{}
+			if err := k8sClient.Get(ctx, types.NamespacedName{
+				Name:      vmcpServerName,
+				Namespace: testNamespace,
+			}, vmcpServer); err != nil {
+				return err
+			}
+
+			// Should discover all 3 backends
+			if len(vmcpServer.Status.DiscoveredBackends) != 3 {
+				return fmt.Errorf("expected 3 discovered backends, got %d", len(vmcpServer.Status.DiscoveredBackends))
+			}
+
+			// BackendCount should be 2 (only ready backends)
+			if vmcpServer.Status.BackendCount != 2 {
+				return fmt.Errorf("expected BackendCount=2 (only ready backends), got %d", vmcpServer.Status.BackendCount)
+			}
+
+			// Verify unhealthy backend is marked as unavailable/degraded
+			unhealthyFound := false
+			healthyCount := 0
+			for _, backend := range vmcpServer.Status.DiscoveredBackends {
+				if backend.Name == unhealthyBackend {
+					if backend.Status != mcpv1alpha1.BackendStatusUnavailable &&
+						backend.Status != mcpv1alpha1.BackendStatusDegraded {
+						return fmt.Errorf("unhealthy backend %s should be unavailable/degraded but is %s",
+							backend.Name, backend.Status)
+					}
+					unhealthyFound = true
+				} else {
+					if backend.Status != mcpv1alpha1.BackendStatusReady {
+						return fmt.Errorf("healthy backend %s should be ready but is %s",
+							backend.Name, backend.Status)
+					}
+					healthyCount++
+				}
+			}
+
+			if !unhealthyFound {
+				return fmt.Errorf("unhealthy backend not found in discovered backends")
+			}
+
+			if healthyCount != 2 {
+				return fmt.Errorf("expected 2 healthy backends, found %d", healthyCount)
+			}
+
+			return nil
+		}, 30*time.Second, 2*time.Second).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
+
+		// Verify all backends are present in discovery
 		vmcpServer := &mcpv1alpha1.VirtualMCPServer{}
 		err := k8sClient.Get(ctx, types.NamespacedName{
 			Name:      vmcpServerName,
 			Namespace: testNamespace,
 		}, vmcpServer)
 		Expect(err).NotTo(HaveOccurred())
-
-		// Should discover all 3 backends
-		Expect(vmcpServer.Status.BackendCount).To(Equal(3))
 		Expect(vmcpServer.Status.DiscoveredBackends).To(HaveLen(3))
 
 		// Check that all backends are present in discovery
diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_yardstick_base_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_yardstick_base_test.go
@@ -319,7 +319,7 @@ var _ = Describe("VirtualMCPServer Yardstick Base", Ordered, func() {
 			}, timeout, pollingInterval).Should(Equal(mcpv1alpha1.VirtualMCPServerPhaseDegraded),
 				"VirtualMCPServer should enter Degraded phase when a backend is unavailable")
 
-			By("Verifying backend count reflects one ready backend")
+			By("Verifying backend count reflects all discovered backends")
 			// Re-fetch VirtualMCPServer to ensure we have the latest status
 			Expect(k8sClient.Get(ctx, types.NamespacedName{
 				Name:      vmcpServerName,

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ func WaitForVirtualMCPServerDeployed(`
`99`	`99`
`100`	`100`	`// Check that pods are running (but not necessarily all backends healthy)`
`101`	`101`	`labels := map[string]string{`
`102`		`- "app.kubernetes.io/name": "vmcp",`
	`102`	`+ "app.kubernetes.io/name": "virtualmcpserver",`
`103`	`103`	`"app.kubernetes.io/instance": name,`
`104`	`104`	`}`
`105`	`105`	`if err := checkPodsReady(ctx, c, namespace, labels); err != nil {`