Add periodic health monitoring for vMCP backends

taskbot · taskbot · commit c1baa2680fe0 · 2025-12-19T11:13:08.000+01:00
Implement health check system that monitors backend MCP server availability through periodic ListCapabilities calls. This is the foundation for the health monitoring and circuit breaker system described in issue #3036. Addresses first part of #3036
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller.go b/cmd/thv-operator/controllers/virtualmcpserver_controller.go
@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"maps"
+	"net/http"
 	"reflect"
 	"strings"
 	"time"
@@ -1656,6 +1657,53 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
 			"authConfigRef", authConfigRef)
 	}
 
+	// Query vmcp health status and update backend statuses if health monitoring is enabled
+	// This provides real MCP health check results instead of just Pod/Phase status
+	if vmcp.Status.URL != "" {
+		healthStatus := r.queryVMCPHealthStatus(ctx, vmcp.Status.URL)
+		if healthStatus != nil {
+			ctxLogger.V(1).Info("Updating backend status from vmcp health checks",
+				"vmcp_url", vmcp.Status.URL,
+				"backend_count", len(healthStatus))
+
+			for i := range discoveredBackends {
+				backend := &discoveredBackends[i]
+				if healthStat, found := healthStatus[backend.Name]; found {
+					// Map vmcp health status to CRD backend status
+					// vmcp statuses: healthy, unhealthy, degraded, unknown
+					// CRD statuses: ready, unavailable, degraded, unknown
+					var newStatus string
+					switch healthStat {
+					case "healthy":
+						newStatus = mcpv1alpha1.BackendStatusReady
+					case "unhealthy":
+						newStatus = mcpv1alpha1.BackendStatusUnavailable
+					case "degraded":
+						newStatus = mcpv1alpha1.BackendStatusDegraded
+					case "unknown":
+						newStatus = mcpv1alpha1.BackendStatusUnknown
+					default:
+						// Keep existing status if health status is unexpected
+						continue
+					}
+
+					// Only log if status changed
+					if newStatus != backend.Status {
+						ctxLogger.V(1).Info("Backend health check updated status",
+							"name", backend.Name,
+							"old_status", backend.Status,
+							"new_status", newStatus,
+							"health_status", healthStat)
+						backend.Status = newStatus
+					}
+				}
+			}
+		} else {
+			ctxLogger.V(1).Info("Health monitoring not enabled or failed to query vmcp health endpoint",
+				"vmcp_url", vmcp.Status.URL)
+		}
+	}
+
 	return discoveredBackends, nil
 }
 
@@ -2096,3 +2144,79 @@ func (*VirtualMCPServerReconciler) vmcpReferencesCompositeToolDefinition(
 
 	return false
 }
+
+// BackendHealthStatusResponse represents the health status response from the vmcp health API
+type BackendHealthStatusResponse struct {
+	Backends []struct {
+		BackendID           string    `json:"backendId"`
+		Status              string    `json:"status"`
+		ConsecutiveFailures int       `json:"consecutiveFailures"`
+		LastCheckTime       time.Time `json:"lastCheckTime"`
+		LastError           string    `json:"lastError,omitempty"`
+		LastTransitionTime  time.Time `json:"lastTransitionTime"`
+	} `json:"backends"`
+}
+
+// queryVMCPHealthStatus queries the vmcp health endpoint and returns backend health status.
+// Returns nil if health monitoring is not enabled or if there's an error.
+func (*VirtualMCPServerReconciler) queryVMCPHealthStatus(
+	ctx context.Context,
+	vmcpURL string,
+) map[string]string {
+	ctxLogger := log.FromContext(ctx)
+
+	// Construct health endpoint URL
+	healthURL := fmt.Sprintf("%s/api/backends/health", vmcpURL)
+
+	// Create HTTP client with timeout
+	httpClient := &http.Client{
+		Timeout: 5 * time.Second,
+	}
+
+	// Create and execute request
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil)
+	if err != nil {
+		ctxLogger.V(1).Error(err, "Failed to create health check request", "url", healthURL)
+		return nil
+	}
+
+	resp, err := httpClient.Do(req)
+	if err != nil {
+		ctxLogger.V(1).Error(err, "Failed to query vmcp health endpoint", "url", healthURL)
+		return nil
+	}
+	defer resp.Body.Close()
+
+	// Check status code
+	if resp.StatusCode == http.StatusServiceUnavailable {
+		// Health monitoring is not enabled on the vmcp server
+		ctxLogger.V(1).Info("Health monitoring not enabled on vmcp server", "url", healthURL)
+		return nil
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		ctxLogger.V(1).Info("Unexpected status code from vmcp health endpoint",
+			"url", healthURL,
+			"status_code", resp.StatusCode)
+		return nil
+	}
+
+	// Parse response
+	var healthResp BackendHealthStatusResponse
+	if err := json.NewDecoder(resp.Body).Decode(&healthResp); err != nil {
+		ctxLogger.V(1).Error(err, "Failed to decode health response", "url", healthURL)
+		return nil
+	}
+
+	// Convert to map of backendID -> status
+	healthStatus := make(map[string]string)
+	for _, backend := range healthResp.Backends {
+		healthStatus[backend.BackendID] = backend.Status
+	}
+
+	ctxLogger.V(1).Info("Retrieved health status from vmcp server",
+		"url", healthURL,
+		"backend_count", len(healthStatus))
+
+	return healthStatus
+}
diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go
@@ -358,6 +358,15 @@ func runServe(cmd *cobra.Command, _ []string) error {
 		HealthMonitorConfig: healthMonitorConfig,
 	}
 
+	// Configure health monitoring if enabled
+	if cfg.Operational != nil && cfg.Operational.FailureHandling != nil && cfg.Operational.FailureHandling.HealthCheckInterval > 0 {
+		serverCfg.HealthMonitorConfig = &health.MonitorConfig{
+			CheckInterval:      time.Duration(cfg.Operational.FailureHandling.HealthCheckInterval),
+			UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
+			Timeout:            10 * time.Second, // Default timeout
+		}
+	}
+
 	// Convert composite tool configurations to workflow definitions
 	workflowDefs, err := vmcpserver.ConvertConfigToWorkflowDefinitions(cfg.CompositeTools)
 	if err != nil {
diff --git a/test/e2e/thv-operator/virtualmcp/README.md b/test/e2e/thv-operator/virtualmcp/README.md
@@ -64,6 +64,7 @@ ginkgo -vv
 
 - `suite_test.go` - Ginkgo test suite setup with kubeconfig loading
 - `virtualmcp_discovered_mode_test.go` - Tests VirtualMCPServer with discovered mode aggregation
+- `virtualmcp_health_monitoring_test.go` - Tests VirtualMCPServer health monitoring functionality
 - `helpers.go` - Common helper functions for interacting with Kubernetes resources
 - `README.md` - This file
 
@@ -77,6 +78,16 @@ Comprehensive E2E tests for VirtualMCPServer in discovered mode, which automatic
 - Validates discovered mode configuration and backend discovery
 - Uses prefix conflict resolution strategy to namespace tools from different backends
 
+#### Health Monitoring Tests (`virtualmcp_health_monitoring_test.go`)
+End-to-end tests for VirtualMCPServer health monitoring of backend MCP servers:
+- Creates VirtualMCPServer with configured health check interval and unhealthy threshold
+- Creates multiple backend MCPServers (2 healthy, 1 initially unhealthy)
+- Verifies health monitoring correctly identifies healthy and unhealthy backends
+- Tests that health check timestamps are updated periodically
+- Validates backend recovery detection (unhealthy → healthy transitions)
+- Ensures health status is accurately reflected in VirtualMCPServer status
+- Uses fast health check intervals (5s) for quicker test execution
+
 ## Environment Variables
 
 | Variable | Description | Default |
diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go