changes from review

taskbot · taskbot · commit 5d3dc03c047b · 2025-12-22T11:51:06.000+01:00
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller.go b/cmd/thv-operator/controllers/virtualmcpserver_controller.go
@@ -86,6 +86,7 @@ type VirtualMCPServerReconciler struct {
 	PlatformDetector *ctrlutil.SharedPlatformDetector
 
 	// healthStatusCache caches vmcp health endpoint responses to reduce HTTP overhead
+	// Initialized in SetupWithManager before reconciliation starts (controller-runtime contract)
 	healthStatusCache      map[string]*healthStatusCacheEntry
 	healthStatusCacheMutex sync.RWMutex
 }
@@ -205,14 +206,9 @@ func (r *VirtualMCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Req
 		return ctrl.Result{}, err
 	}
 
-	// Apply discovered backends to latestVMCP so updateVirtualMCPServerStatus can use them
-	// for phase determination. The statusManager has the updated backends from discoverBackends,
-	// but they haven't been applied to the CR yet.
-	if discoveredBackends != nil {
-		latestVMCP.Status.DiscoveredBackends = discoveredBackends
-	}
-
 	// Update status based on pod health using the latest Generation
+	// Note: updateVirtualMCPServerStatus uses statusManager.GetDiscoveredBackends()
+	// for phase determination, so discovered backends don't need to be applied here
 	if err := r.updateVirtualMCPServerStatus(ctx, latestVMCP, statusManager); err != nil {
 		ctxLogger.Error(err, "Failed to update VirtualMCPServer status")
 		return ctrl.Result{}, err
@@ -231,16 +227,13 @@ func (r *VirtualMCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Req
 	if vmcp.Spec.Operational != nil && vmcp.Spec.Operational.FailureHandling != nil &&
 		vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
 		// Parse health check interval to determine requeue time
-		// Note: We parse the duration string on each reconciliation rather than caching
-		// because time.ParseDuration is extremely fast (<1μs) and reconciliation frequency
-		// is already throttled (typically every 10s). Caching would add unnecessary complexity.
 		interval, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
 		if err != nil {
 			// Invalid duration format - log warning and fall through to event-driven reconciliation
 			// This should be caught by webhook validation, but we handle it gracefully here
-			ctxLogger.Error(err, "Invalid HealthCheckInterval format, health monitoring disabled",
+			ctxLogger.Error(err, "Invalid HealthCheckInterval format, falling back to event-driven reconciliation",
 				"health_check_interval", vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
-			// Continue with event-driven reconciliation instead of periodic
+			// Continue with event-driven reconciliation instead of periodic polling
 		} else {
 			// Requeue at a multiple of the health check interval to ensure we catch updates
 			// without reconciling too frequently
@@ -1719,7 +1712,7 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
 	// Query vmcp health status and update backend statuses if health monitoring is enabled
 	// This provides real MCP health check results instead of just Pod/Phase status
 	//
-	// Performance: Health status responses are cached with a short TTL (10s) to reduce HTTP
+	// Performance: Health status responses are cached with healthStatusCacheTTL to reduce HTTP
 	// overhead from frequent reconciliations while maintaining relatively fresh health data.
 	// The vmcp health endpoint itself returns cached results from periodic health checks.
 	if vmcp.Status.URL != "" {
@@ -1768,7 +1761,14 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
 						}
 					}
 
+					// Update LastHealthCheck with actual health check timestamp from vmcp
+					// Do this BEFORE the shouldPreserveUnavailable check so timestamp is always fresh
+					if !healthInfo.LastCheckTime.IsZero() {
+						discoveredBackends[i].LastHealthCheck = metav1.NewTime(healthInfo.LastCheckTime)
+					}
+
 					if shouldPreserveUnavailable {
+						// Skip status update but keep timestamp fresh (already updated above)
 						continue
 					}
 
@@ -1781,11 +1781,6 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
 							"health_status", healthInfo.Status)
 						discoveredBackends[i].Status = newStatus
 					}
-
-					// Update LastHealthCheck with actual health check timestamp from vmcp
-					if !healthInfo.LastCheckTime.IsZero() {
-						discoveredBackends[i].LastHealthCheck = metav1.NewTime(healthInfo.LastCheckTime)
-					}
 				}
 			}
 		} else {
@@ -1865,6 +1860,9 @@ func (phaseChangePredicate) Update(e event.UpdateEvent) bool {
 		}
 	}
 
+	// Return false for any other type. This should never happen in practice because
+	// this predicate is only registered for MCPServer and MCPRemoteProxy watches
+	// in SetupWithManager(). The controller-runtime framework guarantees type safety.
 	return false
 }
 
@@ -2383,8 +2381,8 @@ func (r *VirtualMCPServerReconciler) queryVMCPHealthStatus(
 
 	// Create HTTP client with derived timeout
 	// Note: Uses default transport which validates TLS certificates.
-	// If the vmcp server uses self-signed certificates, ensure proper cert configuration
-	// (e.g., via cert-manager) or use HTTP for internal cluster communication.
+	// For self-signed certificates, use proper certificate management (e.g., cert-manager)
+	// to establish trust. Disabling TLS validation or using HTTP is not recommended.
 	httpClient := &http.Client{
 		Timeout: timeout,
 	}
diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go
@@ -369,15 +369,6 @@ func runServe(cmd *cobra.Command, _ []string) error {
 		HealthMonitorConfig: healthMonitorConfig,
 	}
 
-	// Configure health monitoring if enabled
-	if cfg.Operational != nil && cfg.Operational.FailureHandling != nil && cfg.Operational.FailureHandling.HealthCheckInterval > 0 {
-		serverCfg.HealthMonitorConfig = &health.MonitorConfig{
-			CheckInterval:      time.Duration(cfg.Operational.FailureHandling.HealthCheckInterval),
-			UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
-			Timeout:            10 * time.Second, // Default timeout
-		}
-	}
-
 	// Convert composite tool configurations to workflow definitions
 	workflowDefs, err := vmcpserver.ConvertConfigToWorkflowDefinitions(cfg.CompositeTools)
 	if err != nil {
diff --git a/examples/operator/virtual-mcps/vmcp_health_monitoring.yaml b/examples/operator/virtual-mcps/vmcp_health_monitoring.yaml
@@ -147,12 +147,6 @@ spec:
       # Recommended: best_effort for production to maintain availability
       partialFailureMode: best_effort
 
-      # Optional: Circuit breaker (not yet implemented in this release)
-      # circuitBreaker:
-      #   enabled: true
-      #   failureThreshold: 5
-      #   timeout: 60s
-
 ---
 # Example: To test health monitoring behavior, you can:
 #
diff --git a/test/e2e/thv-operator/virtualmcp/helpers.go b/test/e2e/thv-operator/virtualmcp/helpers.go
@@ -72,9 +72,9 @@ func WaitForVirtualMCPServerReady(
 	}, timeout, pollingInterval).Should(gomega.Succeed())
 }
 
-// WaitForVirtualMCPServerDeployed waits for a VirtualMCPServer deployment to be running
+// WaitForVirtualMCPServerDeployed waits for a VirtualMCPServer to have pods running and a URL assigned,
 // without requiring the Ready condition to be True. This is useful for health monitoring tests
-// where some backends may intentionally be unhealthy.
+// where some backends may intentionally be unhealthy, causing Ready condition to be False.
 func WaitForVirtualMCPServerDeployed(
 	ctx context.Context,
 	c client.Client,
diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go
@@ -15,16 +15,17 @@ import (
 
 var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
 	var (
-		testNamespace       = "default"
-		mcpGroupName        = "test-health-group"
-		vmcpServerName      = "test-vmcp-health"
-		healthyBackend1     = "healthy-backend-1"
-		healthyBackend2     = "healthy-backend-2"
-		unhealthyBackend    = "unhealthy-backend"
-		timeout             = 3 * time.Minute
-		pollingInterval     = 2 * time.Second
-		healthCheckInterval = "5s" // Fast checks for e2e
-		unhealthyThreshold  = 2    // Mark unhealthy after 2 consecutive failures
+		testNamespace               = "default"
+		mcpGroupName                = "test-health-group"
+		vmcpServerName              = "test-vmcp-health"
+		healthyBackend1             = "healthy-backend-1"
+		healthyBackend2             = "healthy-backend-2"
+		unhealthyBackend            = "unhealthy-backend"
+		timeout                     = 3 * time.Minute
+		pollingInterval             = 2 * time.Second
+		healthCheckInterval         = "5s"             // Fast checks for e2e
+		unhealthyThreshold          = 2                // Mark unhealthy after 2 consecutive failures
+		healthCheckStabilizeTimeout = 30 * time.Second // Time for health checks to stabilize and detect failures
 	)
 
 	BeforeAll(func() {
@@ -210,7 +211,7 @@ var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
 			}
 
 			return nil
-		}, 30*time.Second, 2*time.Second).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
+		}, healthCheckStabilizeTimeout, pollingInterval).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
 
 		// Verify all backends are present in discovery
 		vmcpServer := &mcpv1alpha1.VirtualMCPServer{}