Skip to content

Commit e063ed5

Browse files
committed
changes from review
1 parent ee8622c commit e063ed5

File tree

5 files changed

+32
-48
lines changed

5 files changed

+32
-48
lines changed

cmd/thv-operator/controllers/virtualmcpserver_controller.go

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ type VirtualMCPServerReconciler struct {
8686
PlatformDetector *ctrlutil.SharedPlatformDetector
8787

8888
// healthStatusCache caches vmcp health endpoint responses to reduce HTTP overhead
89+
// Initialized in SetupWithManager before reconciliation starts (controller-runtime contract)
8990
healthStatusCache map[string]*healthStatusCacheEntry
9091
healthStatusCacheMutex sync.RWMutex
9192
}
@@ -205,14 +206,9 @@ func (r *VirtualMCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Req
205206
return ctrl.Result{}, err
206207
}
207208

208-
// Apply discovered backends to latestVMCP so updateVirtualMCPServerStatus can use them
209-
// for phase determination. The statusManager has the updated backends from discoverBackends,
210-
// but they haven't been applied to the CR yet.
211-
if discoveredBackends != nil {
212-
latestVMCP.Status.DiscoveredBackends = discoveredBackends
213-
}
214-
215209
// Update status based on pod health using the latest Generation
210+
// Note: updateVirtualMCPServerStatus uses statusManager.GetDiscoveredBackends()
211+
// for phase determination, so discovered backends don't need to be applied here
216212
if err := r.updateVirtualMCPServerStatus(ctx, latestVMCP, statusManager); err != nil {
217213
ctxLogger.Error(err, "Failed to update VirtualMCPServer status")
218214
return ctrl.Result{}, err
@@ -231,16 +227,13 @@ func (r *VirtualMCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Req
231227
if vmcp.Spec.Operational != nil && vmcp.Spec.Operational.FailureHandling != nil &&
232228
vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
233229
// Parse health check interval to determine requeue time
234-
// Note: We parse the duration string on each reconciliation rather than caching
235-
// because time.ParseDuration is extremely fast (<1μs) and reconciliation frequency
236-
// is already throttled (typically every 10s). Caching would add unnecessary complexity.
237230
interval, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
238231
if err != nil {
239232
// Invalid duration format - log warning and fall through to event-driven reconciliation
240233
// This should be caught by webhook validation, but we handle it gracefully here
241-
ctxLogger.Error(err, "Invalid HealthCheckInterval format, health monitoring disabled",
234+
ctxLogger.Error(err, "Invalid HealthCheckInterval format, falling back to event-driven reconciliation",
242235
"health_check_interval", vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
243-
// Continue with event-driven reconciliation instead of periodic
236+
// Continue with event-driven reconciliation instead of periodic polling
244237
} else {
245238
// Requeue at a multiple of the health check interval to ensure we catch updates
246239
// without reconciling too frequently
@@ -1715,7 +1708,7 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
17151708
// Query vmcp health status and update backend statuses if health monitoring is enabled
17161709
// This provides real MCP health check results instead of just Pod/Phase status
17171710
//
1718-
// Performance: Health status responses are cached with a short TTL (10s) to reduce HTTP
1711+
// Performance: Health status responses are cached with healthStatusCacheTTL to reduce HTTP
17191712
// overhead from frequent reconciliations while maintaining relatively fresh health data.
17201713
// The vmcp health endpoint itself returns cached results from periodic health checks.
17211714
if vmcp.Status.URL != "" {
@@ -1764,7 +1757,14 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
17641757
}
17651758
}
17661759

1760+
// Update LastHealthCheck with actual health check timestamp from vmcp
1761+
// Do this BEFORE the shouldPreserveUnavailable check so timestamp is always fresh
1762+
if !healthInfo.LastCheckTime.IsZero() {
1763+
discoveredBackends[i].LastHealthCheck = metav1.NewTime(healthInfo.LastCheckTime)
1764+
}
1765+
17671766
if shouldPreserveUnavailable {
1767+
// Skip status update but keep timestamp fresh (already updated above)
17681768
continue
17691769
}
17701770

@@ -1777,11 +1777,6 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
17771777
"health_status", healthInfo.Status)
17781778
discoveredBackends[i].Status = newStatus
17791779
}
1780-
1781-
// Update LastHealthCheck with actual health check timestamp from vmcp
1782-
if !healthInfo.LastCheckTime.IsZero() {
1783-
discoveredBackends[i].LastHealthCheck = metav1.NewTime(healthInfo.LastCheckTime)
1784-
}
17851780
}
17861781
}
17871782
} else {
@@ -1861,6 +1856,9 @@ func (phaseChangePredicate) Update(e event.UpdateEvent) bool {
18611856
}
18621857
}
18631858

1859+
// Return false for any other type. This should never happen in practice because
1860+
// this predicate is only registered for MCPServer and MCPRemoteProxy watches
1861+
// in SetupWithManager(). The controller-runtime framework guarantees type safety.
18641862
return false
18651863
}
18661864

@@ -2379,8 +2377,8 @@ func (r *VirtualMCPServerReconciler) queryVMCPHealthStatus(
23792377

23802378
// Create HTTP client with derived timeout
23812379
// Note: Uses default transport which validates TLS certificates.
2382-
// If the vmcp server uses self-signed certificates, ensure proper cert configuration
2383-
// (e.g., via cert-manager) or use HTTP for internal cluster communication.
2380+
// For self-signed certificates, use proper certificate management (e.g., cert-manager)
2381+
// to establish trust. Disabling TLS validation or using HTTP is not recommended.
23842382
httpClient := &http.Client{
23852383
Timeout: timeout,
23862384
}

cmd/vmcp/app/commands.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -358,15 +358,6 @@ func runServe(cmd *cobra.Command, _ []string) error {
358358
HealthMonitorConfig: healthMonitorConfig,
359359
}
360360

361-
// Configure health monitoring if enabled
362-
if cfg.Operational != nil && cfg.Operational.FailureHandling != nil && cfg.Operational.FailureHandling.HealthCheckInterval > 0 {
363-
serverCfg.HealthMonitorConfig = &health.MonitorConfig{
364-
CheckInterval: time.Duration(cfg.Operational.FailureHandling.HealthCheckInterval),
365-
UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
366-
Timeout: 10 * time.Second, // Default timeout
367-
}
368-
}
369-
370361
// Convert composite tool configurations to workflow definitions
371362
workflowDefs, err := vmcpserver.ConvertConfigToWorkflowDefinitions(cfg.CompositeTools)
372363
if err != nil {

examples/operator/virtual-mcps/vmcp_health_monitoring.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,6 @@ spec:
147147
# Recommended: best_effort for production to maintain availability
148148
partialFailureMode: best_effort
149149

150-
# Optional: Circuit breaker (not yet implemented in this release)
151-
# circuitBreaker:
152-
# enabled: true
153-
# failureThreshold: 5
154-
# timeout: 60s
155-
156150
---
157151
# Example: To test health monitoring behavior, you can:
158152
#

test/e2e/thv-operator/virtualmcp/helpers.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ func WaitForVirtualMCPServerReady(
7272
}, timeout, pollingInterval).Should(gomega.Succeed())
7373
}
7474

75-
// WaitForVirtualMCPServerDeployed waits for a VirtualMCPServer deployment to be running
75+
// WaitForVirtualMCPServerDeployed waits for a VirtualMCPServer to have pods running and a URL assigned,
7676
// without requiring the Ready condition to be True. This is useful for health monitoring tests
77-
// where some backends may intentionally be unhealthy.
77+
// where some backends may intentionally be unhealthy, causing Ready condition to be False.
7878
func WaitForVirtualMCPServerDeployed(
7979
ctx context.Context,
8080
c client.Client,

test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,17 @@ import (
1515

1616
var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
1717
var (
18-
testNamespace = "default"
19-
mcpGroupName = "test-health-group"
20-
vmcpServerName = "test-vmcp-health"
21-
healthyBackend1 = "healthy-backend-1"
22-
healthyBackend2 = "healthy-backend-2"
23-
unhealthyBackend = "unhealthy-backend"
24-
timeout = 3 * time.Minute
25-
pollingInterval = 2 * time.Second
26-
healthCheckInterval = "5s" // Fast checks for e2e
27-
unhealthyThreshold = 2 // Mark unhealthy after 2 consecutive failures
18+
testNamespace = "default"
19+
mcpGroupName = "test-health-group"
20+
vmcpServerName = "test-vmcp-health"
21+
healthyBackend1 = "healthy-backend-1"
22+
healthyBackend2 = "healthy-backend-2"
23+
unhealthyBackend = "unhealthy-backend"
24+
timeout = 3 * time.Minute
25+
pollingInterval = 2 * time.Second
26+
healthCheckInterval = "5s" // Fast checks for e2e
27+
unhealthyThreshold = 2 // Mark unhealthy after 2 consecutive failures
28+
healthCheckStabilizeTimeout = 30 * time.Second // Time for health checks to stabilize and detect failures
2829
)
2930

3031
BeforeAll(func() {
@@ -210,7 +211,7 @@ var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
210211
}
211212

212213
return nil
213-
}, 30*time.Second, 2*time.Second).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
214+
}, healthCheckStabilizeTimeout, pollingInterval).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
214215

215216
// Verify all backends are present in discovery
216217
vmcpServer := &mcpv1alpha1.VirtualMCPServer{}

0 commit comments

Comments
 (0)