Skip to content

Commit 5d3dc03

Browse files
committed
changes from review
1 parent 4344092 commit 5d3dc03

File tree

5 files changed

+32
-48
lines changed

5 files changed

+32
-48
lines changed

cmd/thv-operator/controllers/virtualmcpserver_controller.go

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ type VirtualMCPServerReconciler struct {
8686
PlatformDetector *ctrlutil.SharedPlatformDetector
8787

8888
// healthStatusCache caches vmcp health endpoint responses to reduce HTTP overhead
89+
// Initialized in SetupWithManager before reconciliation starts (controller-runtime contract)
8990
healthStatusCache map[string]*healthStatusCacheEntry
9091
healthStatusCacheMutex sync.RWMutex
9192
}
@@ -205,14 +206,9 @@ func (r *VirtualMCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Req
205206
return ctrl.Result{}, err
206207
}
207208

208-
// Apply discovered backends to latestVMCP so updateVirtualMCPServerStatus can use them
209-
// for phase determination. The statusManager has the updated backends from discoverBackends,
210-
// but they haven't been applied to the CR yet.
211-
if discoveredBackends != nil {
212-
latestVMCP.Status.DiscoveredBackends = discoveredBackends
213-
}
214-
215209
// Update status based on pod health using the latest Generation
210+
// Note: updateVirtualMCPServerStatus uses statusManager.GetDiscoveredBackends()
211+
// for phase determination, so discovered backends don't need to be applied here
216212
if err := r.updateVirtualMCPServerStatus(ctx, latestVMCP, statusManager); err != nil {
217213
ctxLogger.Error(err, "Failed to update VirtualMCPServer status")
218214
return ctrl.Result{}, err
@@ -231,16 +227,13 @@ func (r *VirtualMCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Req
231227
if vmcp.Spec.Operational != nil && vmcp.Spec.Operational.FailureHandling != nil &&
232228
vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
233229
// Parse health check interval to determine requeue time
234-
// Note: We parse the duration string on each reconciliation rather than caching
235-
// because time.ParseDuration is extremely fast (<1μs) and reconciliation frequency
236-
// is already throttled (typically every 10s). Caching would add unnecessary complexity.
237230
interval, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
238231
if err != nil {
239232
// Invalid duration format - log warning and fall through to event-driven reconciliation
240233
// This should be caught by webhook validation, but we handle it gracefully here
241-
ctxLogger.Error(err, "Invalid HealthCheckInterval format, health monitoring disabled",
234+
ctxLogger.Error(err, "Invalid HealthCheckInterval format, falling back to event-driven reconciliation",
242235
"health_check_interval", vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
243-
// Continue with event-driven reconciliation instead of periodic
236+
// Continue with event-driven reconciliation instead of periodic polling
244237
} else {
245238
// Requeue at a multiple of the health check interval to ensure we catch updates
246239
// without reconciling too frequently
@@ -1719,7 +1712,7 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
17191712
// Query vmcp health status and update backend statuses if health monitoring is enabled
17201713
// This provides real MCP health check results instead of just Pod/Phase status
17211714
//
1722-
// Performance: Health status responses are cached with a short TTL (10s) to reduce HTTP
1715+
// Performance: Health status responses are cached with healthStatusCacheTTL to reduce HTTP
17231716
// overhead from frequent reconciliations while maintaining relatively fresh health data.
17241717
// The vmcp health endpoint itself returns cached results from periodic health checks.
17251718
if vmcp.Status.URL != "" {
@@ -1768,7 +1761,14 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
17681761
}
17691762
}
17701763

1764+
// Update LastHealthCheck with actual health check timestamp from vmcp
1765+
// Do this BEFORE the shouldPreserveUnavailable check so timestamp is always fresh
1766+
if !healthInfo.LastCheckTime.IsZero() {
1767+
discoveredBackends[i].LastHealthCheck = metav1.NewTime(healthInfo.LastCheckTime)
1768+
}
1769+
17711770
if shouldPreserveUnavailable {
1771+
// Skip status update but keep timestamp fresh (already updated above)
17721772
continue
17731773
}
17741774

@@ -1781,11 +1781,6 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
17811781
"health_status", healthInfo.Status)
17821782
discoveredBackends[i].Status = newStatus
17831783
}
1784-
1785-
// Update LastHealthCheck with actual health check timestamp from vmcp
1786-
if !healthInfo.LastCheckTime.IsZero() {
1787-
discoveredBackends[i].LastHealthCheck = metav1.NewTime(healthInfo.LastCheckTime)
1788-
}
17891784
}
17901785
}
17911786
} else {
@@ -1865,6 +1860,9 @@ func (phaseChangePredicate) Update(e event.UpdateEvent) bool {
18651860
}
18661861
}
18671862

1863+
// Return false for any other type. This should never happen in practice because
1864+
// this predicate is only registered for MCPServer and MCPRemoteProxy watches
1865+
// in SetupWithManager(). The controller-runtime framework guarantees type safety.
18681866
return false
18691867
}
18701868

@@ -2383,8 +2381,8 @@ func (r *VirtualMCPServerReconciler) queryVMCPHealthStatus(
23832381

23842382
// Create HTTP client with derived timeout
23852383
// Note: Uses default transport which validates TLS certificates.
2386-
// If the vmcp server uses self-signed certificates, ensure proper cert configuration
2387-
// (e.g., via cert-manager) or use HTTP for internal cluster communication.
2384+
// For self-signed certificates, use proper certificate management (e.g., cert-manager)
2385+
// to establish trust. Disabling TLS validation or using HTTP is not recommended.
23882386
httpClient := &http.Client{
23892387
Timeout: timeout,
23902388
}

cmd/vmcp/app/commands.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -369,15 +369,6 @@ func runServe(cmd *cobra.Command, _ []string) error {
369369
HealthMonitorConfig: healthMonitorConfig,
370370
}
371371

372-
// Configure health monitoring if enabled
373-
if cfg.Operational != nil && cfg.Operational.FailureHandling != nil && cfg.Operational.FailureHandling.HealthCheckInterval > 0 {
374-
serverCfg.HealthMonitorConfig = &health.MonitorConfig{
375-
CheckInterval: time.Duration(cfg.Operational.FailureHandling.HealthCheckInterval),
376-
UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
377-
Timeout: 10 * time.Second, // Default timeout
378-
}
379-
}
380-
381372
// Convert composite tool configurations to workflow definitions
382373
workflowDefs, err := vmcpserver.ConvertConfigToWorkflowDefinitions(cfg.CompositeTools)
383374
if err != nil {

examples/operator/virtual-mcps/vmcp_health_monitoring.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,6 @@ spec:
147147
# Recommended: best_effort for production to maintain availability
148148
partialFailureMode: best_effort
149149

150-
# Optional: Circuit breaker (not yet implemented in this release)
151-
# circuitBreaker:
152-
# enabled: true
153-
# failureThreshold: 5
154-
# timeout: 60s
155-
156150
---
157151
# Example: To test health monitoring behavior, you can:
158152
#

test/e2e/thv-operator/virtualmcp/helpers.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ func WaitForVirtualMCPServerReady(
7272
}, timeout, pollingInterval).Should(gomega.Succeed())
7373
}
7474

75-
// WaitForVirtualMCPServerDeployed waits for a VirtualMCPServer deployment to be running
75+
// WaitForVirtualMCPServerDeployed waits for a VirtualMCPServer to have pods running and a URL assigned,
7676
// without requiring the Ready condition to be True. This is useful for health monitoring tests
77-
// where some backends may intentionally be unhealthy.
77+
// where some backends may intentionally be unhealthy, causing Ready condition to be False.
7878
func WaitForVirtualMCPServerDeployed(
7979
ctx context.Context,
8080
c client.Client,

test/e2e/thv-operator/virtualmcp/virtualmcp_health_monitoring_test.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,17 @@ import (
1515

1616
var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
1717
var (
18-
testNamespace = "default"
19-
mcpGroupName = "test-health-group"
20-
vmcpServerName = "test-vmcp-health"
21-
healthyBackend1 = "healthy-backend-1"
22-
healthyBackend2 = "healthy-backend-2"
23-
unhealthyBackend = "unhealthy-backend"
24-
timeout = 3 * time.Minute
25-
pollingInterval = 2 * time.Second
26-
healthCheckInterval = "5s" // Fast checks for e2e
27-
unhealthyThreshold = 2 // Mark unhealthy after 2 consecutive failures
18+
testNamespace = "default"
19+
mcpGroupName = "test-health-group"
20+
vmcpServerName = "test-vmcp-health"
21+
healthyBackend1 = "healthy-backend-1"
22+
healthyBackend2 = "healthy-backend-2"
23+
unhealthyBackend = "unhealthy-backend"
24+
timeout = 3 * time.Minute
25+
pollingInterval = 2 * time.Second
26+
healthCheckInterval = "5s" // Fast checks for e2e
27+
unhealthyThreshold = 2 // Mark unhealthy after 2 consecutive failures
28+
healthCheckStabilizeTimeout = 30 * time.Second // Time for health checks to stabilize and detect failures
2829
)
2930

3031
BeforeAll(func() {
@@ -210,7 +211,7 @@ var _ = Describe("VirtualMCPServer Health Monitoring", Ordered, func() {
210211
}
211212

212213
return nil
213-
}, 30*time.Second, 2*time.Second).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
214+
}, healthCheckStabilizeTimeout, pollingInterval).Should(Succeed(), "Health checks should mark unhealthy backend as unavailable")
214215

215216
// Verify all backends are present in discovery
216217
vmcpServer := &mcpv1alpha1.VirtualMCPServer{}

0 commit comments

Comments
 (0)