Skip to content

Commit c1baa26

Browse files
committed
Add periodic health monitoring for vMCP backends
Implement health check system that monitors backend MCP server availability through periodic ListCapabilities calls. This is the foundation for the health monitoring and circuit breaker system described in issue #3036. Addresses first part of #3036
1 parent 6a3fcae commit c1baa26

4 files changed

Lines changed: 515 additions & 0 deletions

File tree

cmd/thv-operator/controllers/virtualmcpserver_controller.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"encoding/json"
88
"fmt"
99
"maps"
10+
"net/http"
1011
"reflect"
1112
"strings"
1213
"time"
@@ -1656,6 +1657,53 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
16561657
"authConfigRef", authConfigRef)
16571658
}
16581659

1660+
// Query vmcp health status and update backend statuses if health monitoring is enabled
1661+
// This provides real MCP health check results instead of just Pod/Phase status
1662+
if vmcp.Status.URL != "" {
1663+
healthStatus := r.queryVMCPHealthStatus(ctx, vmcp.Status.URL)
1664+
if healthStatus != nil {
1665+
ctxLogger.V(1).Info("Updating backend status from vmcp health checks",
1666+
"vmcp_url", vmcp.Status.URL,
1667+
"backend_count", len(healthStatus))
1668+
1669+
for i := range discoveredBackends {
1670+
backend := &discoveredBackends[i]
1671+
if healthStat, found := healthStatus[backend.Name]; found {
1672+
// Map vmcp health status to CRD backend status
1673+
// vmcp statuses: healthy, unhealthy, degraded, unknown
1674+
// CRD statuses: ready, unavailable, degraded, unknown
1675+
var newStatus string
1676+
switch healthStat {
1677+
case "healthy":
1678+
newStatus = mcpv1alpha1.BackendStatusReady
1679+
case "unhealthy":
1680+
newStatus = mcpv1alpha1.BackendStatusUnavailable
1681+
case "degraded":
1682+
newStatus = mcpv1alpha1.BackendStatusDegraded
1683+
case "unknown":
1684+
newStatus = mcpv1alpha1.BackendStatusUnknown
1685+
default:
1686+
// Keep existing status if health status is unexpected
1687+
continue
1688+
}
1689+
1690+
// Only log if status changed
1691+
if newStatus != backend.Status {
1692+
ctxLogger.V(1).Info("Backend health check updated status",
1693+
"name", backend.Name,
1694+
"old_status", backend.Status,
1695+
"new_status", newStatus,
1696+
"health_status", healthStat)
1697+
backend.Status = newStatus
1698+
}
1699+
}
1700+
}
1701+
} else {
1702+
ctxLogger.V(1).Info("Health monitoring not enabled or failed to query vmcp health endpoint",
1703+
"vmcp_url", vmcp.Status.URL)
1704+
}
1705+
}
1706+
16591707
return discoveredBackends, nil
16601708
}
16611709

@@ -2096,3 +2144,79 @@ func (*VirtualMCPServerReconciler) vmcpReferencesCompositeToolDefinition(
20962144

20972145
return false
20982146
}
2147+
2148+
// BackendHealthStatusResponse represents the health status response from the vmcp health API
2149+
type BackendHealthStatusResponse struct {
2150+
Backends []struct {
2151+
BackendID string `json:"backendId"`
2152+
Status string `json:"status"`
2153+
ConsecutiveFailures int `json:"consecutiveFailures"`
2154+
LastCheckTime time.Time `json:"lastCheckTime"`
2155+
LastError string `json:"lastError,omitempty"`
2156+
LastTransitionTime time.Time `json:"lastTransitionTime"`
2157+
} `json:"backends"`
2158+
}
2159+
2160+
// queryVMCPHealthStatus queries the vmcp health endpoint and returns backend health status.
2161+
// Returns nil if health monitoring is not enabled or if there's an error.
2162+
func (*VirtualMCPServerReconciler) queryVMCPHealthStatus(
2163+
ctx context.Context,
2164+
vmcpURL string,
2165+
) map[string]string {
2166+
ctxLogger := log.FromContext(ctx)
2167+
2168+
// Construct health endpoint URL
2169+
healthURL := fmt.Sprintf("%s/api/backends/health", vmcpURL)
2170+
2171+
// Create HTTP client with timeout
2172+
httpClient := &http.Client{
2173+
Timeout: 5 * time.Second,
2174+
}
2175+
2176+
// Create and execute request
2177+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil)
2178+
if err != nil {
2179+
ctxLogger.V(1).Error(err, "Failed to create health check request", "url", healthURL)
2180+
return nil
2181+
}
2182+
2183+
resp, err := httpClient.Do(req)
2184+
if err != nil {
2185+
ctxLogger.V(1).Error(err, "Failed to query vmcp health endpoint", "url", healthURL)
2186+
return nil
2187+
}
2188+
defer resp.Body.Close()
2189+
2190+
// Check status code
2191+
if resp.StatusCode == http.StatusServiceUnavailable {
2192+
// Health monitoring is not enabled on the vmcp server
2193+
ctxLogger.V(1).Info("Health monitoring not enabled on vmcp server", "url", healthURL)
2194+
return nil
2195+
}
2196+
2197+
if resp.StatusCode != http.StatusOK {
2198+
ctxLogger.V(1).Info("Unexpected status code from vmcp health endpoint",
2199+
"url", healthURL,
2200+
"status_code", resp.StatusCode)
2201+
return nil
2202+
}
2203+
2204+
// Parse response
2205+
var healthResp BackendHealthStatusResponse
2206+
if err := json.NewDecoder(resp.Body).Decode(&healthResp); err != nil {
2207+
ctxLogger.V(1).Error(err, "Failed to decode health response", "url", healthURL)
2208+
return nil
2209+
}
2210+
2211+
// Convert to map of backendID -> status
2212+
healthStatus := make(map[string]string)
2213+
for _, backend := range healthResp.Backends {
2214+
healthStatus[backend.BackendID] = backend.Status
2215+
}
2216+
2217+
ctxLogger.V(1).Info("Retrieved health status from vmcp server",
2218+
"url", healthURL,
2219+
"backend_count", len(healthStatus))
2220+
2221+
return healthStatus
2222+
}

cmd/vmcp/app/commands.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,15 @@ func runServe(cmd *cobra.Command, _ []string) error {
358358
HealthMonitorConfig: healthMonitorConfig,
359359
}
360360

361+
// Configure health monitoring if enabled
362+
if cfg.Operational != nil && cfg.Operational.FailureHandling != nil && cfg.Operational.FailureHandling.HealthCheckInterval > 0 {
363+
serverCfg.HealthMonitorConfig = &health.MonitorConfig{
364+
CheckInterval: time.Duration(cfg.Operational.FailureHandling.HealthCheckInterval),
365+
UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
366+
Timeout: 10 * time.Second, // Default timeout
367+
}
368+
}
369+
361370
// Convert composite tool configurations to workflow definitions
362371
workflowDefs, err := vmcpserver.ConvertConfigToWorkflowDefinitions(cfg.CompositeTools)
363372
if err != nil {

test/e2e/thv-operator/virtualmcp/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ ginkgo -vv
6464

6565
- `suite_test.go` - Ginkgo test suite setup with kubeconfig loading
6666
- `virtualmcp_discovered_mode_test.go` - Tests VirtualMCPServer with discovered mode aggregation
67+
- `virtualmcp_health_monitoring_test.go` - Tests VirtualMCPServer health monitoring functionality
6768
- `helpers.go` - Common helper functions for interacting with Kubernetes resources
6869
- `README.md` - This file
6970

@@ -77,6 +78,16 @@ Comprehensive E2E tests for VirtualMCPServer in discovered mode, which automatic
7778
- Validates discovered mode configuration and backend discovery
7879
- Uses prefix conflict resolution strategy to namespace tools from different backends
7980

81+
#### Health Monitoring Tests (`virtualmcp_health_monitoring_test.go`)
82+
End-to-end tests for VirtualMCPServer health monitoring of backend MCP servers:
83+
- Creates VirtualMCPServer with configured health check interval and unhealthy threshold
84+
- Creates multiple backend MCPServers (2 healthy, 1 initially unhealthy)
85+
- Verifies health monitoring correctly identifies healthy and unhealthy backends
86+
- Tests that health check timestamps are updated periodically
87+
- Validates backend recovery detection (unhealthy → healthy transitions)
88+
- Ensures health status is accurately reflected in VirtualMCPServer status
89+
- Uses fast health check intervals (5s) for quicker test execution
90+
8091
## Environment Variables
8192

8293
| Variable | Description | Default |

0 commit comments

Comments
 (0)