Skip to content

Commit fb475a8

Browse files
yroblataskbot
andauthored
Add core health monitoring infrastructure for vmcp backends (#3100)
* Add core health monitoring infrastructure for vmcp backends Implement health checking and status tracking for virtual MCP server backends. This provides the foundation for monitoring backend availability and categorizing failure modes (unhealthy, degraded, unauthenticated). Related-to: #3036 * changes from review * fixes from review --------- Co-authored-by: taskbot <taskbot@users.noreply.github.com>
1 parent 039b19d commit fb475a8

10 files changed

Lines changed: 2401 additions & 13 deletions

File tree

pkg/vmcp/client/client.go

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ package client
77
import (
88
"context"
99
"encoding/base64"
10+
"errors"
1011
"fmt"
1112
"io"
13+
"net"
1214
"net/http"
1315

1416
"github.com/mark3labs/mcp-go/client"
@@ -239,6 +241,69 @@ func (h *httpBackendClient) defaultClientFactory(ctx context.Context, target *vm
239241
return c, nil
240242
}
241243

244+
// wrapBackendError wraps an error with the appropriate sentinel error based on error type.
245+
// This enables type-safe error checking with errors.Is() instead of string matching.
246+
//
247+
// Error detection strategy (in order of preference):
248+
// 1. Check for standard Go error types (context errors, net.Error, url.Error)
249+
// 2. Fall back to string pattern matching for library-specific errors (MCP SDK, HTTP libs)
250+
//
251+
// Error chain preservation:
252+
// The returned error wraps the sentinel error (ErrTimeout, ErrBackendUnavailable, etc.) with %w
253+
// and formats the original error with %v. This means:
254+
// - errors.Is() works for checking the sentinel error (e.g., errors.Is(err, vmcp.ErrTimeout))
255+
// - errors.As() cannot access the underlying original error type
256+
// This is a deliberate trade-off due to Go's limitation of one %w per fmt.Errorf call.
257+
// If access to the underlying error type is needed in the future, consider implementing
258+
// a custom error type with multiple Unwrap() methods (Go 1.20+).
259+
func wrapBackendError(err error, backendID string, operation string) error {
260+
if err == nil {
261+
return nil
262+
}
263+
264+
// 1. Type-based detection: Check for context deadline/cancellation
265+
if errors.Is(err, context.DeadlineExceeded) {
266+
return fmt.Errorf("%w: failed to %s for backend %s (timeout): %v",
267+
vmcp.ErrTimeout, operation, backendID, err)
268+
}
269+
if errors.Is(err, context.Canceled) {
270+
return fmt.Errorf("%w: failed to %s for backend %s (cancelled): %v",
271+
vmcp.ErrCancelled, operation, backendID, err)
272+
}
273+
274+
// 2. Type-based detection: Check for net.Error with Timeout() method
275+
// This handles network timeouts from the standard library
276+
var netErr net.Error
277+
if errors.As(err, &netErr) && netErr.Timeout() {
278+
return fmt.Errorf("%w: failed to %s for backend %s (timeout): %v",
279+
vmcp.ErrTimeout, operation, backendID, err)
280+
}
281+
282+
// 3. String-based detection: Fall back to pattern matching for cases where
283+
// we don't have structured error types (MCP SDK, HTTP libraries with embedded status codes)
284+
// Authentication errors (401, 403, auth failures)
285+
if vmcp.IsAuthenticationError(err) {
286+
return fmt.Errorf("%w: failed to %s for backend %s: %v",
287+
vmcp.ErrAuthenticationFailed, operation, backendID, err)
288+
}
289+
290+
// Timeout errors (deadline exceeded, timeout messages)
291+
if vmcp.IsTimeoutError(err) {
292+
return fmt.Errorf("%w: failed to %s for backend %s (timeout): %v",
293+
vmcp.ErrTimeout, operation, backendID, err)
294+
}
295+
296+
// Connection errors (refused, reset, unreachable)
297+
if vmcp.IsConnectionError(err) {
298+
return fmt.Errorf("%w: failed to %s for backend %s (connection error): %v",
299+
vmcp.ErrBackendUnavailable, operation, backendID, err)
300+
}
301+
302+
// Default to backend unavailable for unknown errors
303+
return fmt.Errorf("%w: failed to %s for backend %s: %v",
304+
vmcp.ErrBackendUnavailable, operation, backendID, err)
305+
}
306+
242307
// initializeClient performs MCP protocol initialization handshake and returns server capabilities.
243308
// This allows the caller to determine which optional features the server supports.
244309
func initializeClient(ctx context.Context, c *client.Client) (*mcp.ServerCapabilities, error) {
@@ -313,14 +378,14 @@ func (h *httpBackendClient) ListCapabilities(ctx context.Context, target *vmcp.B
313378
// Create a client for this backend (not yet initialized)
314379
c, err := h.clientFactory(ctx, target)
315380
if err != nil {
316-
return nil, fmt.Errorf("failed to create client for backend %s: %w", target.WorkloadID, err)
381+
return nil, wrapBackendError(err, target.WorkloadID, "create client")
317382
}
318383
defer c.Close()
319384

320385
// Initialize the client and get server capabilities
321386
serverCaps, err := initializeClient(ctx, c)
322387
if err != nil {
323-
return nil, fmt.Errorf("failed to initialize client for backend %s: %w", target.WorkloadID, err)
388+
return nil, wrapBackendError(err, target.WorkloadID, "initialize client")
324389
}
325390

326391
logger.Debugf("Backend %s capabilities: tools=%v, resources=%v, prompts=%v",
@@ -330,17 +395,17 @@ func (h *httpBackendClient) ListCapabilities(ctx context.Context, target *vmcp.B
330395
// Check for nil BEFORE passing to functions to avoid interface{} nil pointer issues
331396
toolsResp, err := queryTools(ctx, c, serverCaps.Tools != nil, target.WorkloadID)
332397
if err != nil {
333-
return nil, err
398+
return nil, wrapBackendError(err, target.WorkloadID, "list tools")
334399
}
335400

336401
resourcesResp, err := queryResources(ctx, c, serverCaps.Resources != nil, target.WorkloadID)
337402
if err != nil {
338-
return nil, err
403+
return nil, wrapBackendError(err, target.WorkloadID, "list resources")
339404
}
340405

341406
promptsResp, err := queryPrompts(ctx, c, serverCaps.Prompts != nil, target.WorkloadID)
342407
if err != nil {
343-
return nil, err
408+
return nil, wrapBackendError(err, target.WorkloadID, "list prompts")
344409
}
345410

346411
// Convert MCP types to vmcp types
@@ -428,13 +493,13 @@ func (h *httpBackendClient) CallTool(
428493
// Create a client for this backend
429494
c, err := h.clientFactory(ctx, target)
430495
if err != nil {
431-
return nil, fmt.Errorf("failed to create client for backend %s: %w", target.WorkloadID, err)
496+
return nil, wrapBackendError(err, target.WorkloadID, "create client")
432497
}
433498
defer c.Close()
434499

435500
// Initialize the client
436501
if _, err := initializeClient(ctx, c); err != nil {
437-
return nil, fmt.Errorf("failed to initialize client for backend %s: %w", target.WorkloadID, err)
502+
return nil, wrapBackendError(err, target.WorkloadID, "initialize client")
438503
}
439504

440505
// Call the tool using the original capability name from the backend's perspective.
@@ -525,13 +590,13 @@ func (h *httpBackendClient) ReadResource(ctx context.Context, target *vmcp.Backe
525590
// Create a client for this backend
526591
c, err := h.clientFactory(ctx, target)
527592
if err != nil {
528-
return nil, fmt.Errorf("failed to create client for backend %s: %w", target.WorkloadID, err)
593+
return nil, wrapBackendError(err, target.WorkloadID, "create client")
529594
}
530595
defer c.Close()
531596

532597
// Initialize the client
533598
if _, err := initializeClient(ctx, c); err != nil {
534-
return nil, fmt.Errorf("failed to initialize client for backend %s: %w", target.WorkloadID, err)
599+
return nil, wrapBackendError(err, target.WorkloadID, "initialize client")
535600
}
536601

537602
// Read the resource using the original URI from the backend's perspective.
@@ -586,13 +651,13 @@ func (h *httpBackendClient) GetPrompt(
586651
// Create a client for this backend
587652
c, err := h.clientFactory(ctx, target)
588653
if err != nil {
589-
return "", fmt.Errorf("failed to create client for backend %s: %w", target.WorkloadID, err)
654+
return "", wrapBackendError(err, target.WorkloadID, "create client")
590655
}
591656
defer c.Close()
592657

593658
// Initialize the client
594659
if _, err := initializeClient(ctx, c); err != nil {
595-
return "", fmt.Errorf("failed to initialize client for backend %s: %w", target.WorkloadID, err)
660+
return "", wrapBackendError(err, target.WorkloadID, "initialize client")
596661
}
597662

598663
// Get the prompt using the original prompt name from the backend's perspective.

pkg/vmcp/errors.go

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package vmcp
22

3-
import "errors"
3+
import (
4+
"errors"
5+
"strings"
6+
)
47

58
// Common domain errors used across vmcp subpackages.
69
// Following DDD principles, domain errors are defined at the package root.
@@ -61,3 +64,82 @@ var (
6164
// Wrapping errors should list the conflicting tool names.
6265
ErrToolNameConflict = errors.New("tool name conflict")
6366
)
67+
68+
// Error Categorization Helpers
69+
//
70+
// These functions categorize errors by examining error message strings.
71+
// They serve as a fallback mechanism for error detection when:
72+
//
73+
// 1. Errors come from external libraries that use their own error types and formats
74+
// 2. Legacy code paths don't wrap errors with sentinel errors
75+
// 3. Backwards compatibility is needed for error detection
76+
//
77+
// Note: BackendClient now wraps all errors with appropriate sentinel errors
78+
// (ErrAuthenticationFailed, ErrTimeout, ErrBackendUnavailable). Health monitoring
79+
// code should prefer errors.Is() checks over these string-based functions.
80+
// These functions remain for backwards compatibility and as a fallback mechanism.
81+
82+
// IsAuthenticationError checks if an error message indicates an authentication failure.
83+
// Uses case-insensitive pattern matching to detect various auth error formats from
84+
// HTTP libraries, MCP protocol errors, and authentication middleware.
85+
func IsAuthenticationError(err error) bool {
86+
if err == nil {
87+
return false
88+
}
89+
90+
errLower := strings.ToLower(err.Error())
91+
92+
// Check for explicit authentication failure messages
93+
if strings.Contains(errLower, "authentication failed") ||
94+
strings.Contains(errLower, "authentication error") {
95+
return true
96+
}
97+
98+
// Check for HTTP 401/403 status codes with context
99+
// Match patterns like "401 Unauthorized", "HTTP 401", "status code 401"
100+
if strings.Contains(errLower, "401 unauthorized") ||
101+
strings.Contains(errLower, "403 forbidden") ||
102+
strings.Contains(errLower, "http 401") ||
103+
strings.Contains(errLower, "http 403") ||
104+
strings.Contains(errLower, "status code 401") ||
105+
strings.Contains(errLower, "status code 403") {
106+
return true
107+
}
108+
109+
// Check for explicit unauthenticated/unauthorized errors
110+
if strings.Contains(errLower, "request unauthenticated") ||
111+
strings.Contains(errLower, "request unauthorized") ||
112+
strings.Contains(errLower, "access denied") {
113+
return true
114+
}
115+
116+
return false
117+
}
118+
119+
// IsTimeoutError checks if an error message indicates a timeout.
120+
// Detects various timeout formats from context deadlines, HTTP timeouts,
121+
// and network timeout errors.
122+
func IsTimeoutError(err error) bool {
123+
if err == nil {
124+
return false
125+
}
126+
127+
errLower := strings.ToLower(err.Error())
128+
return strings.Contains(errLower, "timeout") ||
129+
strings.Contains(errLower, "deadline exceeded") ||
130+
strings.Contains(errLower, "context deadline exceeded")
131+
}
132+
133+
// IsConnectionError checks if an error message indicates a connection failure.
134+
// Detects network-level errors like connection refused, reset, unreachable, etc.
135+
func IsConnectionError(err error) bool {
136+
if err == nil {
137+
return false
138+
}
139+
140+
errLower := strings.ToLower(err.Error())
141+
return strings.Contains(errLower, "connection refused") ||
142+
strings.Contains(errLower, "connection reset") ||
143+
strings.Contains(errLower, "no route to host") ||
144+
strings.Contains(errLower, "network is unreachable")
145+
}

pkg/vmcp/health/checker.go

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
// Package health provides health monitoring for vMCP backend MCP servers.
2+
//
3+
// This package implements the HealthChecker interface and provides periodic
4+
// health monitoring with configurable intervals and failure thresholds.
5+
package health
6+
7+
import (
8+
"context"
9+
"errors"
10+
"fmt"
11+
"time"
12+
13+
"github.com/stacklok/toolhive/pkg/logger"
14+
"github.com/stacklok/toolhive/pkg/vmcp"
15+
)
16+
17+
// healthChecker implements vmcp.HealthChecker using ListCapabilities as the health check.
18+
type healthChecker struct {
19+
// client is the backend client used to communicate with backends.
20+
client vmcp.BackendClient
21+
22+
// timeout is the timeout for health check operations.
23+
timeout time.Duration
24+
25+
// degradedThreshold is the response time threshold for marking a backend as degraded.
26+
// If a health check succeeds but takes longer than this duration, the backend is marked degraded.
27+
// Zero means disabled (backends will never be marked degraded based on response time alone).
28+
degradedThreshold time.Duration
29+
}
30+
31+
// NewHealthChecker creates a new health checker that uses BackendClient.ListCapabilities
32+
// as the health check mechanism. This validates the full MCP communication stack:
33+
// network connectivity, MCP protocol compliance, authentication, and responsiveness.
34+
//
35+
// Parameters:
36+
// - client: BackendClient for communicating with backend MCP servers
37+
// - timeout: Maximum duration for health check operations (0 = no timeout)
38+
// - degradedThreshold: Response time threshold for marking backend as degraded (0 = disabled)
39+
//
40+
// Returns a new HealthChecker implementation.
41+
func NewHealthChecker(client vmcp.BackendClient, timeout time.Duration, degradedThreshold time.Duration) vmcp.HealthChecker {
42+
return &healthChecker{
43+
client: client,
44+
timeout: timeout,
45+
degradedThreshold: degradedThreshold,
46+
}
47+
}
48+
49+
// CheckHealth performs a health check on a backend by calling ListCapabilities.
50+
// This validates the full MCP communication stack and returns the backend's health status.
51+
//
52+
// Health determination logic:
53+
// - Success with fast response: Backend is healthy (BackendHealthy)
54+
// - Success with slow response (> degradedThreshold): Backend is degraded (BackendDegraded)
55+
// - Authentication error: Backend is unauthenticated (BackendUnauthenticated)
56+
// - Timeout or connection error: Backend is unhealthy (BackendUnhealthy)
57+
// - Other errors: Backend is unhealthy (BackendUnhealthy)
58+
//
59+
// The error return is informational and provides context about what failed.
60+
// The BackendHealthStatus return indicates the categorized health state.
61+
func (h *healthChecker) CheckHealth(ctx context.Context, target *vmcp.BackendTarget) (vmcp.BackendHealthStatus, error) {
62+
// Apply timeout if configured
63+
checkCtx := ctx
64+
var cancel context.CancelFunc
65+
if h.timeout > 0 {
66+
checkCtx, cancel = context.WithTimeout(ctx, h.timeout)
67+
defer cancel()
68+
}
69+
70+
logger.Debugf("Performing health check for backend %s (%s)", target.WorkloadName, target.BaseURL)
71+
72+
// Track response time for degraded detection
73+
startTime := time.Now()
74+
75+
// Use ListCapabilities as the health check - it performs:
76+
// 1. Client creation with transport setup
77+
// 2. MCP protocol initialization handshake
78+
// 3. Capabilities query (tools, resources, prompts)
79+
// This validates the full communication stack
80+
_, err := h.client.ListCapabilities(checkCtx, target)
81+
responseDuration := time.Since(startTime)
82+
83+
if err != nil {
84+
// Categorize the error to determine health status
85+
status := categorizeError(err)
86+
logger.Debugf("Health check failed for backend %s: %v (status: %s, duration: %v)",
87+
target.WorkloadName, err, status, responseDuration)
88+
return status, fmt.Errorf("health check failed: %w", err)
89+
}
90+
91+
// Check if response time indicates degraded performance
92+
if h.degradedThreshold > 0 && responseDuration > h.degradedThreshold {
93+
logger.Warnf("Health check succeeded for backend %s but response was slow: %v (threshold: %v) - marking as degraded",
94+
target.WorkloadName, responseDuration, h.degradedThreshold)
95+
return vmcp.BackendDegraded, nil
96+
}
97+
98+
logger.Debugf("Health check succeeded for backend %s (duration: %v)", target.WorkloadName, responseDuration)
99+
return vmcp.BackendHealthy, nil
100+
}
101+
102+
// categorizeError determines the appropriate health status based on the error type.
103+
// This uses sentinel error checking with errors.Is() for type-safe error categorization.
104+
// Falls back to string-based detection for backwards compatibility with non-wrapped errors.
105+
func categorizeError(err error) vmcp.BackendHealthStatus {
106+
if err == nil {
107+
return vmcp.BackendHealthy
108+
}
109+
110+
// 1. Type-safe detection: Check for sentinel errors using errors.Is()
111+
// BackendClient now wraps all errors with appropriate sentinel errors
112+
if errors.Is(err, vmcp.ErrAuthenticationFailed) || errors.Is(err, vmcp.ErrAuthorizationFailed) {
113+
return vmcp.BackendUnauthenticated
114+
}
115+
116+
if errors.Is(err, vmcp.ErrTimeout) || errors.Is(err, vmcp.ErrCancelled) {
117+
return vmcp.BackendUnhealthy
118+
}
119+
120+
if errors.Is(err, vmcp.ErrBackendUnavailable) {
121+
return vmcp.BackendUnhealthy
122+
}
123+
124+
// 2. String-based detection: Fallback for backwards compatibility
125+
// This handles errors from sources that don't wrap with sentinel errors
126+
if vmcp.IsAuthenticationError(err) {
127+
return vmcp.BackendUnauthenticated
128+
}
129+
130+
if vmcp.IsTimeoutError(err) || vmcp.IsConnectionError(err) {
131+
return vmcp.BackendUnhealthy
132+
}
133+
134+
// Default to unhealthy for unknown errors
135+
return vmcp.BackendUnhealthy
136+
}

0 commit comments

Comments
 (0)