From d7a51c9c7c5968ba8f4d5c2957c67ccf6e4749b7 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 10:45:20 -0800 Subject: [PATCH 01/22] vMCP: Wrap /mcp in Telemetry Middleware Signed-off-by: Jeremy Drouillard --- cmd/vmcp/app/commands.go | 25 ++++++++++++++++++------- pkg/telemetry/middleware.go | 2 +- pkg/vmcp/config/config.go | 4 ++++ pkg/vmcp/server/server.go | 9 +++++++++ 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go index eb87d87185..3ddd9d276f 100644 --- a/cmd/vmcp/app/commands.go +++ b/cmd/vmcp/app/commands.go @@ -4,6 +4,7 @@ package app import ( "context" "fmt" + "net/http" "time" "github.com/spf13/cobra" @@ -12,6 +13,7 @@ import ( "github.com/stacklok/toolhive/pkg/env" "github.com/stacklok/toolhive/pkg/groups" "github.com/stacklok/toolhive/pkg/logger" + "github.com/stacklok/toolhive/pkg/telemetry" "github.com/stacklok/toolhive/pkg/vmcp" "github.com/stacklok/toolhive/pkg/vmcp/aggregator" "github.com/stacklok/toolhive/pkg/vmcp/auth/factory" @@ -288,7 +290,6 @@ func runServe(cmd *cobra.Command, _ []string) error { // Create router rtr := vmcprouter.NewDefaultRouter() - // Setup authentication middleware logger.Infof("Setting up incoming authentication (type: %s)", cfg.IncomingAuth.Type) authMiddleware, authInfoHandler, err := factory.NewIncomingAuthMiddleware(ctx, cfg.IncomingAuth) @@ -303,13 +304,23 @@ func runServe(cmd *cobra.Command, _ []string) error { host, _ := cmd.Flags().GetString("host") port, _ := cmd.Flags().GetInt("port") + var telemetryMiddleware func(http.Handler) http.Handler + if cfg.Telemetry != nil { + provider, err := telemetry.NewProvider(ctx, *cfg.Telemetry) + if err != nil { + return fmt.Errorf("failed to create telemetry provider: %w", err) + } + telemetryMiddleware = provider.Middleware(cfg.Name, "streamable-http") + } + serverCfg := &vmcpserver.Config{ - Name: cfg.Name, - Version: getVersion(), - Host: host, - Port: port, - AuthMiddleware: authMiddleware, - AuthInfoHandler: authInfoHandler, + Name: cfg.Name, + Version: getVersion(), + Host: host, + Port: port, + AuthMiddleware: authMiddleware, + AuthInfoHandler: authInfoHandler, + TelemetryMiddleware: telemetryMiddleware, } // Convert composite tool configurations to workflow definitions diff --git a/pkg/telemetry/middleware.go b/pkg/telemetry/middleware.go index 1dcdc61cf5..f59c095167 100644 --- a/pkg/telemetry/middleware.go +++ b/pkg/telemetry/middleware.go @@ -46,7 +46,7 @@ type HTTPMiddleware struct { // NewHTTPMiddleware creates a new HTTP middleware for OpenTelemetry instrumentation. // serverName is the name of the MCP server (e.g., "github", "fetch") -// transport is the backend transport type ("stdio" or "sse") +// transport is the backend transport type ("stdio", "sse", or "streamable-http"). func NewHTTPMiddleware( config Config, tracerProvider trace.TracerProvider, diff --git a/pkg/vmcp/config/config.go b/pkg/vmcp/config/config.go index 8af95847e2..a1c3013bff 100644 --- a/pkg/vmcp/config/config.go +++ b/pkg/vmcp/config/config.go @@ -10,6 +10,7 @@ import ( "fmt" "time" + "github.com/stacklok/toolhive/pkg/telemetry" "github.com/stacklok/toolhive/pkg/vmcp" authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types" ) @@ -87,6 +88,9 @@ type Config struct { // Metadata stores additional configuration metadata. Metadata map[string]string `json:"metadata,omitempty" yaml:"metadata,omitempty"` + + // Telemetry configures telemetry settings. + Telemetry *telemetry.Config `json:"telemetry,omitempty" yaml:"telemetry,omitempty"` } // IncomingAuthConfig configures client authentication to the virtual MCP server. diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index 63d1bbafd9..c3a983ac4c 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -82,6 +82,10 @@ type Config struct { // AuthInfoHandler is the optional handler for /.well-known/oauth-protected-resource endpoint. // Exposes OIDC discovery information about the protected resource. AuthInfoHandler http.Handler + + // TelemetryMiddleware is the optional telemetry middleware to apply to MCP routes. + // If nil, no telemetry is recorded. + TelemetryMiddleware func(http.Handler) http.Handler } // Server is the Virtual MCP Server that aggregates multiple backends. @@ -344,6 +348,11 @@ func (s *Server) Start(ctx context.Context) error { // MCP endpoint - apply middleware chain: auth → discovery var mcpHandler http.Handler = streamableServer + if s.config.TelemetryMiddleware != nil { + mcpHandler = s.config.TelemetryMiddleware(mcpHandler) + logger.Info("Telemetry middleware enabled for MCP endpoints") + } + // Apply discovery middleware (runs after auth middleware) // Discovery middleware performs per-request capability aggregation with user context // Pass sessionManager to enable session-based capability retrieval for subsequent requests From a9a2dd741dea59ae55b28be1bf38127734febf04 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 12:07:19 -0800 Subject: [PATCH 02/22] Emit Metrics per Backend Signed-off-by: Jeremy Drouillard --- cmd/vmcp/app/commands.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go index 3ddd9d276f..2e1b6bea13 100644 --- a/cmd/vmcp/app/commands.go +++ b/cmd/vmcp/app/commands.go @@ -304,6 +304,7 @@ func runServe(cmd *cobra.Command, _ []string) error { host, _ := cmd.Flags().GetString("host") port, _ := cmd.Flags().GetInt("port") + // If telemetry is configured, we need to setup the middleware and monitor the backends. var telemetryMiddleware func(http.Handler) http.Handler if cfg.Telemetry != nil { provider, err := telemetry.NewProvider(ctx, *cfg.Telemetry) @@ -311,6 +312,11 @@ func runServe(cmd *cobra.Command, _ []string) error { return fmt.Errorf("failed to create telemetry provider: %w", err) } telemetryMiddleware = provider.Middleware(cfg.Name, "streamable-http") + + backendClient, err = vmcp.MonitorBackends(ctx, provider.MeterProvider(), backends, backendClient) + if err != nil { + return fmt.Errorf("failed to monitor backends: %w", err) + } } serverCfg := &vmcpserver.Config{ From c07d3c5b0c375e919ad034a5daf8bae9d4f94802 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 13:33:57 -0800 Subject: [PATCH 03/22] Refactor to plumb telemetry provider into config Signed-off-by: Jeremy Drouillard --- cmd/vmcp/app/commands.go | 25 +++++----- pkg/telemetry/config.go | 2 +- pkg/vmcp/server/server.go | 9 ++-- pkg/vmcp/telemetry.go | 102 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 18 deletions(-) create mode 100644 pkg/vmcp/telemetry.go diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go index 2e1b6bea13..7d49b743dc 100644 --- a/cmd/vmcp/app/commands.go +++ b/cmd/vmcp/app/commands.go @@ -4,7 +4,6 @@ package app import ( "context" "fmt" - "net/http" "time" "github.com/spf13/cobra" @@ -304,29 +303,29 @@ func runServe(cmd *cobra.Command, _ []string) error { host, _ := cmd.Flags().GetString("host") port, _ := cmd.Flags().GetInt("port") - // If telemetry is configured, we need to setup the middleware and monitor the backends. - var telemetryMiddleware func(http.Handler) http.Handler + // If telemetry is configured, create the provider and monitor the backends. + var telemetryProvider *telemetry.Provider if cfg.Telemetry != nil { - provider, err := telemetry.NewProvider(ctx, *cfg.Telemetry) + var err error + telemetryProvider, err = telemetry.NewProvider(ctx, *cfg.Telemetry) if err != nil { return fmt.Errorf("failed to create telemetry provider: %w", err) } - telemetryMiddleware = provider.Middleware(cfg.Name, "streamable-http") - backendClient, err = vmcp.MonitorBackends(ctx, provider.MeterProvider(), backends, backendClient) + backendClient, err = vmcp.MonitorBackends(ctx, telemetryProvider.MeterProvider(), backends, backendClient) if err != nil { return fmt.Errorf("failed to monitor backends: %w", err) } } serverCfg := &vmcpserver.Config{ - Name: cfg.Name, - Version: getVersion(), - Host: host, - Port: port, - AuthMiddleware: authMiddleware, - AuthInfoHandler: authInfoHandler, - TelemetryMiddleware: telemetryMiddleware, + Name: cfg.Name, + Version: getVersion(), + Host: host, + Port: port, + AuthMiddleware: authMiddleware, + AuthInfoHandler: authInfoHandler, + TelemetryProvider: telemetryProvider, } // Convert composite tool configurations to workflow definitions diff --git a/pkg/telemetry/config.go b/pkg/telemetry/config.go index 8b6c7d276a..1220d4b049 100644 --- a/pkg/telemetry/config.go +++ b/pkg/telemetry/config.go @@ -200,7 +200,7 @@ func setGlobalProvidersAndReturn(telemetryProviders *providers.CompositeProvider // Middleware returns an HTTP middleware that instruments requests with OpenTelemetry. // serverName is the name of the MCP server (e.g., "github", "fetch") -// transport is the backend transport type ("stdio" or "sse") +// transport is the backend transport type ("stdio", "sse", or "streamable-http"). func (p *Provider) Middleware(serverName, transport string) types.MiddlewareFunction { return NewHTTPMiddleware(p.config, p.tracerProvider, p.meterProvider, serverName, transport) } diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index c3a983ac4c..679da02a7e 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -19,6 +19,7 @@ import ( "github.com/stacklok/toolhive/pkg/auth" "github.com/stacklok/toolhive/pkg/logger" + "github.com/stacklok/toolhive/pkg/telemetry" transportsession "github.com/stacklok/toolhive/pkg/transport/session" "github.com/stacklok/toolhive/pkg/vmcp" "github.com/stacklok/toolhive/pkg/vmcp/aggregator" @@ -83,9 +84,9 @@ type Config struct { // Exposes OIDC discovery information about the protected resource. AuthInfoHandler http.Handler - // TelemetryMiddleware is the optional telemetry middleware to apply to MCP routes. + // TelemetryProvider is the optional telemetry provider. // If nil, no telemetry is recorded. - TelemetryMiddleware func(http.Handler) http.Handler + TelemetryProvider *telemetry.Provider } // Server is the Virtual MCP Server that aggregates multiple backends. @@ -348,8 +349,8 @@ func (s *Server) Start(ctx context.Context) error { // MCP endpoint - apply middleware chain: auth → discovery var mcpHandler http.Handler = streamableServer - if s.config.TelemetryMiddleware != nil { - mcpHandler = s.config.TelemetryMiddleware(mcpHandler) + if s.config.TelemetryProvider != nil { + mcpHandler = s.config.TelemetryProvider.Middleware(s.config.Name, "streamable-http")(mcpHandler) logger.Info("Telemetry middleware enabled for MCP endpoints") } diff --git a/pkg/vmcp/telemetry.go b/pkg/vmcp/telemetry.go new file mode 100644 index 0000000000..a5e458e731 --- /dev/null +++ b/pkg/vmcp/telemetry.go @@ -0,0 +1,102 @@ +package vmcp + +import ( + "context" + "fmt" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +const ( + instrumentationName = "github.com/stacklok/toolhive/pkg/vmcp" +) + +// MonitorBackends decorate the backend client so it records telemetry on each method call. +// It also emits a gauge for the number of backends discovered once, since the number of backends is static. +func MonitorBackends(ctx context.Context, meterProvider metric.MeterProvider, backends []Backend, backendClient BackendClient) (BackendClient, error) { + meter := meterProvider.Meter(instrumentationName) + + backendCount, err := meter.Int64Gauge( + "toolhive_vmcp_backends_discovered", + metric.WithDescription("Number of backends discovered"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create backend count gauge: %w", err) + } + backendCount.Record(ctx, int64(len(backends))) + + requestsTotal, err := meter.Int64Counter("toolhive_vmcp_requests_total", metric.WithDescription("Total number of requests per backend")) + if err != nil { + return nil, fmt.Errorf("failed to create requests total counter: %w", err) + } + errorsTotal, err := meter.Int64Counter("toolhive_vmcp_errors_total", metric.WithDescription("Total number of errors per backend")) + if err != nil { + return nil, fmt.Errorf("failed to create errors total counter: %w", err) + } + requestsDuration, err := meter.Float64Histogram("toolhive_vmcp_requests_duration", metric.WithDescription("Duration of requests in seconds per backend")) + if err != nil { + return nil, fmt.Errorf("failed to create requests duration histogram: %w", err) + } + + return telemetryBackendClient{ + backendClient: backendClient, + requestsTotal: requestsTotal, + errorsTotal: errorsTotal, + requestsDuration: requestsDuration, + }, nil + +} + +type telemetryBackendClient struct { + backendClient BackendClient + + requestsTotal metric.Int64Counter + errorsTotal metric.Int64Counter + requestsDuration metric.Float64Histogram +} + +var _ BackendClient = telemetryBackendClient{} + +// record updates the telemetry metrics for each method on the BackendClient interface. +// It returns a function that should be deferred to record the duration and error. +func (t telemetryBackendClient) record(ctx context.Context, target *BackendTarget, action string, err *error) func() { + attrs := metric.WithAttributes( + attribute.String("target.workload_id", target.WorkloadID), + attribute.String("target.workload_name", target.WorkloadName), + attribute.String("target.base_url", target.BaseURL), + attribute.String("target.transport_type", target.TransportType), + attribute.String("action", action), + ) + start := time.Now() + t.requestsTotal.Add(ctx, 1, attrs) + + return func() { + duration := time.Since(start) + t.requestsDuration.Record(ctx, duration.Seconds(), attrs) + if err != nil { + t.errorsTotal.Add(ctx, 1, attrs) + } + } +} + +func (t telemetryBackendClient) CallTool(ctx context.Context, target *BackendTarget, toolName string, arguments map[string]any) (_ map[string]any, retErr error) { + defer t.record(ctx, target, "call_tool", &retErr)() + return t.backendClient.CallTool(ctx, target, toolName, arguments) +} + +func (t telemetryBackendClient) ReadResource(ctx context.Context, target *BackendTarget, uri string) (_ []byte, retErr error) { + defer t.record(ctx, target, "read_resource", &retErr)() + return t.backendClient.ReadResource(ctx, target, uri) +} + +func (t telemetryBackendClient) GetPrompt(ctx context.Context, target *BackendTarget, name string, arguments map[string]any) (_ string, retErr error) { + defer t.record(ctx, target, "get_prompt", &retErr)() + return t.backendClient.GetPrompt(ctx, target, name, arguments) +} + +func (t telemetryBackendClient) ListCapabilities(ctx context.Context, target *BackendTarget) (_ *CapabilityList, retErr error) { + defer t.record(ctx, target, "list_capabilities", &retErr)() + return t.backendClient.ListCapabilities(ctx, target) +} From f6f8c9bd22c0f5e0ab7c3024292a4f33aaf622bc Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 13:45:12 -0800 Subject: [PATCH 04/22] Move telemetry.go to vmcp/server.go Signed-off-by: Jeremy Drouillard --- cmd/vmcp/app/commands.go | 2 +- pkg/vmcp/{ => server}/telemetry.go | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) rename pkg/vmcp/{ => server}/telemetry.go (83%) diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go index 7d49b743dc..400dd51856 100644 --- a/cmd/vmcp/app/commands.go +++ b/cmd/vmcp/app/commands.go @@ -312,7 +312,7 @@ func runServe(cmd *cobra.Command, _ []string) error { return fmt.Errorf("failed to create telemetry provider: %w", err) } - backendClient, err = vmcp.MonitorBackends(ctx, telemetryProvider.MeterProvider(), backends, backendClient) + backendClient, err = vmcpserver.MonitorBackends(ctx, telemetryProvider.MeterProvider(), backends, backendClient) if err != nil { return fmt.Errorf("failed to monitor backends: %w", err) } diff --git a/pkg/vmcp/telemetry.go b/pkg/vmcp/server/telemetry.go similarity index 83% rename from pkg/vmcp/telemetry.go rename to pkg/vmcp/server/telemetry.go index a5e458e731..64a6478b78 100644 --- a/pkg/vmcp/telemetry.go +++ b/pkg/vmcp/server/telemetry.go @@ -1,4 +1,4 @@ -package vmcp +package server import ( "context" @@ -7,6 +7,8 @@ import ( "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" + + "github.com/stacklok/toolhive/pkg/vmcp" ) const ( @@ -15,7 +17,7 @@ const ( // MonitorBackends decorate the backend client so it records telemetry on each method call. // It also emits a gauge for the number of backends discovered once, since the number of backends is static. -func MonitorBackends(ctx context.Context, meterProvider metric.MeterProvider, backends []Backend, backendClient BackendClient) (BackendClient, error) { +func MonitorBackends(ctx context.Context, meterProvider metric.MeterProvider, backends []vmcp.Backend, backendClient vmcp.BackendClient) (vmcp.BackendClient, error) { meter := meterProvider.Meter(instrumentationName) backendCount, err := meter.Int64Gauge( @@ -50,18 +52,18 @@ func MonitorBackends(ctx context.Context, meterProvider metric.MeterProvider, ba } type telemetryBackendClient struct { - backendClient BackendClient + backendClient vmcp.BackendClient requestsTotal metric.Int64Counter errorsTotal metric.Int64Counter requestsDuration metric.Float64Histogram } -var _ BackendClient = telemetryBackendClient{} +var _ vmcp.BackendClient = telemetryBackendClient{} // record updates the telemetry metrics for each method on the BackendClient interface. // It returns a function that should be deferred to record the duration and error. -func (t telemetryBackendClient) record(ctx context.Context, target *BackendTarget, action string, err *error) func() { +func (t telemetryBackendClient) record(ctx context.Context, target *vmcp.BackendTarget, action string, err *error) func() { attrs := metric.WithAttributes( attribute.String("target.workload_id", target.WorkloadID), attribute.String("target.workload_name", target.WorkloadName), @@ -81,22 +83,22 @@ func (t telemetryBackendClient) record(ctx context.Context, target *BackendTarge } } -func (t telemetryBackendClient) CallTool(ctx context.Context, target *BackendTarget, toolName string, arguments map[string]any) (_ map[string]any, retErr error) { +func (t telemetryBackendClient) CallTool(ctx context.Context, target *vmcp.BackendTarget, toolName string, arguments map[string]any) (_ map[string]any, retErr error) { defer t.record(ctx, target, "call_tool", &retErr)() return t.backendClient.CallTool(ctx, target, toolName, arguments) } -func (t telemetryBackendClient) ReadResource(ctx context.Context, target *BackendTarget, uri string) (_ []byte, retErr error) { +func (t telemetryBackendClient) ReadResource(ctx context.Context, target *vmcp.BackendTarget, uri string) (_ []byte, retErr error) { defer t.record(ctx, target, "read_resource", &retErr)() return t.backendClient.ReadResource(ctx, target, uri) } -func (t telemetryBackendClient) GetPrompt(ctx context.Context, target *BackendTarget, name string, arguments map[string]any) (_ string, retErr error) { +func (t telemetryBackendClient) GetPrompt(ctx context.Context, target *vmcp.BackendTarget, name string, arguments map[string]any) (_ string, retErr error) { defer t.record(ctx, target, "get_prompt", &retErr)() return t.backendClient.GetPrompt(ctx, target, name, arguments) } -func (t telemetryBackendClient) ListCapabilities(ctx context.Context, target *BackendTarget) (_ *CapabilityList, retErr error) { +func (t telemetryBackendClient) ListCapabilities(ctx context.Context, target *vmcp.BackendTarget) (_ *vmcp.CapabilityList, retErr error) { defer t.record(ctx, target, "list_capabilities", &retErr)() return t.backendClient.ListCapabilities(ctx, target) } From 3b3a87efeb6b4c6f220a136120b2fdb4ab29fab7 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 13:51:01 -0800 Subject: [PATCH 05/22] Decorate WorkflowExecutors with Telemetry Signed-off-by: Jeremy Drouillard --- pkg/vmcp/server/server.go | 8 ++++ pkg/vmcp/server/telemetry.go | 83 ++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index 679da02a7e..b364f72c1a 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -211,6 +211,14 @@ func New( return nil, fmt.Errorf("workflow validation failed: %w", err) } + // Decorate workflow executors with telemetry if provider is configured + if cfg.TelemetryProvider != nil { + workflowExecutors, err = MonitorWorkflowExecutors(cfg.TelemetryProvider.MeterProvider(), workflowExecutors) + if err != nil { + return nil, fmt.Errorf("failed to monitor workflow executors: %w", err) + } + } + // Create session manager with VMCPSession factory // This enables type-safe access to routing tables while maintaining session lifecycle management sessionManager := transportsession.NewManager(cfg.SessionTTL, vmcpsession.VMCPSessionFactory()) diff --git a/pkg/vmcp/server/telemetry.go b/pkg/vmcp/server/telemetry.go index 64a6478b78..77d1f49492 100644 --- a/pkg/vmcp/server/telemetry.go +++ b/pkg/vmcp/server/telemetry.go @@ -9,6 +9,7 @@ import ( "go.opentelemetry.io/otel/metric" "github.com/stacklok/toolhive/pkg/vmcp" + "github.com/stacklok/toolhive/pkg/vmcp/server/adapter" ) const ( @@ -102,3 +103,85 @@ func (t telemetryBackendClient) ListCapabilities(ctx context.Context, target *vm defer t.record(ctx, target, "list_capabilities", &retErr)() return t.backendClient.ListCapabilities(ctx, target) } + +// MonitorWorkflowExecutors decorates workflow executors with telemetry recording. +// It wraps each executor to emit metrics for execution count, duration, and errors. +func MonitorWorkflowExecutors( + meterProvider metric.MeterProvider, + executors map[string]adapter.WorkflowExecutor, +) (map[string]adapter.WorkflowExecutor, error) { + if len(executors) == 0 { + return executors, nil + } + + meter := meterProvider.Meter(instrumentationName) + + executionsTotal, err := meter.Int64Counter( + "toolhive_vmcp_workflow_executions_total", + metric.WithDescription("Total number of workflow executions"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create workflow executions counter: %w", err) + } + + errorsTotal, err := meter.Int64Counter( + "toolhive_vmcp_workflow_errors_total", + metric.WithDescription("Total number of workflow execution errors"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create workflow errors counter: %w", err) + } + + executionDuration, err := meter.Float64Histogram( + "toolhive_vmcp_workflow_duration_seconds", + metric.WithDescription("Duration of workflow executions in seconds"), + ) + if err != nil { + return nil, fmt.Errorf("failed to create workflow duration histogram: %w", err) + } + + monitored := make(map[string]adapter.WorkflowExecutor, len(executors)) + for name, executor := range executors { + monitored[name] = &telemetryWorkflowExecutor{ + name: name, + executor: executor, + executionsTotal: executionsTotal, + errorsTotal: errorsTotal, + executionDuration: executionDuration, + } + } + + return monitored, nil +} + +// telemetryWorkflowExecutor wraps a WorkflowExecutor with telemetry recording. +type telemetryWorkflowExecutor struct { + name string + executor adapter.WorkflowExecutor + executionsTotal metric.Int64Counter + errorsTotal metric.Int64Counter + executionDuration metric.Float64Histogram +} + +var _ adapter.WorkflowExecutor = (*telemetryWorkflowExecutor)(nil) + +// ExecuteWorkflow executes the workflow and records telemetry metrics. +func (t *telemetryWorkflowExecutor) ExecuteWorkflow(ctx context.Context, params map[string]any) (*adapter.WorkflowResult, error) { + attrs := metric.WithAttributes( + attribute.String("workflow.name", t.name), + ) + + start := time.Now() + t.executionsTotal.Add(ctx, 1, attrs) + + result, err := t.executor.ExecuteWorkflow(ctx, params) + + duration := time.Since(start) + t.executionDuration.Record(ctx, duration.Seconds(), attrs) + + if err != nil { + t.errorsTotal.Add(ctx, 1, attrs) + } + + return result, err +} From 44d9dd6f802c9f07d2ba8985311f3fda6dad2f98 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 13:55:22 -0800 Subject: [PATCH 06/22] Move all decoration within vmpcserver Signed-off-by: Jeremy Drouillard --- cmd/vmcp/app/commands.go | 7 +------ pkg/vmcp/server/server.go | 8 ++++++-- pkg/vmcp/server/telemetry.go | 8 ++++---- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go index 400dd51856..4098f8a6a0 100644 --- a/cmd/vmcp/app/commands.go +++ b/cmd/vmcp/app/commands.go @@ -303,7 +303,7 @@ func runServe(cmd *cobra.Command, _ []string) error { host, _ := cmd.Flags().GetString("host") port, _ := cmd.Flags().GetInt("port") - // If telemetry is configured, create the provider and monitor the backends. + // If telemetry is configured, create the provider. var telemetryProvider *telemetry.Provider if cfg.Telemetry != nil { var err error @@ -311,11 +311,6 @@ func runServe(cmd *cobra.Command, _ []string) error { if err != nil { return fmt.Errorf("failed to create telemetry provider: %w", err) } - - backendClient, err = vmcpserver.MonitorBackends(ctx, telemetryProvider.MeterProvider(), backends, backendClient) - if err != nil { - return fmt.Errorf("failed to monitor backends: %w", err) - } } serverCfg := &vmcpserver.Config{ diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index b364f72c1a..3401fef9c4 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -211,9 +211,13 @@ func New( return nil, fmt.Errorf("workflow validation failed: %w", err) } - // Decorate workflow executors with telemetry if provider is configured + // Decorate backend client and workflow executors with telemetry if provider is configured if cfg.TelemetryProvider != nil { - workflowExecutors, err = MonitorWorkflowExecutors(cfg.TelemetryProvider.MeterProvider(), workflowExecutors) + backendClient, err = monitorBackends(context.Background(), cfg.TelemetryProvider.MeterProvider(), backends, backendClient) + if err != nil { + return nil, fmt.Errorf("failed to monitor backends: %w", err) + } + workflowExecutors, err = monitorWorkflowExecutors(cfg.TelemetryProvider.MeterProvider(), workflowExecutors) if err != nil { return nil, fmt.Errorf("failed to monitor workflow executors: %w", err) } diff --git a/pkg/vmcp/server/telemetry.go b/pkg/vmcp/server/telemetry.go index 77d1f49492..9f748a518a 100644 --- a/pkg/vmcp/server/telemetry.go +++ b/pkg/vmcp/server/telemetry.go @@ -16,9 +16,9 @@ const ( instrumentationName = "github.com/stacklok/toolhive/pkg/vmcp" ) -// MonitorBackends decorate the backend client so it records telemetry on each method call. +// monitorBackends decorates the backend client so it records telemetry on each method call. // It also emits a gauge for the number of backends discovered once, since the number of backends is static. -func MonitorBackends(ctx context.Context, meterProvider metric.MeterProvider, backends []vmcp.Backend, backendClient vmcp.BackendClient) (vmcp.BackendClient, error) { +func monitorBackends(ctx context.Context, meterProvider metric.MeterProvider, backends []vmcp.Backend, backendClient vmcp.BackendClient) (vmcp.BackendClient, error) { meter := meterProvider.Meter(instrumentationName) backendCount, err := meter.Int64Gauge( @@ -104,9 +104,9 @@ func (t telemetryBackendClient) ListCapabilities(ctx context.Context, target *vm return t.backendClient.ListCapabilities(ctx, target) } -// MonitorWorkflowExecutors decorates workflow executors with telemetry recording. +// monitorWorkflowExecutors decorates workflow executors with telemetry recording. // It wraps each executor to emit metrics for execution count, duration, and errors. -func MonitorWorkflowExecutors( +func monitorWorkflowExecutors( meterProvider metric.MeterProvider, executors map[string]adapter.WorkflowExecutor, ) (map[string]adapter.WorkflowExecutor, error) { From 9313a8a8df4d4598d3817c1a9ddb4d7226fb80ef Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 13:57:53 -0800 Subject: [PATCH 07/22] Expose Metrics on /metrics Signed-off-by: Jeremy Drouillard --- pkg/vmcp/server/server.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index 3401fef9c4..f21a78182a 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -351,6 +351,16 @@ func (s *Server) Start(ctx context.Context) error { mux.HandleFunc("/health", s.handleHealth) mux.HandleFunc("/ping", s.handleHealth) + // Optional Prometheus metrics endpoint (unauthenticated) + if s.config.TelemetryProvider != nil { + if prometheusHandler := s.config.TelemetryProvider.PrometheusHandler(); prometheusHandler != nil { + mux.Handle("/metrics", prometheusHandler) + logger.Info("Prometheus metrics endpoint enabled at /metrics") + } else { + logger.Warn("Prometheus metrics endpoint is not enabled, but telemetry provider is configured") + } + } + // Optional .well-known discovery endpoints (unauthenticated, RFC 9728 compliant) // Handles /.well-known/oauth-protected-resource and subpaths (e.g., /mcp) if wellKnownHandler := auth.NewWellKnownHandler(s.config.AuthInfoHandler); wellKnownHandler != nil { From 2630bc1f7909855896c254b1c45a55c396730783 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Wed, 3 Dec 2025 15:06:40 -0800 Subject: [PATCH 08/22] integration test for metrics Signed-off-by: Jeremy Drouillard --- pkg/vmcp/server/server.go | 17 ++- pkg/vmcp/server/telemetry.go | 6 +- test/integration/vmcp/helpers/vmcp_server.go | 35 +++-- .../integration/vmcp/vmcp_integration_test.go | 127 ++++++++++++++++++ 4 files changed, 169 insertions(+), 16 deletions(-) diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index f21a78182a..6199321fe9 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -199,6 +199,17 @@ func New( // This provides SDK-agnostic elicitation with security validation elicitationHandler := composer.NewDefaultElicitationHandler(sdkElicitationRequester) + // Decorate backend client with telemetry if provider is configured + // This must happen BEFORE creating the workflow engine so that workflow + // backend calls are instrumented when they occur during workflow execution. + if cfg.TelemetryProvider != nil { + var err error + backendClient, err = monitorBackends(context.Background(), cfg.TelemetryProvider.MeterProvider(), backends, backendClient) + if err != nil { + return nil, fmt.Errorf("failed to monitor backends: %w", err) + } + } + // Create workflow engine (composer) for executing composite tools // The composer orchestrates multi-step workflows across backends // Use in-memory state store with 5-minute cleanup interval and 1-hour max age for completed workflows @@ -211,12 +222,8 @@ func New( return nil, fmt.Errorf("workflow validation failed: %w", err) } - // Decorate backend client and workflow executors with telemetry if provider is configured + // Decorate workflow executors with telemetry if provider is configured if cfg.TelemetryProvider != nil { - backendClient, err = monitorBackends(context.Background(), cfg.TelemetryProvider.MeterProvider(), backends, backendClient) - if err != nil { - return nil, fmt.Errorf("failed to monitor backends: %w", err) - } workflowExecutors, err = monitorWorkflowExecutors(cfg.TelemetryProvider.MeterProvider(), workflowExecutors) if err != nil { return nil, fmt.Errorf("failed to monitor workflow executors: %w", err) diff --git a/pkg/vmcp/server/telemetry.go b/pkg/vmcp/server/telemetry.go index 9f748a518a..7a80021ba7 100644 --- a/pkg/vmcp/server/telemetry.go +++ b/pkg/vmcp/server/telemetry.go @@ -30,15 +30,15 @@ func monitorBackends(ctx context.Context, meterProvider metric.MeterProvider, ba } backendCount.Record(ctx, int64(len(backends))) - requestsTotal, err := meter.Int64Counter("toolhive_vmcp_requests_total", metric.WithDescription("Total number of requests per backend")) + requestsTotal, err := meter.Int64Counter("toolhive_vmcp_backend_requests_total", metric.WithDescription("Total number of requests per backend")) if err != nil { return nil, fmt.Errorf("failed to create requests total counter: %w", err) } - errorsTotal, err := meter.Int64Counter("toolhive_vmcp_errors_total", metric.WithDescription("Total number of errors per backend")) + errorsTotal, err := meter.Int64Counter("toolhive_vmcp_backend_errors_total", metric.WithDescription("Total number of errors per backend")) if err != nil { return nil, fmt.Errorf("failed to create errors total counter: %w", err) } - requestsDuration, err := meter.Float64Histogram("toolhive_vmcp_requests_duration", metric.WithDescription("Duration of requests in seconds per backend")) + requestsDuration, err := meter.Float64Histogram("toolhive_vmcp_backend_requests_duration", metric.WithDescription("Duration of requests in seconds per backend")) if err != nil { return nil, fmt.Errorf("failed to create requests duration histogram: %w", err) } diff --git a/test/integration/vmcp/helpers/vmcp_server.go b/test/integration/vmcp/helpers/vmcp_server.go index f001a575ce..93007a5a5a 100644 --- a/test/integration/vmcp/helpers/vmcp_server.go +++ b/test/integration/vmcp/helpers/vmcp_server.go @@ -10,11 +10,13 @@ import ( "github.com/stacklok/toolhive/pkg/auth" "github.com/stacklok/toolhive/pkg/env" + "github.com/stacklok/toolhive/pkg/telemetry" vmcptypes "github.com/stacklok/toolhive/pkg/vmcp" "github.com/stacklok/toolhive/pkg/vmcp/aggregator" "github.com/stacklok/toolhive/pkg/vmcp/auth/factory" authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types" vmcpclient "github.com/stacklok/toolhive/pkg/vmcp/client" + "github.com/stacklok/toolhive/pkg/vmcp/composer" "github.com/stacklok/toolhive/pkg/vmcp/discovery" "github.com/stacklok/toolhive/pkg/vmcp/router" vmcpserver "github.com/stacklok/toolhive/pkg/vmcp/server" @@ -63,8 +65,10 @@ type VMCPServerOption func(*vmcpServerConfig) // vmcpServerConfig holds configuration for creating a test vMCP server. type vmcpServerConfig struct { - conflictStrategy string - prefixFormat string + conflictStrategy string + prefixFormat string + workflowDefs map[string]*composer.WorkflowDefinition + telemetryProvider *telemetry.Provider } // WithPrefixConflictResolution configures prefix-based conflict resolution. @@ -75,6 +79,20 @@ func WithPrefixConflictResolution(format string) VMCPServerOption { } } +// WithWorkflowDefinitions configures composite tool workflow definitions. +func WithWorkflowDefinitions(defs map[string]*composer.WorkflowDefinition) VMCPServerOption { + return func(c *vmcpServerConfig) { + c.workflowDefs = defs + } +} + +// WithTelemetryProvider configures the telemetry provider. +func WithTelemetryProvider(provider *telemetry.Provider) VMCPServerOption { + return func(c *vmcpServerConfig) { + c.telemetryProvider = provider + } +} + // getFreePort returns an available TCP port on localhost. // This is used for parallel test execution to avoid port conflicts. func getFreePort(tb testing.TB) int { @@ -148,12 +166,13 @@ func NewVMCPServer( // Create vMCP server with test-specific defaults vmcpServer, err := vmcpserver.New(&vmcpserver.Config{ - Name: "test-vmcp", - Version: "1.0.0", - Host: "127.0.0.1", - Port: getFreePort(tb), // Get a random available port for parallel test execution - AuthMiddleware: auth.AnonymousMiddleware, - }, rtr, backendClient, discoveryMgr, backends, nil) // nil for workflowDefs in tests + Name: "test-vmcp", + Version: "1.0.0", + Host: "127.0.0.1", + Port: getFreePort(tb), // Get a random available port for parallel test execution + AuthMiddleware: auth.AnonymousMiddleware, + TelemetryProvider: config.telemetryProvider, + }, rtr, backendClient, discoveryMgr, backends, config.workflowDefs) require.NoError(tb, err, "failed to create vMCP server") // Start server automatically diff --git a/test/integration/vmcp/vmcp_integration_test.go b/test/integration/vmcp/vmcp_integration_test.go index 57dd164e5e..74716ae682 100644 --- a/test/integration/vmcp/vmcp_integration_test.go +++ b/test/integration/vmcp/vmcp_integration_test.go @@ -2,12 +2,19 @@ package vmcp_test import ( "context" + "io" + "net/http" + "strings" "testing" + "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/stacklok/toolhive/pkg/telemetry" "github.com/stacklok/toolhive/pkg/vmcp" authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types" + "github.com/stacklok/toolhive/pkg/vmcp/composer" "github.com/stacklok/toolhive/test/integration/vmcp/helpers" ) @@ -225,3 +232,123 @@ func TestVMCPServer_TwoBoundaryAuth_HeaderInjection(t *testing.T) { helpers.AssertTextNotContains(t, text, "error", "failed", "leakage") }) } + +// TestVMCPServer_Telemetry_CompositeToolMetrics verifies that vMCP exposes +// Prometheus metrics for composite tool workflow executions and backend requests on /metrics. +// This test creates a composite tool, executes it, and verifies the metrics +// for both the workflow and the backend subtool calls are correctly exposed. +func TestVMCPServer_Telemetry_CompositeToolMetrics(t *testing.T) { + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Setup: Create a synthetic MCP backend server with a simple tool + echoServer := helpers.CreateBackendServer(t, []helpers.BackendTool{ + helpers.NewBackendTool("echo", "Echo the input message", + func(_ context.Context, args map[string]any) string { + msg, _ := args["message"].(string) + return `{"echoed": "` + msg + `"}` + }), + }, helpers.WithBackendName("echo-mcp")) + defer echoServer.Close() + + // Configure backend pointing to test server + backends := []vmcp.Backend{ + helpers.NewBackend("echo", + helpers.WithURL(echoServer.URL+"/mcp"), + helpers.WithMetadata("group", "test-group"), + ), + } + + // Create composite tool workflow definition that calls the echo tool + workflowDefs := map[string]*composer.WorkflowDefinition{ + "echo_workflow": { + Name: "echo_workflow", + Description: "A composite tool that echoes a message", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{ + "message": map[string]any{ + "type": "string", + "description": "The message to echo", + }, + }, + "required": []string{"message"}, + }, + Steps: []composer.WorkflowStep{ + { + ID: "echo_step", + Type: "tool", + Tool: "echo_echo", // prefixed with backend name + Arguments: map[string]any{ + "message": "{{.params.message}}", + }, + }, + }, + Timeout: 30 * time.Second, + }, + } + + // Create telemetry provider with Prometheus enabled + telemetryConfig := telemetry.Config{ + ServiceName: "vmcp-telemetry-test", + ServiceVersion: "1.0.0", + EnablePrometheusMetricsPath: true, + } + telemetryProvider, err := telemetry.NewProvider(ctx, telemetryConfig) + require.NoError(t, err, "failed to create telemetry provider") + defer telemetryProvider.Shutdown(ctx) + + // Create vMCP server with composite tool and telemetry + vmcpServer := helpers.NewVMCPServer(ctx, t, backends, + helpers.WithPrefixConflictResolution("{workload}_"), + helpers.WithWorkflowDefinitions(workflowDefs), + helpers.WithTelemetryProvider(telemetryProvider), + ) + + // Create and initialize MCP client + vmcpURL := "http://" + vmcpServer.Address() + "/mcp" + client := helpers.NewMCPClient(ctx, t, vmcpURL) + defer client.Close() + + // Call the composite tool + resp := client.CallTool(ctx, "echo_workflow", map[string]any{"message": "hello world"}) + text := helpers.AssertToolCallSuccess(t, resp) + helpers.AssertTextContains(t, text, "echoed", "hello world") + + // Fetch metrics from /metrics endpoint + metricsURL := "http://" + vmcpServer.Address() + "/metrics" + httpClient := &http.Client{Timeout: 5 * time.Second} + metricsResp, err := httpClient.Get(metricsURL) + require.NoError(t, err, "failed to fetch metrics") + defer metricsResp.Body.Close() + + require.Equal(t, http.StatusOK, metricsResp.StatusCode, "metrics endpoint should return 200") + + body, err := io.ReadAll(metricsResp.Body) + require.NoError(t, err, "failed to read metrics body") + metricsContent := string(body) + + // Log metrics for debugging + t.Logf("Metrics content:\n%s", metricsContent) + + // Verify workflow execution metrics are present (composite tool). + assert.True(t, strings.Contains(metricsContent, "toolhive_vmcp_workflow_executions_total"), + "Should contain workflow executions total metric") + assert.True(t, strings.Contains(metricsContent, "toolhive_vmcp_workflow_duration_seconds"), + "Should contain workflow duration metric") + assert.True(t, strings.Contains(metricsContent, `workflow_name="echo_workflow"`), + "Should contain workflow name label") + + // Verify backend metrics are present. + assert.True(t, strings.Contains(metricsContent, "toolhive_vmcp_backend_requests_total"), + "Should contain backend requests total metric") + assert.True(t, strings.Contains(metricsContent, "toolhive_vmcp_backend_requests_duration"), + "Should contain backend requests duration metric") + + // Verify HTTP middleware metrics are present (incoming MCP requests). + assert.True(t, strings.Contains(metricsContent, "toolhive_mcp_requests_total"), + "Should contain HTTP middleware requests total metric") + assert.True(t, strings.Contains(metricsContent, "toolhive_mcp_request_duration_seconds"), + "Should contain HTTP middleware request duration metric") +} From d7c0583f16bd47c5467128c3fc7ca7587c069702 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Thu, 4 Dec 2025 10:07:30 -0800 Subject: [PATCH 09/22] Plumb telemetry config from CRD to vMCP Server Signed-off-by: Jeremy Drouillard --- cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go | 6 ++++++ cmd/thv-operator/pkg/vmcpconfig/converter.go | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go index 801d18f62a..1166a19ae5 100644 --- a/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go +++ b/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go @@ -1,6 +1,7 @@ package v1alpha1 import ( + "github.com/stacklok/toolhive/pkg/telemetry" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -54,6 +55,11 @@ type VirtualMCPServerSpec struct { // +kubebuilder:pruning:PreserveUnknownFields // +kubebuilder:validation:Type=object PodTemplateSpec *runtime.RawExtension `json:"podTemplateSpec,omitempty"` + + // Telemetry configures OpenTelemetry-based observability for the Virtual MCP server + // including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint + // +optional + Telemetry *telemetry.Config `json:"telemetry,omitempty"` } // GroupRef references an MCPGroup resource diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go index 438b59b4f7..ff3fc0f2f1 100644 --- a/cmd/thv-operator/pkg/vmcpconfig/converter.go +++ b/cmd/thv-operator/pkg/vmcpconfig/converter.go @@ -110,6 +110,10 @@ func (c *Converter) Convert( config.Operational = c.convertOperational(ctx, vmcp) } + // Convert Telemetry - pass through directly as it's the same type + + config.Telemetry = vmcp.Spec.Telemetry + // Apply operational defaults (fills missing values) config.EnsureOperationalDefaults() From 928f70c9efff76699ceb93904c9f5f733949fd42 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Thu, 4 Dec 2025 10:31:09 -0800 Subject: [PATCH 10/22] Trace BackendClient and Workflow calls Signed-off-by: Jeremy Drouillard --- cmd/thv-operator/pkg/vmcpconfig/converter.go | 4 +- pkg/vmcp/server/server.go | 4 +- pkg/vmcp/server/telemetry.go | 80 +++++++++++++++----- 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go index ff3fc0f2f1..14736d654e 100644 --- a/cmd/thv-operator/pkg/vmcpconfig/converter.go +++ b/cmd/thv-operator/pkg/vmcpconfig/converter.go @@ -110,8 +110,8 @@ func (c *Converter) Convert( config.Operational = c.convertOperational(ctx, vmcp) } - // Convert Telemetry - pass through directly as it's the same type - + // Convert Telemetry - pass through directly as it's the same type. + // Eventually, we may want to decouple the two, but for now this is fine. config.Telemetry = vmcp.Spec.Telemetry // Apply operational defaults (fills missing values) diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index 6199321fe9..fd7685da36 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -204,7 +204,7 @@ func New( // backend calls are instrumented when they occur during workflow execution. if cfg.TelemetryProvider != nil { var err error - backendClient, err = monitorBackends(context.Background(), cfg.TelemetryProvider.MeterProvider(), backends, backendClient) + backendClient, err = monitorBackends(context.Background(), cfg.TelemetryProvider.MeterProvider(), cfg.TelemetryProvider.TracerProvider(), backends, backendClient) if err != nil { return nil, fmt.Errorf("failed to monitor backends: %w", err) } @@ -224,7 +224,7 @@ func New( // Decorate workflow executors with telemetry if provider is configured if cfg.TelemetryProvider != nil { - workflowExecutors, err = monitorWorkflowExecutors(cfg.TelemetryProvider.MeterProvider(), workflowExecutors) + workflowExecutors, err = monitorWorkflowExecutors(cfg.TelemetryProvider.MeterProvider(), cfg.TelemetryProvider.TracerProvider(), workflowExecutors) if err != nil { return nil, fmt.Errorf("failed to monitor workflow executors: %w", err) } diff --git a/pkg/vmcp/server/telemetry.go b/pkg/vmcp/server/telemetry.go index 7a80021ba7..b79d12494a 100644 --- a/pkg/vmcp/server/telemetry.go +++ b/pkg/vmcp/server/telemetry.go @@ -6,7 +6,9 @@ import ( "time" "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/trace" "github.com/stacklok/toolhive/pkg/vmcp" "github.com/stacklok/toolhive/pkg/vmcp/server/adapter" @@ -18,7 +20,13 @@ const ( // monitorBackends decorates the backend client so it records telemetry on each method call. // It also emits a gauge for the number of backends discovered once, since the number of backends is static. -func monitorBackends(ctx context.Context, meterProvider metric.MeterProvider, backends []vmcp.Backend, backendClient vmcp.BackendClient) (vmcp.BackendClient, error) { +func monitorBackends( + ctx context.Context, + meterProvider metric.MeterProvider, + tracerProvider trace.TracerProvider, + backends []vmcp.Backend, + backendClient vmcp.BackendClient, +) (vmcp.BackendClient, error) { meter := meterProvider.Meter(instrumentationName) backendCount, err := meter.Int64Gauge( @@ -45,15 +53,16 @@ func monitorBackends(ctx context.Context, meterProvider metric.MeterProvider, ba return telemetryBackendClient{ backendClient: backendClient, + tracer: tracerProvider.Tracer(instrumentationName), requestsTotal: requestsTotal, errorsTotal: errorsTotal, requestsDuration: requestsDuration, }, nil - } type telemetryBackendClient struct { backendClient vmcp.BackendClient + tracer trace.Tracer requestsTotal metric.Int64Counter errorsTotal metric.Int64Counter @@ -62,52 +71,68 @@ type telemetryBackendClient struct { var _ vmcp.BackendClient = telemetryBackendClient{} -// record updates the telemetry metrics for each method on the BackendClient interface. -// It returns a function that should be deferred to record the duration and error. -func (t telemetryBackendClient) record(ctx context.Context, target *vmcp.BackendTarget, action string, err *error) func() { - attrs := metric.WithAttributes( +// record updates the metrics and creates a span for each method on the BackendClient interface. +// It returns a function that should be deferred to record the duration, error, and end the span. +func (t telemetryBackendClient) record(ctx context.Context, target *vmcp.BackendTarget, action string, err *error) (context.Context, func()) { + // Create span attributes + commonAttrs := []attribute.KeyValue{ attribute.String("target.workload_id", target.WorkloadID), attribute.String("target.workload_name", target.WorkloadName), attribute.String("target.base_url", target.BaseURL), attribute.String("target.transport_type", target.TransportType), attribute.String("action", action), + } + + ctx, span := t.tracer.Start(ctx, "telemetryBackendClient."+action, + // TODO: Add params and results to the span once we have reusable sanitization functions. + trace.WithAttributes(commonAttrs...), ) + + metricAttrs := metric.WithAttributes(commonAttrs...) start := time.Now() - t.requestsTotal.Add(ctx, 1, attrs) + t.requestsTotal.Add(ctx, 1, metricAttrs) - return func() { + return ctx, func() { duration := time.Since(start) - t.requestsDuration.Record(ctx, duration.Seconds(), attrs) - if err != nil { - t.errorsTotal.Add(ctx, 1, attrs) + t.requestsDuration.Record(ctx, duration.Seconds(), metricAttrs) + if err != nil && *err != nil { + t.errorsTotal.Add(ctx, 1, metricAttrs) + span.RecordError(*err) + span.SetStatus(codes.Error, (*err).Error()) } + span.End() } } func (t telemetryBackendClient) CallTool(ctx context.Context, target *vmcp.BackendTarget, toolName string, arguments map[string]any) (_ map[string]any, retErr error) { - defer t.record(ctx, target, "call_tool", &retErr)() + ctx, done := t.record(ctx, target, "call_tool", &retErr) + defer done() return t.backendClient.CallTool(ctx, target, toolName, arguments) } func (t telemetryBackendClient) ReadResource(ctx context.Context, target *vmcp.BackendTarget, uri string) (_ []byte, retErr error) { - defer t.record(ctx, target, "read_resource", &retErr)() + ctx, done := t.record(ctx, target, "read_resource", &retErr) + defer done() return t.backendClient.ReadResource(ctx, target, uri) } func (t telemetryBackendClient) GetPrompt(ctx context.Context, target *vmcp.BackendTarget, name string, arguments map[string]any) (_ string, retErr error) { - defer t.record(ctx, target, "get_prompt", &retErr)() + ctx, done := t.record(ctx, target, "get_prompt", &retErr) + defer done() return t.backendClient.GetPrompt(ctx, target, name, arguments) } func (t telemetryBackendClient) ListCapabilities(ctx context.Context, target *vmcp.BackendTarget) (_ *vmcp.CapabilityList, retErr error) { - defer t.record(ctx, target, "list_capabilities", &retErr)() + ctx, done := t.record(ctx, target, "list_capabilities", &retErr) + defer done() return t.backendClient.ListCapabilities(ctx, target) } // monitorWorkflowExecutors decorates workflow executors with telemetry recording. -// It wraps each executor to emit metrics for execution count, duration, and errors. +// It wraps each executor to emit metrics and traces for execution count, duration, and errors. func monitorWorkflowExecutors( meterProvider metric.MeterProvider, + tracerProvider trace.TracerProvider, executors map[string]adapter.WorkflowExecutor, ) (map[string]adapter.WorkflowExecutor, error) { if len(executors) == 0 { @@ -140,11 +165,14 @@ func monitorWorkflowExecutors( return nil, fmt.Errorf("failed to create workflow duration histogram: %w", err) } + tracer := tracerProvider.Tracer(instrumentationName) + monitored := make(map[string]adapter.WorkflowExecutor, len(executors)) for name, executor := range executors { monitored[name] = &telemetryWorkflowExecutor{ name: name, executor: executor, + tracer: tracer, executionsTotal: executionsTotal, errorsTotal: errorsTotal, executionDuration: executionDuration, @@ -158,6 +186,7 @@ func monitorWorkflowExecutors( type telemetryWorkflowExecutor struct { name string executor adapter.WorkflowExecutor + tracer trace.Tracer executionsTotal metric.Int64Counter errorsTotal metric.Int64Counter executionDuration metric.Float64Histogram @@ -165,22 +194,31 @@ type telemetryWorkflowExecutor struct { var _ adapter.WorkflowExecutor = (*telemetryWorkflowExecutor)(nil) -// ExecuteWorkflow executes the workflow and records telemetry metrics. +// ExecuteWorkflow executes the workflow and records telemetry metrics and traces. func (t *telemetryWorkflowExecutor) ExecuteWorkflow(ctx context.Context, params map[string]any) (*adapter.WorkflowResult, error) { - attrs := metric.WithAttributes( + commonAttrs := []attribute.KeyValue{ attribute.String("workflow.name", t.name), + } + + ctx, span := t.tracer.Start(ctx, "telemetryWorkflowExecutor.ExecuteWorkflow", + // TODO: Add params and results to the span once we have reusable sanitization functions. + trace.WithAttributes(commonAttrs...), ) + defer span.End() + metricAttrs := metric.WithAttributes(commonAttrs...) start := time.Now() - t.executionsTotal.Add(ctx, 1, attrs) + t.executionsTotal.Add(ctx, 1, metricAttrs) result, err := t.executor.ExecuteWorkflow(ctx, params) duration := time.Since(start) - t.executionDuration.Record(ctx, duration.Seconds(), attrs) + t.executionDuration.Record(ctx, duration.Seconds(), metricAttrs) if err != nil { - t.errorsTotal.Add(ctx, 1, attrs) + t.errorsTotal.Add(ctx, 1, metricAttrs) + span.RecordError(err) + span.SetStatus(codes.Error, err.Error()) } return result, err From 526223fbc49619f698eed1e2dfd50a93b8966b1a Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Thu, 4 Dec 2025 11:16:59 -0800 Subject: [PATCH 11/22] Document o11y for vMCP Signed-off-by: Jeremy Drouillard --- cmd/vmcp/README.md | 2 + docs/observability.md | 7 ++ docs/operator/virtualmcpserver-api.md | 43 +++++++++++ .../virtualmcpserver-observability.md | 74 +++++++++++++++++++ examples/vmcp-config.yaml | 11 +++ 5 files changed, 137 insertions(+) create mode 100644 docs/operator/virtualmcpserver-observability.md diff --git a/cmd/vmcp/README.md b/cmd/vmcp/README.md index 0868e426ac..2e8d077ea3 100644 --- a/cmd/vmcp/README.md +++ b/cmd/vmcp/README.md @@ -14,6 +14,7 @@ The Virtual MCP Server (vmcp) is a standalone binary that aggregates multiple MC - ✅ **Session Management**: MCP protocol session tracking with TTL-based cleanup - ✅ **Health Endpoints**: `/health` and `/ping` for service monitoring - ✅ **Configuration Validation**: `vmcp validate` command for config verification +- ✅ **Observability**: OpenTelemetry metrics and traces for backend operations and workflow executions ### In Progress - 🚧 **Incoming Authentication** (Issue #165): OIDC, local, anonymous authentication @@ -121,6 +122,7 @@ vmcp uses a YAML configuration file to define: 3. **Outgoing Authentication**: Virtual MCP → Backend API token exchange 4. **Tool Aggregation**: Conflict resolution and filtering strategies 5. **Operational Settings**: Timeouts, health checks, circuit breakers +6. **Telemetry**: OpenTelemetry metrics/tracing and Prometheus endpoint See [examples/vmcp-config.yaml](../../examples/vmcp-config.yaml) for a complete example. diff --git a/docs/observability.md b/docs/observability.md index 83b632516d..c765dccc44 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -84,3 +84,10 @@ The telemetry middleware: This provides end-to-end visibility across the entire request lifecycle while maintaining the modular architecture of ToolHive's middleware system. + +## Virtual MCP Server Telemetry + +For observability in the Virtual MCP Server (vMCP), including backend request +metrics, workflow execution telemetry, and distributed tracing, see the +dedicated [Virtual MCP Server Observability](./operator/virtualmcpserver-observability.md) +documentation. diff --git a/docs/operator/virtualmcpserver-api.md b/docs/operator/virtualmcpserver-api.md index 59582d9d05..0ef955bc7b 100644 --- a/docs/operator/virtualmcpserver-api.md +++ b/docs/operator/virtualmcpserver-api.md @@ -290,6 +290,40 @@ spec: cpu: "1000m" ``` +### `.spec.telemetry` (optional) + +Configures OpenTelemetry-based observability for the Virtual MCP server, including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint. + +**Type**: `telemetry.Config` + +**Fields**: +- `endpoint` (string): OTLP collector endpoint (host:port format) +- `serviceName` (string): Service name for telemetry +- `serviceVersion` (string): Service version for telemetry +- `tracingEnabled` (boolean): Enable distributed tracing +- `metricsEnabled` (boolean): Enable OTLP metrics export +- `samplingRate` (float64): Trace sampling rate (0.0-1.0) +- `headers` (map[string]string): Authentication headers for OTLP endpoint +- `insecure` (boolean): Use HTTP instead of HTTPS +- `enablePrometheusMetricsPath` (boolean): Expose Prometheus /metrics endpoint +- `environmentVariables` ([]string): Environment variable names to include as span attributes +- `customAttributes` (map[string]string): Custom resource attributes for all telemetry signals + +**Example**: +```yaml +spec: + telemetry: + endpoint: "otel-collector:4317" + serviceName: "my-vmcp" + tracingEnabled: true + metricsEnabled: true + samplingRate: 0.1 + insecure: true + enablePrometheusMetricsPath: true +``` + +For details on what metrics and traces are emitted, see the [Virtual MCP Server Observability](./virtualmcpserver-observability.md) documentation. + ## Status Fields ### `.status.conditions` @@ -451,6 +485,14 @@ spec: failureThreshold: 5 timeout: 60s + # Observability + telemetry: + endpoint: "otel-collector:4317" + tracingEnabled: true + metricsEnabled: true + samplingRate: 0.1 + enablePrometheusMetricsPath: true + status: phase: Ready message: "Virtual MCP serving 3 backends with 15 tools" @@ -518,4 +560,5 @@ The VirtualMCPServer CRD includes comprehensive validation: - [MCPServer](./mcpserver-api.md): Individual MCP server instances - [MCPExternalAuthConfig](./mcpexternalauthconfig-api.md): External authentication configuration - [MCPToolConfig](./toolconfig-api.md): Tool filtering and renaming configuration +- [Virtual MCP Server Observability](./virtualmcpserver-observability.md): Telemetry and metrics documentation - [Virtual MCP Proposal](../proposals/THV-2106-virtual-mcp-server.md): Complete design proposal diff --git a/docs/operator/virtualmcpserver-observability.md b/docs/operator/virtualmcpserver-observability.md new file mode 100644 index 0000000000..8e6bdd791b --- /dev/null +++ b/docs/operator/virtualmcpserver-observability.md @@ -0,0 +1,74 @@ +# Virtual MCP Server Observability + +This document describes the observability for the Virtual MCP +Server (vMCP), which aggregates multiple backend MCP servers into a unified +interface. The vMCP provides OpenTelemetry-based instrumentation for monitoring +backend operations and composite tool workflow executions. + +For general ToolHive observability concepts and proxy runner telemetry, see the +main [Observability and Telemetry](../observability.md) documentation. + +## Overview + +The vMCP telemetry provides visibility into: + +1. **Backend operations**: Track requests to individual backend MCP servers + including tool calls, resource reads, prompt retrieval, and capability listing +2. **Workflow executions**: Monitor composite tool workflow performance and errors +3. **Distributed tracing**: Correlate requests across the vMCP and its backends + +The vMCP uses a decorator pattern to wrap backend clients and workflow executors +with telemetry instrumentation. This approach provides consistent metrics and +tracing without modifying the core business logic. + +The implementation of both metrics and traces can be found in `pkg/vmcp/server/telemetry.go`. + +## Metrics + +The vMCP emits metrics for backend operations and workflow executions. All +metrics use the `toolhive_vmcp_` prefix. + +**Backend metrics** track requests to individual backend MCP servers, including +request counts, error counts, and request duration histograms. These metrics +include attributes identifying the target backend (workload ID, name, URL, +transport type) and the action being performed (tool call, resource read, etc.). + +**Workflow metrics** track composite tool workflow executions, including +execution counts, error counts, and duration histograms. These metrics include +the workflow name as an attribute. + +## Distributed Tracing + +The vMCP creates spans for each individual backend operation as well as workflow executions, enabling the attribution of workflow exection errors or latency to specific tool calls. + + +## Configuration + +Configure telemetry in the `VirtualMCPServer` resource using the `spec.telemetry` +field: + +```yaml +apiVersion: toolhive.stacklok.dev/v1alpha1 +kind: VirtualMCPServer +metadata: + name: my-vmcp +spec: + groupRef: + name: my-group + telemetry: + endpoint: "otel-collector:4317" + serviceName: "my-vmcp" + tracingEnabled: true + metricsEnabled: true + samplingRate: 0.1 + insecure: true + enablePrometheusMetricsPath: true +``` + +See the [VirtualMCPServer API reference](./virtualmcpserver-api.md) for complete +CRD documentation. + +## Related Documentation + +- [Observability and Telemetry](../observability.md) - Main ToolHive observability documentation +- [VirtualMCPServer API Reference](./virtualmcpserver-api.md) - Complete CRD specification diff --git a/examples/vmcp-config.yaml b/examples/vmcp-config.yaml index a450aacfb5..1ccf162cca 100644 --- a/examples/vmcp-config.yaml +++ b/examples/vmcp-config.yaml @@ -187,3 +187,14 @@ operational: # environment: "{{.steps.confirm_deploy.content.environment}}" # depends_on: ["confirm_deploy"] # condition: "{{.steps.confirm_deploy.action == 'accept'}}" + +# ===== OBSERVABILITY ===== +# OpenTelemetry-based metrics and tracing for backend operations and workflows +telemetry: + endpoint: "localhost:4317" # OTLP collector endpoint + serviceName: "engineering-vmcp" + tracingEnabled: true + metricsEnabled: true + samplingRate: 0.1 # 10% sampling + insecure: true # Use HTTP instead of HTTPS + enablePrometheusMetricsPath: true # Expose /metrics endpoint From 402e87294598e936288ecfceacdde11d97bcf14a Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Thu, 4 Dec 2025 12:19:36 -0800 Subject: [PATCH 12/22] fix lint Signed-off-by: Jeremy Drouillard --- .../api/v1alpha1/virtualmcpserver_types.go | 3 +- pkg/vmcp/server/server.go | 14 ++++++-- pkg/vmcp/server/telemetry.go | 35 ++++++++++++++----- .../integration/vmcp/vmcp_integration_test.go | 3 ++ 4 files changed, 44 insertions(+), 11 deletions(-) diff --git a/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go index 1166a19ae5..139bb97e94 100644 --- a/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go +++ b/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go @@ -1,9 +1,10 @@ package v1alpha1 import ( - "github.com/stacklok/toolhive/pkg/telemetry" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + + "github.com/stacklok/toolhive/pkg/telemetry" ) // VirtualMCPServerSpec defines the desired state of VirtualMCPServer diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index fd7685da36..3ad77730e0 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -204,7 +204,13 @@ func New( // backend calls are instrumented when they occur during workflow execution. if cfg.TelemetryProvider != nil { var err error - backendClient, err = monitorBackends(context.Background(), cfg.TelemetryProvider.MeterProvider(), cfg.TelemetryProvider.TracerProvider(), backends, backendClient) + backendClient, err = monitorBackends( + context.Background(), + cfg.TelemetryProvider.MeterProvider(), + cfg.TelemetryProvider.TracerProvider(), + backends, + backendClient, + ) if err != nil { return nil, fmt.Errorf("failed to monitor backends: %w", err) } @@ -224,7 +230,11 @@ func New( // Decorate workflow executors with telemetry if provider is configured if cfg.TelemetryProvider != nil { - workflowExecutors, err = monitorWorkflowExecutors(cfg.TelemetryProvider.MeterProvider(), cfg.TelemetryProvider.TracerProvider(), workflowExecutors) + workflowExecutors, err = monitorWorkflowExecutors( + cfg.TelemetryProvider.MeterProvider(), + cfg.TelemetryProvider.TracerProvider(), + workflowExecutors, + ) if err != nil { return nil, fmt.Errorf("failed to monitor workflow executors: %w", err) } diff --git a/pkg/vmcp/server/telemetry.go b/pkg/vmcp/server/telemetry.go index b79d12494a..0573a63171 100644 --- a/pkg/vmcp/server/telemetry.go +++ b/pkg/vmcp/server/telemetry.go @@ -38,15 +38,24 @@ func monitorBackends( } backendCount.Record(ctx, int64(len(backends))) - requestsTotal, err := meter.Int64Counter("toolhive_vmcp_backend_requests_total", metric.WithDescription("Total number of requests per backend")) + requestsTotal, err := meter.Int64Counter( + "toolhive_vmcp_backend_requests_total", + metric.WithDescription("Total number of requests per backend"), + ) if err != nil { return nil, fmt.Errorf("failed to create requests total counter: %w", err) } - errorsTotal, err := meter.Int64Counter("toolhive_vmcp_backend_errors_total", metric.WithDescription("Total number of errors per backend")) + errorsTotal, err := meter.Int64Counter( + "toolhive_vmcp_backend_errors_total", + metric.WithDescription("Total number of errors per backend"), + ) if err != nil { return nil, fmt.Errorf("failed to create errors total counter: %w", err) } - requestsDuration, err := meter.Float64Histogram("toolhive_vmcp_backend_requests_duration", metric.WithDescription("Duration of requests in seconds per backend")) + requestsDuration, err := meter.Float64Histogram( + "toolhive_vmcp_backend_requests_duration", + metric.WithDescription("Duration of requests in seconds per backend"), + ) if err != nil { return nil, fmt.Errorf("failed to create requests duration histogram: %w", err) } @@ -73,7 +82,9 @@ var _ vmcp.BackendClient = telemetryBackendClient{} // record updates the metrics and creates a span for each method on the BackendClient interface. // It returns a function that should be deferred to record the duration, error, and end the span. -func (t telemetryBackendClient) record(ctx context.Context, target *vmcp.BackendTarget, action string, err *error) (context.Context, func()) { +func (t telemetryBackendClient) record( + ctx context.Context, target *vmcp.BackendTarget, action string, err *error, +) (context.Context, func()) { // Create span attributes commonAttrs := []attribute.KeyValue{ attribute.String("target.workload_id", target.WorkloadID), @@ -104,25 +115,33 @@ func (t telemetryBackendClient) record(ctx context.Context, target *vmcp.Backend } } -func (t telemetryBackendClient) CallTool(ctx context.Context, target *vmcp.BackendTarget, toolName string, arguments map[string]any) (_ map[string]any, retErr error) { +func (t telemetryBackendClient) CallTool( + ctx context.Context, target *vmcp.BackendTarget, toolName string, arguments map[string]any, +) (_ map[string]any, retErr error) { ctx, done := t.record(ctx, target, "call_tool", &retErr) defer done() return t.backendClient.CallTool(ctx, target, toolName, arguments) } -func (t telemetryBackendClient) ReadResource(ctx context.Context, target *vmcp.BackendTarget, uri string) (_ []byte, retErr error) { +func (t telemetryBackendClient) ReadResource( + ctx context.Context, target *vmcp.BackendTarget, uri string, +) (_ []byte, retErr error) { ctx, done := t.record(ctx, target, "read_resource", &retErr) defer done() return t.backendClient.ReadResource(ctx, target, uri) } -func (t telemetryBackendClient) GetPrompt(ctx context.Context, target *vmcp.BackendTarget, name string, arguments map[string]any) (_ string, retErr error) { +func (t telemetryBackendClient) GetPrompt( + ctx context.Context, target *vmcp.BackendTarget, name string, arguments map[string]any, +) (_ string, retErr error) { ctx, done := t.record(ctx, target, "get_prompt", &retErr) defer done() return t.backendClient.GetPrompt(ctx, target, name, arguments) } -func (t telemetryBackendClient) ListCapabilities(ctx context.Context, target *vmcp.BackendTarget) (_ *vmcp.CapabilityList, retErr error) { +func (t telemetryBackendClient) ListCapabilities( + ctx context.Context, target *vmcp.BackendTarget, +) (_ *vmcp.CapabilityList, retErr error) { ctx, done := t.record(ctx, target, "list_capabilities", &retErr) defer done() return t.backendClient.ListCapabilities(ctx, target) diff --git a/test/integration/vmcp/vmcp_integration_test.go b/test/integration/vmcp/vmcp_integration_test.go index 74716ae682..b60e9fba34 100644 --- a/test/integration/vmcp/vmcp_integration_test.go +++ b/test/integration/vmcp/vmcp_integration_test.go @@ -237,7 +237,10 @@ func TestVMCPServer_TwoBoundaryAuth_HeaderInjection(t *testing.T) { // Prometheus metrics for composite tool workflow executions and backend requests on /metrics. // This test creates a composite tool, executes it, and verifies the metrics // for both the workflow and the backend subtool calls are correctly exposed. +// +//nolint:paralleltest // safe to run in parallel with other tests func TestVMCPServer_Telemetry_CompositeToolMetrics(t *testing.T) { + t.Parallel() ctx, cancel := context.WithCancel(context.Background()) defer cancel() From 9593ff93c78243ed239b96bd02889541c2c3bad8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 19:40:32 +0000 Subject: [PATCH 13/22] Initial plan From 0827db0381385745237a2e5053f7b8250a7bfee9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 4 Dec 2025 19:46:33 +0000 Subject: [PATCH 14/22] Fix metric naming inconsistencies and typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove _total suffix from counter names (auto-added by Prometheus exporter) - Add metric.WithUnit("s") for histogram durations instead of _seconds suffix - Fix spelling: "exection" → "execution" in docs - Update middleware chain comment to include telemetry Co-authored-by: jerm-dro <10532181+jerm-dro@users.noreply.github.com> --- .../operator/virtualmcpserver-observability.md | 2 +- pkg/vmcp/server/server.go | 2 +- pkg/vmcp/server/telemetry.go | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/operator/virtualmcpserver-observability.md b/docs/operator/virtualmcpserver-observability.md index 8e6bdd791b..80235abe14 100644 --- a/docs/operator/virtualmcpserver-observability.md +++ b/docs/operator/virtualmcpserver-observability.md @@ -39,7 +39,7 @@ the workflow name as an attribute. ## Distributed Tracing -The vMCP creates spans for each individual backend operation as well as workflow executions, enabling the attribution of workflow exection errors or latency to specific tool calls. +The vMCP creates spans for each individual backend operation as well as workflow executions, enabling the attribution of workflow execution errors or latency to specific tool calls. ## Configuration diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index 3ad77730e0..682ba00d83 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -385,7 +385,7 @@ func (s *Server) Start(ctx context.Context) error { logger.Info("RFC 9728 OAuth discovery endpoints enabled at /.well-known/") } - // MCP endpoint - apply middleware chain: auth → discovery + // MCP endpoint - apply middleware chain: auth → discovery → telemetry var mcpHandler http.Handler = streamableServer if s.config.TelemetryProvider != nil { diff --git a/pkg/vmcp/server/telemetry.go b/pkg/vmcp/server/telemetry.go index 0573a63171..64dbe218b4 100644 --- a/pkg/vmcp/server/telemetry.go +++ b/pkg/vmcp/server/telemetry.go @@ -39,22 +39,21 @@ func monitorBackends( backendCount.Record(ctx, int64(len(backends))) requestsTotal, err := meter.Int64Counter( - "toolhive_vmcp_backend_requests_total", - metric.WithDescription("Total number of requests per backend"), - ) + "toolhive_vmcp_backend_requests", + metric.WithDescription("Total number of requests per backend")) if err != nil { return nil, fmt.Errorf("failed to create requests total counter: %w", err) } errorsTotal, err := meter.Int64Counter( - "toolhive_vmcp_backend_errors_total", - metric.WithDescription("Total number of errors per backend"), - ) + "toolhive_vmcp_backend_errors", + metric.WithDescription("Total number of errors per backend")) if err != nil { return nil, fmt.Errorf("failed to create errors total counter: %w", err) } requestsDuration, err := meter.Float64Histogram( "toolhive_vmcp_backend_requests_duration", metric.WithDescription("Duration of requests in seconds per backend"), + metric.WithUnit("s"), ) if err != nil { return nil, fmt.Errorf("failed to create requests duration histogram: %w", err) @@ -161,7 +160,7 @@ func monitorWorkflowExecutors( meter := meterProvider.Meter(instrumentationName) executionsTotal, err := meter.Int64Counter( - "toolhive_vmcp_workflow_executions_total", + "toolhive_vmcp_workflow_executions", metric.WithDescription("Total number of workflow executions"), ) if err != nil { @@ -169,7 +168,7 @@ func monitorWorkflowExecutors( } errorsTotal, err := meter.Int64Counter( - "toolhive_vmcp_workflow_errors_total", + "toolhive_vmcp_workflow_errors", metric.WithDescription("Total number of workflow execution errors"), ) if err != nil { @@ -177,8 +176,9 @@ func monitorWorkflowExecutors( } executionDuration, err := meter.Float64Histogram( - "toolhive_vmcp_workflow_duration_seconds", + "toolhive_vmcp_workflow_duration", metric.WithDescription("Duration of workflow executions in seconds"), + metric.WithUnit("s"), ) if err != nil { return nil, fmt.Errorf("failed to create workflow duration histogram: %w", err) From 67acde4b08e8dcdd7628b6b043fd2db47e43f43e Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Thu, 4 Dec 2025 12:39:00 -0800 Subject: [PATCH 15/22] Generate docs Signed-off-by: Jeremy Drouillard --- docs/operator/crd-api.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md index 25323a7054..d1d3ca6942 100644 --- a/docs/operator/crd-api.md +++ b/docs/operator/crd-api.md @@ -1975,6 +1975,7 @@ _Appears in:_ | `operational` _[OperationalConfig](#operationalconfig)_ | Operational defines operational settings like timeouts and health checks | | | | `serviceType` _string_ | ServiceType specifies the Kubernetes service type for the Virtual MCP server | ClusterIP | Enum: [ClusterIP NodePort LoadBalancer]
| | `podTemplateSpec` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#rawextension-runtime-pkg)_ | PodTemplateSpec defines the pod template to use for the Virtual MCP server
This allows for customizing the pod configuration beyond what is provided by the other fields.
Note that to modify the specific container the Virtual MCP server runs in, you must specify
the 'vmcp' container name in the PodTemplateSpec.
This field accepts a PodTemplateSpec object as JSON/YAML. | | Type: object
| +| `telemetry` _[Config](#config)_ | Telemetry configures OpenTelemetry-based observability for the Virtual MCP server
including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint | | | #### VirtualMCPServerStatus From 11127e2a627d2dcfc49088c7aa928dd930a0c5c9 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Thu, 4 Dec 2025 15:21:34 -0800 Subject: [PATCH 16/22] vMCP CRD uses common spec telemetry type Signed-off-by: Jeremy Drouillard --- .../api/v1alpha1/virtualmcpserver_types.go | 4 +- .../api/v1alpha1/zz_generated.deepcopy.go | 5 ++ cmd/thv-operator/pkg/vmcpconfig/converter.go | 5 +- ...olhive.stacklok.dev_virtualmcpservers.yaml | 67 +++++++++++++++++++ docs/operator/crd-api.md | 3 +- docs/operator/virtualmcpserver-api.md | 62 ++++++++++------- .../virtualmcpserver-observability.md | 24 ++++--- 7 files changed, 130 insertions(+), 40 deletions(-) diff --git a/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go index 139bb97e94..70d9aa9e4f 100644 --- a/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go +++ b/cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go @@ -3,8 +3,6 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - - "github.com/stacklok/toolhive/pkg/telemetry" ) // VirtualMCPServerSpec defines the desired state of VirtualMCPServer @@ -60,7 +58,7 @@ type VirtualMCPServerSpec struct { // Telemetry configures OpenTelemetry-based observability for the Virtual MCP server // including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint // +optional - Telemetry *telemetry.Config `json:"telemetry,omitempty"` + Telemetry *TelemetryConfig `json:"telemetry,omitempty"` } // GroupRef references an MCPGroup resource diff --git a/cmd/thv-operator/api/v1alpha1/zz_generated.deepcopy.go b/cmd/thv-operator/api/v1alpha1/zz_generated.deepcopy.go index 23ced678d5..a0adc2cf92 100644 --- a/cmd/thv-operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/cmd/thv-operator/api/v1alpha1/zz_generated.deepcopy.go @@ -2255,6 +2255,11 @@ func (in *VirtualMCPServerSpec) DeepCopyInto(out *VirtualMCPServerSpec) { *out = new(runtime.RawExtension) (*in).DeepCopyInto(*out) } + if in.Telemetry != nil { + in, out := &in.Telemetry, &out.Telemetry + *out = new(TelemetryConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMCPServerSpec. diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go index 14736d654e..792631b268 100644 --- a/cmd/thv-operator/pkg/vmcpconfig/converter.go +++ b/cmd/thv-operator/pkg/vmcpconfig/converter.go @@ -15,6 +15,7 @@ import ( mcpv1alpha1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1alpha1" "github.com/stacklok/toolhive/cmd/thv-operator/pkg/oidc" + "github.com/stacklok/toolhive/cmd/thv-operator/pkg/spectoconfig" authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types" vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config" ) @@ -110,9 +111,7 @@ func (c *Converter) Convert( config.Operational = c.convertOperational(ctx, vmcp) } - // Convert Telemetry - pass through directly as it's the same type. - // Eventually, we may want to decouple the two, but for now this is fine. - config.Telemetry = vmcp.Spec.Telemetry + config.Telemetry = spectoconfig.ConvertTelemetryConfig(ctx, vmcp.Spec.Telemetry, vmcp.Name) // Apply operational defaults (fills missing values) config.EnsureOperationalDefaults() diff --git a/deploy/charts/operator-crds/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/crds/toolhive.stacklok.dev_virtualmcpservers.yaml index 0d22818536..d8ff946527 100644 --- a/deploy/charts/operator-crds/crds/toolhive.stacklok.dev_virtualmcpservers.yaml +++ b/deploy/charts/operator-crds/crds/toolhive.stacklok.dev_virtualmcpservers.yaml @@ -733,6 +733,73 @@ spec: - NodePort - LoadBalancer type: string + telemetry: + description: |- + Telemetry configures OpenTelemetry-based observability for the Virtual MCP server + including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint + properties: + openTelemetry: + description: OpenTelemetry defines OpenTelemetry configuration + properties: + enabled: + default: false + description: Enabled controls whether OpenTelemetry is enabled + type: boolean + endpoint: + description: Endpoint is the OTLP endpoint URL for tracing + and metrics + type: string + headers: + description: |- + Headers contains authentication headers for the OTLP endpoint + Specified as key=value pairs + items: + type: string + type: array + insecure: + default: false + description: Insecure indicates whether to use HTTP instead + of HTTPS for the OTLP endpoint + type: boolean + metrics: + description: Metrics defines OpenTelemetry metrics-specific + configuration + properties: + enabled: + default: false + description: Enabled controls whether OTLP metrics are + sent + type: boolean + type: object + serviceName: + description: |- + ServiceName is the service name for telemetry + If not specified, defaults to the MCPServer name + type: string + tracing: + description: Tracing defines OpenTelemetry tracing configuration + properties: + enabled: + default: false + description: Enabled controls whether OTLP tracing is + sent + type: boolean + samplingRate: + default: "0.05" + description: SamplingRate is the trace sampling rate (0.0-1.0) + type: string + type: object + type: object + prometheus: + description: Prometheus defines Prometheus-specific configuration + properties: + enabled: + default: false + description: Enabled controls whether Prometheus metrics endpoint + is exposed + type: boolean + type: object + type: object required: - groupRef - incomingAuth diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md index d1d3ca6942..e9d5af6b8f 100644 --- a/docs/operator/crd-api.md +++ b/docs/operator/crd-api.md @@ -1697,6 +1697,7 @@ TelemetryConfig defines observability configuration for the MCP server _Appears in:_ - [MCPRemoteProxySpec](#mcpremoteproxyspec) - [MCPServerSpec](#mcpserverspec) +- [VirtualMCPServerSpec](#virtualmcpserverspec) | Field | Description | Default | Validation | | --- | --- | --- | --- | @@ -1975,7 +1976,7 @@ _Appears in:_ | `operational` _[OperationalConfig](#operationalconfig)_ | Operational defines operational settings like timeouts and health checks | | | | `serviceType` _string_ | ServiceType specifies the Kubernetes service type for the Virtual MCP server | ClusterIP | Enum: [ClusterIP NodePort LoadBalancer]
| | `podTemplateSpec` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#rawextension-runtime-pkg)_ | PodTemplateSpec defines the pod template to use for the Virtual MCP server
This allows for customizing the pod configuration beyond what is provided by the other fields.
Note that to modify the specific container the Virtual MCP server runs in, you must specify
the 'vmcp' container name in the PodTemplateSpec.
This field accepts a PodTemplateSpec object as JSON/YAML. | | Type: object
| -| `telemetry` _[Config](#config)_ | Telemetry configures OpenTelemetry-based observability for the Virtual MCP server
including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint | | | +| `telemetry` _[TelemetryConfig](#telemetryconfig)_ | Telemetry configures OpenTelemetry-based observability for the Virtual MCP server
including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint | | | #### VirtualMCPServerStatus diff --git a/docs/operator/virtualmcpserver-api.md b/docs/operator/virtualmcpserver-api.md index 0ef955bc7b..14a70f2397 100644 --- a/docs/operator/virtualmcpserver-api.md +++ b/docs/operator/virtualmcpserver-api.md @@ -292,34 +292,41 @@ spec: ### `.spec.telemetry` (optional) -Configures OpenTelemetry-based observability for the Virtual MCP server, including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint. +Configures OpenTelemetry-based observability for the Virtual MCP server, including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint. Uses the same configuration structure as `MCPServer.spec.telemetry`. -**Type**: `telemetry.Config` +**Type**: `TelemetryConfig` **Fields**: -- `endpoint` (string): OTLP collector endpoint (host:port format) -- `serviceName` (string): Service name for telemetry -- `serviceVersion` (string): Service version for telemetry -- `tracingEnabled` (boolean): Enable distributed tracing -- `metricsEnabled` (boolean): Enable OTLP metrics export -- `samplingRate` (float64): Trace sampling rate (0.0-1.0) -- `headers` (map[string]string): Authentication headers for OTLP endpoint -- `insecure` (boolean): Use HTTP instead of HTTPS -- `enablePrometheusMetricsPath` (boolean): Expose Prometheus /metrics endpoint -- `environmentVariables` ([]string): Environment variable names to include as span attributes -- `customAttributes` (map[string]string): Custom resource attributes for all telemetry signals +- `openTelemetry` (OpenTelemetryConfig, optional): OpenTelemetry configuration + - `enabled` (boolean): Controls whether OpenTelemetry is enabled + - `endpoint` (string): OTLP endpoint URL for tracing and metrics + - `serviceName` (string): Service name for telemetry (defaults to VirtualMCPServer name) + - `headers` ([]string): Authentication headers for OTLP endpoint (key=value format) + - `insecure` (boolean): Use HTTP instead of HTTPS for the OTLP endpoint + - `metrics` (OpenTelemetryMetricsConfig, optional): Metrics-specific configuration + - `enabled` (boolean): Controls whether OTLP metrics are sent + - `tracing` (OpenTelemetryTracingConfig, optional): Tracing-specific configuration + - `enabled` (boolean): Controls whether OTLP tracing is sent + - `samplingRate` (string): Trace sampling rate (0.0-1.0, default: "0.05") +- `prometheus` (PrometheusConfig, optional): Prometheus-specific configuration + - `enabled` (boolean): Controls whether Prometheus metrics endpoint is exposed at /metrics **Example**: ```yaml spec: telemetry: - endpoint: "otel-collector:4317" - serviceName: "my-vmcp" - tracingEnabled: true - metricsEnabled: true - samplingRate: 0.1 - insecure: true - enablePrometheusMetricsPath: true + openTelemetry: + enabled: true + endpoint: "otel-collector:4317" + serviceName: "my-vmcp" + insecure: true + tracing: + enabled: true + samplingRate: "0.1" + metrics: + enabled: true + prometheus: + enabled: true ``` For details on what metrics and traces are emitted, see the [Virtual MCP Server Observability](./virtualmcpserver-observability.md) documentation. @@ -487,11 +494,16 @@ spec: # Observability telemetry: - endpoint: "otel-collector:4317" - tracingEnabled: true - metricsEnabled: true - samplingRate: 0.1 - enablePrometheusMetricsPath: true + openTelemetry: + enabled: true + endpoint: "otel-collector:4317" + tracing: + enabled: true + samplingRate: "0.1" + metrics: + enabled: true + prometheus: + enabled: true status: phase: Ready diff --git a/docs/operator/virtualmcpserver-observability.md b/docs/operator/virtualmcpserver-observability.md index 80235abe14..048c1a7d6d 100644 --- a/docs/operator/virtualmcpserver-observability.md +++ b/docs/operator/virtualmcpserver-observability.md @@ -45,7 +45,8 @@ The vMCP creates spans for each individual backend operation as well as workflow ## Configuration Configure telemetry in the `VirtualMCPServer` resource using the `spec.telemetry` -field: +field. The telemetry configuration uses the same `TelemetryConfig` type as +`MCPServer`, providing a consistent configuration experience across resources. ```yaml apiVersion: toolhive.stacklok.dev/v1alpha1 @@ -55,14 +56,21 @@ metadata: spec: groupRef: name: my-group + incomingAuth: + type: anonymous telemetry: - endpoint: "otel-collector:4317" - serviceName: "my-vmcp" - tracingEnabled: true - metricsEnabled: true - samplingRate: 0.1 - insecure: true - enablePrometheusMetricsPath: true + openTelemetry: + enabled: true + endpoint: "otel-collector:4317" + serviceName: "my-vmcp" + insecure: true + tracing: + enabled: true + samplingRate: "0.1" + metrics: + enabled: true + prometheus: + enabled: true ``` See the [VirtualMCPServer API reference](./virtualmcpserver-api.md) for complete From e9c018b6ce97eaf67aa6f674d139a08c7a05110b Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Thu, 4 Dec 2025 15:56:55 -0800 Subject: [PATCH 17/22] plumb context to start Signed-off-by: Jeremy Drouillard --- cmd/vmcp/app/commands.go | 8 +++++++- pkg/vmcp/server/health_test.go | 11 ++++++----- pkg/vmcp/server/integration_test.go | 4 ++-- pkg/vmcp/server/server.go | 3 ++- pkg/vmcp/server/server_test.go | 6 +++--- test/integration/vmcp/helpers/vmcp_server.go | 2 +- 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go index 4098f8a6a0..9fc2fc9242 100644 --- a/cmd/vmcp/app/commands.go +++ b/cmd/vmcp/app/commands.go @@ -311,6 +311,12 @@ func runServe(cmd *cobra.Command, _ []string) error { if err != nil { return fmt.Errorf("failed to create telemetry provider: %w", err) } + defer func() { + err := telemetryProvider.Shutdown(ctx) + if err != nil { + logger.Errorf("failed to shutdown telemetry provider: %v", err) + } + }() } serverCfg := &vmcpserver.Config{ @@ -333,7 +339,7 @@ func runServe(cmd *cobra.Command, _ []string) error { } // Create server with discovery manager, backends, and workflow definitions - srv, err := vmcpserver.New(serverCfg, rtr, backendClient, discoveryMgr, backends, workflowDefs) + srv, err := vmcpserver.New(ctx, serverCfg, rtr, backendClient, discoveryMgr, backends, workflowDefs) if err != nil { return fmt.Errorf("failed to create Virtual MCP Server: %w", err) } diff --git a/pkg/vmcp/server/health_test.go b/pkg/vmcp/server/health_test.go index 812b11b038..50ec721bf7 100644 --- a/pkg/vmcp/server/health_test.go +++ b/pkg/vmcp/server/health_test.go @@ -59,7 +59,10 @@ func createTestServer(t *testing.T) *server.Server { // Mock Stop to be called during server shutdown mockDiscoveryMgr.EXPECT().Stop().AnyTimes() - srv, err := server.New(&server.Config{ + // Create context for server + ctx, cancel := context.WithCancel(t.Context()) + + srv, err := server.New(ctx, &server.Config{ Name: "test-vmcp", Version: "1.0.0", Host: "127.0.0.1", @@ -68,9 +71,7 @@ func createTestServer(t *testing.T) *server.Server { require.NoError(t, err) // Start server in background - ctx, cancel := context.WithCancel(t.Context()) t.Cleanup(cancel) - errCh := make(chan error, 1) go func() { if err := srv.Start(ctx); err != nil { @@ -175,7 +176,7 @@ func TestServer_SessionManager(t *testing.T) { mockDiscoveryMgr := discoveryMocks.NewMockManager(ctrl) rt := router.NewDefaultRouter() - srv, err := server.New(&server.Config{ + srv, err := server.New(context.Background(), &server.Config{ Name: "test-vmcp", Version: "1.0.0", SessionTTL: 10 * time.Minute, @@ -198,7 +199,7 @@ func TestServer_SessionManager(t *testing.T) { rt := router.NewDefaultRouter() customTTL := 15 * time.Minute - srv, err := server.New(&server.Config{ + srv, err := server.New(context.Background(), &server.Config{ Name: "test-vmcp", Version: "1.0.0", SessionTTL: customTTL, diff --git a/pkg/vmcp/server/integration_test.go b/pkg/vmcp/server/integration_test.go index f96f185a97..7d8af3e989 100644 --- a/pkg/vmcp/server/integration_test.go +++ b/pkg/vmcp/server/integration_test.go @@ -192,7 +192,7 @@ func TestIntegration_AggregatorToRouterToServer(t *testing.T) { // Mock Stop to be called during server shutdown mockDiscoveryMgr.EXPECT().Stop().Times(1) - srv, err := server.New(&server.Config{ + srv, err := server.New(ctx, &server.Config{ Name: "test-vmcp", Version: "1.0.0", Host: "127.0.0.1", @@ -322,7 +322,7 @@ func TestIntegration_HTTPRequestFlowWithRoutingTable(t *testing.T) { } // Create and start server - srv, err := server.New(&server.Config{ + srv, err := server.New(ctx, &server.Config{ Name: "test-vmcp", Version: "1.0.0", Host: "127.0.0.1", diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go index 682ba00d83..6ce684cc87 100644 --- a/pkg/vmcp/server/server.go +++ b/pkg/vmcp/server/server.go @@ -152,6 +152,7 @@ type Server struct { // //nolint:gocyclo // Complexity from hook logic is acceptable func New( + ctx context.Context, cfg *Config, rt router.Router, backendClient vmcp.BackendClient, @@ -205,7 +206,7 @@ func New( if cfg.TelemetryProvider != nil { var err error backendClient, err = monitorBackends( - context.Background(), + ctx, cfg.TelemetryProvider.MeterProvider(), cfg.TelemetryProvider.TracerProvider(), backends, diff --git a/pkg/vmcp/server/server_test.go b/pkg/vmcp/server/server_test.go index 0a0a74422e..e17ee43450 100644 --- a/pkg/vmcp/server/server_test.go +++ b/pkg/vmcp/server/server_test.go @@ -76,7 +76,7 @@ func TestNew(t *testing.T) { mockBackendClient := mocks.NewMockBackendClient(ctrl) mockDiscoveryMgr := discoveryMocks.NewMockManager(ctrl) - s, err := server.New(tt.config, mockRouter, mockBackendClient, mockDiscoveryMgr, []vmcp.Backend{}, nil) + s, err := server.New(context.Background(), tt.config, mockRouter, mockBackendClient, mockDiscoveryMgr, []vmcp.Backend{}, nil) require.NoError(t, err) require.NotNil(t, s) @@ -133,7 +133,7 @@ func TestServer_Address(t *testing.T) { mockBackendClient := mocks.NewMockBackendClient(ctrl) mockDiscoveryMgr := discoveryMocks.NewMockManager(ctrl) - s, err := server.New(tt.config, mockRouter, mockBackendClient, mockDiscoveryMgr, []vmcp.Backend{}, nil) + s, err := server.New(context.Background(), tt.config, mockRouter, mockBackendClient, mockDiscoveryMgr, []vmcp.Backend{}, nil) require.NoError(t, err) addr := s.Address() assert.Equal(t, tt.expected, addr) @@ -155,7 +155,7 @@ func TestServer_Stop(t *testing.T) { mockDiscoveryMgr := discoveryMocks.NewMockManager(ctrl) mockDiscoveryMgr.EXPECT().Stop().Times(1) - s, err := server.New(&server.Config{}, mockRouter, mockBackendClient, mockDiscoveryMgr, []vmcp.Backend{}, nil) + s, err := server.New(context.Background(), &server.Config{}, mockRouter, mockBackendClient, mockDiscoveryMgr, []vmcp.Backend{}, nil) require.NoError(t, err) err = s.Stop(context.Background()) require.NoError(t, err) diff --git a/test/integration/vmcp/helpers/vmcp_server.go b/test/integration/vmcp/helpers/vmcp_server.go index 93007a5a5a..8e9d559f0f 100644 --- a/test/integration/vmcp/helpers/vmcp_server.go +++ b/test/integration/vmcp/helpers/vmcp_server.go @@ -165,7 +165,7 @@ func NewVMCPServer( rtr := router.NewDefaultRouter() // Create vMCP server with test-specific defaults - vmcpServer, err := vmcpserver.New(&vmcpserver.Config{ + vmcpServer, err := vmcpserver.New(ctx, &vmcpserver.Config{ Name: "test-vmcp", Version: "1.0.0", Host: "127.0.0.1", From 931bbdee1c4ee38c6fa72209f48e65aac39d4807 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Mon, 8 Dec 2025 09:34:01 -0800 Subject: [PATCH 18/22] bump chart version Signed-off-by: Jeremy Drouillard --- deploy/charts/operator-crds/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/charts/operator-crds/Chart.yaml b/deploy/charts/operator-crds/Chart.yaml index c5f8fd6a55..87e93efd0e 100644 --- a/deploy/charts/operator-crds/Chart.yaml +++ b/deploy/charts/operator-crds/Chart.yaml @@ -2,5 +2,5 @@ apiVersion: v2 name: toolhive-operator-crds description: A Helm chart for installing the ToolHive Operator CRDs into Kubernetes. type: application -version: 0.0.75 +version: 0.0.76 appVersion: "0.0.1" From 36b85553e3ab6cf99152d7a8253e0fd2886b72dd Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Mon, 8 Dec 2025 09:38:48 -0800 Subject: [PATCH 19/22] bump readme version Signed-off-by: Jeremy Drouillard --- deploy/charts/operator-crds/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/charts/operator-crds/README.md b/deploy/charts/operator-crds/README.md index 4fd2a51f48..4d787fb561 100644 --- a/deploy/charts/operator-crds/README.md +++ b/deploy/charts/operator-crds/README.md @@ -1,6 +1,6 @@ # ToolHive Operator CRDs Helm Chart -![Version: 0.0.75](https://img.shields.io/badge/Version-0.0.75-informational?style=flat-square) +![Version: 0.0.76](https://img.shields.io/badge/Version-0.0.75-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) A Helm chart for installing the ToolHive Operator CRDs into Kubernetes. From b0bedaade9223033a8021f2073c11810f4410ca6 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Mon, 8 Dec 2025 09:42:13 -0800 Subject: [PATCH 20/22] actually bump readme Signed-off-by: Jeremy Drouillard --- deploy/charts/operator-crds/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/charts/operator-crds/README.md b/deploy/charts/operator-crds/README.md index 4d787fb561..8d77792f66 100644 --- a/deploy/charts/operator-crds/README.md +++ b/deploy/charts/operator-crds/README.md @@ -1,6 +1,6 @@ # ToolHive Operator CRDs Helm Chart -![Version: 0.0.76](https://img.shields.io/badge/Version-0.0.75-informational?style=flat-square) +![Version: 0.0.76](https://img.shields.io/badge/Version-0.0.76-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) A Helm chart for installing the ToolHive Operator CRDs into Kubernetes. From 01f085348e9d8db39d8d14d233b95b15df3f674a Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Mon, 8 Dec 2025 11:18:58 -0800 Subject: [PATCH 21/22] Include telemetry in raw config Signed-off-by: Jeremy Drouillard --- pkg/vmcp/config/yaml_loader.go | 5 ++ pkg/vmcp/config/yaml_loader_transform_test.go | 64 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/pkg/vmcp/config/yaml_loader.go b/pkg/vmcp/config/yaml_loader.go index ee6830b4cc..ee51a5efc5 100644 --- a/pkg/vmcp/config/yaml_loader.go +++ b/pkg/vmcp/config/yaml_loader.go @@ -8,6 +8,7 @@ import ( "gopkg.in/yaml.v3" "github.com/stacklok/toolhive/pkg/env" + "github.com/stacklok/toolhive/pkg/telemetry" "github.com/stacklok/toolhive/pkg/vmcp" authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types" ) @@ -58,6 +59,8 @@ type rawConfig struct { Operational *OperationalConfig `yaml:"operational"` CompositeTools []*rawCompositeTool `yaml:"composite_tools"` + + Telemetry *telemetry.Config `yaml:"telemetry"` } type rawIncomingAuth struct { @@ -214,6 +217,8 @@ func (l *YAMLLoader) transformToConfig(raw *rawConfig) (*Config, error) { cfg.CompositeTools = compositeTools } + cfg.Telemetry = raw.Telemetry + // Apply operational defaults (fills missing values) cfg.EnsureOperationalDefaults() diff --git a/pkg/vmcp/config/yaml_loader_transform_test.go b/pkg/vmcp/config/yaml_loader_transform_test.go index ba401fc517..8b1a579c04 100644 --- a/pkg/vmcp/config/yaml_loader_transform_test.go +++ b/pkg/vmcp/config/yaml_loader_transform_test.go @@ -1,6 +1,7 @@ package config import ( + "os" "testing" "time" @@ -9,6 +10,7 @@ import ( "go.uber.org/mock/gomock" "github.com/stacklok/toolhive/pkg/env/mocks" + "github.com/stacklok/toolhive/pkg/telemetry" authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types" ) @@ -768,3 +770,65 @@ func TestYAMLLoader_transformCompositeTools_WithOutputConfig(t *testing.T) { }) } } + +// TestYAMLLoader_transformTelemetryConfig tests that telemetry configuration is preserved +// when transforming from raw YAML to the final Config struct. +func TestYAMLLoader_transformTelemetryConfig(t *testing.T) { + t.Parallel() + + // Note: yaml.v3 uses lowercase field names by default (no yaml tags on telemetry.Config) + yamlContent := ` +name: telemetry-test +telemetry: + endpoint: "localhost:4318" + servicename: "test-service" + serviceversion: "1.2.3" + tracingenabled: true + metricsenabled: true + samplingrate: 0.75 + insecure: true + enableprometheusmetricspath: true + headers: + Authorization: "Bearer token123" + X-Custom-Header: "custom-value" + environmentvariables: + - "NODE_ENV" + - "DEPLOYMENT_ENV" +` + + // Write temp file + tmpFile, err := os.CreateTemp("", "telemetry-test-*.yaml") + require.NoError(t, err) + defer os.Remove(tmpFile.Name()) + + _, err = tmpFile.WriteString(yamlContent) + require.NoError(t, err) + require.NoError(t, tmpFile.Close()) + + // Load config + ctrl := gomock.NewController(t) + mockEnv := mocks.NewMockReader(ctrl) + mockEnv.EXPECT().Getenv(gomock.Any()).Return("").AnyTimes() + + loader := NewYAMLLoader(tmpFile.Name(), mockEnv) + cfg, err := loader.Load() + require.NoError(t, err) + + // Verify telemetry config is fully preserved + require.NotNil(t, cfg.Telemetry, "Telemetry config should not be nil") + + require.Equal(t, telemetry.Config{ + Endpoint: "localhost:4318", + ServiceName: "test-service", + ServiceVersion: "1.2.3", + TracingEnabled: true, + MetricsEnabled: true, + SamplingRate: 0.75, + Insecure: true, + EnablePrometheusMetricsPath: true, + Headers: map[string]string{"Authorization": "Bearer token123", "X-Custom-Header": "custom-value"}, + EnvironmentVariables: []string{"NODE_ENV", "DEPLOYMENT_ENV"}, + CustomAttributes: nil, + }, *cfg.Telemetry) + +} From 105cd926f83089adb1e1fca087a76a5495c70f08 Mon Sep 17 00:00:00 2001 From: Jeremy Drouillard Date: Mon, 8 Dec 2025 11:28:06 -0800 Subject: [PATCH 22/22] update example Signed-off-by: Jeremy Drouillard --- examples/vmcp-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/vmcp-config.yaml b/examples/vmcp-config.yaml index 1ccf162cca..43a332a193 100644 --- a/examples/vmcp-config.yaml +++ b/examples/vmcp-config.yaml @@ -192,9 +192,9 @@ operational: # OpenTelemetry-based metrics and tracing for backend operations and workflows telemetry: endpoint: "localhost:4317" # OTLP collector endpoint - serviceName: "engineering-vmcp" - tracingEnabled: true - metricsEnabled: true - samplingRate: 0.1 # 10% sampling + servicename: "engineering-vmcp" + tracingenabled: true + metricsenabled: true + samplingrate: 0.1 # 10% sampling insecure: true # Use HTTP instead of HTTPS - enablePrometheusMetricsPath: true # Expose /metrics endpoint + enableprometheusmetricspath: true # Expose /metrics endpoint