Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ type VirtualMCPServerSpec struct {
// +kubebuilder:pruning:PreserveUnknownFields
// +kubebuilder:validation:Type=object
PodTemplateSpec *runtime.RawExtension `json:"podTemplateSpec,omitempty"`

// Telemetry configures OpenTelemetry-based observability for the Virtual MCP server
// including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint
// +optional
Telemetry *TelemetryConfig `json:"telemetry,omitempty"`
}

// GroupRef references an MCPGroup resource
Expand Down
5 changes: 5 additions & 0 deletions cmd/thv-operator/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions cmd/thv-operator/pkg/vmcpconfig/converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

mcpv1alpha1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1alpha1"
"github.com/stacklok/toolhive/cmd/thv-operator/pkg/oidc"
"github.com/stacklok/toolhive/cmd/thv-operator/pkg/spectoconfig"
authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config"
)
Expand Down Expand Up @@ -110,6 +111,8 @@ func (c *Converter) Convert(
config.Operational = c.convertOperational(ctx, vmcp)
}

config.Telemetry = spectoconfig.ConvertTelemetryConfig(ctx, vmcp.Spec.Telemetry, vmcp.Name)

// Apply operational defaults (fills missing values)
config.EnsureOperationalDefaults()

Expand Down
2 changes: 2 additions & 0 deletions cmd/vmcp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The Virtual MCP Server (vmcp) is a standalone binary that aggregates multiple MC
- ✅ **Session Management**: MCP protocol session tracking with TTL-based cleanup
- ✅ **Health Endpoints**: `/health` and `/ping` for service monitoring
- ✅ **Configuration Validation**: `vmcp validate` command for config verification
- ✅ **Observability**: OpenTelemetry metrics and traces for backend operations and workflow executions

### In Progress
- 🚧 **Incoming Authentication** (Issue #165): OIDC, local, anonymous authentication
Expand Down Expand Up @@ -121,6 +122,7 @@ vmcp uses a YAML configuration file to define:
3. **Outgoing Authentication**: Virtual MCP → Backend API token exchange
4. **Tool Aggregation**: Conflict resolution and filtering strategies
5. **Operational Settings**: Timeouts, health checks, circuit breakers
6. **Telemetry**: OpenTelemetry metrics/tracing and Prometheus endpoint

See [examples/vmcp-config.yaml](../../examples/vmcp-config.yaml) for a complete example.

Expand Down
33 changes: 25 additions & 8 deletions cmd/vmcp/app/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/stacklok/toolhive/pkg/env"
"github.com/stacklok/toolhive/pkg/groups"
"github.com/stacklok/toolhive/pkg/logger"
"github.com/stacklok/toolhive/pkg/telemetry"
"github.com/stacklok/toolhive/pkg/vmcp"
"github.com/stacklok/toolhive/pkg/vmcp/aggregator"
"github.com/stacklok/toolhive/pkg/vmcp/auth/factory"
Expand Down Expand Up @@ -288,7 +289,6 @@ func runServe(cmd *cobra.Command, _ []string) error {
// Create router
rtr := vmcprouter.NewDefaultRouter()

// Setup authentication middleware
logger.Infof("Setting up incoming authentication (type: %s)", cfg.IncomingAuth.Type)

authMiddleware, authInfoHandler, err := factory.NewIncomingAuthMiddleware(ctx, cfg.IncomingAuth)
Expand All @@ -303,13 +303,30 @@ func runServe(cmd *cobra.Command, _ []string) error {
host, _ := cmd.Flags().GetString("host")
port, _ := cmd.Flags().GetInt("port")

// If telemetry is configured, create the provider.
var telemetryProvider *telemetry.Provider
if cfg.Telemetry != nil {
var err error
telemetryProvider, err = telemetry.NewProvider(ctx, *cfg.Telemetry)
if err != nil {
return fmt.Errorf("failed to create telemetry provider: %w", err)
}
Comment thread
jerm-dro marked this conversation as resolved.
defer func() {
err := telemetryProvider.Shutdown(ctx)
if err != nil {
logger.Errorf("failed to shutdown telemetry provider: %v", err)
}
}()
}

serverCfg := &vmcpserver.Config{
Name: cfg.Name,
Version: getVersion(),
Host: host,
Port: port,
AuthMiddleware: authMiddleware,
AuthInfoHandler: authInfoHandler,
Name: cfg.Name,
Version: getVersion(),
Host: host,
Port: port,
AuthMiddleware: authMiddleware,
AuthInfoHandler: authInfoHandler,
TelemetryProvider: telemetryProvider,
}

// Convert composite tool configurations to workflow definitions
Expand All @@ -322,7 +339,7 @@ func runServe(cmd *cobra.Command, _ []string) error {
}

// Create server with discovery manager, backends, and workflow definitions
srv, err := vmcpserver.New(serverCfg, rtr, backendClient, discoveryMgr, backends, workflowDefs)
srv, err := vmcpserver.New(ctx, serverCfg, rtr, backendClient, discoveryMgr, backends, workflowDefs)
if err != nil {
return fmt.Errorf("failed to create Virtual MCP Server: %w", err)
}
Expand Down
2 changes: 1 addition & 1 deletion deploy/charts/operator-crds/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: toolhive-operator-crds
description: A Helm chart for installing the ToolHive Operator CRDs into Kubernetes.
type: application
version: 0.0.75
version: 0.0.76
appVersion: "0.0.1"
2 changes: 1 addition & 1 deletion deploy/charts/operator-crds/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ToolHive Operator CRDs Helm Chart

![Version: 0.0.75](https://img.shields.io/badge/Version-0.0.75-informational?style=flat-square)
![Version: 0.0.76](https://img.shields.io/badge/Version-0.0.76-informational?style=flat-square)
![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)

A Helm chart for installing the ToolHive Operator CRDs into Kubernetes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,73 @@ spec:
- NodePort
- LoadBalancer
type: string
telemetry:
description: |-
Telemetry configures OpenTelemetry-based observability for the Virtual MCP server
including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint
properties:
openTelemetry:
description: OpenTelemetry defines OpenTelemetry configuration
properties:
enabled:
default: false
description: Enabled controls whether OpenTelemetry is enabled
type: boolean
endpoint:
description: Endpoint is the OTLP endpoint URL for tracing
and metrics
type: string
headers:
description: |-
Headers contains authentication headers for the OTLP endpoint
Specified as key=value pairs
items:
type: string
type: array
insecure:
default: false
description: Insecure indicates whether to use HTTP instead
of HTTPS for the OTLP endpoint
type: boolean
metrics:
description: Metrics defines OpenTelemetry metrics-specific
configuration
properties:
enabled:
default: false
description: Enabled controls whether OTLP metrics are
sent
type: boolean
type: object
serviceName:
description: |-
ServiceName is the service name for telemetry
If not specified, defaults to the MCPServer name
type: string
tracing:
description: Tracing defines OpenTelemetry tracing configuration
properties:
enabled:
default: false
description: Enabled controls whether OTLP tracing is
sent
type: boolean
samplingRate:
default: "0.05"
description: SamplingRate is the trace sampling rate (0.0-1.0)
type: string
type: object
type: object
prometheus:
description: Prometheus defines Prometheus-specific configuration
properties:
enabled:
default: false
description: Enabled controls whether Prometheus metrics endpoint
is exposed
type: boolean
type: object
type: object
required:
- groupRef
- incomingAuth
Expand Down
7 changes: 7 additions & 0 deletions docs/observability.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,10 @@ The telemetry middleware:

This provides end-to-end visibility across the entire request lifecycle while
maintaining the modular architecture of ToolHive's middleware system.

## Virtual MCP Server Telemetry

For observability in the Virtual MCP Server (vMCP), including backend request
metrics, workflow execution telemetry, and distributed tracing, see the
dedicated [Virtual MCP Server Observability](./operator/virtualmcpserver-observability.md)
documentation.
Comment thread
jerm-dro marked this conversation as resolved.
2 changes: 2 additions & 0 deletions docs/operator/crd-api.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 55 additions & 0 deletions docs/operator/virtualmcpserver-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,47 @@ spec:
cpu: "1000m"
```

### `.spec.telemetry` (optional)

Configures OpenTelemetry-based observability for the Virtual MCP server, including distributed tracing, OTLP metrics export, and Prometheus metrics endpoint. Uses the same configuration structure as `MCPServer.spec.telemetry`.

**Type**: `TelemetryConfig`

**Fields**:
- `openTelemetry` (OpenTelemetryConfig, optional): OpenTelemetry configuration
- `enabled` (boolean): Controls whether OpenTelemetry is enabled
- `endpoint` (string): OTLP endpoint URL for tracing and metrics
- `serviceName` (string): Service name for telemetry (defaults to VirtualMCPServer name)
- `headers` ([]string): Authentication headers for OTLP endpoint (key=value format)
- `insecure` (boolean): Use HTTP instead of HTTPS for the OTLP endpoint
- `metrics` (OpenTelemetryMetricsConfig, optional): Metrics-specific configuration
- `enabled` (boolean): Controls whether OTLP metrics are sent
- `tracing` (OpenTelemetryTracingConfig, optional): Tracing-specific configuration
- `enabled` (boolean): Controls whether OTLP tracing is sent
- `samplingRate` (string): Trace sampling rate (0.0-1.0, default: "0.05")
- `prometheus` (PrometheusConfig, optional): Prometheus-specific configuration
- `enabled` (boolean): Controls whether Prometheus metrics endpoint is exposed at /metrics

**Example**:
```yaml
spec:
telemetry:
openTelemetry:
enabled: true
endpoint: "otel-collector:4317"
serviceName: "my-vmcp"
insecure: true
tracing:
enabled: true
samplingRate: "0.1"
metrics:
enabled: true
prometheus:
enabled: true
```

For details on what metrics and traces are emitted, see the [Virtual MCP Server Observability](./virtualmcpserver-observability.md) documentation.

## Status Fields

### `.status.conditions`
Expand Down Expand Up @@ -451,6 +492,19 @@ spec:
failureThreshold: 5
timeout: 60s

# Observability
telemetry:
openTelemetry:
enabled: true
endpoint: "otel-collector:4317"
tracing:
enabled: true
samplingRate: "0.1"
metrics:
enabled: true
prometheus:
enabled: true

status:
phase: Ready
message: "Virtual MCP serving 3 backends with 15 tools"
Expand Down Expand Up @@ -518,4 +572,5 @@ The VirtualMCPServer CRD includes comprehensive validation:
- [MCPServer](./mcpserver-api.md): Individual MCP server instances
- [MCPExternalAuthConfig](./mcpexternalauthconfig-api.md): External authentication configuration
- [MCPToolConfig](./toolconfig-api.md): Tool filtering and renaming configuration
- [Virtual MCP Server Observability](./virtualmcpserver-observability.md): Telemetry and metrics documentation
- [Virtual MCP Proposal](../proposals/THV-2106-virtual-mcp-server.md): Complete design proposal
82 changes: 82 additions & 0 deletions docs/operator/virtualmcpserver-observability.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Virtual MCP Server Observability

This document describes the observability for the Virtual MCP
Server (vMCP), which aggregates multiple backend MCP servers into a unified
interface. The vMCP provides OpenTelemetry-based instrumentation for monitoring
backend operations and composite tool workflow executions.

For general ToolHive observability concepts and proxy runner telemetry, see the
main [Observability and Telemetry](../observability.md) documentation.

## Overview

The vMCP telemetry provides visibility into:

1. **Backend operations**: Track requests to individual backend MCP servers
including tool calls, resource reads, prompt retrieval, and capability listing
2. **Workflow executions**: Monitor composite tool workflow performance and errors
3. **Distributed tracing**: Correlate requests across the vMCP and its backends

The vMCP uses a decorator pattern to wrap backend clients and workflow executors
with telemetry instrumentation. This approach provides consistent metrics and
tracing without modifying the core business logic.

The implementation of both metrics and traces can be found in `pkg/vmcp/server/telemetry.go`.

## Metrics

The vMCP emits metrics for backend operations and workflow executions. All
metrics use the `toolhive_vmcp_` prefix.

**Backend metrics** track requests to individual backend MCP servers, including
request counts, error counts, and request duration histograms. These metrics
include attributes identifying the target backend (workload ID, name, URL,
transport type) and the action being performed (tool call, resource read, etc.).

**Workflow metrics** track composite tool workflow executions, including
execution counts, error counts, and duration histograms. These metrics include
the workflow name as an attribute.

## Distributed Tracing

The vMCP creates spans for each individual backend operation as well as workflow executions, enabling the attribution of workflow execution errors or latency to specific tool calls.


## Configuration

Configure telemetry in the `VirtualMCPServer` resource using the `spec.telemetry`
field. The telemetry configuration uses the same `TelemetryConfig` type as
`MCPServer`, providing a consistent configuration experience across resources.

```yaml
apiVersion: toolhive.stacklok.dev/v1alpha1
kind: VirtualMCPServer
metadata:
name: my-vmcp
spec:
groupRef:
name: my-group
incomingAuth:
type: anonymous
telemetry:
openTelemetry:
enabled: true
endpoint: "otel-collector:4317"
serviceName: "my-vmcp"
insecure: true
tracing:
enabled: true
samplingRate: "0.1"
metrics:
enabled: true
prometheus:
enabled: true
```

See the [VirtualMCPServer API reference](./virtualmcpserver-api.md) for complete
CRD documentation.

## Related Documentation

- [Observability and Telemetry](../observability.md) - Main ToolHive observability documentation
- [VirtualMCPServer API Reference](./virtualmcpserver-api.md) - Complete CRD specification
Loading
Loading