Skip to content

Commit a78d9bf

Browse files
ChrisJBurnsclaude
andauthored
Restructure MCPTelemetryConfig CRD to nested spec per RFC THV-0023 (#4487)
* Restructure MCPTelemetryConfig CRD to nested spec per RFC THV-0023 The MCPTelemetryConfig spec was using a flat inline embedding of telemetry.Config which exposed CLI-only fields and didn't match the nested structure specified in RFC THV-0023. Restructure to use openTelemetry/prometheus sub-objects: - spec.openTelemetry: endpoint, headers, sensitiveHeaders, resourceAttributes, tracing, metrics, useLegacyAttributes - spec.prometheus: enabled - status.referencingWorkloads: structured WorkloadReference (kind/namespace/name) replacing bare string list Update the conversion layer (spectoconfig, controllerutil) to map the nested CRD types to the flat telemetry.Config at runtime. Add operator example YAML files for MCPTelemetryConfig and MCPServer with telemetryConfigRef. Closes #4262 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Consolidate telemetry examples into single multi-document YAML Combine MCPTelemetryConfig and MCPServer-with-ref examples into mcpserver_fetch_otel.yaml as a single apply-able file. Remove the separate telemetry-configs directory. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Address review: gate OTel on Enabled flag, add endpoint CEL validation Fix two issues from PR review: - NormalizeMCPTelemetryConfig now gates OTel config on Enabled flag, matching ConvertTelemetryConfig behavior. Setting enabled: false correctly disables OTLP even if sub-fields are populated. - Add CEL validation rejecting endpoint when neither tracing nor metrics is enabled, catching the misconfiguration at admission time instead of failing at proxy runner startup. Add test cases covering both scenarios. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Fix CEL rule YAML quoting in CRD manifests Use single-quoted YAML for the endpoint validation CEL rule to match the quoting style of other CEL rules in the chart. Double-quoted YAML caused the rule value to include literal quote characters, failing CRD installation with a CEL syntax error. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Fix OTel collector endpoint in telemetry example Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Move endpoint validation to Validate(), fix smart quote corruption The gci linter was replacing ASCII quotes with Unicode smart quotes (U+201C/U+201D) in kubebuilder annotations, breaking controller-gen CRD generation. Additionally, the CEL rule for endpoint validation generated double-quoted YAML that the Helm wrapper mishandled. Move the endpoint-requires-signals validation from a CEL rule to the programmatic Validate() method, which avoids both issues and provides clearer error messages. Add test case for this validation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Reuse shared WorkloadReference type from MCPOIDCConfig Remove duplicate WorkloadReference definition from mcptelemetryconfig_types.go and use the shared type already defined in mcpoidcconfig_types.go. Drop the Namespace field (cross-namespace refs are not supported) to match the shared type's schema. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 862f9b8 commit a78d9bf

15 files changed

Lines changed: 735 additions & 449 deletions

File tree

cmd/thv-operator/api/v1alpha1/mcptelemetryconfig_types.go

Lines changed: 89 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ import (
77
"fmt"
88

99
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10-
11-
"github.com/stacklok/toolhive/pkg/telemetry"
1210
)
1311

1412
// SensitiveHeader represents a header whose value is stored in a Kubernetes Secret.
@@ -25,24 +23,76 @@ type SensitiveHeader struct {
2523
SecretKeyRef SecretKeyRef `json:"secretKeyRef"`
2624
}
2725

28-
// MCPTelemetryConfigSpec defines the desired state of MCPTelemetryConfig.
29-
// It embeds telemetry.Config from pkg/telemetry to eliminate the conversion
30-
// layer between CRD and application types. The environmentVariables field is
31-
// CLI-only and rejected by CEL validation; customAttributes is allowed for
32-
// setting shared OTel resource attributes (e.g., deployment.environment).
26+
// MCPTelemetryOTelConfig defines OpenTelemetry configuration for shared MCPTelemetryConfig resources.
27+
// Unlike OpenTelemetryConfig (used by inline MCPServer telemetry), this type:
28+
// - Omits ServiceName (per-server field set via MCPTelemetryConfigReference)
29+
// - Uses map[string]string for Headers (not []string)
30+
// - Adds SensitiveHeaders for Kubernetes Secret-backed credentials
31+
// - Adds ResourceAttributes for shared OTel resource attributes
3332
//
34-
// +kubebuilder:validation:XValidation:rule="!has(self.environmentVariables)",message="environmentVariables is a CLI-only field and cannot be set in MCPTelemetryConfig; use customAttributes for resource attributes"
3533
// +kubebuilder:validation:XValidation:rule="!has(self.headers) || !has(self.sensitiveHeaders) || self.sensitiveHeaders.all(sh, !(sh.name in self.headers))",message="a header name cannot appear in both headers and sensitiveHeaders"
3634
//
3735
//nolint:lll // CEL validation rules exceed line length limit
38-
type MCPTelemetryConfigSpec struct {
39-
telemetry.Config `json:",inline"` // nolint:revive
36+
type MCPTelemetryOTelConfig struct {
37+
// Enabled controls whether OpenTelemetry is enabled
38+
// +kubebuilder:default=false
39+
// +optional
40+
Enabled bool `json:"enabled,omitempty"`
41+
42+
// Endpoint is the OTLP endpoint URL for tracing and metrics
43+
// +optional
44+
Endpoint string `json:"endpoint,omitempty"`
45+
46+
// Insecure indicates whether to use HTTP instead of HTTPS for the OTLP endpoint
47+
// +kubebuilder:default=false
48+
// +optional
49+
Insecure bool `json:"insecure,omitempty"`
50+
51+
// Headers contains authentication headers for the OTLP endpoint.
52+
// For secret-backed credentials, use sensitiveHeaders instead.
53+
// +optional
54+
Headers map[string]string `json:"headers,omitempty"`
4055

4156
// SensitiveHeaders contains headers whose values are stored in Kubernetes Secrets.
4257
// Use this for credential headers (e.g., API keys, bearer tokens) instead of
4358
// embedding secrets in the headers field.
4459
// +optional
4560
SensitiveHeaders []SensitiveHeader `json:"sensitiveHeaders,omitempty"`
61+
62+
// ResourceAttributes contains custom resource attributes to be added to all telemetry signals.
63+
// These become OTel resource attributes (e.g., deployment.environment, service.namespace).
64+
// Note: service.name is intentionally excluded — it is set per-server via
65+
// MCPTelemetryConfigReference.ServiceName.
66+
// +optional
67+
ResourceAttributes map[string]string `json:"resourceAttributes,omitempty"`
68+
69+
// Metrics defines OpenTelemetry metrics-specific configuration
70+
// +optional
71+
Metrics *OpenTelemetryMetricsConfig `json:"metrics,omitempty"`
72+
73+
// Tracing defines OpenTelemetry tracing configuration
74+
// +optional
75+
Tracing *OpenTelemetryTracingConfig `json:"tracing,omitempty"`
76+
77+
// UseLegacyAttributes controls whether legacy attribute names are emitted alongside
78+
// the new MCP OTEL semantic convention names. Defaults to true for backward compatibility.
79+
// This will change to false in a future release and eventually be removed.
80+
// +kubebuilder:default=true
81+
// +optional
82+
UseLegacyAttributes bool `json:"useLegacyAttributes"`
83+
}
84+
85+
// MCPTelemetryConfigSpec defines the desired state of MCPTelemetryConfig.
86+
// The spec uses a nested structure with openTelemetry and prometheus sub-objects
87+
// for clear separation of concerns.
88+
type MCPTelemetryConfigSpec struct {
89+
// OpenTelemetry defines OpenTelemetry configuration (OTLP endpoint, tracing, metrics)
90+
// +optional
91+
OpenTelemetry *MCPTelemetryOTelConfig `json:"openTelemetry,omitempty"`
92+
93+
// Prometheus defines Prometheus-specific configuration
94+
// +optional
95+
Prometheus *PrometheusConfig `json:"prometheus,omitempty"`
4696
}
4797

4898
// MCPTelemetryConfigStatus defines the observed state of MCPTelemetryConfig
@@ -59,17 +109,18 @@ type MCPTelemetryConfigStatus struct {
59109
// +optional
60110
ConfigHash string `json:"configHash,omitempty"`
61111

62-
// ReferencingServers is a list of MCPServer resources that reference this MCPTelemetryConfig
112+
// ReferencingWorkloads lists workloads that reference this MCPTelemetryConfig
63113
// +optional
64-
ReferencingServers []string `json:"referencingServers,omitempty"`
114+
ReferencingWorkloads []WorkloadReference `json:"referencingWorkloads,omitempty"`
65115
}
66116

67117
// +kubebuilder:object:root=true
68118
// +kubebuilder:subresource:status
69119
// +kubebuilder:resource:shortName=mcpotel,categories=toolhive
70-
// +kubebuilder:printcolumn:name="Endpoint",type=string,JSONPath=`.spec.endpoint`
120+
// +kubebuilder:printcolumn:name="Endpoint",type=string,JSONPath=`.spec.openTelemetry.endpoint`
71121
// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status`
72-
// +kubebuilder:printcolumn:name="References",type=string,JSONPath=`.status.referencingServers`
122+
// +kubebuilder:printcolumn:name="Tracing",type=boolean,JSONPath=`.spec.openTelemetry.tracing.enabled`
123+
// +kubebuilder:printcolumn:name="Metrics",type=boolean,JSONPath=`.spec.openTelemetry.metrics.enabled`
73124
// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
74125

75126
// MCPTelemetryConfig is the Schema for the mcptelemetryconfigs API.
@@ -115,33 +166,47 @@ type MCPTelemetryConfigReference struct {
115166
// CEL catches issues at API admission time, but this method also validates
116167
// stored objects to catch any that bypassed CEL or were stored before CEL rules were added.
117168
func (r *MCPTelemetryConfig) Validate() error {
118-
if err := r.validateCLIOnlyFields(); err != nil {
169+
if err := r.validateEndpointRequiresSignals(); err != nil {
119170
return err
120171
}
121172
return r.validateSensitiveHeaders()
122173
}
123174

124-
// validateCLIOnlyFields rejects CLI-only fields that are not applicable to CRD-managed telemetry.
125-
func (r *MCPTelemetryConfig) validateCLIOnlyFields() error {
126-
if len(r.Spec.EnvironmentVariables) > 0 {
127-
return fmt.Errorf("environmentVariables is a CLI-only field and cannot be set in MCPTelemetryConfig")
175+
// validateEndpointRequiresSignals rejects an endpoint when neither tracing nor metrics is enabled.
176+
// Without this check the config would pass CRD validation but fail at runtime in telemetry.NewProvider.
177+
func (r *MCPTelemetryConfig) validateEndpointRequiresSignals() error {
178+
if r.Spec.OpenTelemetry == nil {
179+
return nil
180+
}
181+
otel := r.Spec.OpenTelemetry
182+
if otel.Endpoint == "" {
183+
return nil
184+
}
185+
tracingEnabled := otel.Tracing != nil && otel.Tracing.Enabled
186+
metricsEnabled := otel.Metrics != nil && otel.Metrics.Enabled
187+
if !tracingEnabled && !metricsEnabled {
188+
return fmt.Errorf("endpoint requires at least one of tracing or metrics to be enabled")
128189
}
129190
return nil
130191
}
131192

132193
// validateSensitiveHeaders validates sensitive header entries and checks for overlap with plaintext headers.
133194
func (r *MCPTelemetryConfig) validateSensitiveHeaders() error {
134-
for i, sh := range r.Spec.SensitiveHeaders {
195+
if r.Spec.OpenTelemetry == nil {
196+
return nil
197+
}
198+
otel := r.Spec.OpenTelemetry
199+
for i, sh := range otel.SensitiveHeaders {
135200
if sh.Name == "" {
136-
return fmt.Errorf("sensitiveHeaders[%d].name must not be empty", i)
201+
return fmt.Errorf("openTelemetry.sensitiveHeaders[%d].name must not be empty", i)
137202
}
138203
if sh.SecretKeyRef.Name == "" {
139-
return fmt.Errorf("sensitiveHeaders[%d].secretKeyRef.name must not be empty", i)
204+
return fmt.Errorf("openTelemetry.sensitiveHeaders[%d].secretKeyRef.name must not be empty", i)
140205
}
141206
if sh.SecretKeyRef.Key == "" {
142-
return fmt.Errorf("sensitiveHeaders[%d].secretKeyRef.key must not be empty", i)
207+
return fmt.Errorf("openTelemetry.sensitiveHeaders[%d].secretKeyRef.key must not be empty", i)
143208
}
144-
if _, exists := r.Spec.Headers[sh.Name]; exists {
209+
if _, exists := otel.Headers[sh.Name]; exists {
145210
return fmt.Errorf("header %q appears in both headers and sensitiveHeaders", sh.Name)
146211
}
147212
}

cmd/thv-operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 56 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/thv-operator/controllers/mcptelemetryconfig_controller.go

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"context"
88
"fmt"
99
"slices"
10-
"sort"
1110
"time"
1211

1312
"k8s.io/apimachinery/pkg/api/errors"
@@ -108,16 +107,16 @@ func (r *MCPTelemetryConfigReconciler) Reconcile(ctx context.Context, req ctrl.R
108107
// Calculate the hash of the current configuration
109108
configHash := r.calculateConfigHash(telemetryConfig.Spec)
110109

111-
// Track referencing MCPServers
112-
referencingServers, err := r.findReferencingServers(ctx, telemetryConfig)
110+
// Track referencing workloads
111+
referencingWorkloads, err := r.findReferencingWorkloads(ctx, telemetryConfig)
113112
if err != nil {
114-
logger.Error(err, "Failed to find referencing MCPServers")
113+
logger.Error(err, "Failed to find referencing workloads")
115114
return ctrl.Result{}, err
116115
}
117116

118117
// Check what changed
119118
hashChanged := telemetryConfig.Status.ConfigHash != configHash
120-
refsChanged := !slices.Equal(telemetryConfig.Status.ReferencingServers, referencingServers)
119+
refsChanged := !workloadRefsEqual(telemetryConfig.Status.ReferencingWorkloads, referencingWorkloads)
121120
needsUpdate := hashChanged || refsChanged || conditionChanged
122121

123122
if hashChanged {
@@ -129,7 +128,7 @@ func (r *MCPTelemetryConfigReconciler) Reconcile(ctx context.Context, req ctrl.R
129128
if needsUpdate {
130129
telemetryConfig.Status.ConfigHash = configHash
131130
telemetryConfig.Status.ObservedGeneration = telemetryConfig.Generation
132-
telemetryConfig.Status.ReferencingServers = referencingServers
131+
telemetryConfig.Status.ReferencingWorkloads = referencingWorkloads
133132

134133
if err := r.Status().Update(ctx, telemetryConfig); err != nil {
135134
logger.Error(err, "Failed to update MCPTelemetryConfig status")
@@ -142,7 +141,7 @@ func (r *MCPTelemetryConfigReconciler) Reconcile(ctx context.Context, req ctrl.R
142141

143142
// SetupWithManager sets up the controller with the Manager.
144143
func (r *MCPTelemetryConfigReconciler) SetupWithManager(mgr ctrl.Manager) error {
145-
// Watch MCPServer changes to update ReferencingServers status
144+
// Watch MCPServer changes to update ReferencingWorkloads status
146145
mcpServerHandler := handler.EnqueueRequestsFromMapFunc(
147146
func(_ context.Context, obj client.Object) []reconcile.Request {
148147
mcpServer, ok := obj.(*mcpv1alpha1.MCPServer)
@@ -187,15 +186,19 @@ func (r *MCPTelemetryConfigReconciler) handleDeletion(
187186
return ctrl.Result{}, nil
188187
}
189188

190-
// Check for referencing servers before allowing deletion
191-
referencingServers, err := r.findReferencingServers(ctx, telemetryConfig)
189+
// Check for referencing workloads before allowing deletion
190+
referencingWorkloads, err := r.findReferencingWorkloads(ctx, telemetryConfig)
192191
if err != nil {
193-
logger.Error(err, "Failed to check referencing servers during deletion")
192+
logger.Error(err, "Failed to check referencing workloads during deletion")
194193
return ctrl.Result{}, err
195194
}
196195

197-
if len(referencingServers) > 0 {
198-
msg := fmt.Sprintf("cannot delete: still referenced by MCPServer(s): %v", referencingServers)
196+
if len(referencingWorkloads) > 0 {
197+
names := make([]string, 0, len(referencingWorkloads))
198+
for _, ref := range referencingWorkloads {
199+
names = append(names, fmt.Sprintf("%s/%s", ref.Kind, ref.Name))
200+
}
201+
msg := fmt.Sprintf("cannot delete: still referenced by MCPServer(s): %v", names)
199202
logger.Info(msg, "telemetryConfig", telemetryConfig.Name)
200203
meta.SetStatusCondition(&telemetryConfig.Status.Conditions, metav1.Condition{
201204
Type: "DeletionBlocked",
@@ -220,24 +223,42 @@ func (r *MCPTelemetryConfigReconciler) handleDeletion(
220223
return ctrl.Result{}, nil
221224
}
222225

223-
// findReferencingServers returns a sorted list of MCPServer names in the same namespace
226+
// findReferencingWorkloads returns a sorted list of workload references in the same namespace
224227
// that reference this MCPTelemetryConfig via TelemetryConfigRef.
225-
func (r *MCPTelemetryConfigReconciler) findReferencingServers(
228+
func (r *MCPTelemetryConfigReconciler) findReferencingWorkloads(
226229
ctx context.Context,
227230
telemetryConfig *mcpv1alpha1.MCPTelemetryConfig,
228-
) ([]string, error) {
231+
) ([]mcpv1alpha1.WorkloadReference, error) {
229232
mcpServerList := &mcpv1alpha1.MCPServerList{}
230233
if err := r.List(ctx, mcpServerList, client.InNamespace(telemetryConfig.Namespace)); err != nil {
231234
return nil, fmt.Errorf("failed to list MCPServers: %w", err)
232235
}
233236

234-
var refs []string
237+
var refs []mcpv1alpha1.WorkloadReference
235238
for _, server := range mcpServerList.Items {
236239
if server.Spec.TelemetryConfigRef != nil &&
237240
server.Spec.TelemetryConfigRef.Name == telemetryConfig.Name {
238-
refs = append(refs, server.Name)
241+
refs = append(refs, mcpv1alpha1.WorkloadReference{
242+
Kind: "MCPServer",
243+
Name: server.Name,
244+
})
239245
}
240246
}
241-
sort.Strings(refs)
247+
slices.SortFunc(refs, func(a, b mcpv1alpha1.WorkloadReference) int {
248+
if a.Name < b.Name {
249+
return -1
250+
}
251+
if a.Name > b.Name {
252+
return 1
253+
}
254+
return 0
255+
})
242256
return refs, nil
243257
}
258+
259+
// workloadRefsEqual compares two WorkloadReference slices for equality.
260+
func workloadRefsEqual(a, b []mcpv1alpha1.WorkloadReference) bool {
261+
return slices.EqualFunc(a, b, func(x, y mcpv1alpha1.WorkloadReference) bool {
262+
return x.Kind == y.Kind && x.Name == y.Name
263+
})
264+
}

0 commit comments

Comments
 (0)