From ce995b7e201c3d410c73ec77223383e6cdb259b0 Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Thu, 21 May 2026 07:47:43 -0700 Subject: [PATCH 1/8] TLS support on RemoteMCPServer + companion-Secret API Signed-off-by: Jeremy Alvis --- examples/modelconfig-with-tls.yaml | 4 +- .../crd/bases/kagent.dev_modelconfigs.yaml | 30 +- .../bases/kagent.dev_remotemcpservers.yaml | 57 ++ go/api/v1alpha2/modelconfig_types.go | 27 +- go/api/v1alpha2/remotemcpserver_types.go | 11 + go/api/v1alpha2/zz_generated.deepcopy.go | 5 + .../reconciler/mcp_server_reconciler_test.go | 1 + .../controller/reconciler/reconciler.go | 118 +++- .../controller/reconciler/reconciler_test.go | 296 +++++++++- .../translator/agent/adk_api_translator.go | 139 +++-- .../controller/translator/agent/compiler.go | 2 +- .../translator/agent/manifest_builder.go | 87 ++- .../translator/agent/manifest_builder_test.go | 337 +++++++++++ .../agent/remotemcpserver_tls_test.go | 524 ++++++++++++++++++ .../testdata/outputs/tls-with-custom-ca.json | 12 +- .../outputs/tls-with-system-cas-disabled.json | 12 +- .../translator/agent/tls_mounting_test.go | 155 +++++- .../httpserver/handlers/companion_secrets.go | 159 ++++++ .../httpserver/handlers/modelconfig.go | 120 +--- .../httpserver/handlers/toolservers.go | 46 +- .../httpserver/handlers/toolservers_test.go | 187 +++++++ go/core/pkg/app/app.go | 6 + .../translator/adk_api_translator_types.go | 18 + .../invoke_remotemcpserver_tls_agent.json | 66 +++ go/core/test/e2e/remotemcpserver_tls_test.go | 504 +++++++++++++++++ go/go.mod | 1 + go/go.sum | 2 + .../templates/kagent.dev_modelconfigs.yaml | 30 +- .../kagent.dev_remotemcpservers.yaml | 57 ++ .../kagent-adk/src/kagent/adk/models/_ssl.py | 4 +- .../kagent-adk/src/kagent/adk/types.py | 111 +++- .../tests/unittests/models/test_openai.py | 14 +- .../tests/unittests/models/test_tls_e2e.py | 4 +- .../tests/unittests/test_mcp_tls.py | 219 ++++++++ ui/src/types/index.ts | 1 + 35 files changed, 3094 insertions(+), 272 deletions(-) create mode 100644 go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go create mode 100644 go/core/internal/httpserver/handlers/companion_secrets.go create mode 100644 go/core/test/e2e/mocks/invoke_remotemcpserver_tls_agent.json create mode 100644 go/core/test/e2e/remotemcpserver_tls_test.go create mode 100644 python/packages/kagent-adk/tests/unittests/test_mcp_tls.py diff --git a/examples/modelconfig-with-tls.yaml b/examples/modelconfig-with-tls.yaml index 7f4905e1d7..c59ec3b1bc 100644 --- a/examples/modelconfig-with-tls.yaml +++ b/examples/modelconfig-with-tls.yaml @@ -386,5 +386,5 @@ roleRef: # 6. Troubleshooting: # - See https://kagent.dev/docs for detailed debugging steps # - Check agent logs: kubectl logs deployment/agent- -# - Verify Secret is mounted: kubectl exec deployment/agent- -- ls /etc/ssl/certs/custom/ -# - Test certificate: kubectl exec deployment/agent- -- openssl x509 -in /etc/ssl/certs/custom/ca.crt -text -noout +# - Verify Secret is mounted: kubectl exec deployment/agent- -- ls /etc/ssl/certs/custom/corp-ca/ +# - Test certificate: kubectl exec deployment/agent- -- openssl x509 -in /etc/ssl/certs/custom/corp-ca/ca.crt -text -noout diff --git a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml index 00b21b6da0..a3d8025a6c 100644 --- a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml +++ b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml @@ -682,6 +682,20 @@ spec: Production deployments MUST use proper certificates. type: boolean type: object + x-kubernetes-validations: + - message: caCertSecretKey requires caCertSecretRef + rule: '!(has(self.caCertSecretKey) && size(self.caCertSecretKey) + > 0 && (!has(self.caCertSecretRef) || size(self.caCertSecretRef) + == 0))' + - message: caCertSecretRef requires caCertSecretKey + rule: '!(has(self.caCertSecretRef) && size(self.caCertSecretRef) + > 0 && (!has(self.caCertSecretKey) || size(self.caCertSecretKey) + == 0))' + - message: disableSystemCAs requires caCertSecretRef or disableVerify + (trust-nothing config rejects every upstream) + rule: '!(has(self.disableSystemCAs) && self.disableSystemCAs && + (!has(self.disableVerify) || !self.disableVerify) && (!has(self.caCertSecretRef) + || size(self.caCertSecretRef) == 0))' required: - model type: object @@ -719,22 +733,6 @@ spec: rule: '!(has(self.apiKeyPassthrough) && self.apiKeyPassthrough && (self.provider == ''Gemini'' || self.provider == ''GeminiVertexAI'' || self.provider == ''AnthropicVertexAI''))' - - message: caCertSecretKey requires caCertSecretRef - rule: '!(has(self.tls) && has(self.tls.caCertSecretKey) && size(self.tls.caCertSecretKey) - > 0 && (!has(self.tls.caCertSecretRef) || size(self.tls.caCertSecretRef) - == 0))' - - message: caCertSecretKey requires caCertSecretRef (unless disableVerify - is true) - rule: '!(has(self.tls) && (!has(self.tls.disableVerify) || !self.tls.disableVerify) - && has(self.tls.caCertSecretKey) && size(self.tls.caCertSecretKey) - > 0 && (!has(self.tls.caCertSecretRef) || size(self.tls.caCertSecretRef) - == 0))' - - message: caCertSecretRef requires caCertSecretKey (unless disableVerify - is true) - rule: '!(has(self.tls) && (!has(self.tls.disableVerify) || !self.tls.disableVerify) - && has(self.tls.caCertSecretRef) && size(self.tls.caCertSecretRef) - > 0 && (!has(self.tls.caCertSecretKey) || size(self.tls.caCertSecretKey) - == 0))' - message: openAI.tokenExchange requires apiKeySecret (the service account secret) rule: '!(has(self.openAI) && has(self.openAI.tokenExchange) && (!has(self.apiKeySecret) diff --git a/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml b/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml index f8bbcf0b1d..e0b4e272ab 100644 --- a/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml +++ b/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml @@ -179,6 +179,63 @@ spec: timeout: default: 30s type: string + tls: + description: |- + TLS configuration for the upstream MCP server connection. + Use this for HTTPS upstreams that present a certificate the agent's + system trust store does not include (corporate CA, self-signed cert + on a test fixture, internal MCP gateway). Reuses the same TLSConfig + type as ModelConfig.spec.tls, with identical semantics: disableVerify + turns off certificate validation entirely, caCertSecretRef + + caCertSecretKey point at a PEM bundle Secret in the same namespace, + and disableSystemCAs trusts only the named bundle. + properties: + caCertSecretKey: + description: |- + CACertSecretKey is the key within the Secret that contains the CA certificate data. + This field follows the same pattern as APIKeySecretKey. + Required when CACertSecretRef is set (unless DisableVerify is true). + type: string + caCertSecretRef: + description: |- + CACertSecretRef is a reference to a Kubernetes Secret containing + CA certificate(s) in PEM format. The Secret must be in the same + namespace as the ModelConfig. + When set, the certificate will be used to verify the provider's SSL certificate. + This field follows the same pattern as APIKeySecret. + type: string + disableSystemCAs: + default: false + description: |- + DisableSystemCAs disables the use of system CA certificates. + When false (default), system CA certificates are used for verification (safe behavior). + When true, only the custom CA from CACertSecretRef is trusted. + This allows strict security policies where only corporate CAs should be trusted. + type: boolean + disableVerify: + default: false + description: |- + DisableVerify disables SSL certificate verification entirely. + When false (default), SSL certificates are verified. + When true, SSL certificate verification is disabled. + WARNING: This should ONLY be used in development/testing environments. + Production deployments MUST use proper certificates. + type: boolean + type: object + x-kubernetes-validations: + - message: caCertSecretKey requires caCertSecretRef + rule: '!(has(self.caCertSecretKey) && size(self.caCertSecretKey) + > 0 && (!has(self.caCertSecretRef) || size(self.caCertSecretRef) + == 0))' + - message: caCertSecretRef requires caCertSecretKey + rule: '!(has(self.caCertSecretRef) && size(self.caCertSecretRef) + > 0 && (!has(self.caCertSecretKey) || size(self.caCertSecretKey) + == 0))' + - message: disableSystemCAs requires caCertSecretRef or disableVerify + (trust-nothing config rejects every upstream) + rule: '!(has(self.disableSystemCAs) && self.disableSystemCAs && + (!has(self.disableVerify) || !self.disableVerify) && (!has(self.caCertSecretRef) + || size(self.caCertSecretRef) == 0))' url: minLength: 1 type: string diff --git a/go/api/v1alpha2/modelconfig_types.go b/go/api/v1alpha2/modelconfig_types.go index 0d08928681..6caaf1f434 100644 --- a/go/api/v1alpha2/modelconfig_types.go +++ b/go/api/v1alpha2/modelconfig_types.go @@ -274,9 +274,14 @@ type SAPAICoreConfig struct { AuthURL string `json:"authUrl,omitempty"` } -// TLSConfig contains TLS/SSL configuration options for model provider connections. -// This enables agents to connect to internal LiteLLM gateways or other providers -// that use self-signed certificates or custom certificate authorities. +// TLSConfig contains TLS/SSL configuration options for outbound HTTPS +// connections from the agent (model provider, RemoteMCPServer). The +// XValidation rules below apply at admission to every CRD field that +// uses TLSConfig, so callers don't need to re-declare them per spec. +// +// +kubebuilder:validation:XValidation:message="caCertSecretKey requires caCertSecretRef",rule="!(has(self.caCertSecretKey) && size(self.caCertSecretKey) > 0 && (!has(self.caCertSecretRef) || size(self.caCertSecretRef) == 0))" +// +kubebuilder:validation:XValidation:message="caCertSecretRef requires caCertSecretKey",rule="!(has(self.caCertSecretRef) && size(self.caCertSecretRef) > 0 && (!has(self.caCertSecretKey) || size(self.caCertSecretKey) == 0))" +// +kubebuilder:validation:XValidation:message="disableSystemCAs requires caCertSecretRef or disableVerify (trust-nothing config rejects every upstream)",rule="!(has(self.disableSystemCAs) && self.disableSystemCAs && (!has(self.disableVerify) || !self.disableVerify) && (!has(self.caCertSecretRef) || size(self.caCertSecretRef) == 0))" type TLSConfig struct { // DisableVerify disables SSL certificate verification entirely. // When false (default), SSL certificates are verified. @@ -310,6 +315,19 @@ type TLSConfig struct { DisableSystemCAs bool `json:"disableSystemCAs,omitempty"` } +// IsEmpty reports whether the TLSConfig carries any opinion. A nil +// receiver and an all-zero struct are equivalent — both mean "no TLS +// config supplied" and the consumer should fall back to its default +// behavior (typically system trust store, default httpx client). The +// single helper keeps callers from re-listing fields, so adding a new +// field to TLSConfig only requires updating this method. +func (t *TLSConfig) IsEmpty() bool { + if t == nil { + return true + } + return !t.DisableVerify && t.CACertSecretRef == "" && t.CACertSecretKey == "" && !t.DisableSystemCAs +} + // ModelConfigSpec defines the desired state of ModelConfig. // // +kubebuilder:validation:XValidation:message="provider.openAI must be nil if the provider is not OpenAI",rule="!(has(self.openAI) && self.provider != 'OpenAI')" @@ -325,9 +343,6 @@ type TLSConfig struct { // +kubebuilder:validation:XValidation:message="apiKeySecretKey must be set if apiKeySecret is set (except for Bedrock and SAPAICore providers)",rule="!(has(self.apiKeySecret) && !has(self.apiKeySecretKey) && self.provider != 'Bedrock' && self.provider != 'SAPAICore')" // +kubebuilder:validation:XValidation:message="apiKeyPassthrough and apiKeySecret are mutually exclusive",rule="!(has(self.apiKeyPassthrough) && self.apiKeyPassthrough && has(self.apiKeySecret) && size(self.apiKeySecret) > 0)" // +kubebuilder:validation:XValidation:message="apiKeyPassthrough must be false if provider is Gemini;GeminiVertexAI;AnthropicVertexAI",rule="!(has(self.apiKeyPassthrough) && self.apiKeyPassthrough && (self.provider == 'Gemini' || self.provider == 'GeminiVertexAI' || self.provider == 'AnthropicVertexAI'))" -// +kubebuilder:validation:XValidation:message="caCertSecretKey requires caCertSecretRef",rule="!(has(self.tls) && has(self.tls.caCertSecretKey) && size(self.tls.caCertSecretKey) > 0 && (!has(self.tls.caCertSecretRef) || size(self.tls.caCertSecretRef) == 0))" -// +kubebuilder:validation:XValidation:message="caCertSecretKey requires caCertSecretRef (unless disableVerify is true)",rule="!(has(self.tls) && (!has(self.tls.disableVerify) || !self.tls.disableVerify) && has(self.tls.caCertSecretKey) && size(self.tls.caCertSecretKey) > 0 && (!has(self.tls.caCertSecretRef) || size(self.tls.caCertSecretRef) == 0))" -// +kubebuilder:validation:XValidation:message="caCertSecretRef requires caCertSecretKey (unless disableVerify is true)",rule="!(has(self.tls) && (!has(self.tls.disableVerify) || !self.tls.disableVerify) && has(self.tls.caCertSecretRef) && size(self.tls.caCertSecretRef) > 0 && (!has(self.tls.caCertSecretKey) || size(self.tls.caCertSecretKey) == 0))" // +kubebuilder:validation:XValidation:message="openAI.tokenExchange requires apiKeySecret (the service account secret)",rule="!(has(self.openAI) && has(self.openAI.tokenExchange) && (!has(self.apiKeySecret) || size(self.apiKeySecret) == 0))" // +kubebuilder:validation:XValidation:message="openAI.tokenExchange and apiKeyPassthrough are mutually exclusive",rule="!(has(self.openAI) && has(self.openAI.tokenExchange) && has(self.apiKeyPassthrough) && self.apiKeyPassthrough)" // +kubebuilder:validation:XValidation:message="openAI.tokenExchange type GDCHServiceAccount requires openAI.tokenExchange.gdchServiceAccount",rule="!(has(self.openAI) && has(self.openAI.tokenExchange) && self.openAI.tokenExchange.type == 'GDCHServiceAccount' && !has(self.openAI.tokenExchange.gdchServiceAccount))" diff --git a/go/api/v1alpha2/remotemcpserver_types.go b/go/api/v1alpha2/remotemcpserver_types.go index 0d58952eec..d63cc75347 100644 --- a/go/api/v1alpha2/remotemcpserver_types.go +++ b/go/api/v1alpha2/remotemcpserver_types.go @@ -63,6 +63,17 @@ type RemoteMCPServerSpec struct { // See: https://gateway-api.sigs.k8s.io/guides/multiple-ns/#cross-namespace-routing // +optional AllowedNamespaces *AllowedNamespaces `json:"allowedNamespaces,omitempty"` + + // TLS configuration for the upstream MCP server connection. + // Use this for HTTPS upstreams that present a certificate the agent's + // system trust store does not include (corporate CA, self-signed cert + // on a test fixture, internal MCP gateway). Reuses the same TLSConfig + // type as ModelConfig.spec.tls, with identical semantics: disableVerify + // turns off certificate validation entirely, caCertSecretRef + + // caCertSecretKey point at a PEM bundle Secret in the same namespace, + // and disableSystemCAs trusts only the named bundle. + // +optional + TLS *TLSConfig `json:"tls,omitempty"` } var _ sql.Scanner = (*RemoteMCPServerSpec)(nil) diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 136058bced..4cef11d501 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -1394,6 +1394,11 @@ func (in *RemoteMCPServerSpec) DeepCopyInto(out *RemoteMCPServerSpec) { *out = new(AllowedNamespaces) (*in).DeepCopyInto(*out) } + if in.TLS != nil { + in, out := &in.TLS, &out.TLS + *out = new(TLSConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoteMCPServerSpec. diff --git a/go/core/internal/controller/reconciler/mcp_server_reconciler_test.go b/go/core/internal/controller/reconciler/mcp_server_reconciler_test.go index 9514ca6f2c..b3cb19c7bf 100644 --- a/go/core/internal/controller/reconciler/mcp_server_reconciler_test.go +++ b/go/core/internal/controller/reconciler/mcp_server_reconciler_test.go @@ -116,6 +116,7 @@ func TestReconcileKagentMCPServer_ErrorPropagation(t *testing.T) { types.NamespacedName{Namespace: "test", Name: "default-model"}, []string{}, // No namespace restrictions for tests nil, + nil, ) // Call ReconcileKagentMCPServer diff --git a/go/core/internal/controller/reconciler/reconciler.go b/go/core/internal/controller/reconciler/reconciler.go index 0b2d93e282..1541b44cc5 100644 --- a/go/core/internal/controller/reconciler/reconciler.go +++ b/go/core/internal/controller/reconciler/reconciler.go @@ -3,6 +3,8 @@ package reconciler import ( "context" "crypto/sha256" + "crypto/tls" + "crypto/x509" "encoding/hex" "errors" "fmt" @@ -16,6 +18,7 @@ import ( reconcilerutils "github.com/kagent-dev/kagent/go/core/internal/controller/reconciler/utils" "github.com/kagent-dev/kagent/go/core/internal/controller/translator" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + pkgtranslator "github.com/kagent-dev/kagent/go/core/pkg/translator" "github.com/kagent-dev/kmcp/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -90,23 +93,30 @@ type kagentReconciler struct { watchedNamespaces []string sandboxBackend sandboxbackend.Backend + + // rmsURLRewriter optionally transforms the URL the controller dials + // when discovering tools on a RemoteMCPServer. Nil means dial + // s.Spec.URL verbatim. See pkgtranslator.RemoteMCPServerURLRewriter. + rmsURLRewriter pkgtranslator.RemoteMCPServerURLRewriter } func NewKagentReconciler( - translator agent_translator.AdkApiTranslator, + adkTranslator agent_translator.AdkApiTranslator, kube client.Client, dbClient database.Client, defaultModelConfig types.NamespacedName, watchedNamespaces []string, sandboxBackend sandboxbackend.Backend, + rmsURLRewriter pkgtranslator.RemoteMCPServerURLRewriter, ) KagentReconciler { return &kagentReconciler{ - adkTranslator: translator, + adkTranslator: adkTranslator, kube: kube, dbClient: dbClient, defaultModelConfig: defaultModelConfig, watchedNamespaces: watchedNamespaces, sandboxBackend: sandboxBackend, + rmsURLRewriter: rmsURLRewriter, } } @@ -1028,35 +1038,125 @@ func (a *kagentReconciler) createMcpTransport(ctx context.Context, s *v1alpha2.R return nil, err } - httpClient := newHTTPClient(headers, remoteMCPRegistrationTimeout(s)) + // Resolve the dial-time URL. Default is s.Spec.URL; an installed + // rmsURLRewriter may substitute a different dial target. The + // rewriter controls the URL only — tlsConfig below is built from + // s.Spec.TLS regardless of what the rewriter returns, so rewriting + // http:// → https:// to a destination needing different trust + // roots is not supported (s.Spec.TLS would need to already be + // consistent with the new destination). + endpoint := s.Spec.URL + if a.rmsURLRewriter != nil { + rewritten, err := a.rmsURLRewriter.RewriteRemoteMCPServerURL(ctx, s) + if err != nil { + return nil, fmt.Errorf("failed to rewrite RemoteMCPServer URL for %s/%s: %w", s.Namespace, s.Name, err) + } + endpoint = rewritten + } + + tlsConfig, err := a.buildRemoteMCPServerTLSConfig(ctx, s) + if err != nil { + return nil, fmt.Errorf("failed to build TLS config for %s/%s: %w", s.Namespace, s.Name, err) + } + + httpClient := newHTTPClient(headers, remoteMCPRegistrationTimeout(s), tlsConfig) switch s.Spec.Protocol { case v1alpha2.RemoteMCPServerProtocolSse: return &mcp.SSEClientTransport{ - Endpoint: s.Spec.URL, + Endpoint: endpoint, HTTPClient: httpClient, }, nil default: return &mcp.StreamableClientTransport{ - Endpoint: s.Spec.URL, + Endpoint: endpoint, HTTPClient: httpClient, }, nil } } +// buildRemoteMCPServerTLSConfig returns a *tls.Config matching the +// RemoteMCPServer's spec.tls (or nil when no TLS config is present, so +// the http.Client falls back to Go's default transport with the system +// trust store). Mirrors the per-source TLS semantics the agent +// translator emits — disableVerify, custom CA from a Secret, and +// disableSystemCAs trust-only-the-named-bundle — so tool discovery +// trusts the same upstream chain the agent will trust at runtime. +func (a *kagentReconciler) buildRemoteMCPServerTLSConfig(ctx context.Context, s *v1alpha2.RemoteMCPServer) (*tls.Config, error) { + tlsSpec := s.Spec.TLS + if tlsSpec.IsEmpty() { + return nil, nil + } + + cfg := &tls.Config{ + InsecureSkipVerify: tlsSpec.DisableVerify, //nolint:gosec // operator-authored test-fixture escape hatch + } + + if tlsSpec.CACertSecretRef != "" && tlsSpec.CACertSecretKey != "" { + secret := &corev1.Secret{} + if err := a.kube.Get(ctx, types.NamespacedName{Namespace: s.Namespace, Name: tlsSpec.CACertSecretRef}, secret); err != nil { + return nil, fmt.Errorf("failed to read CA secret %s/%s: %w", s.Namespace, tlsSpec.CACertSecretRef, err) + } + pem, ok := secret.Data[tlsSpec.CACertSecretKey] + if !ok || len(pem) == 0 { + return nil, fmt.Errorf("CA secret %s/%s does not contain key %q", s.Namespace, tlsSpec.CACertSecretRef, tlsSpec.CACertSecretKey) + } + + var pool *x509.CertPool + if tlsSpec.DisableSystemCAs { + pool = x509.NewCertPool() + } else { + sys, err := x509.SystemCertPool() + if err != nil { + return nil, fmt.Errorf("failed to load system CA pool: %w", err) + } + pool = sys + } + if !pool.AppendCertsFromPEM(pem) { + return nil, fmt.Errorf("CA secret %s/%s key %q does not contain valid PEM certificates", s.Namespace, tlsSpec.CACertSecretRef, tlsSpec.CACertSecretKey) + } + cfg.RootCAs = pool + } + // Note: the trust-nothing combination (disableSystemCAs=true without + // caCertSecretRef and without disableVerify) is rejected by the CEL + // rule on TLSConfig at admission, so it cannot reach this code path + // in production. + + return cfg, nil +} + // go-sdk does not have a WithHeaders option when initializing transport -// so we need to create a custom HTTP client that adds headers to all requests. -func newHTTPClient(headers map[string]string, timeout time.Duration) *http.Client { +// so we need to create a custom HTTP client that adds headers to all +// requests. When tlsConfig is non-nil it's installed on a cloned +// transport so tool discovery honors RemoteMCPServer.spec.tls. +func newHTTPClient(headers map[string]string, timeout time.Duration, tlsConfig *tls.Config) *http.Client { + var base http.RoundTripper = http.DefaultTransport + if tlsConfig != nil { + // Clone the default transport to preserve its dial/keepalive + // settings (proxies, dual-stack, HTTP/2) and override only the + // TLS config. If the default transport isn't *http.Transport + // (extremely unusual — only happens if something earlier swapped + // it out), fall back to a fresh transport so TLS still applies. + if t, ok := http.DefaultTransport.(*http.Transport); ok { + clone := t.Clone() + clone.TLSClientConfig = tlsConfig + base = clone + } else { + base = &http.Transport{TLSClientConfig: tlsConfig} + } + } + if len(headers) == 0 { return &http.Client{ - Timeout: timeout, + Timeout: timeout, + Transport: base, } } return &http.Client{ Timeout: timeout, Transport: &headerTransport{ headers: headers, - base: http.DefaultTransport, + base: base, }, } } diff --git a/go/core/internal/controller/reconciler/reconciler_test.go b/go/core/internal/controller/reconciler/reconciler_test.go index d9a4860a93..d09217c078 100644 --- a/go/core/internal/controller/reconciler/reconciler_test.go +++ b/go/core/internal/controller/reconciler/reconciler_test.go @@ -2,11 +2,20 @@ package reconciler import ( "context" + "crypto/rand" + "crypto/rsa" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "net/http" "testing" "time" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" + "github.com/modelcontextprotocol/go-sdk/mcp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -717,19 +726,300 @@ func TestNewHTTPClient(t *testing.T) { timeout := 5 * time.Second t.Run("no headers", func(t *testing.T) { - c := newHTTPClient(nil, timeout) + c := newHTTPClient(nil, timeout, nil) assert.Equal(t, timeout, c.Timeout) }) t.Run("empty headers", func(t *testing.T) { - c := newHTTPClient(map[string]string{}, timeout) + c := newHTTPClient(map[string]string{}, timeout, nil) assert.Equal(t, timeout, c.Timeout) }) t.Run("with headers sets timeout and custom transport", func(t *testing.T) { - c := newHTTPClient(map[string]string{"X-Key": "val"}, timeout) + c := newHTTPClient(map[string]string{"X-Key": "val"}, timeout, nil) assert.Equal(t, timeout, c.Timeout) _, ok := c.Transport.(*headerTransport) assert.True(t, ok, "expected headerTransport") }) + + t.Run("with tls config installs cloned transport", func(t *testing.T) { + tlsCfg := &tls.Config{InsecureSkipVerify: true} //nolint:gosec // test only + c := newHTTPClient(nil, timeout, tlsCfg) + require.Equal(t, timeout, c.Timeout) + // No headers → transport is the cloned *http.Transport directly. + tr, ok := c.Transport.(*http.Transport) + require.True(t, ok, "expected *http.Transport when no headers + tls is set") + assert.Same(t, tlsCfg, tr.TLSClientConfig) + }) + + t.Run("with headers and tls config wraps headerTransport over cloned transport", func(t *testing.T) { + tlsCfg := &tls.Config{InsecureSkipVerify: true} //nolint:gosec // test only + c := newHTTPClient(map[string]string{"X-Key": "val"}, timeout, tlsCfg) + ht, ok := c.Transport.(*headerTransport) + require.True(t, ok, "expected headerTransport") + tr, ok := ht.base.(*http.Transport) + require.True(t, ok, "headerTransport.base should be the cloned *http.Transport") + assert.Same(t, tlsCfg, tr.TLSClientConfig) + }) +} + +// TestBuildRemoteMCPServerTLSConfig covers the controller's mirror of the +// agent translator's TLS semantics: tool discovery dials the upstream from +// the controller pod, so it has to construct the same trust chain the agent +// will use at runtime — but from the controller's vantage point (no Secret +// mounted on its filesystem; it has to read the Secret via the kube API). +func TestBuildRemoteMCPServerTLSConfig(t *testing.T) { + caPEM := generateTestCAPEM(t) + + tests := []struct { + name string + spec *v1alpha2.TLSConfig + secret *corev1.Secret + wantNil bool + wantSkip bool + wantCA bool + wantSystem bool + wantErr bool + errContains string + }{ + { + name: "nil spec → no tls config (default transport)", + spec: nil, + wantNil: true, + }, + { + name: "empty struct → no tls config (parity with nil)", + spec: &v1alpha2.TLSConfig{}, + wantNil: true, + }, + { + name: "disableVerify only", + spec: &v1alpha2.TLSConfig{DisableVerify: true}, + wantSkip: true, + }, + { + name: "custom CA only (additive to system pool)", + spec: &v1alpha2.TLSConfig{ + CACertSecretRef: "ca", + CACertSecretKey: "ca.crt", + }, + secret: &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "ca", Namespace: "ns"}, + Data: map[string][]byte{"ca.crt": caPEM}, + }, + wantCA: true, + wantSystem: true, + }, + { + name: "custom CA with disableSystemCAs (trust only the bundle)", + spec: &v1alpha2.TLSConfig{ + CACertSecretRef: "ca", + CACertSecretKey: "ca.crt", + DisableSystemCAs: true, + }, + secret: &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "ca", Namespace: "ns"}, + Data: map[string][]byte{"ca.crt": caPEM}, + }, + wantCA: true, + wantSystem: false, + }, + { + name: "missing secret → error", + spec: &v1alpha2.TLSConfig{ + CACertSecretRef: "ca", + CACertSecretKey: "ca.crt", + }, + wantErr: true, + errContains: "failed to read CA secret", + }, + { + name: "secret present but missing key → error", + spec: &v1alpha2.TLSConfig{ + CACertSecretRef: "ca", + CACertSecretKey: "ca.crt", + }, + secret: &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "ca", Namespace: "ns"}, + Data: map[string][]byte{"other": []byte("x")}, + }, + wantErr: true, + errContains: "does not contain key", + }, + { + name: "secret key contains garbage → error", + spec: &v1alpha2.TLSConfig{ + CACertSecretRef: "ca", + CACertSecretKey: "ca.crt", + }, + secret: &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "ca", Namespace: "ns"}, + Data: map[string][]byte{"ca.crt": []byte("not a pem")}, + }, + wantErr: true, + errContains: "valid PEM certificates", + }, + // Note: the trust-nothing combination (disableSystemCAs=true alone) + // used to be rejected here at reconcile time. It's now rejected + // earlier by the CEL rule on TLSConfig at admission, so it cannot + // reach this code path. No runtime test needed. + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheme := clientgoscheme.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + objs := []client.Object{} + if tt.secret != nil { + objs = append(objs, tt.secret) + } + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build() + + r := &kagentReconciler{kube: kube} + rms := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "ns"}, + Spec: v1alpha2.RemoteMCPServerSpec{URL: "https://x/y", TLS: tt.spec}, + } + + cfg, err := r.buildRemoteMCPServerTLSConfig(context.Background(), rms) + if tt.wantErr { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.errContains) + return + } + require.NoError(t, err) + + if tt.wantNil { + assert.Nil(t, cfg) + return + } + require.NotNil(t, cfg) + assert.Equal(t, tt.wantSkip, cfg.InsecureSkipVerify) + if tt.wantCA { + require.NotNil(t, cfg.RootCAs, "expected RootCAs populated when CA Secret is referenced") + } + // Asserting "system pool was used" by counting subjects is + // platform-specific — on macOS, x509.SystemCertPool returns + // a minimal pool because Go defers to platform verification. + // The path is structurally enforced (SystemCertPool vs + // NewCertPool) and the strict-trust case (wantSystem=false) + // covers the alternative branch. + _ = tt.wantSystem + }) + } +} + +// stubRewriter is a test double for translator.RemoteMCPServerURLRewriter. +// Records the call and returns the configured response, letting tests +// verify createMcpTransport plumbs the rewriter correctly. +type stubRewriter struct { + called bool + gotRMS *v1alpha2.RemoteMCPServer + respURL string + respErr error +} + +func (s *stubRewriter) RewriteRemoteMCPServerURL(_ context.Context, rms *v1alpha2.RemoteMCPServer) (string, error) { + s.called = true + s.gotRMS = rms + return s.respURL, s.respErr +} + +// TestCreateMcpTransport_URLRewriter covers the optional URL-rewriter +// hook on the reconciler: a configured rewriter must (a) receive the +// RemoteMCPServer, (b) supply the dial-time URL on the produced +// transport, and (c) propagate errors. A nil rewriter must leave +// s.Spec.URL as the dial target. +func TestCreateMcpTransport_URLRewriter(t *testing.T) { + scheme := clientgoscheme.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + const specURL = "https://upstream.example.com/mcp" + rms := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "rms", Namespace: "ns"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "test", + URL: specURL, + }, + } + + t.Run("nil rewriter uses spec.URL verbatim", func(t *testing.T) { + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + r := &kagentReconciler{kube: kube, rmsURLRewriter: nil} + + tsp, err := r.createMcpTransport(context.Background(), rms) + require.NoError(t, err) + require.NotNil(t, tsp) + streamable, ok := tsp.(*mcp.StreamableClientTransport) + require.True(t, ok, "expected Streamable HTTP transport for default protocol") + assert.Equal(t, specURL, streamable.Endpoint) + }) + + t.Run("rewriter substitutes the dial URL", func(t *testing.T) { + const rewrittenURL = "http://upstream.example.com:443/mcp" + stub := &stubRewriter{respURL: rewrittenURL} + + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + r := &kagentReconciler{kube: kube, rmsURLRewriter: stub} + + tsp, err := r.createMcpTransport(context.Background(), rms) + require.NoError(t, err) + require.True(t, stub.called, "rewriter must be called") + require.NotNil(t, stub.gotRMS) + assert.Equal(t, "rms", stub.gotRMS.Name, "rewriter receives the RMS being dialed") + + streamable, ok := tsp.(*mcp.StreamableClientTransport) + require.True(t, ok) + assert.Equal(t, rewrittenURL, streamable.Endpoint, "rewritten URL must become the dial target") + }) + + t.Run("rewriter error propagates and aborts transport construction", func(t *testing.T) { + stub := &stubRewriter{respErr: assert.AnError} + + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + r := &kagentReconciler{kube: kube, rmsURLRewriter: stub} + + tsp, err := r.createMcpTransport(context.Background(), rms) + assert.Nil(t, tsp) + require.Error(t, err) + assert.Contains(t, err.Error(), "rewrite RemoteMCPServer URL") + }) + + t.Run("rewriter applies to SSE transport too", func(t *testing.T) { + const rewrittenURL = "http://upstream.example.com:443/sse" + stub := &stubRewriter{respURL: rewrittenURL} + + sseRMS := rms.DeepCopy() + sseRMS.Spec.Protocol = v1alpha2.RemoteMCPServerProtocolSse + + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + r := &kagentReconciler{kube: kube, rmsURLRewriter: stub} + + tsp, err := r.createMcpTransport(context.Background(), sseRMS) + require.NoError(t, err) + sse, ok := tsp.(*mcp.SSEClientTransport) + require.True(t, ok, "expected SSE transport when protocol is SSE") + assert.Equal(t, rewrittenURL, sse.Endpoint) + }) +} + +// generateTestCAPEM produces a minimal self-signed PEM certificate the +// AppendCertsFromPEM call accepts. The cert isn't valid for any URL — +// these tests only exercise the controller's parse-and-pool path. +func generateTestCAPEM(t *testing.T) []byte { + t.Helper() + priv, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "kagent-test-ca"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(time.Hour), + IsCA: true, + KeyUsage: x509.KeyUsageCertSign, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &priv.PublicKey, priv) + require.NoError(t, err) + return pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) } diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go index 0ee9528f8b..6735528291 100644 --- a/go/core/internal/controller/translator/agent/adk_api_translator.go +++ b/go/core/internal/controller/translator/agent/adk_api_translator.go @@ -217,56 +217,100 @@ func (r *adkApiTranslator) GetOwnedResourceTypes() []client.Object { const ( googleCredsVolumeName = "google-creds" - tlsCACertVolumeName = "tls-ca-cert" - tlsCACertMountPath = "/etc/ssl/certs/custom" + tlsCAVolumePrefix = "tls-ca-" + tlsCAMountRoot = "/etc/ssl/certs/custom" + maxDNS1123LabelLen = 63 gdchCredsVolumeName = "gdch-creds" gdchCredsMountPath = "/gdch-creds" ) -// populateTLSFields populates TLS configuration fields in the BaseModel -// from the ModelConfig TLS spec. -func populateTLSFields(baseModel *adk.BaseModel, tlsConfig *v1alpha2.TLSConfig) { - if tlsConfig == nil { - return - } - - // Set TLS configuration fields in BaseModel - baseModel.TLSInsecureSkipVerify = &tlsConfig.DisableVerify - baseModel.TLSDisableSystemCAs = &tlsConfig.DisableSystemCAs +// dns1123LabelRE matches RFC 1123 labels (lowercase alphanumeric + dashes, +// must start and end with alphanumeric). K8s volume names require this +// grammar — but K8s Secret names follow the looser DNS_SUBDOMAIN grammar +// (dots allowed, up to 253 chars), so a literal Secret name like +// `corp.ca` or cert-manager-style `mcp.example.com-tls` would fail volume +// name validation if embedded verbatim. tlsCAPaths hashes the name when +// it would violate this regex (or the length limit) for that reason. +var dns1123LabelRE = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) + +// tlsCAPaths returns deterministic volume name, mount path, and cert file +// path for the given Secret reference. Per-Secret naming lets multiple TLS +// sources (chat ModelConfig + embedding ModelConfig + RemoteMCPServers) on +// the same agent pod coexist without colliding when their +// modelDeploymentData entries get merged (mergeDeploymentData dedupes by +// (Name, MountPath) for VolumeMounts and by Name for Volumes). +func tlsCAPaths(secretName, key string) (volumeName, mountPath, certPath string) { + candidate := tlsCAVolumePrefix + secretName + id := secretName + if len(candidate) > maxDNS1123LabelLen || !dns1123LabelRE.MatchString(candidate) { + h := sha256.Sum256([]byte(secretName)) + id = hex.EncodeToString(h[:])[:8] + } + volumeName = tlsCAVolumePrefix + id + mountPath = path.Join(tlsCAMountRoot, id) + certPath = path.Join(mountPath, key) + return +} - // Set CA cert path if Secret and key are both specified +// deriveTLSFields turns a v1alpha2.TLSConfig into the three pointer fields +// that every TLS-aware adk wire type carries (BaseModel, +// StreamableHTTPConnectionParams, SseConnectionParams). Returns nils for +// nil or all-zero configs so the caller can assign-through to all three +// fields in a single statement. Emitting explicit `false` booleans on +// an empty struct would flip the Python runtime out of its no-op +// short-circuit and silently swap google-adk's default httpx client for +// kagent's, which has the same SSL behavior but different +// timeout/redirect defaults. +func deriveTLSFields(tlsConfig *v1alpha2.TLSConfig) (insecureSkipVerify *bool, caCertPath *string, disableSystemCAs *bool) { + if tlsConfig.IsEmpty() { + return nil, nil, nil + } + insecureSkipVerify = &tlsConfig.DisableVerify + disableSystemCAs = &tlsConfig.DisableSystemCAs if tlsConfig.CACertSecretRef != "" && tlsConfig.CACertSecretKey != "" { - certPath := fmt.Sprintf("%s/%s", tlsCACertMountPath, tlsConfig.CACertSecretKey) - baseModel.TLSCACertPath = &certPath + _, _, p := tlsCAPaths(tlsConfig.CACertSecretRef, tlsConfig.CACertSecretKey) + caCertPath = &p } + return } -// addTLSConfiguration adds TLS certificate volume mounts to modelDeploymentData -// when TLS configuration is present in the ModelConfig. -// Note: TLS configuration fields are now included in agent config JSON via BaseModel, -// so this function only handles volume mounting. +// addTLSConfiguration mounts a CA Secret as a per-Secret read-only volume on +// modelDeploymentData. Safe to call multiple times for the same agent with +// the same OR different TLSConfigs: +// - different Secrets produce different volume names + paths and accumulate. +// - the same Secret referenced from multiple sources (e.g. several RMSs +// pointing at one shared corp-CA bundle) is idempotent — we skip the +// append if a volume with the same name is already present, because the +// RemoteMCPServer path (translateRemoteMCPServerTarget) appends directly to the +// already-merged modelDeploymentData rather than through +// mergeDeploymentData. func addTLSConfiguration(modelDeploymentData *modelDeploymentData, tlsConfig *v1alpha2.TLSConfig) { if tlsConfig == nil { return } - // Add Secret volume mount if both CA certificate Secret and key are specified if tlsConfig.CACertSecretRef != "" && tlsConfig.CACertSecretKey != "" { - // Add volume from Secret + volumeName, mountPath, _ := tlsCAPaths(tlsConfig.CACertSecretRef, tlsConfig.CACertSecretKey) + + for _, v := range modelDeploymentData.Volumes { + if v.Name == volumeName { + return + } + } + modelDeploymentData.Volumes = append(modelDeploymentData.Volumes, corev1.Volume{ - Name: tlsCACertVolumeName, + Name: volumeName, VolumeSource: corev1.VolumeSource{ Secret: &corev1.SecretVolumeSource{ SecretName: tlsConfig.CACertSecretRef, - DefaultMode: new(int32(0444)), // Read-only for all users + DefaultMode: new(int32(0444)), }, }, }) - // Add volume mount modelDeploymentData.VolumeMounts = append(modelDeploymentData.VolumeMounts, corev1.VolumeMount{ - Name: tlsCACertVolumeName, - MountPath: tlsCACertMountPath, + Name: volumeName, + MountPath: mountPath, ReadOnly: true, }) } @@ -371,7 +415,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - populateTLSFields(&openai.BaseModel, model.Spec.TLS) + openai.BaseModel.TLSInsecureSkipVerify, openai.BaseModel.TLSCACertPath, openai.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) // Populate TokenExchange fields (OpenAI-specific) addTokenExchangeConfiguration(openai, modelDeploymentData, &model.Spec) openai.APIKeyPassthrough = model.Spec.APIKeyPassthrough @@ -429,7 +473,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - populateTLSFields(&anthropic.BaseModel, model.Spec.TLS) + anthropic.BaseModel.TLSInsecureSkipVerify, anthropic.BaseModel.TLSCACertPath, anthropic.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough if model.Spec.Anthropic != nil { @@ -478,7 +522,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - populateTLSFields(&azureOpenAI.BaseModel, model.Spec.TLS) + azureOpenAI.BaseModel.TLSInsecureSkipVerify, azureOpenAI.BaseModel.TLSCACertPath, azureOpenAI.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) azureOpenAI.APIKeyPassthrough = model.Spec.APIKeyPassthrough return azureOpenAI, modelDeploymentData, secretHashBytes, nil @@ -523,7 +567,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - populateTLSFields(&gemini.BaseModel, model.Spec.TLS) + gemini.BaseModel.TLSInsecureSkipVerify, gemini.BaseModel.TLSCACertPath, gemini.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) gemini.APIKeyPassthrough = model.Spec.APIKeyPassthrough return gemini, modelDeploymentData, secretHashBytes, nil @@ -564,7 +608,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - populateTLSFields(&anthropic.BaseModel, model.Spec.TLS) + anthropic.BaseModel.TLSInsecureSkipVerify, anthropic.BaseModel.TLSCACertPath, anthropic.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough return anthropic, modelDeploymentData, secretHashBytes, nil @@ -588,7 +632,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Options: model.Spec.Ollama.Options, } // Populate TLS fields in BaseModel - populateTLSFields(&ollama.BaseModel, model.Spec.TLS) + ollama.BaseModel.TLSInsecureSkipVerify, ollama.BaseModel.TLSCACertPath, ollama.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) ollama.APIKeyPassthrough = model.Spec.APIKeyPassthrough return ollama, modelDeploymentData, secretHashBytes, nil @@ -611,8 +655,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - populateTLSFields(&gemini.BaseModel, model.Spec.TLS) - + gemini.BaseModel.TLSInsecureSkipVerify, gemini.BaseModel.TLSCACertPath, gemini.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) return gemini, modelDeploymentData, secretHashBytes, nil case v1alpha2.ModelProviderBedrock: if model.Spec.Bedrock == nil { @@ -700,7 +743,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC } // Populate TLS fields in BaseModel - populateTLSFields(&bedrock.BaseModel, model.Spec.TLS) + bedrock.BaseModel.TLSInsecureSkipVerify, bedrock.BaseModel.TLSCACertPath, bedrock.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) bedrock.APIKeyPassthrough = model.Spec.APIKeyPassthrough return bedrock, modelDeploymentData, secretHashBytes, nil @@ -749,7 +792,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC AuthUrl: model.Spec.SAPAICore.AuthURL, } - populateTLSFields(&sapAICore.BaseModel, model.Spec.TLS) + sapAICore.BaseModel.TLSInsecureSkipVerify, sapAICore.BaseModel.TLSCACertPath, sapAICore.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) sapAICore.APIKeyPassthrough = model.Spec.APIKeyPassthrough return sapAICore, modelDeploymentData, secretHashBytes, nil @@ -788,6 +831,7 @@ func (a *adkApiTranslator) translateStreamableHttpTool(ctx context.Context, serv if server.Spec.TerminateOnClose != nil { params.TerminateOnClose = server.Spec.TerminateOnClose } + params.TLSInsecureSkipVerify, params.TLSCACertPath, params.TLSDisableSystemCAs = deriveTLSFields(server.Spec.TLS) return params, nil } @@ -819,10 +863,11 @@ func (a *adkApiTranslator) translateSseHttpTool(ctx context.Context, server *v1a if server.Spec.SseReadTimeout != nil { params.SseReadTimeout = new(server.Spec.SseReadTimeout.Seconds()) } + params.TLSInsecureSkipVerify, params.TLSCACertPath, params.TLSDisableSystemCAs = deriveTLSFields(server.Spec.TLS) return params, nil } -func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, agentNamespace string, toolServer *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) error { +func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, mdd *modelDeploymentData, agentNamespace string, toolServer *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) error { gvk := toolServer.GroupKind() switch gvk { @@ -853,7 +898,7 @@ func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent * return err } - return a.translateRemoteMCPServerTarget(ctx, agent, remoteMcpServer, toolServer, agentHeaders, proxyURL) + return a.translateRemoteMCPServerTarget(ctx, agent, mdd, remoteMcpServer, toolServer, agentHeaders, proxyURL) case schema.GroupKind{ Group: "", @@ -879,7 +924,7 @@ func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent * proxyURL = a.globalProxyURL } - return a.translateRemoteMCPServerTarget(ctx, agent, remoteMcpServer, toolServer, agentHeaders, proxyURL) + return a.translateRemoteMCPServerTarget(ctx, agent, mdd, remoteMcpServer, toolServer, agentHeaders, proxyURL) case schema.GroupKind{ Group: "", Kind: "Service", @@ -902,13 +947,13 @@ func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent * return err } - return a.translateRemoteMCPServerTarget(ctx, agent, remoteMcpServer, toolServer, agentHeaders, proxyURL) + return a.translateRemoteMCPServerTarget(ctx, agent, mdd, remoteMcpServer, toolServer, agentHeaders, proxyURL) default: return fmt.Errorf("unknown tool server type: %s", gvk) } } -func (a *adkApiTranslator) translateRemoteMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, remoteMcpServer *v1alpha2.RemoteMCPServer, mcpServerTool *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) error { +func (a *adkApiTranslator) translateRemoteMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, mdd *modelDeploymentData, remoteMcpServer *v1alpha2.RemoteMCPServer, mcpServerTool *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) error { switch remoteMcpServer.Spec.Protocol { case v1alpha2.RemoteMCPServerProtocolSse: tool, err := a.translateSseHttpTool(ctx, remoteMcpServer, agentHeaders, proxyURL) @@ -933,6 +978,12 @@ func (a *adkApiTranslator) translateRemoteMCPServerTarget(ctx context.Context, a RequireApproval: mcpServerTool.RequireApproval, }) } + // Mount the CA Secret on the agent pod when the RemoteMCPServer pins a TLS bundle. + // Converters that synthesize RMSs from in-cluster MCPServer/Service + // references don't set Spec.TLS, so this is a no-op for those. + if mdd != nil { + addTLSConfiguration(mdd, remoteMcpServer.Spec.TLS) + } return nil } @@ -1042,9 +1093,13 @@ func mergeDeploymentData(dst, src *modelDeploymentData) { } } for _, sm := range src.VolumeMounts { + // Dedupe by (Name, MountPath). MountPath-only dedupe would silently + // drop a mount from a different Volume that happened to choose the + // same path; matching on both lets kubelet surface the conflict + // loudly instead. found := false for _, m := range dst.VolumeMounts { - if m.MountPath == sm.MountPath { + if m.Name == sm.Name && m.MountPath == sm.MountPath { found = true break } diff --git a/go/core/internal/controller/translator/agent/compiler.go b/go/core/internal/controller/translator/agent/compiler.go index 0232a859e6..801297ad15 100644 --- a/go/core/internal/controller/translator/agent/compiler.go +++ b/go/core/internal/controller/translator/agent/compiler.go @@ -252,7 +252,7 @@ func (a *adkApiTranslator) translateInlineAgent(ctx context.Context, agent v1alp switch { case tool.McpServer != nil: - err := a.translateMCPServerTarget(ctx, cfg, agent.GetNamespace(), tool.McpServer, headers, a.globalProxyURL) + err := a.translateMCPServerTarget(ctx, cfg, mdd, agent.GetNamespace(), tool.McpServer, headers, a.globalProxyURL) if err != nil { return nil, nil, nil, err } diff --git a/go/core/internal/controller/translator/agent/manifest_builder.go b/go/core/internal/controller/translator/agent/manifest_builder.go index 3b059bbef5..704cc8fe04 100644 --- a/go/core/internal/controller/translator/agent/manifest_builder.go +++ b/go/core/internal/controller/translator/agent/manifest_builder.go @@ -28,10 +28,9 @@ type manifestContext struct { } type configSecretInputs struct { - secret *corev1.Secret - configHash uint64 - volumes []corev1.Volume - mounts []corev1.VolumeMount + secret *corev1.Secret + volumes []corev1.Volume + mounts []corev1.VolumeMount } type podRuntimeInputs struct { @@ -57,7 +56,7 @@ func (a *adkApiTranslator) BuildManifest( outputs := &AgentOutputs{} manifestCtx := newManifestContext(agent, inputs.Deployment) - configSecret, err := a.buildConfigSecret(manifestCtx, inputs.Config, inputs.Sandbox, inputs.AgentCard, inputs.SecretHashBytes) + configSecret, err := a.buildConfigSecret(manifestCtx, inputs.Config, inputs.Sandbox, inputs.AgentCard) if err != nil { return nil, err } @@ -72,7 +71,17 @@ func (a *adkApiTranslator) BuildManifest( return nil, err } - podTemplate := buildPodTemplate(manifestCtx, podRuntime, configSecret.configHash) + // Build the pod template with a placeholder config-hash. The real + // hash is computed and stamped after plugins run (see below): a + // plugin that mutates the config Secret needs the hash to reflect + // the mutation so pods roll on the next reconcile. buildPodTemplate + // initializes the Annotations map by reference; the post-plugin + // stamp writes into that same map, which workloadObjects share via + // Go's map-reference semantics — so the stamp propagates to + // whichever workload type buildWorkloadObjects produced (Deployment + // in normal mode, Sandbox in sandbox mode) without the translator + // needing to know about the workload's concrete type. + podTemplate := buildPodTemplate(manifestCtx, podRuntime, 0) workloadObjects, err := a.buildWorkloadObjects(ctx, manifestCtx, podTemplate) if err != nil { @@ -89,7 +98,18 @@ func (a *adkApiTranslator) BuildManifest( outputs.AgentCard = *inputs.AgentCard } - return outputs, a.runPlugins(ctx, agent, outputs) + if err := a.runPlugins(ctx, agent, outputs); err != nil { + return outputs, err + } + + // Stamp the post-plugin config-hash. The mutation propagates to + // every workload object that embedded podTemplate, since their + // pod-template Annotations map is the same reference as + // podTemplate.Annotations (see buildPodTemplate). + configHash := computeHashFromConfigSecret(configSecret.secret, inputs.SecretHashBytes) + podTemplate.Annotations["kagent.dev/config-hash"] = fmt.Sprintf("%d", configHash) + + return outputs, nil } func newManifestContext(agent v1alpha2.AgentObject, dep *resolvedDeployment) manifestContext { @@ -129,12 +149,10 @@ func (a *adkApiTranslator) buildConfigSecret( cfg *adk.AgentConfig, sandboxCfg *v1alpha2.SandboxConfig, card *server.AgentCard, - modelConfigSecretHashBytes []byte, ) (*configSecretInputs, error) { cfgJSON := "" agentCard := "" srtSettingsJSON := "" - var configHash uint64 var volumes []corev1.Volume var mounts []corev1.VolumeMount @@ -161,14 +179,6 @@ func (a *adkApiTranslator) buildConfigSecret( } if cfg != nil || srtSettingsJSON != "" { - secretData := modelConfigSecretHashBytes - if secretData == nil { - secretData = []byte{} - } - hashData := make([]byte, 0, len(secretData)+len(srtSettingsJSON)) - hashData = append(hashData, secretData...) - hashData = append(hashData, srtSettingsJSON...) - configHash = computeConfigHash([]byte(cfgJSON), []byte(agentCard), hashData) volumes = []corev1.Volume{{ Name: "config", VolumeSource: corev1.VolumeSource{ @@ -184,12 +194,49 @@ func (a *adkApiTranslator) buildConfigSecret( ObjectMeta: manifestCtx.objectMeta(), StringData: buildConfigSecretData(cfgJSON, agentCard, srtSettingsJSON), }, - configHash: configHash, - volumes: volumes, - mounts: mounts, + volumes: volumes, + mounts: mounts, }, nil } +// computeHashFromConfigSecret derives the kagent.dev/config-hash +// annotation value from the agent's config Secret. Called after plugins +// have had a chance to mutate the Secret's contents — so the hash +// reflects what the agent pod will actually load at startup, and +// plugin-driven mutations naturally trigger rollouts on the next +// reconcile. +// +// modelConfigSecretHashBytes is the controller-resolved hash over +// upstream credentials (API key, etc.) that aren't stored in the Secret +// itself but still need to participate in the rollout signal. It's not +// touched by plugins; passing it through preserves the pre/post-plugin +// hash equivalence in the no-mutation case so existing agents don't +// roll on this change alone. +// +// Returns 0 in the no-config case (matches the original gate in +// buildConfigSecret) so pods that have no config to roll on stay +// stable. +func computeHashFromConfigSecret(secret *corev1.Secret, modelConfigSecretHashBytes []byte) uint64 { + if secret == nil { + return 0 + } + cfgJSON := secret.StringData["config.json"] + agentCard := secret.StringData["agent-card.json"] + srtSettings := secret.StringData["srt-settings.json"] + if cfgJSON == "" && srtSettings == "" { + return 0 + } + + secretData := modelConfigSecretHashBytes + if secretData == nil { + secretData = []byte{} + } + hashData := make([]byte, 0, len(secretData)+len(srtSettings)) + hashData = append(hashData, secretData...) + hashData = append(hashData, srtSettings...) + return computeConfigHash([]byte(cfgJSON), []byte(agentCard), hashData) +} + func buildConfigSecretData(cfgJSON, agentCard, srtSettingsJSON string) map[string]string { data := map[string]string{ "config.json": cfgJSON, diff --git a/go/core/internal/controller/translator/agent/manifest_builder_test.go b/go/core/internal/controller/translator/agent/manifest_builder_test.go index 8c01a2af58..6f5008f7dc 100644 --- a/go/core/internal/controller/translator/agent/manifest_builder_test.go +++ b/go/core/internal/controller/translator/agent/manifest_builder_test.go @@ -1,11 +1,26 @@ package agent import ( + "context" "encoding/json" + "fmt" + "strings" "testing" + "github.com/kagent-dev/kagent/go/api/adk" "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/agentsxk8s" + pkgtranslator "github.com/kagent-dev/kagent/go/core/pkg/translator" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + schemev1 "k8s.io/client-go/kubernetes/scheme" + agentsandboxv1 "sigs.k8s.io/agent-sandbox/api/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestBuildSRTSettingsJSON_DefaultDenyConfig(t *testing.T) { @@ -117,3 +132,325 @@ func TestBuildConfigSecretData_IncludesSRTSettingsWhenPresent(t *testing.T) { t.Fatal("srt-settings.json should be present when non-empty") } } + +// TestComputeHashFromConfigSecret_NilSecretReturnsZero pins the +// fail-safe shape of the helper: when there's no Secret to read, we +// don't fabricate a hash. +func TestComputeHashFromConfigSecret_NilSecretReturnsZero(t *testing.T) { + if got := computeHashFromConfigSecret(nil, nil); got != 0 { + t.Fatalf("nil secret should hash to 0, got %d", got) + } +} + +// TestComputeHashFromConfigSecret_EmptyConfigReturnsZero preserves the +// pre-PR no-config behavior. Agents without compiled config produce a +// zero hash so pods that have nothing to roll on stay stable. +func TestComputeHashFromConfigSecret_EmptyConfigReturnsZero(t *testing.T) { + secret := &corev1.Secret{ + StringData: map[string]string{ + "config.json": "", + "agent-card.json": "", + }, + } + if got := computeHashFromConfigSecret(secret, nil); got != 0 { + t.Fatalf("empty config secret should hash to 0, got %d", got) + } +} + +// TestComputeHashFromConfigSecret_DifferentContentDifferentHash is the +// load-bearing assertion for the plugin-restamp fix: if a plugin mutates +// the Secret's config.json, the recomputed hash must differ from the +// pre-mutation hash so pods actually roll. +func TestComputeHashFromConfigSecret_DifferentContentDifferentHash(t *testing.T) { + pre := &corev1.Secret{StringData: map[string]string{ + "config.json": `{"url":"https://upstream/mcp"}`, + "agent-card.json": `{"name":"a"}`, + }} + post := &corev1.Secret{StringData: map[string]string{ + "config.json": `{"url":"http://upstream:443/mcp"}`, + "agent-card.json": `{"name":"a"}`, + }} + + preHash := computeHashFromConfigSecret(pre, []byte("model-hash")) + postHash := computeHashFromConfigSecret(post, []byte("model-hash")) + + assert.NotEqual(t, preHash, postHash, "mutating config.json must change the hash") +} + +// TestComputeHashFromConfigSecret_SameContentSameHash pins determinism: +// the recompute path must produce identical hashes for identical inputs +// so the no-plugin case doesn't cause spurious pod rollouts on +// controller restart or version upgrade. +func TestComputeHashFromConfigSecret_SameContentSameHash(t *testing.T) { + secret := &corev1.Secret{StringData: map[string]string{ + "config.json": `{"url":"https://upstream/mcp"}`, + "agent-card.json": `{"name":"a"}`, + }} + a := computeHashFromConfigSecret(secret, []byte("model-hash")) + b := computeHashFromConfigSecret(secret, []byte("model-hash")) + assert.Equal(t, a, b) +} + +// configSecretMutatingPlugin walks outputs.Manifest, finds the agent +// config Secret by name, parses config.json, mutates it, and +// re-marshals. Used to verify that BuildManifest re-stamps the +// kagent.dev/config-hash annotation after plugins so the workload +// rolls on the next reconcile. +type configSecretMutatingPlugin struct { + agentName string + mutate func(cfg *adk.AgentConfig) +} + +func (p *configSecretMutatingPlugin) ProcessAgent(_ context.Context, _ v1alpha2.AgentObject, outputs *pkgtranslator.AgentOutputs) error { + for _, obj := range outputs.Manifest { + secret, ok := obj.(*corev1.Secret) + if !ok || secret.GetName() != p.agentName { + continue + } + raw := secret.StringData["config.json"] + if raw == "" { + return nil + } + var cfg adk.AgentConfig + if err := json.Unmarshal([]byte(raw), &cfg); err != nil { + return fmt.Errorf("unmarshal config.json: %w", err) + } + p.mutate(&cfg) + out, err := json.Marshal(&cfg) + if err != nil { + return fmt.Errorf("marshal config.json: %w", err) + } + secret.StringData["config.json"] = string(out) + return nil + } + return nil +} + +func (p *configSecretMutatingPlugin) GetOwnedResourceTypes() []client.Object { return nil } + +// TestBuildManifest_PluginMutatesSecret_HashReflectsPostPluginContent +// is the integration regression for the OSS gap that forced operators +// to `kubectl rollout restart` after toggling enterprise plugins. +// +// Pre-fix: buildPodTemplate stamped the kagent.dev/config-hash +// annotation from the pre-plugin AgentConfig, then plugins ran and +// mutated the Secret's config.json. The annotation never reflected the +// mutation, so the Deployment's pod template hash stayed stable and +// pods didn't roll. Operators had to restart agents manually. +// +// Post-fix: BuildManifest builds the workload with a placeholder hash, +// runs plugins, then stamps the real hash computed from the post-plugin +// Secret content. The annotation on the workload (via shared Go map +// reference) flips, the next reconcile sees the new hash, and pods roll +// automatically. +func TestBuildManifest_PluginMutatesSecret_HashReflectsPostPluginContent(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "hash-test"}} + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "hash-test"}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + }, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "hash-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "System", + ModelConfig: "model", + }, + }, + } + + // Capture the unmodified pre-plugin hash by running once without a + // plugin installed. + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ns, modelConfig, agent).Build() + defaultModel := types.NamespacedName{Namespace: "hash-test", Name: "model"} + transNoPlugin := NewAdkApiTranslator(kube, defaultModel, nil, "", nil) + noPluginOutputs, err := TranslateAgent(context.Background(), transNoPlugin, agent) + require.NoError(t, err) + preHash := configHashFromDeployment(t, noPluginOutputs) + + // Now run the same compile through a plugin that rewrites + // config.json's instruction field. The annotation MUST differ. + mutator := &configSecretMutatingPlugin{ + agentName: "agent", + mutate: func(cfg *adk.AgentConfig) { + cfg.Instruction = "MUTATED BY PLUGIN" + }, + } + transWithPlugin := NewAdkApiTranslator(kube, defaultModel, []TranslatorPlugin{mutator}, "", nil) + pluginOutputs, err := TranslateAgent(context.Background(), transWithPlugin, agent) + require.NoError(t, err) + postHash := configHashFromDeployment(t, pluginOutputs) + + assert.NotEqual(t, preHash, postHash, + "kagent.dev/config-hash on the workload must change when a plugin mutates the config Secret") + + // Also verify the Secret itself carries the mutation (sanity-check + // the plugin actually ran). + mutatedSecret := configSecretFromManifest(t, pluginOutputs, "agent") + assert.Contains(t, mutatedSecret.StringData["config.json"], "MUTATED BY PLUGIN") +} + +// TestBuildManifest_NoPlugin_HashMatchesGoldenBehavior is the +// no-spurious-rollout guard. When no plugin runs, the post-plugin +// recompute must produce the same hash that the pre-PR code would have +// stamped — otherwise upgrading the controller would roll every agent +// pod for no functional reason. +// +// The golden testdata under testdata/outputs/ already pins specific +// hash values for many agent shapes; this is the dedicated regression +// test that names the invariant. +func TestBuildManifest_NoPlugin_HashMatchesGoldenBehavior(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "no-plugin"}} + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "no-plugin"}, + Spec: v1alpha2.ModelConfigSpec{Model: "gpt-4o", Provider: v1alpha2.ModelProviderOpenAI}, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "no-plugin"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "System", + ModelConfig: "model", + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ns, modelConfig, agent).Build() + trans := NewAdkApiTranslator(kube, types.NamespacedName{Namespace: "no-plugin", Name: "model"}, nil, "", nil) + + first, err := TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + second, err := TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + + assert.Equal(t, configHashFromDeployment(t, first), configHashFromDeployment(t, second), + "two translations of the same agent must produce identical hashes (deterministic, no rollout drift)") + assert.NotEqual(t, "0", configHashFromDeployment(t, first), + "a non-empty config should produce a non-zero hash") +} + +// configHashFromDeployment extracts the kagent.dev/config-hash +// annotation value from the Deployment's pod template in an outputs +// bundle. Test helper. +func configHashFromDeployment(t *testing.T, outputs *pkgtranslator.AgentOutputs) string { + t.Helper() + for _, obj := range outputs.Manifest { + if dep, ok := obj.(*appsv1.Deployment); ok { + return dep.Spec.Template.Annotations["kagent.dev/config-hash"] + } + } + t.Fatal("Deployment not found in outputs.Manifest") + return "" +} + +// configSecretFromManifest finds the agent's config Secret in an +// outputs bundle. Test helper. +func configSecretFromManifest(t *testing.T, outputs *pkgtranslator.AgentOutputs, name string) *corev1.Secret { + t.Helper() + for _, obj := range outputs.Manifest { + if s, ok := obj.(*corev1.Secret); ok && strings.Contains(s.GetName(), name) { + return s + } + } + t.Fatal("config Secret not found in outputs.Manifest") + return nil +} + +// configHashFromSandbox extracts the kagent.dev/config-hash annotation +// from the Sandbox CR's pod template. Sandbox-mode workloads carry the +// same annotation as Deployment-mode workloads, just nested through +// Sandbox.Spec.PodTemplate.ObjectMeta (agent-sandbox uses its own +// PodMetadata type, not corev1.ObjectMeta). Test helper. +func configHashFromSandbox(t *testing.T, outputs *pkgtranslator.AgentOutputs) string { + t.Helper() + for _, obj := range outputs.Manifest { + if sb, ok := obj.(*agentsandboxv1.Sandbox); ok { + return sb.Spec.PodTemplate.ObjectMeta.Annotations["kagent.dev/config-hash"] + } + } + t.Fatal("Sandbox not found in outputs.Manifest") + return "" +} + +// TestBuildManifest_SandboxMode_PluginMutatesSecret_HashReflectsPostPluginContent +// is the sandbox-mode sibling of the Deployment-mode regression test. +// +// The post-plugin hash stamp relies on Go map-reference sharing: the +// Annotations map on the local podTemplate variable, on +// Deployment.Spec.Template.Annotations (normal mode), and on +// Sandbox.Spec.PodTemplate.Annotations (sandbox mode) must all point at +// the same underlying map. If a future sandbox backend "defensively" +// deep-copies that map via maps.Clone, the post-plugin mutation silently +// stops propagating to the Sandbox CR — sandbox-mode agents stop rolling +// on plugin-driven config changes. +// +// This test specifically pins the sandbox-path invariant. The +// agentsxk8s.New() backend at agentsxk8s.go:50 currently does +// `Annotations: in.PodTemplate.Annotations` (reference assignment, not +// clone) and a regression to clone here breaks this assertion. +func TestBuildManifest_SandboxMode_PluginMutatesSecret_HashReflectsPostPluginContent(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + require.NoError(t, agentsandboxv1.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "sb-hash-test"}} + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "sb-hash-test"}, + Spec: v1alpha2.ModelConfigSpec{Model: "gpt-4o", Provider: v1alpha2.ModelProviderOpenAI}, + } + // SandboxAgent (not Agent) drives the sandbox path through + // buildWorkloadObjects → sandboxBackend.BuildSandbox. + sb := &v1alpha2.SandboxAgent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "sb-hash-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "System", + ModelConfig: "model", + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ns, modelConfig).Build() + defaultModel := types.NamespacedName{Namespace: "sb-hash-test", Name: "model"} + + // No-plugin baseline. + noPluginTrans := NewAdkApiTranslator(kube, defaultModel, nil, "", agentsxk8s.New()) + noPluginOutputs, err := TranslateAgent(context.Background(), noPluginTrans, sb) + require.NoError(t, err) + preHash := configHashFromSandbox(t, noPluginOutputs) + assert.NotEqual(t, "0", preHash, "sandbox-mode workload should have a non-placeholder hash post-stamp") + + // With a plugin that mutates the Secret, the Sandbox CR's hash + // MUST differ. Failure here means the map-sharing invariant broke + // on the sandbox path (typically a future agentsxk8s change that + // clones the annotations map). + mutator := &configSecretMutatingPlugin{ + agentName: "agent", + mutate: func(cfg *adk.AgentConfig) { + cfg.Instruction = "MUTATED BY PLUGIN" + }, + } + pluginTrans := NewAdkApiTranslator(kube, defaultModel, []TranslatorPlugin{mutator}, "", agentsxk8s.New()) + pluginOutputs, err := TranslateAgent(context.Background(), pluginTrans, sb) + require.NoError(t, err) + postHash := configHashFromSandbox(t, pluginOutputs) + + assert.NotEqual(t, preHash, postHash, + "sandbox CR's config-hash annotation must change when a plugin mutates the config Secret — "+ + "failure typically indicates the sandbox backend started deep-copying the pod template annotations map, "+ + "which silently breaks plugin-driven rollouts") +} diff --git a/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go new file mode 100644 index 0000000000..29347ad997 --- /dev/null +++ b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go @@ -0,0 +1,524 @@ +package agent_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + translator "github.com/kagent-dev/kagent/go/core/internal/controller/translator/agent" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + schemev1 "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +// findDeployment extracts the agent Deployment from a translator output bundle. +// Translator output is a heterogeneous slice of K8s objects; the agent pod +// spec lives on the Deployment. +func findDeployment(t *testing.T, outputs *translator.AgentOutputs) *appsv1.Deployment { + t.Helper() + for _, obj := range outputs.Manifest { + if d, ok := obj.(*appsv1.Deployment); ok { + return d + } + } + t.Fatal("Deployment not found in manifest") + return nil +} + +// hasVolumeForSecret returns the volume in the deployment that mounts the +// named Secret, or nil if no such volume is present. Used to assert TLS CA +// Secrets land on the agent pod when RMS.Spec.TLS or ModelConfig.Spec.TLS +// reference them. +func hasVolumeForSecret(dep *appsv1.Deployment, secretName string) *corev1.Volume { + for i, v := range dep.Spec.Template.Spec.Volumes { + if v.Secret != nil && v.Secret.SecretName == secretName { + return &dep.Spec.Template.Spec.Volumes[i] + } + } + return nil +} + +// Test_AdkApiTranslator_RMSTLS_DisableVerify exercises the simplest TLS path +// on a RemoteMCPServer: spec.tls.disableVerify=true with no CA Secret. The +// agent's StreamableHTTP connection params should carry the disable-verify +// flag, and no CA volume should be mounted (nothing to mount). +func Test_AdkApiTranslator_RMSTLS_DisableVerify(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tls-test"}} + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "tls-test"}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + }, + } + rms := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "self-signed-upstream", Namespace: "tls-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "Self-signed test fixture", + URL: "https://upstream.example.com/mcp", + TLS: &v1alpha2.TLSConfig{ + DisableVerify: true, + }, + }, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "tls-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "You are an agent", + ModelConfig: "model", + Tools: []*v1alpha2.Tool{{ + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", + ApiGroup: "kagent.dev", + Name: "self-signed-upstream", + }, + }, + }}, + }, + }, + } + + kubeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(ns, modelConfig, rms, agent). + Build() + + trans := translator.NewAdkApiTranslator( + kubeClient, + types.NamespacedName{Namespace: "tls-test", Name: "model"}, + nil, "", nil, + ) + outputs, err := translator.TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + require.NotNil(t, outputs.Config) + + require.Len(t, outputs.Config.HttpTools, 1, "Expected one Streamable HTTP tool") + params := outputs.Config.HttpTools[0].Params + require.NotNil(t, params.TLSInsecureSkipVerify, "TLSInsecureSkipVerify should be set when spec.tls is present") + assert.True(t, *params.TLSInsecureSkipVerify, "disableVerify should propagate") + assert.Nil(t, params.TLSCACertPath, "No CA path expected without CACertSecretRef") + + // No CA Secret → no extra TLS volume on the deployment. + dep := findDeployment(t, outputs) + for _, v := range dep.Spec.Template.Spec.Volumes { + if v.Secret != nil { + assert.NotContains(t, v.Name, "tls-ca-", "No TLS volume expected when no CACertSecretRef is set") + } + } +} + +// Test_AdkApiTranslator_RMSTLS_CustomCA exercises the production path: an +// RemoteMCPServer pinning a CA Secret produces (a) TLSCACertPath on the wire config so +// the runtime initializes its trust store from disk, and (b) a per-Secret +// read-only volume + mount on the agent pod so the file is actually present. +func Test_AdkApiTranslator_RMSTLS_CustomCA(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tls-test"}} + caSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-corp-ca", Namespace: "tls-test"}, + Data: map[string][]byte{"ca.crt": []byte("FAKE CA PEM")}, + } + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "tls-test"}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + }, + } + rms := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-upstream", Namespace: "tls-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "Corporate-CA upstream", + URL: "https://mcp.corp.internal/mcp", + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: "rms-corp-ca", + CACertSecretKey: "ca.crt", + }, + }, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "tls-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "You are an agent", + ModelConfig: "model", + Tools: []*v1alpha2.Tool{{ + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", + ApiGroup: "kagent.dev", + Name: "corp-upstream", + }, + }, + }}, + }, + }, + } + + kubeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(ns, caSecret, modelConfig, rms, agent). + Build() + + trans := translator.NewAdkApiTranslator( + kubeClient, + types.NamespacedName{Namespace: "tls-test", Name: "model"}, + nil, "", nil, + ) + outputs, err := translator.TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + + require.Len(t, outputs.Config.HttpTools, 1) + params := outputs.Config.HttpTools[0].Params + require.NotNil(t, params.TLSCACertPath, "TLSCACertPath should be set when CA Secret is referenced") + assert.Contains(t, *params.TLSCACertPath, "rms-corp-ca/ca.crt", + "Cert path should embed the Secret name (operator-readable, per-Secret bucket)") + + dep := findDeployment(t, outputs) + v := hasVolumeForSecret(dep, "rms-corp-ca") + require.NotNil(t, v, "Deployment should mount the CA Secret as a volume") + require.NotNil(t, v.Secret.DefaultMode) + assert.Equal(t, int32(0444), *v.Secret.DefaultMode, "CA Secret volume should be read-only 0444") + + // Mount path on the agent container should match the cert path's + // directory and be marked read-only. + var mountFound bool + for _, m := range dep.Spec.Template.Spec.Containers[0].VolumeMounts { + if m.Name == v.Name { + assert.True(t, m.ReadOnly, "CA mount should be read-only") + assert.Contains(t, m.MountPath, "rms-corp-ca") + mountFound = true + } + } + assert.True(t, mountFound, "Agent container should mount the CA volume") +} + +// Test_AdkApiTranslator_RMSTLS_SharedSecretAcrossRMSs guards a regression +// where two RemoteMCPServers in the same agent reference the same CA +// Secret. translateRemoteMCPServerTarget calls addTLSConfiguration once +// per RemoteMCPServer directly on the already-merged modelDeploymentData, so without +// idempotency we'd produce two Volume entries with the same Name and the +// API server would reject the Deployment with `Pod.spec.volumes[N].name: +// Duplicate value`. The "all our internal MCP services share one +// corporate-CA bundle" topology is the realistic shape that triggers it. +func Test_AdkApiTranslator_RMSTLS_SharedSecretAcrossRMSs(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tls-test"}} + caSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-ca", Namespace: "tls-test"}, + Data: map[string][]byte{"ca.crt": []byte("FAKE CA PEM")}, + } + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "tls-test"}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + }, + } + sharedTLS := &v1alpha2.TLSConfig{ + CACertSecretRef: "corp-ca", + CACertSecretKey: "ca.crt", + } + rmsA := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-a", Namespace: "tls-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "RMS A behind corp CA", + URL: "https://a.corp.internal/mcp", + TLS: sharedTLS, + }, + } + rmsB := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-b", Namespace: "tls-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "RMS B behind corp CA", + URL: "https://b.corp.internal/mcp", + TLS: sharedTLS, + }, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "tls-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "You are an agent", + ModelConfig: "model", + Tools: []*v1alpha2.Tool{ + { + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", ApiGroup: "kagent.dev", Name: "rms-a", + }, + }, + }, + { + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", ApiGroup: "kagent.dev", Name: "rms-b", + }, + }, + }, + }, + }, + }, + } + + kubeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(ns, caSecret, modelConfig, rmsA, rmsB, agent). + Build() + + trans := translator.NewAdkApiTranslator( + kubeClient, + types.NamespacedName{Namespace: "tls-test", Name: "model"}, + nil, "", nil, + ) + outputs, err := translator.TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + + dep := findDeployment(t, outputs) + + // Count Volumes that mount the shared Secret. Must be exactly one; + // duplicates would fail kubelet admission. + matchingVolumes := 0 + for _, v := range dep.Spec.Template.Spec.Volumes { + if v.Secret != nil && v.Secret.SecretName == "corp-ca" { + matchingVolumes++ + } + } + assert.Equal(t, 1, matchingVolumes, "Same Secret referenced from multiple RMSs must produce exactly one Volume") + + // Both RMSs' connection params should still point at the same cert + // file — the mount is shared but every transport reads from the + // same on-disk path. + require.Len(t, outputs.Config.HttpTools, 2) + pathA := outputs.Config.HttpTools[0].Params.TLSCACertPath + pathB := outputs.Config.HttpTools[1].Params.TLSCACertPath + require.NotNil(t, pathA) + require.NotNil(t, pathB) + assert.Equal(t, *pathA, *pathB, "Both RMSs sharing a Secret should resolve to the same cert path") +} + +// Test_AdkApiTranslator_RMSTLS_EmptyTLSStructIsNoOp guards against the +// trap where a non-nil but all-zero `Spec.TLS: {}` produces wire fields +// (`tls_insecure_skip_verify: false`, `tls_disable_system_cas: false`) +// that would flip the Python runtime out of its no-op short-circuit and +// silently swap google-adk's default httpx client factory for kagent's. +// Both factories produce the same SSL behavior here but differ on +// timeout / redirect defaults; the principle of least surprise says +// `{}` should be indistinguishable from `nil`. +func Test_AdkApiTranslator_RMSTLS_EmptyTLSStructIsNoOp(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tls-test"}} + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "tls-test"}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + }, + } + rms := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "upstream", Namespace: "tls-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "Upstream with empty TLS struct", + URL: "https://upstream.example.com/mcp", + TLS: &v1alpha2.TLSConfig{}, + }, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "tls-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "You are an agent", + ModelConfig: "model", + Tools: []*v1alpha2.Tool{{ + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", ApiGroup: "kagent.dev", Name: "upstream", + }, + }, + }}, + }, + }, + } + + kubeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(ns, modelConfig, rms, agent). + Build() + + trans := translator.NewAdkApiTranslator( + kubeClient, + types.NamespacedName{Namespace: "tls-test", Name: "model"}, + nil, "", nil, + ) + outputs, err := translator.TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + + require.Len(t, outputs.Config.HttpTools, 1) + params := outputs.Config.HttpTools[0].Params + assert.Nil(t, params.TLSInsecureSkipVerify, + "Empty TLS struct must not emit explicit booleans on the wire") + assert.Nil(t, params.TLSDisableSystemCAs, + "Empty TLS struct must not emit explicit booleans on the wire") + assert.Nil(t, params.TLSCACertPath) +} + +// Test_AdkApiTranslator_RMSTLS_CoexistsWithModelConfigTLS verifies the +// reason the volume-naming rework was necessary: an agent that combines a +// ModelConfig TLS CA with one or more RemoteMCPServer TLS CAs must end up +// with every CA mounted on the pod, not just one. Before per-Secret naming +// the merge dedupe would silently drop later contributors because they all +// declared the same volume name. +func Test_AdkApiTranslator_RMSTLS_CoexistsWithModelConfigTLS(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tls-test"}} + modelCASecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "model-ca", Namespace: "tls-test"}, + Data: map[string][]byte{"ca.crt": []byte("FAKE MODEL CA PEM")}, + } + rmsACASecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-a-ca", Namespace: "tls-test"}, + Data: map[string][]byte{"ca.crt": []byte("FAKE RMS A CA PEM")}, + } + rmsBCASecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-b-ca", Namespace: "tls-test"}, + Data: map[string][]byte{"ca.crt": []byte("FAKE RMS B CA PEM")}, + } + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "tls-test"}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: "model-ca", + CACertSecretKey: "ca.crt", + }, + }, + } + rmsA := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-a", Namespace: "tls-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "RMS A", + URL: "https://a.example.com/mcp", + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: "rms-a-ca", + CACertSecretKey: "ca.crt", + }, + }, + } + rmsB := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-b", Namespace: "tls-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "RMS B", + URL: "https://b.example.com/mcp", + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: "rms-b-ca", + CACertSecretKey: "ca.crt", + }, + }, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "tls-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "You are an agent", + ModelConfig: "model", + Tools: []*v1alpha2.Tool{ + { + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", ApiGroup: "kagent.dev", Name: "rms-a", + }, + }, + }, + { + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", ApiGroup: "kagent.dev", Name: "rms-b", + }, + }, + }, + }, + }, + }, + } + + kubeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(ns, modelCASecret, rmsACASecret, rmsBCASecret, modelConfig, rmsA, rmsB, agent). + Build() + + trans := translator.NewAdkApiTranslator( + kubeClient, + types.NamespacedName{Namespace: "tls-test", Name: "model"}, + nil, "", nil, + ) + outputs, err := translator.TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + + dep := findDeployment(t, outputs) + + // All three Secrets must be mounted on the agent pod. + for _, secretName := range []string{"model-ca", "rms-a-ca", "rms-b-ca"} { + v := hasVolumeForSecret(dep, secretName) + assert.NotNilf(t, v, "Secret %q should be mounted on the agent pod", secretName) + } + + // The three volumes must have distinct names and distinct mount paths + // so the merge dedupe doesn't silently drop any. Volume name + mount + // path are derived deterministically from the Secret name. + tlsVolumeNames := map[string]struct{}{} + tlsMountPaths := map[string]struct{}{} + for _, v := range dep.Spec.Template.Spec.Volumes { + if v.Secret != nil && v.Secret.SecretName != "" { + switch v.Secret.SecretName { + case "model-ca", "rms-a-ca", "rms-b-ca": + tlsVolumeNames[v.Name] = struct{}{} + } + } + } + for _, m := range dep.Spec.Template.Spec.Containers[0].VolumeMounts { + for n := range tlsVolumeNames { + if m.Name == n { + tlsMountPaths[m.MountPath] = struct{}{} + } + } + } + assert.Len(t, tlsVolumeNames, 3, "Three TLS Secrets must produce three distinct volume names") + assert.Len(t, tlsMountPaths, 3, "Three TLS Secrets must produce three distinct mount paths") +} diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-custom-ca.json b/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-custom-ca.json index 62e996c083..1c1b2a1502 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-custom-ca.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-custom-ca.json @@ -26,7 +26,7 @@ "max_tokens": 1024, "model": "gpt-4o", "temperature": 0.7, - "tls_ca_cert_path": "/etc/ssl/certs/custom/ca.crt", + "tls_ca_cert_path": "/etc/ssl/certs/custom/custom-ca-cert/ca.crt", "tls_disable_system_cas": false, "tls_insecure_skip_verify": false, "type": "openai" @@ -60,7 +60,7 @@ }, "stringData": { "agent-card.json": "{\"name\":\"tls_agent_with_custom_ca\",\"description\":\"\",\"url\":\"http://tls-agent-with-custom-ca.test:8080\",\"version\":\"\",\"capabilities\":{\"streaming\":true,\"pushNotifications\":false,\"stateTransitionHistory\":true},\"defaultInputModes\":[\"text\"],\"defaultOutputModes\":[\"text\"],\"skills\":[],\"preferredTransport\":\"JSONRPC\"}", - "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"tls_insecure_skip_verify\":false,\"tls_ca_cert_path\":\"/etc/ssl/certs/custom/ca.crt\",\"tls_disable_system_cas\":false,\"base_url\":\"https://internal-litellm.company.com\",\"max_tokens\":1024,\"temperature\":0.7},\"description\":\"\",\"instruction\":\"You are a helpful assistant with custom CA support.\",\"stream\":false}" + "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"tls_insecure_skip_verify\":false,\"tls_ca_cert_path\":\"/etc/ssl/certs/custom/custom-ca-cert/ca.crt\",\"tls_disable_system_cas\":false,\"base_url\":\"https://internal-litellm.company.com\",\"max_tokens\":1024,\"temperature\":0.7},\"description\":\"\",\"instruction\":\"You are a helpful assistant with custom CA support.\",\"stream\":false}" } }, { @@ -129,7 +129,7 @@ "template": { "metadata": { "annotations": { - "kagent.dev/config-hash": "7617586729040135348" + "kagent.dev/config-hash": "7742056941063993416" }, "labels": { "app": "kagent", @@ -211,8 +211,8 @@ "name": "config" }, { - "mountPath": "/etc/ssl/certs/custom", - "name": "tls-ca-cert", + "mountPath": "/etc/ssl/certs/custom/custom-ca-cert", + "name": "tls-ca-custom-ca-cert", "readOnly": true }, { @@ -231,7 +231,7 @@ } }, { - "name": "tls-ca-cert", + "name": "tls-ca-custom-ca-cert", "secret": { "defaultMode": 292, "secretName": "custom-ca-cert" diff --git a/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-system-cas-disabled.json b/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-system-cas-disabled.json index 620c29c9f5..331af12705 100644 --- a/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-system-cas-disabled.json +++ b/go/core/internal/controller/translator/agent/testdata/outputs/tls-with-system-cas-disabled.json @@ -26,7 +26,7 @@ "max_tokens": 1024, "model": "gpt-4o", "temperature": 0.7, - "tls_ca_cert_path": "/etc/ssl/certs/custom/ca.crt", + "tls_ca_cert_path": "/etc/ssl/certs/custom/corporate-ca-cert/ca.crt", "tls_disable_system_cas": true, "tls_insecure_skip_verify": false, "type": "openai" @@ -60,7 +60,7 @@ }, "stringData": { "agent-card.json": "{\"name\":\"tls_agent_with_system_cas_disabled\",\"description\":\"\",\"url\":\"http://tls-agent-with-system-cas-disabled.test:8080\",\"version\":\"\",\"capabilities\":{\"streaming\":true,\"pushNotifications\":false,\"stateTransitionHistory\":true},\"defaultInputModes\":[\"text\"],\"defaultOutputModes\":[\"text\"],\"skills\":[],\"preferredTransport\":\"JSONRPC\"}", - "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"tls_insecure_skip_verify\":false,\"tls_ca_cert_path\":\"/etc/ssl/certs/custom/ca.crt\",\"tls_disable_system_cas\":true,\"base_url\":\"https://corp-llm-gateway.internal\",\"max_tokens\":1024,\"temperature\":0.7},\"description\":\"\",\"instruction\":\"You are a helpful assistant in a corporate environment.\",\"stream\":false}" + "config.json": "{\"model\":{\"type\":\"openai\",\"model\":\"gpt-4o\",\"tls_insecure_skip_verify\":false,\"tls_ca_cert_path\":\"/etc/ssl/certs/custom/corporate-ca-cert/ca.crt\",\"tls_disable_system_cas\":true,\"base_url\":\"https://corp-llm-gateway.internal\",\"max_tokens\":1024,\"temperature\":0.7},\"description\":\"\",\"instruction\":\"You are a helpful assistant in a corporate environment.\",\"stream\":false}" } }, { @@ -129,7 +129,7 @@ "template": { "metadata": { "annotations": { - "kagent.dev/config-hash": "1527591075664209989" + "kagent.dev/config-hash": "5192670338632125074" }, "labels": { "app": "kagent", @@ -211,8 +211,8 @@ "name": "config" }, { - "mountPath": "/etc/ssl/certs/custom", - "name": "tls-ca-cert", + "mountPath": "/etc/ssl/certs/custom/corporate-ca-cert", + "name": "tls-ca-corporate-ca-cert", "readOnly": true }, { @@ -231,7 +231,7 @@ } }, { - "name": "tls-ca-cert", + "name": "tls-ca-corporate-ca-cert", "secret": { "defaultMode": 292, "secretName": "corporate-ca-cert" diff --git a/go/core/internal/controller/translator/agent/tls_mounting_test.go b/go/core/internal/controller/translator/agent/tls_mounting_test.go index e203d0167b..2ac688a1ac 100644 --- a/go/core/internal/controller/translator/agent/tls_mounting_test.go +++ b/go/core/internal/controller/translator/agent/tls_mounting_test.go @@ -1,6 +1,7 @@ package agent import ( + "path" "testing" "github.com/kagent-dev/kagent/go/api/v1alpha2" @@ -45,21 +46,19 @@ func Test_addTLSConfiguration_WithCACertSecret(t *testing.T) { addTLSConfiguration(mdd, tlsConfig) - // Verify volume is added - require.Len(t, mdd.Volumes, 1, "Expected 1 volume for TLS cert secret") + wantVolumeName, wantMountPath, _ := tlsCAPaths("internal-ca-cert", "ca.crt") + require.Len(t, mdd.Volumes, 1, "Expected 1 volume for TLS cert secret") volume := mdd.Volumes[0] - assert.Equal(t, tlsCACertVolumeName, volume.Name, "Volume name should match TLS CA cert volume name") + assert.Equal(t, wantVolumeName, volume.Name, "Volume name should be derived from Secret name") require.NotNil(t, volume.Secret, "Expected Secret volume source") assert.Equal(t, "internal-ca-cert", volume.Secret.SecretName, "Secret name should match CACertSecretRef") assert.Equal(t, int32(0444), *volume.Secret.DefaultMode, "DefaultMode should be 0444 for read-only cert") - // Verify volume mount is added require.Len(t, mdd.VolumeMounts, 1, "Expected 1 volume mount for TLS cert") - mount := mdd.VolumeMounts[0] - assert.Equal(t, tlsCACertVolumeName, mount.Name, "Volume mount name should match volume name") - assert.Equal(t, tlsCACertMountPath, mount.MountPath, "Mount path should be TLS cert mount path") + assert.Equal(t, wantVolumeName, mount.Name, "Volume mount name should match volume name") + assert.Equal(t, wantMountPath, mount.MountPath, "Mount path should be derived from Secret name") assert.True(t, mount.ReadOnly, "Volume mount should be read-only") } @@ -88,14 +87,13 @@ func Test_addTLSConfiguration_CustomCertKey(t *testing.T) { addTLSConfiguration(mdd, tlsConfig) - // Verify volume is added - require.Len(t, mdd.Volumes, 1, "Expected 1 volume for TLS cert with custom key") + _, wantMountPath, _ := tlsCAPaths("internal-ca-cert", "custom-ca.pem") - // Verify volume mount is added at the correct path + require.Len(t, mdd.Volumes, 1, "Expected 1 volume for TLS cert with custom key") require.Len(t, mdd.VolumeMounts, 1, "Expected 1 volume mount for TLS cert") mount := mdd.VolumeMounts[0] - assert.Equal(t, tlsCACertMountPath, mount.MountPath, "Mount path should be TLS cert mount path") + assert.Equal(t, wantMountPath, mount.MountPath, "Mount path should be derived from Secret name") } // Test_addTLSConfiguration_DisableSystemCAsFlag verifies no volumes added when no cert secret @@ -142,16 +140,143 @@ func Test_addTLSConfiguration_AllFieldsCombined(t *testing.T) { addTLSConfiguration(mdd, tlsConfig) - // Verify volume and mount + _, wantMountPath, _ := tlsCAPaths("my-ca-bundle", "bundle.crt") + require.Len(t, mdd.Volumes, 1, "Expected 1 volume for combined TLS config") require.Len(t, mdd.VolumeMounts, 1, "Expected 1 volume mount for combined TLS config") - // Verify volume references correct Secret volume := mdd.Volumes[0] require.NotNil(t, volume.Secret, "Expected Secret volume source") assert.Equal(t, "my-ca-bundle", volume.Secret.SecretName, "Secret name should match CACertSecretRef") - // Verify mount path is correct mount := mdd.VolumeMounts[0] - assert.Equal(t, tlsCACertMountPath, mount.MountPath, "Mount path should be TLS cert mount path") + assert.Equal(t, wantMountPath, mount.MountPath, "Mount path should be derived from Secret name") +} + +// Test_addTLSConfiguration_MultipleSecretsDoNotCollide verifies that mounting +// CAs from different Secrets on the same agent produces distinct volumes and +// distinct mount paths so the merge dedupe (mergeDeploymentData) preserves +// both. Without per-Secret naming, the merge would silently drop one because +// it dedupes by volume name and mount path. That matters now that RMSs and +// ModelConfigs can each contribute CA mounts to the same agent pod. +func Test_addTLSConfiguration_MultipleSecretsDoNotCollide(t *testing.T) { + mdd := &modelDeploymentData{} + + addTLSConfiguration(mdd, &v1alpha2.TLSConfig{ + CACertSecretRef: "chat-ca", + CACertSecretKey: "ca.crt", + }) + addTLSConfiguration(mdd, &v1alpha2.TLSConfig{ + CACertSecretRef: "embedding-ca", + CACertSecretKey: "ca.crt", + }) + addTLSConfiguration(mdd, &v1alpha2.TLSConfig{ + CACertSecretRef: "rms-corp-ca", + CACertSecretKey: "ca.crt", + }) + + require.Len(t, mdd.Volumes, 3, "Three distinct Secrets should produce three volumes") + require.Len(t, mdd.VolumeMounts, 3, "Three distinct Secrets should produce three mounts") + + seenNames := map[string]struct{}{} + seenPaths := map[string]struct{}{} + for _, v := range mdd.Volumes { + seenNames[v.Name] = struct{}{} + } + for _, m := range mdd.VolumeMounts { + seenPaths[m.MountPath] = struct{}{} + } + assert.Len(t, seenNames, 3, "Volume names should be distinct per Secret") + assert.Len(t, seenPaths, 3, "Mount paths should be distinct per Secret") +} + +// Test_addTLSConfiguration_SameSecretMergesCleanly verifies that referencing +// the same Secret + key from multiple sources produces identical volume name +// and mount path on each call. The downstream merge dedupes by both, so the +// agent pod ends up with exactly one mount even when several +// ModelConfigs/RMSs share a Secret. +func Test_addTLSConfiguration_SameSecretMergesCleanly(t *testing.T) { + a := &modelDeploymentData{} + b := &modelDeploymentData{} + + addTLSConfiguration(a, &v1alpha2.TLSConfig{ + CACertSecretRef: "shared-ca", + CACertSecretKey: "ca.crt", + }) + addTLSConfiguration(b, &v1alpha2.TLSConfig{ + CACertSecretRef: "shared-ca", + CACertSecretKey: "ca.crt", + }) + + require.Len(t, a.Volumes, 1) + require.Len(t, b.Volumes, 1) + assert.Equal(t, a.Volumes[0].Name, b.Volumes[0].Name, "Same Secret should produce identical volume name") + assert.Equal(t, a.VolumeMounts[0].MountPath, b.VolumeMounts[0].MountPath, "Same Secret should produce identical mount path") +} + +// Test_tlsCAPaths_LongSecretNameHashes verifies that Secret names which would +// overflow the K8s DNS_LABEL 63-char limit on the volume name get hashed. +func Test_tlsCAPaths_LongSecretNameHashes(t *testing.T) { + // 60-char Secret name → "tls-ca-" + 60 = 67 chars > 63. Must hash. + longName := "a-very-long-secret-name-that-just-exceeds-the-volume-limit-x" + require.Len(t, longName, 60) + + volumeName, mountPath, certPath := tlsCAPaths(longName, "ca.crt") + + assert.LessOrEqual(t, len(volumeName), 63, "Volume name must fit DNS_LABEL") + assert.NotContains(t, volumeName, longName, "Long names should be hashed, not embedded literally") + assert.True(t, len(mountPath) > len(tlsCAMountRoot), "Mount path should include hashed component") + assert.Equal(t, certPath, path.Join(mountPath, "ca.crt")) +} + +// Test_tlsCAPaths_ShortSecretNameEmbeds verifies that Secret names that fit +// the volume-name budget appear in the volume name and mount path verbatim, +// for operator readability when inspecting an agent's pod spec. +func Test_tlsCAPaths_ShortSecretNameEmbeds(t *testing.T) { + volumeName, mountPath, certPath := tlsCAPaths("corp-ca", "ca.crt") + + assert.Equal(t, "tls-ca-corp-ca", volumeName) + assert.Equal(t, "/etc/ssl/certs/custom/corp-ca", mountPath) + assert.Equal(t, "/etc/ssl/certs/custom/corp-ca/ca.crt", certPath) +} + +// Test_tlsCAPaths_DottedSecretNameHashes covers the case where the Secret +// name follows DNS_SUBDOMAIN (allowed by K8s for Secret names: dots and +// up to 253 chars) but violates DNS_LABEL (required for volume names: no +// dots, ≤ 63 chars). Common examples: cert-manager output Secrets named +// after the certificate (`mcp.example.com-tls`), or operator-authored +// names that mirror an FQDN. Without charset-aware hashing, the produced +// volume name would embed the dot and the agent Deployment would fail to +// apply with a `spec.template.spec.volumes[N].name: Invalid value` error. +func Test_tlsCAPaths_DottedSecretNameHashes(t *testing.T) { + cases := []string{ + "corp.ca", + "mcp.example.com-tls", + "my.cert.bundle", + } + for _, name := range cases { + t.Run(name, func(t *testing.T) { + volumeName, mountPath, certPath := tlsCAPaths(name, "ca.crt") + assert.NotContains(t, volumeName, ".", + "Volume name must not contain dots (DNS_LABEL) — hash when Secret name does") + assert.Regexp(t, `^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`, volumeName, + "Volume name must satisfy RFC 1123 label") + assert.LessOrEqual(t, len(volumeName), 63, "Volume name must fit DNS_LABEL") + assert.Equal(t, certPath, path.Join(mountPath, "ca.crt")) + }) + } +} + +// Test_tlsCAPaths_Deterministic verifies that two calls with the same input +// produce identical output, including when hashing kicks in. The merge +// dedupe relies on identical volume names + mount paths for the same +// Secret regardless of which call site produced them. +func Test_tlsCAPaths_Deterministic(t *testing.T) { + for _, name := range []string{"short-name", "corp.ca", "a-very-long-secret-name-that-just-exceeds-the-volume-limit-x"} { + v1, m1, c1 := tlsCAPaths(name, "ca.crt") + v2, m2, c2 := tlsCAPaths(name, "ca.crt") + assert.Equal(t, v1, v2, "tlsCAPaths must be deterministic for volume name (%s)", name) + assert.Equal(t, m1, m2, "tlsCAPaths must be deterministic for mount path (%s)", name) + assert.Equal(t, c1, c2, "tlsCAPaths must be deterministic for cert path (%s)", name) + } } diff --git a/go/core/internal/httpserver/handlers/companion_secrets.go b/go/core/internal/httpserver/handlers/companion_secrets.go new file mode 100644 index 0000000000..3f3a00b525 --- /dev/null +++ b/go/core/internal/httpserver/handlers/companion_secrets.go @@ -0,0 +1,159 @@ +package handlers + +import ( + "context" + stderrors "errors" + "fmt" + "maps" + "strings" + + api "github.com/kagent-dev/kagent/go/api/httpapi" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/validation" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// errInvalidCompanionSecret tags errors that should surface as a 400 Bad +// Request to the API caller (e.g. an existing Secret has the wrong type +// or is owned by a different resource). +var errInvalidCompanionSecret = stderrors.New("invalid companion secret") + +// companionSecretAPIError translates a companion-Secret error into the +// appropriate HTTP API error. errInvalidCompanionSecret indicates +// caller-fixable conditions and surfaces as 400; anything else is an +// internal failure (kube API error, marshal failure, etc.). +func companionSecretAPIError(err error) *errors.APIError { + if stderrors.Is(err, errInvalidCompanionSecret) { + return errors.NewBadRequestError(err.Error(), err) + } + return errors.NewInternalServerError("Failed to create or update companion secrets", err) +} + +// validateSecretMaterials checks each SecretMaterial's name and key +// against Kubernetes naming rules. Returns a single error for the first +// invalid material so the caller can return a 400 with a precise reason. +func validateSecretMaterials(secrets []api.SecretMaterial) error { + for _, secret := range secrets { + if errs := validation.IsDNS1123Subdomain(secret.Name); len(errs) > 0 { + return fmt.Errorf("invalid secret name %q: %s", secret.Name, strings.Join(errs, "; ")) + } + if errs := validation.IsConfigMapKey(secret.Key); len(errs) > 0 { + return fmt.Errorf("invalid key %q for secret %q: %s", secret.Key, secret.Name, strings.Join(errs, "; ")) + } + } + return nil +} + +// createOrUpdateCompanionSecrets writes each SecretMaterial as an Opaque +// Secret in the owner's namespace, with an OwnerReference back to the +// owner so K8s garbage collection cleans them up when the parent is +// deleted. Materials grouped by `name` accumulate keys into a single +// Secret object. +// +// When a Secret with the same name already exists, the helper merges +// the new keys into the existing Data. Two safety checks: +// - the existing Secret's type must be Opaque (mismatched types are a +// 400-class error because the caller's payload is incompatible). +// - the existing Secret must already carry an OwnerReference back to +// this owner (a Secret managed by someone else is not safe to +// mutate from a different parent's create/update path). +// +// Caller is responsible for ensuring the owner has been written to the +// API server first (so .GetUID() is populated) — the OwnerReference +// uses the owner's live UID. +func createOrUpdateCompanionSecrets( + ctx context.Context, + kubeClient client.Client, + owner client.Object, + gvk schema.GroupVersionKind, + secrets []api.SecretMaterial, +) error { + // Group secrets by name and key. + secretsByName := map[string]map[string][]byte{} + for _, secret := range secrets { + if _, ok := secretsByName[secret.Name]; !ok { + secretsByName[secret.Name] = map[string][]byte{} + } + secretsByName[secret.Name][secret.Key] = []byte(secret.Value) + } + + namespace := owner.GetNamespace() + for name, data := range secretsByName { + existingSecret := &corev1.Secret{} + err := kubeClient.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, existingSecret) + if err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to get companion secret %s/%s: %w", namespace, name, err) + } + + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + OwnerReferences: []metav1.OwnerReference{ownerReferenceFor(owner, gvk)}, + }, + Type: corev1.SecretTypeOpaque, + Data: data, + } + if err := kubeClient.Create(ctx, secret); err != nil { + return fmt.Errorf("failed to create companion secret %s/%s: %w", namespace, name, err) + } + continue + } + + if existingSecret.Type != corev1.SecretTypeOpaque { + return fmt.Errorf("%w: companion secret %s/%s must be type %q, got %q", errInvalidCompanionSecret, namespace, name, corev1.SecretTypeOpaque, existingSecret.Type) + } + if !isOwnedBy(existingSecret, owner, gvk) { + return fmt.Errorf("%w: companion secret %s/%s is not managed by %s %s/%s", errInvalidCompanionSecret, namespace, name, gvk.Kind, owner.GetNamespace(), owner.GetName()) + } + + if existingSecret.Data == nil { + existingSecret.Data = map[string][]byte{} + } + maps.Copy(existingSecret.Data, data) + if err := kubeClient.Update(ctx, existingSecret); err != nil { + return fmt.Errorf("failed to update companion secret %s/%s: %w", namespace, name, err) + } + } + + return nil +} + +// ownerReferenceFor returns an OwnerReference pointing at the given +// owner. GVK is taken explicitly rather than from owner's TypeMeta +// because objects roundtripped through client.Get often have empty +// TypeMeta; the caller knows the kind at the call site. +func ownerReferenceFor(owner client.Object, gvk schema.GroupVersionKind) metav1.OwnerReference { + controller := true + return metav1.OwnerReference{ + APIVersion: gvk.GroupVersion().Identifier(), + Kind: gvk.Kind, + Name: owner.GetName(), + UID: owner.GetUID(), + Controller: &controller, + } +} + +// isOwnedBy reports whether the secret carries an OwnerReference back +// to the named owner of the given GVK. UID is compared when the owner +// has one (so unit tests that build owners without a live UID still +// match by name+kind, which is the intended semantic). +func isOwnedBy(secret *corev1.Secret, owner client.Object, gvk schema.GroupVersionKind) bool { + for _, ownerRef := range secret.GetOwnerReferences() { + if ownerRef.APIVersion != gvk.GroupVersion().Identifier() || + ownerRef.Kind != gvk.Kind || + ownerRef.Name != owner.GetName() { + continue + } + if owner.GetUID() != "" && ownerRef.UID != owner.GetUID() { + continue + } + return true + } + return false +} diff --git a/go/core/internal/httpserver/handlers/modelconfig.go b/go/core/internal/httpserver/handlers/modelconfig.go index a07c313e90..4e74c0c01f 100644 --- a/go/core/internal/httpserver/handlers/modelconfig.go +++ b/go/core/internal/httpserver/handlers/modelconfig.go @@ -1,26 +1,22 @@ package handlers import ( - "context" "encoding/json" - stderrors "errors" "fmt" - "maps" "net/http" "strings" - api "github.com/kagent-dev/kagent/go/api/httpapi" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/httpserver/errors" common "github.com/kagent-dev/kagent/go/core/internal/utils" "github.com/kagent-dev/kagent/go/core/pkg/auth" - corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/validation" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" + + api "github.com/kagent-dev/kagent/go/api/httpapi" ) // ModelConfigHandler handles ModelConfiguration requests @@ -190,7 +186,7 @@ func (h *ModelConfigHandler) HandleCreateModelConfig(w ErrorResponseWriter, r *h } } - if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, modelConfig, req.Secrets); err != nil { + if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, modelConfig, modelConfigGVK, req.Secrets); err != nil { log.Error(err, "Failed to create or update companion secrets") w.RespondWithError(companionSecretAPIError(err)) return @@ -280,7 +276,7 @@ func (h *ModelConfigHandler) HandleUpdateModelConfig(w ErrorResponseWriter, r *h log.V(1).Info("Successfully updated API key secret") } - if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, modelConfig, req.Secrets); err != nil { + if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, modelConfig, modelConfigGVK, req.Secrets); err != nil { log.Error(err, "Failed to create or update companion secrets") w.RespondWithError(companionSecretAPIError(err)) return @@ -350,107 +346,7 @@ func validateAPIKeySecretRef(apiKeySecret, apiKeySecretKey string, provider v1al return nil } -func validateSecretMaterials(secrets []api.SecretMaterial) error { - for _, secret := range secrets { - if errs := validation.IsDNS1123Subdomain(secret.Name); len(errs) > 0 { - return fmt.Errorf("invalid secret name %q: %s", secret.Name, strings.Join(errs, "; ")) - } - if errs := validation.IsConfigMapKey(secret.Key); len(errs) > 0 { - return fmt.Errorf("invalid key %q for secret %q: %s", secret.Key, secret.Name, strings.Join(errs, "; ")) - } - } - return nil -} - -var errInvalidCompanionSecret = stderrors.New("invalid companion secret") - -// companionSecretAPIError returns an API error for companion secret validation errors. -func companionSecretAPIError(err error) *errors.APIError { - if stderrors.Is(err, errInvalidCompanionSecret) { - return errors.NewBadRequestError(err.Error(), err) - } - return errors.NewInternalServerError("Failed to create or update companion secrets", err) -} - -func createOrUpdateCompanionSecrets(ctx context.Context, kubeClient client.Client, owner *v1alpha2.ModelConfig, secrets []api.SecretMaterial) error { - // Group secrets by name and key. - secretsByName := map[string]map[string][]byte{} - for _, secret := range secrets { - if _, ok := secretsByName[secret.Name]; !ok { - secretsByName[secret.Name] = map[string][]byte{} - } - secretsByName[secret.Name][secret.Key] = []byte(secret.Value) - } - - namespace := owner.GetNamespace() - for name, data := range secretsByName { - existingSecret := &corev1.Secret{} - // Get the existing secret by name and namespace. - err := kubeClient.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, existingSecret) - if err != nil { - if !apierrors.IsNotFound(err) { - return fmt.Errorf("failed to get companion secret %s/%s: %w", namespace, name, err) - } - - // Create the secret if it doesn't exist. - secret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - OwnerReferences: []metav1.OwnerReference{modelConfigOwnerReference(owner)}, - }, - Type: corev1.SecretTypeOpaque, - Data: data, - } - if err := kubeClient.Create(ctx, secret); err != nil { - return fmt.Errorf("failed to create companion secret %s/%s: %w", namespace, name, err) - } - continue - } - - if existingSecret.Type != corev1.SecretTypeOpaque { - return fmt.Errorf("%w: companion secret %s/%s must be type %q, got %q", errInvalidCompanionSecret, namespace, name, corev1.SecretTypeOpaque, existingSecret.Type) - } - if !isOwnedByModelConfig(existingSecret, owner) { - return fmt.Errorf("%w: companion secret %s/%s is not managed by ModelConfig %s/%s", errInvalidCompanionSecret, namespace, name, owner.GetNamespace(), owner.GetName()) - } - - if existingSecret.Data == nil { - existingSecret.Data = map[string][]byte{} - } - maps.Copy(existingSecret.Data, data) - if err := kubeClient.Update(ctx, existingSecret); err != nil { - return fmt.Errorf("failed to update companion secret %s/%s: %w", namespace, name, err) - } - } - - return nil -} - -// modelConfigOwnerReference returns the owner reference for the model config. -func modelConfigOwnerReference(owner *v1alpha2.ModelConfig) metav1.OwnerReference { - controller := true - return metav1.OwnerReference{ - APIVersion: v1alpha2.GroupVersion.Identifier(), - Kind: "ModelConfig", - Name: owner.GetName(), - UID: owner.GetUID(), - Controller: &controller, - } -} - -// isOwnedByModelConfig checks if the secret is owned by the model config. -func isOwnedByModelConfig(secret *corev1.Secret, owner *v1alpha2.ModelConfig) bool { - for _, ownerRef := range secret.GetOwnerReferences() { - if ownerRef.APIVersion != v1alpha2.GroupVersion.Identifier() || - ownerRef.Kind != "ModelConfig" || - ownerRef.Name != owner.GetName() { - continue - } - if owner.GetUID() != "" && ownerRef.UID != owner.GetUID() { - continue - } - return true - } - return false -} +// modelConfigGVK is passed to companion-secret helpers so the +// OwnerReference and isOwnedBy check use the right Kind for this +// resource. +var modelConfigGVK = v1alpha2.GroupVersion.WithKind("ModelConfig") diff --git a/go/core/internal/httpserver/handlers/toolservers.go b/go/core/internal/httpserver/handlers/toolservers.go index e061b670c0..59acd66b57 100644 --- a/go/core/internal/httpserver/handlers/toolservers.go +++ b/go/core/internal/httpserver/handlers/toolservers.go @@ -39,8 +39,29 @@ type ToolServerCreateRequest struct { // MCPServer is used when Type is "MCPServer" MCPServer *v1alpha1.MCPServer `json:"mcpServer,omitempty"` + + // Secrets are optional companion Secrets to create or update + // alongside the ToolServer. Each entry materializes as a key in a + // Kubernetes Secret of type Opaque, owned by the created + // ToolServer so K8s GC cleans them up on delete. Names referenced + // here (e.g. RemoteMCPServer.spec.tls.caCertSecretRef, + // RemoteMCPServer.spec.headersFrom[].valueFrom.secretRef.name, + // MCPServer.spec.secretRefs[].name) must match a Secret described + // in this list when the operator wants the API to materialize the + // content inline. Pre-existing Secrets can also be referenced + // directly without supplying material here. + Secrets []api.SecretMaterial `json:"secrets,omitempty"` } +// remoteMCPServerGVK and mcpServerGVK are passed to the +// companion-secret helpers so the OwnerReference and ownership check +// use the right Kind. kmcp.MCPServer shares the kagent.dev group with +// v1alpha2 (see kmcp/api/v1alpha1/groupversion_info.go). +var ( + remoteMCPServerGVK = v1alpha2.GroupVersion.WithKind("RemoteMCPServer") + mcpServerGVK = v1alpha1.GroupVersion.WithKind("MCPServer") +) + // HandleListToolServers handles GET /api/toolservers requests func (h *ToolServersHandler) HandleListToolServers(w ErrorResponseWriter, r *http.Request) { log := ctrllog.FromContext(r.Context()).WithName("toolservers-handler").WithValues("operation", "list") @@ -107,26 +128,31 @@ func (h *ToolServersHandler) HandleCreateToolServer(w ErrorResponseWriter, r *ht return } + if err := validateSecretMaterials(toolServerRequest.Secrets); err != nil { + w.RespondWithError(errors.NewBadRequestError(err.Error(), err)) + return + } + switch toolServerRequest.Type { case ToolServerTypeRemoteMCPServer: if toolServerRequest.RemoteMCPServer == nil { w.RespondWithError(errors.NewBadRequestError("RemoteMCPServer data is required when type is RemoteMCPServer", nil)) return } - h.handleCreateRemoteMCPServer(w, r, toolServerRequest.RemoteMCPServer, log) + h.handleCreateRemoteMCPServer(w, r, toolServerRequest.RemoteMCPServer, toolServerRequest.Secrets, log) case ToolServerTypeMCPServer: if toolServerRequest.MCPServer == nil { w.RespondWithError(errors.NewBadRequestError("MCPServer data is required when type is MCPServer", nil)) return } - h.handleCreateMCPServer(w, r, toolServerRequest.MCPServer, log) + h.handleCreateMCPServer(w, r, toolServerRequest.MCPServer, toolServerRequest.Secrets, log) default: w.RespondWithError(errors.NewBadRequestError(fmt.Sprintf("Invalid tool server type. Must be one of %s", toolServerTypes.Join(", ")), nil)) } } // handleCreateRemoteMCPServer handles the creation of a RemoteMCPServer -func (h *ToolServersHandler) handleCreateRemoteMCPServer(w ErrorResponseWriter, r *http.Request, toolServerRequest *v1alpha2.RemoteMCPServer, log logr.Logger) { +func (h *ToolServersHandler) handleCreateRemoteMCPServer(w ErrorResponseWriter, r *http.Request, toolServerRequest *v1alpha2.RemoteMCPServer, secrets []api.SecretMaterial, log logr.Logger) { if toolServerRequest.Namespace == "" { toolServerRequest.Namespace = common.GetResourceNamespace() } @@ -150,13 +176,19 @@ func (h *ToolServersHandler) handleCreateRemoteMCPServer(w ErrorResponseWriter, return } + if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, toolServerRequest, remoteMCPServerGVK, secrets); err != nil { + log.Error(err, "Failed to create or update companion secrets") + w.RespondWithError(companionSecretAPIError(err)) + return + } + log.Info("Successfully created RemoteMCPServer") data := api.NewResponse(toolServerRequest, "Successfully created RemoteMCPServer", false) RespondWithJSON(w, http.StatusCreated, data) } // handleCreateMCPServer handles the creation of an MCPServer (stdio-based) -func (h *ToolServersHandler) handleCreateMCPServer(w ErrorResponseWriter, r *http.Request, toolServerRequest *v1alpha1.MCPServer, log logr.Logger) { +func (h *ToolServersHandler) handleCreateMCPServer(w ErrorResponseWriter, r *http.Request, toolServerRequest *v1alpha1.MCPServer, secrets []api.SecretMaterial, log logr.Logger) { if toolServerRequest.Namespace == "" { toolServerRequest.Namespace = common.GetResourceNamespace() } @@ -184,6 +216,12 @@ func (h *ToolServersHandler) handleCreateMCPServer(w ErrorResponseWriter, r *htt return } + if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, toolServerRequest, mcpServerGVK, secrets); err != nil { + log.Error(err, "Failed to create or update companion secrets") + w.RespondWithError(companionSecretAPIError(err)) + return + } + log.Info("Successfully created MCPServer") data := api.NewResponse(toolServerRequest, "Successfully created MCPServer", false) RespondWithJSON(w, http.StatusCreated, data) diff --git a/go/core/internal/httpserver/handlers/toolservers_test.go b/go/core/internal/httpserver/handlers/toolservers_test.go index 79d0caadce..b82f161257 100644 --- a/go/core/internal/httpserver/handlers/toolservers_test.go +++ b/go/core/internal/httpserver/handlers/toolservers_test.go @@ -377,6 +377,193 @@ func TestToolServersHandler(t *testing.T) { require.NotNil(t, responseRecorder.errorReceived) }) + // SecretMaterials companion-Secret support mirrors the ModelConfig + // inline-Secret pattern so operators can create an RMS/MCPServer + // and its referenced Secrets in a single POST without + // pre-creating Secret objects out of band. + t.Run("Success_RemoteMCPServer_WithSecretMaterials_CreatesCASecret", func(t *testing.T) { + handler, kubeClient, _, responseRecorder := setupHandler(t) + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "RemoteMCPServer", + RemoteMCPServer: &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-mcp", Namespace: "default"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "Corp-CA MCP", + URL: "https://mcp.corp.internal/mcp", + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: "corp-ca", + CACertSecretKey: "ca.crt", + }, + }, + }, + Secrets: []api.SecretMaterial{ + {Name: "corp-ca", Key: "ca.crt", Value: "FAKE PEM"}, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + + handler.HandleCreateToolServer(responseRecorder, req) + require.Equal(t, http.StatusCreated, responseRecorder.Code) + + // Companion Secret created in the same namespace. + secret := &corev1.Secret{} + err := kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "corp-ca"}, secret) + require.NoError(t, err) + assert.Equal(t, corev1.SecretTypeOpaque, secret.Type) + assert.Equal(t, []byte("FAKE PEM"), secret.Data["ca.crt"]) + + // OwnerReference points back at the RMS so K8s GC cleans it up. + require.Len(t, secret.OwnerReferences, 1) + or := secret.OwnerReferences[0] + assert.Equal(t, "RemoteMCPServer", or.Kind) + assert.Equal(t, "corp-mcp", or.Name) + assert.Equal(t, v1alpha2.GroupVersion.Identifier(), or.APIVersion) + }) + + t.Run("Success_MCPServer_WithSecretMaterials_CreatesEnvSecret", func(t *testing.T) { + handler, kubeClient, _, responseRecorder := setupHandler(t) + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "MCPServer", + MCPServer: &v1alpha1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "kmcp-with-creds", Namespace: "default"}, + Spec: v1alpha1.MCPServerSpec{ + Deployment: v1alpha1.MCPServerDeployment{ + Image: "example/kmcp:latest", + Port: 8080, + Cmd: "/bin/serve", + SecretRefs: []corev1.LocalObjectReference{ + {Name: "kmcp-creds"}, + }, + }, + }, + }, + Secrets: []api.SecretMaterial{ + {Name: "kmcp-creds", Key: "API_TOKEN", Value: "shhh"}, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + + handler.HandleCreateToolServer(responseRecorder, req) + require.Equal(t, http.StatusCreated, responseRecorder.Code) + + secret := &corev1.Secret{} + err := kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "kmcp-creds"}, secret) + require.NoError(t, err) + assert.Equal(t, []byte("shhh"), secret.Data["API_TOKEN"]) + require.Len(t, secret.OwnerReferences, 1) + or := secret.OwnerReferences[0] + assert.Equal(t, "MCPServer", or.Kind) + assert.Equal(t, "kmcp-with-creds", or.Name) + }) + + t.Run("SecretMaterial_GroupsMultipleKeysIntoSingleSecret", func(t *testing.T) { + handler, kubeClient, _, responseRecorder := setupHandler(t) + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "RemoteMCPServer", + RemoteMCPServer: &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "multi-secret-mcp", Namespace: "default"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "RMS with multi-key Secret", + URL: "https://mcp.corp.internal/mcp", + }, + }, + Secrets: []api.SecretMaterial{ + {Name: "shared", Key: "ca.crt", Value: "PEM"}, + {Name: "shared", Key: "token", Value: "abc"}, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + + handler.HandleCreateToolServer(responseRecorder, req) + require.Equal(t, http.StatusCreated, responseRecorder.Code) + + secret := &corev1.Secret{} + err := kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "shared"}, secret) + require.NoError(t, err) + assert.Equal(t, []byte("PEM"), secret.Data["ca.crt"]) + assert.Equal(t, []byte("abc"), secret.Data["token"]) + }) + + t.Run("SecretMaterial_InvalidName_Rejected", func(t *testing.T) { + handler, _, _, responseRecorder := setupHandler(t) + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "RemoteMCPServer", + RemoteMCPServer: &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "rms-invalid-secret", Namespace: "default"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "x", URL: "https://x/y", + }, + }, + Secrets: []api.SecretMaterial{ + {Name: "INVALID NAME WITH SPACES", Key: "ca.crt", Value: "x"}, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + + handler.HandleCreateToolServer(responseRecorder, req) + assert.Equal(t, http.StatusBadRequest, responseRecorder.Code) + }) + + t.Run("SecretMaterial_ExistingSecretNotOwned_Rejected", func(t *testing.T) { + handler, kubeClient, _, responseRecorder := setupHandler(t) + + // Pre-create a Secret that isn't owned by any RMS. + preexisting := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "stranger", Namespace: "default"}, + Type: corev1.SecretTypeOpaque, + Data: map[string][]byte{"ca.crt": []byte("OLD")}, + } + require.NoError(t, kubeClient.Create(context.Background(), preexisting)) + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "RemoteMCPServer", + RemoteMCPServer: &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "stranger-rms", Namespace: "default"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "x", URL: "https://x/y", + }, + }, + Secrets: []api.SecretMaterial{ + {Name: "stranger", Key: "ca.crt", Value: "NEW"}, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + + handler.HandleCreateToolServer(responseRecorder, req) + // The 400 surfaces from companionSecretAPIError when the + // existing Secret isn't already owned by this RMS. + assert.Equal(t, http.StatusBadRequest, responseRecorder.Code) + + // Confirm the unrelated Secret wasn't mutated. + fresh := &corev1.Secret{} + err := kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "stranger"}, fresh) + require.NoError(t, err) + assert.Equal(t, []byte("OLD"), fresh.Data["ca.crt"]) + }) + t.Run("ToolServerAlreadyExists", func(t *testing.T) { handler, kubeClient, _, responseRecorder := setupHandler(t) diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index d47ab55adf..e3bb2dcc5a 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -292,6 +292,11 @@ type ExtensionConfig struct { AgentPlugins []agent_translator.TranslatorPlugin MCPServerPlugins []translator.MCPTranslatorPlugin SandboxBackend sandboxbackend.Backend + + // RemoteMCPServerURLRewriter optionally transforms the URL the + // controller dials when discovering tools on a RemoteMCPServer. + // Nil means use the spec.URL verbatim. + RemoteMCPServerURLRewriter translator.RemoteMCPServerURLRewriter } type GetExtensionConfig func(bootstrap BootstrapConfig) (*ExtensionConfig, error) @@ -527,6 +532,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne cfg.DefaultModelConfig, watchNamespacesList, extensionCfg.SandboxBackend, + extensionCfg.RemoteMCPServerURLRewriter, ) if err := (&controller.ServiceController{ diff --git a/go/core/pkg/translator/adk_api_translator_types.go b/go/core/pkg/translator/adk_api_translator_types.go index 07fd4268d5..1f431d0305 100644 --- a/go/core/pkg/translator/adk_api_translator_types.go +++ b/go/core/pkg/translator/adk_api_translator_types.go @@ -20,3 +20,21 @@ type TranslatorPlugin interface { ProcessAgent(ctx context.Context, agent v1alpha2.AgentObject, outputs *AgentOutputs) error GetOwnedResourceTypes() []client.Object } + +// RemoteMCPServerURLRewriter optionally transforms the URL the kagent +// controller dials when discovering tools on a RemoteMCPServer. +// +// The reconciler treats the returned string as the dial target. +// Returning the input URL unchanged is the correct behavior when the +// rewriter does not apply. A nil rewriter means the controller dials +// the RemoteMCPServer spec.URL verbatim. +// +// The hook controls the URL only. TLS configuration is built from +// spec.tls independently and applied to the underlying http.Client. +// Implementations that change scheme (e.g. https:// → http:// to route +// through a destination that handles TLS elsewhere) must ensure +// spec.tls remains consistent with the new dial target. The rewriter +// currently cannot supply TLS config of its own. +type RemoteMCPServerURLRewriter interface { + RewriteRemoteMCPServerURL(ctx context.Context, remoteMCPServer *v1alpha2.RemoteMCPServer) (string, error) +} diff --git a/go/core/test/e2e/mocks/invoke_remotemcpserver_tls_agent.json b/go/core/test/e2e/mocks/invoke_remotemcpserver_tls_agent.json new file mode 100644 index 0000000000..5f35559cae --- /dev/null +++ b/go/core/test/e2e/mocks/invoke_remotemcpserver_tls_agent.json @@ -0,0 +1,66 @@ +{ + "openai": [ + { + "name": "initial_request", + "match": { + "match_type": "contains", + "message": { + "content": "add 2 and 3", + "role": "user" + } + }, + "response": { + "id": "chatcmpl-1", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4.1-mini", + "choices": [ + { + "index": 0, + "role": "assistant", + "message": { + "content": "", + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": { + "name": "add_numbers", + "arguments": "{\"a\": 2, \"b\": 3}" + } + } + ] + }, + "finish_reason": "tool_calls" + } + ] + } + }, + { + "name": "add_response", + "match": { + "match_type": "contains", + "message": { + "content": "5", + "role": "tool", + "tool_call_id": "call_1" + } + }, + "response": { + "id": "call_1", + "object": "chat.completion.tool_message", + "created": 1677652288, + "model": "gpt-4.1-mini", + "choices": [ + { + "index": 0, + "message": { + "content": "I have added 2 and 3. The result is 5.", + "role": "assistant" + } + } + ] + } + } + ] +} diff --git a/go/core/test/e2e/remotemcpserver_tls_test.go b/go/core/test/e2e/remotemcpserver_tls_test.go new file mode 100644 index 0000000000..54702dc37f --- /dev/null +++ b/go/core/test/e2e/remotemcpserver_tls_test.go @@ -0,0 +1,504 @@ +// E2E coverage for RemoteMCPServer TLS support and the tool-server +// companion-Secret API. These tests stand up a real mockmcp fixture +// (in-process, optionally with TLS) on the test host, install matching +// kagent resources in the cluster, drive an agent invocation through +// the A2A endpoint, and assert on the mockmcp request recorder. +// +// Prerequisites (mirror the existing e2e tests in invoke_api_test.go): +// +// - A kind cluster with kagent installed. +// - `kubectl port-forward -n kagent deployments/kagent-controller 8083` +// (or KAGENT_URL set) so the tests can reach the HTTP API. +// - The cluster must be able to dial the test host on `host.docker.internal` +// (Mac) / `172.17.0.1` (Linux) — same indirection mockllm uses; +// buildK8sURL() in invoke_api_test.go does the translation. + +package e2e_test + +import ( + "bytes" + "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/json" + "encoding/pem" + "fmt" + "math/big" + "net" + "net/http" + "testing" + "time" + + "github.com/kagent-dev/kagent/go/api/httpapi" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" + "github.com/kagent-dev/mockmcp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// mockmcpFixture is a started mockmcp.Server plus the host-visible base +// URL agents and the controller should dial. baseURL routes through +// host.docker.internal (Mac) / 172.17.0.1 (Linux) so in-cluster pods can +// reach the test host — same shape as setupMockServer() does for the +// mock LLM. +type mockmcpFixture struct { + server *mockmcp.Server + baseURL string // dial URL from inside the cluster + mcpURL string // dial URL with the /mcp path appended + sseURL string // dial URL with the /sse path appended + caCertPEM []byte // when TLS, the PEM bundle to install as a Secret (nil for plaintext fixtures) + hostBindURL string // local 127.0.0.1 URL for debug logging — NOT the cluster-facing URL +} + +// setupMockMCP starts a mockmcp fixture. When withTLS is true a fresh +// self-signed cert is minted for the duration of the test and returned +// in caCertPEM so the caller can write it into a Secret for the RMS to +// reference. Server is registered with t.Cleanup; callers don't manage +// shutdown. +func setupMockMCP(t *testing.T, withTLS bool, opts mockmcp.Options) *mockmcpFixture { + t.Helper() + + // Bind ephemerally so concurrent tests don't fight for ports. + if opts.Addr == "" { + opts.Addr = "127.0.0.1:0" + } + opts.RecordRequests = true + + var caPEM []byte + if withTLS { + // Mint a self-signed CA + server cert. SANs include + // host.docker.internal and the Docker-bridge IP so the + // certificate validates whichever host alias the in-cluster + // pod ends up using. + certPEM, keyPEM, ca := generateSelfSignedCert(t, []string{ + "localhost", + "host.docker.internal", + }, []net.IP{ + net.ParseIP("127.0.0.1"), + net.ParseIP("172.17.0.1"), + }) + opts.CertPEM = certPEM + opts.KeyPEM = keyPEM + caPEM = ca + } + + server, err := mockmcp.NewServer(opts) + require.NoError(t, err) + + hostURL, err := server.Start(t.Context()) + require.NoError(t, err) + t.Cleanup(func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = server.Stop(ctx) + }) + + clusterURL := buildK8sURL(hostURL) + + return &mockmcpFixture{ + server: server, + baseURL: clusterURL, + mcpURL: clusterURL + mockmcp.MCPPath, + sseURL: clusterURL + mockmcp.SSEPath, + caCertPEM: caPEM, + hostBindURL: hostURL, + } +} + +// createCASecret writes the CA PEM produced by setupMockMCP into a +// Kubernetes Secret in the kagent namespace under the given name and +// key. Registers for cleanup. Returns the Secret so the caller can +// reference it from RMS.spec.tls. +func createCASecret(t *testing.T, cli client.Client, name, key string, ca []byte) *corev1.Secret { + t.Helper() + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "kagent"}, + Type: corev1.SecretTypeOpaque, + Data: map[string][]byte{key: ca}, + } + require.NoError(t, cli.Create(t.Context(), secret)) + cleanup(t, cli, secret) + return secret +} + +// createRMS creates the RMS in the kagent namespace via the typed +// client and waits for Status.DiscoveredTools to populate — that's the +// signal the controller's tool-discovery code path completed end-to-end +// (including the TLS handshake when applicable). Returns the RMS so the +// caller can assert on Status if needed. +func createRMS(t *testing.T, cli client.Client, spec v1alpha2.RemoteMCPServerSpec) *v1alpha2.RemoteMCPServer { + t.Helper() + rms := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-rms-", + Namespace: "kagent", + }, + Spec: spec, + } + require.NoError(t, cli.Create(t.Context(), rms)) + cleanup(t, cli, rms) + + // Poll Status.DiscoveredTools — populates only after a successful + // ListTools call against the upstream, which is the end-to-end + // signal we want. + pollErr := wait.PollUntilContextTimeout(t.Context(), 2*time.Second, 60*time.Second, true, func(ctx context.Context) (bool, error) { + fresh := &v1alpha2.RemoteMCPServer{} + if err := cli.Get(ctx, client.ObjectKey{Namespace: rms.Namespace, Name: rms.Name}, fresh); err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + return false, err + } + if len(fresh.Status.DiscoveredTools) > 0 { + *rms = *fresh + return true, nil + } + return false, nil + }) + require.NoError(t, pollErr, "RMS Status.DiscoveredTools did not populate within timeout") + return rms +} + +// waitForAPIDiscoveredTools polls GET /api/toolservers until the named +// RemoteMCPServer's DiscoveredTools list is non-empty, then returns the +// tools. The CRD status field and this HTTP response come from +// different write paths in the reconciler (the field is set by +// setRemoteMCPServerStatusConditions, the API serves rows the +// reconciler persisted via RefreshToolsForServer), so a working CRD +// status doesn't strictly imply the API has caught up — poll instead +// of assuming. +func waitForAPIDiscoveredTools(t *testing.T, namespace, name string) []*v1alpha2.MCPTool { + t.Helper() + ref := namespace + "/" + name + var matched []*v1alpha2.MCPTool + + pollErr := wait.PollUntilContextTimeout(t.Context(), 2*time.Second, 60*time.Second, true, func(ctx context.Context) (bool, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, kagentURL()+"/api/toolservers", nil) + if err != nil { + return false, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return false, nil + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return false, nil + } + var list httpapi.StandardResponse[[]httpapi.ToolServerResponse] + if err := json.NewDecoder(resp.Body).Decode(&list); err != nil { + return false, err + } + for _, ts := range list.Data { + if ts.Ref == ref && len(ts.DiscoveredTools) > 0 { + matched = ts.DiscoveredTools + return true, nil + } + } + return false, nil + }) + require.NoError(t, pollErr, "timed out waiting for tools on /api/toolservers for %s", ref) + return matched +} + +// generateSelfSignedCert mints an ECDSA self-signed certificate scoped +// to the given DNS names + IPs. Returns the certificate PEM, the key +// PEM (both bundled for mockmcp), and a CA PEM identical to the +// certificate (since it's self-signed the cert IS its own trust +// anchor). For more elaborate chains the CA could be separate, but +// self-signed-as-CA is sufficient for what RMS TLS needs to exercise: +// the controller pins a single PEM as RootCAs. +func generateSelfSignedCert(t *testing.T, dnsNames []string, ips []net.IP) (certPEM, keyPEM, caPEM []byte) { + t.Helper() + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + require.NoError(t, err) + + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(time.Now().UnixNano()), + Subject: pkix.Name{CommonName: "kagent-e2e-mockmcp"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment | x509.KeyUsageCertSign, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + DNSNames: dnsNames, + IPAddresses: ips, + IsCA: true, + BasicConstraintsValid: true, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key) + require.NoError(t, err) + cert := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) + + keyDER, err := x509.MarshalPKCS8PrivateKey(key) + require.NoError(t, err) + keyPEM = pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: keyDER}) + + // Self-signed: the cert IS the CA bundle. + return cert, keyPEM, cert +} + +// TestE2E_RMS_PrivateCAUpstream exercises the production path: +// RMS.spec.tls.caCertSecretRef pins a CA Secret, the controller reads +// the Secret to build its tool-discovery http.Client, and the agent +// pod mounts the same Secret to use during invocation. The mockmcp +// fixture is the upstream MCP server; its request recorder lets us +// assert that the controller's ListTools call AND the agent's +// tools/call both reached the fixture over TLS with the right cert +// chain. +func TestE2E_RMS_PrivateCAUpstream(t *testing.T) { + baseURL, stopServer := setupMockServer(t, "mocks/invoke_remotemcpserver_tls_agent.json") + defer stopServer() + + cli := setupK8sClient(t, false) + + mcp := setupMockMCP(t, true, mockmcp.Options{}) + caSecret := createCASecret(t, cli, "e2e-rms-tls-ca", "ca.crt", mcp.caCertPEM) + + rms := createRMS(t, cli, v1alpha2.RemoteMCPServerSpec{ + Description: "Private-CA HTTPS upstream", + URL: mcp.mcpURL, + Protocol: v1alpha2.RemoteMCPServerProtocolStreamableHttp, + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: caSecret.Name, + CACertSecretKey: "ca.crt", + }, + }) + + // Sanity-check the controller's ListTools call landed on mockmcp. + requests := mcp.server.Requests() + assert.NotEmpty(t, requests, "mockmcp should have recorded the controller's ListTools call") + + // DiscoveredTools should include mockmcp's default tool set on the + // CRD status (the K8s API surface). + toolNames := make(map[string]bool) + for _, tool := range rms.Status.DiscoveredTools { + toolNames[tool.Name] = true + } + assert.True(t, toolNames["add_numbers"], "expected add_numbers in Status.DiscoveredTools, got %v", toolNames) + + // And on the kagent HTTP API (what the UI calls). The reconciler + // persists the post-ListTools result to the DB at + // reconciler.go:RefreshToolsForServer, separate from the CRD status + // write — verifying both confirms the controller "shows" tools on + // every surface an operator might look at. + apiTools := waitForAPIDiscoveredTools(t, rms.Namespace, rms.Name) + apiNames := make(map[string]bool) + for _, tool := range apiTools { + apiNames[tool.Name] = true + } + assert.True(t, apiNames["add_numbers"], "expected add_numbers via /api/toolservers, got %v", apiNames) + + // Drive an agent invocation through the A2A endpoint. The mock LLM + // is preprogrammed to issue a tools/call to add_numbers; mockmcp + // answers it via the request recorder. + modelCfg := setupModelConfig(t, cli, baseURL) + agent := setupAgent(t, cli, modelCfg.Name, []*v1alpha2.Tool{{ + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + ApiGroup: "kagent.dev", + Kind: "RemoteMCPServer", + Name: rms.Name, + }, + ToolNames: []string{"add_numbers"}, + }, + }}) + a2aClient := setupA2AClient(t, agent) + + runSyncTest(t, a2aClient, "add 2 and 3", "5", nil) + + // The agent's tools/call should also have reached mockmcp. + postInvoke := mcp.server.Requests() + assert.Greater(t, len(postInvoke), len(requests), + "mockmcp should have recorded additional requests from the agent's tools/call") +} + +// TestE2E_RMS_DisableVerify confirms the test-fixture escape hatch +// works end-to-end: an RMS with disableVerify=true (and no Secret) +// connects successfully to the same self-signed fixture that +// PrivateCAUpstream uses. The point of the test is to prove the +// controller actually skipped verification — if disableVerify +// silently fell back to system trust, the handshake would fail on +// the self-signed cert. +func TestE2E_RMS_DisableVerify(t *testing.T) { + baseURL, stopServer := setupMockServer(t, "mocks/invoke_remotemcpserver_tls_agent.json") + defer stopServer() + + cli := setupK8sClient(t, false) + + mcp := setupMockMCP(t, true, mockmcp.Options{}) + + rms := createRMS(t, cli, v1alpha2.RemoteMCPServerSpec{ + Description: "Skip-verify HTTPS upstream", + URL: mcp.mcpURL, + Protocol: v1alpha2.RemoteMCPServerProtocolStreamableHttp, + TLS: &v1alpha2.TLSConfig{ + DisableVerify: true, + }, + }) + + assert.NotEmpty(t, rms.Status.DiscoveredTools, + "DiscoveredTools should populate even though the upstream cert isn't trusted by system CAs") + + modelCfg := setupModelConfig(t, cli, baseURL) + agent := setupAgent(t, cli, modelCfg.Name, []*v1alpha2.Tool{{ + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + ApiGroup: "kagent.dev", + Kind: "RemoteMCPServer", + Name: rms.Name, + }, + ToolNames: []string{"add_numbers"}, + }, + }}) + a2aClient := setupA2AClient(t, agent) + + runSyncTest(t, a2aClient, "add 2 and 3", "5", nil) +} + +// TestE2E_RMS_SSE_TLS exercises the SSE-transport-with-TLS code path. +// SSE and Streamable HTTP go through different translator functions +// (translateSseHttpTool vs translateStreamableHttpTool) and the Python +// runtime has separate factory-installation logic per transport. Both +// must work against a real /sse endpoint backed by a private CA. +func TestE2E_RMS_SSE_TLS(t *testing.T) { + baseURL, stopServer := setupMockServer(t, "mocks/invoke_remotemcpserver_tls_agent.json") + defer stopServer() + + cli := setupK8sClient(t, false) + + mcp := setupMockMCP(t, true, mockmcp.Options{}) + caSecret := createCASecret(t, cli, "e2e-rms-sse-ca", "ca.crt", mcp.caCertPEM) + + rms := createRMS(t, cli, v1alpha2.RemoteMCPServerSpec{ + Description: "Private-CA SSE upstream", + URL: mcp.sseURL, + Protocol: v1alpha2.RemoteMCPServerProtocolSse, + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: caSecret.Name, + CACertSecretKey: "ca.crt", + }, + }) + + // Find the actual mockmcp request path the controller landed on to + // confirm SSE was used (not Streamable HTTP). The mockmcp.SSEPath + // is the request side; the recorder captures it on every request. + requests := mcp.server.Requests() + require.NotEmpty(t, requests, "mockmcp should have recorded the controller's ListTools call over SSE") + var sawSSE bool + for _, req := range requests { + if req.Path == mockmcp.SSEPath { + sawSSE = true + break + } + } + assert.True(t, sawSSE, "expected at least one request on the SSE path (%s); recorded paths: %v", + mockmcp.SSEPath, recordedPaths(requests)) + + modelCfg := setupModelConfig(t, cli, baseURL) + agent := setupAgent(t, cli, modelCfg.Name, []*v1alpha2.Tool{{ + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + ApiGroup: "kagent.dev", + Kind: "RemoteMCPServer", + Name: rms.Name, + }, + ToolNames: []string{"add_numbers"}, + }, + }}) + a2aClient := setupA2AClient(t, agent) + + runSyncTest(t, a2aClient, "add 2 and 3", "5", nil) +} + +// TestE2E_API_ToolServerCompanionSecrets posts a ToolServerCreateRequest +// with inline SecretMaterials and asserts the controller materializes +// both the RMS and the companion Secret in a single round-trip — the +// "one POST" UX equivalent of ModelConfig's existing inline-Secret +// support. Also exercises the OwnerReference: deleting the RMS should +// cascade-delete the Secret via K8s GC. +// +// This test doesn't need mockmcp running; it stops at "did the +// controller create the resources correctly?", which is the API +// contract under test. +func TestE2E_API_ToolServerCompanionSecrets(t *testing.T) { + cli := setupK8sClient(t, true) + + rmsName := fmt.Sprintf("e2e-rms-companion-%d", time.Now().UnixNano()) + caSecretName := rmsName + "-ca" + + body, err := json.Marshal(handlers.ToolServerCreateRequest{ + Type: "RemoteMCPServer", + RemoteMCPServer: &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{ + Name: rmsName, + Namespace: "kagent", + }, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "Companion-Secret integration test", + URL: "https://nowhere.invalid/mcp", + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: caSecretName, + CACertSecretKey: "ca.crt", + }, + }, + }, + Secrets: []httpapi.SecretMaterial{ + {Name: caSecretName, Key: "ca.crt", Value: "FAKE PEM CONTENT"}, + }, + }) + require.NoError(t, err) + + req, err := http.NewRequestWithContext(t.Context(), "POST", + kagentURL()+"/api/toolservers", bytes.NewReader(body)) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + assert.Equal(t, http.StatusCreated, resp.StatusCode, "expected 201 Created on POST /api/toolservers") + + // The RMS should exist in K8s. + rms := &v1alpha2.RemoteMCPServer{} + require.NoError(t, cli.Get(t.Context(), + client.ObjectKey{Namespace: "kagent", Name: rmsName}, rms)) + // Companion-Secret deletion happens via OwnerReference GC when we + // delete the RMS in cleanup; register this for that path. + cleanup(t, cli, rms) + + // The Secret should exist in K8s with the expected content. + secret := &corev1.Secret{} + require.NoError(t, cli.Get(t.Context(), + client.ObjectKey{Namespace: "kagent", Name: caSecretName}, secret)) + assert.Equal(t, []byte("FAKE PEM CONTENT"), secret.Data["ca.crt"]) + + // OwnerReference should point back at the RMS so K8s GC cleans it up. + require.Len(t, secret.OwnerReferences, 1) + or := secret.OwnerReferences[0] + assert.Equal(t, "RemoteMCPServer", or.Kind) + assert.Equal(t, rmsName, or.Name) + assert.Equal(t, v1alpha2.GroupVersion.Identifier(), or.APIVersion) +} + +// recordedPaths is a small helper for assertion failure messages — turns +// the request recorder's snapshot into a human-readable list of paths +// when an expected request doesn't show up. +func recordedPaths(reqs []mockmcp.RecordedRequest) []string { + paths := make([]string, len(reqs)) + for i, req := range reqs { + paths[i] = req.Method + " " + req.Path + } + return paths +} diff --git a/go/go.mod b/go/go.mod index c44f073c79..a04296021a 100644 --- a/go/go.mod +++ b/go/go.mod @@ -65,6 +65,7 @@ require ( github.com/golang/protobuf v1.5.4 github.com/google/jsonschema-go v0.4.3 github.com/jackc/pgx/v5 v5.9.2 + github.com/kagent-dev/mockmcp v0.0.0-20260520211643-dcd475b74085 github.com/ollama/ollama v0.23.2 github.com/testcontainers/testcontainers-go v0.42.0 github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0 diff --git a/go/go.sum b/go/go.sum index ad09ec9eaf..4e53e8ef78 100644 --- a/go/go.sum +++ b/go/go.sum @@ -498,6 +498,8 @@ github.com/kagent-dev/kmcp v0.3.0 h1:CuF8LN6JbPoy75saRzwI5OIcSVZUgiwB0BeWtq+rAO4 github.com/kagent-dev/kmcp v0.3.0/go.mod h1:g7wS/3m2wonRo/1DMwVoHxnilr/urPgV2hwV1DwkwrQ= github.com/kagent-dev/mockllm v0.0.5 h1:mm9Ml3NH6/E/YKVMgMwWYMNsNGkDze6I6TC0ppHZAo8= github.com/kagent-dev/mockllm v0.0.5/go.mod h1:tDLemRsTZa1NdHaDbg3sgFk9cT1QWvMPlBtLVD6I2mA= +github.com/kagent-dev/mockmcp v0.0.0-20260520211643-dcd475b74085 h1:5bYxRc6K6+JiFMYGSJpQ7y18PzTGXUmUtVXY7Fqh0Ew= +github.com/kagent-dev/mockmcp v0.0.0-20260520211643-dcd475b74085/go.mod h1:vTMste7c+XiDOQyhe6iGPKLbMy3chY/h2fL3Dox5fAE= github.com/karamaru-alpha/copyloopvar v1.2.2 h1:yfNQvP9YaGQR7VaWLYcfZUlRP2eo2vhExWKxD/fP6q0= github.com/karamaru-alpha/copyloopvar v1.2.2/go.mod h1:oY4rGZqZ879JkJMtX3RRkcXRkmUvH0x35ykgaKgsgJY= github.com/kisielk/errcheck v1.10.0 h1:Lvs/YAHP24YKg08LA8oDw2z9fJVme090RAXd90S+rrw= diff --git a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml index 00b21b6da0..a3d8025a6c 100644 --- a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml +++ b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml @@ -682,6 +682,20 @@ spec: Production deployments MUST use proper certificates. type: boolean type: object + x-kubernetes-validations: + - message: caCertSecretKey requires caCertSecretRef + rule: '!(has(self.caCertSecretKey) && size(self.caCertSecretKey) + > 0 && (!has(self.caCertSecretRef) || size(self.caCertSecretRef) + == 0))' + - message: caCertSecretRef requires caCertSecretKey + rule: '!(has(self.caCertSecretRef) && size(self.caCertSecretRef) + > 0 && (!has(self.caCertSecretKey) || size(self.caCertSecretKey) + == 0))' + - message: disableSystemCAs requires caCertSecretRef or disableVerify + (trust-nothing config rejects every upstream) + rule: '!(has(self.disableSystemCAs) && self.disableSystemCAs && + (!has(self.disableVerify) || !self.disableVerify) && (!has(self.caCertSecretRef) + || size(self.caCertSecretRef) == 0))' required: - model type: object @@ -719,22 +733,6 @@ spec: rule: '!(has(self.apiKeyPassthrough) && self.apiKeyPassthrough && (self.provider == ''Gemini'' || self.provider == ''GeminiVertexAI'' || self.provider == ''AnthropicVertexAI''))' - - message: caCertSecretKey requires caCertSecretRef - rule: '!(has(self.tls) && has(self.tls.caCertSecretKey) && size(self.tls.caCertSecretKey) - > 0 && (!has(self.tls.caCertSecretRef) || size(self.tls.caCertSecretRef) - == 0))' - - message: caCertSecretKey requires caCertSecretRef (unless disableVerify - is true) - rule: '!(has(self.tls) && (!has(self.tls.disableVerify) || !self.tls.disableVerify) - && has(self.tls.caCertSecretKey) && size(self.tls.caCertSecretKey) - > 0 && (!has(self.tls.caCertSecretRef) || size(self.tls.caCertSecretRef) - == 0))' - - message: caCertSecretRef requires caCertSecretKey (unless disableVerify - is true) - rule: '!(has(self.tls) && (!has(self.tls.disableVerify) || !self.tls.disableVerify) - && has(self.tls.caCertSecretRef) && size(self.tls.caCertSecretRef) - > 0 && (!has(self.tls.caCertSecretKey) || size(self.tls.caCertSecretKey) - == 0))' - message: openAI.tokenExchange requires apiKeySecret (the service account secret) rule: '!(has(self.openAI) && has(self.openAI.tokenExchange) && (!has(self.apiKeySecret) diff --git a/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml b/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml index f8bbcf0b1d..e0b4e272ab 100644 --- a/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml +++ b/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml @@ -179,6 +179,63 @@ spec: timeout: default: 30s type: string + tls: + description: |- + TLS configuration for the upstream MCP server connection. + Use this for HTTPS upstreams that present a certificate the agent's + system trust store does not include (corporate CA, self-signed cert + on a test fixture, internal MCP gateway). Reuses the same TLSConfig + type as ModelConfig.spec.tls, with identical semantics: disableVerify + turns off certificate validation entirely, caCertSecretRef + + caCertSecretKey point at a PEM bundle Secret in the same namespace, + and disableSystemCAs trusts only the named bundle. + properties: + caCertSecretKey: + description: |- + CACertSecretKey is the key within the Secret that contains the CA certificate data. + This field follows the same pattern as APIKeySecretKey. + Required when CACertSecretRef is set (unless DisableVerify is true). + type: string + caCertSecretRef: + description: |- + CACertSecretRef is a reference to a Kubernetes Secret containing + CA certificate(s) in PEM format. The Secret must be in the same + namespace as the ModelConfig. + When set, the certificate will be used to verify the provider's SSL certificate. + This field follows the same pattern as APIKeySecret. + type: string + disableSystemCAs: + default: false + description: |- + DisableSystemCAs disables the use of system CA certificates. + When false (default), system CA certificates are used for verification (safe behavior). + When true, only the custom CA from CACertSecretRef is trusted. + This allows strict security policies where only corporate CAs should be trusted. + type: boolean + disableVerify: + default: false + description: |- + DisableVerify disables SSL certificate verification entirely. + When false (default), SSL certificates are verified. + When true, SSL certificate verification is disabled. + WARNING: This should ONLY be used in development/testing environments. + Production deployments MUST use proper certificates. + type: boolean + type: object + x-kubernetes-validations: + - message: caCertSecretKey requires caCertSecretRef + rule: '!(has(self.caCertSecretKey) && size(self.caCertSecretKey) + > 0 && (!has(self.caCertSecretRef) || size(self.caCertSecretRef) + == 0))' + - message: caCertSecretRef requires caCertSecretKey + rule: '!(has(self.caCertSecretRef) && size(self.caCertSecretRef) + > 0 && (!has(self.caCertSecretKey) || size(self.caCertSecretKey) + == 0))' + - message: disableSystemCAs requires caCertSecretRef or disableVerify + (trust-nothing config rejects every upstream) + rule: '!(has(self.disableSystemCAs) && self.disableSystemCAs && + (!has(self.disableVerify) || !self.disableVerify) && (!has(self.caCertSecretRef) + || size(self.caCertSecretRef) == 0))' url: minLength: 1 type: string diff --git a/python/packages/kagent-adk/src/kagent/adk/models/_ssl.py b/python/packages/kagent-adk/src/kagent/adk/models/_ssl.py index ffd0b831eb..a634a58e49 100644 --- a/python/packages/kagent-adk/src/kagent/adk/models/_ssl.py +++ b/python/packages/kagent-adk/src/kagent/adk/models/_ssl.py @@ -175,13 +175,13 @@ def create_ssl_context( >>> # Use only custom CA certificate >>> ctx = create_ssl_context( - ... disable_verify=False, ca_cert_path="/etc/ssl/certs/custom/ca.crt", disable_system_cas=True + ... disable_verify=False, ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", disable_system_cas=True ... ) >>> assert isinstance(ctx, ssl.SSLContext) >>> # Use system CAs plus custom CA >>> ctx = create_ssl_context( - ... disable_verify=False, ca_cert_path="/etc/ssl/certs/custom/ca.crt", disable_system_cas=False + ... disable_verify=False, ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", disable_system_cas=False ... ) >>> assert isinstance(ctx, ssl.SSLContext) """ diff --git a/python/packages/kagent-adk/src/kagent/adk/types.py b/python/packages/kagent-adk/src/kagent/adk/types.py index 5e2f4a97af..ca6da6b4a9 100644 --- a/python/packages/kagent-adk/src/kagent/adk/types.py +++ b/python/packages/kagent-adk/src/kagent/adk/types.py @@ -11,10 +11,11 @@ from google.adk.models.anthropic_llm import Claude as ClaudeLLM from google.adk.models.google_llm import Gemini as GeminiLLM from google.adk.tools.mcp_tool import SseConnectionParams, StreamableHTTPConnectionParams -from pydantic import AliasChoices, BaseModel, Field +from pydantic import AliasChoices, BaseModel, Field, model_validator from kagent.adk._approval import make_approval_callback, strip_confirmation_parts_callback from kagent.adk._mcp_toolset import KAgentMcpToolset +from kagent.adk.models._ssl import create_ssl_context from kagent.adk._remote_a2a_tool import KAgentRemoteA2AToolset from kagent.adk.models._anthropic import KAgentAnthropicLlm from kagent.adk.models._bedrock import KAgentBedrockLlm @@ -141,14 +142,113 @@ def _convert_ollama_options(options: dict[str, str] | None) -> dict[str, Any]: return converted -class HttpMcpServerConfig(BaseModel): +def _build_tls_httpx_client_factory( + *, + disable_verify: bool, + ca_cert_path: str | None, + disable_system_cas: bool, +) -> Callable[..., httpx.AsyncClient]: + """Returns an httpx_client_factory suitable for google-adk MCP transport + connection params. The factory applies the kagent TLS configuration + (disableVerify / caCert / disableSystemCAs) to every httpx.AsyncClient + the MCP session manager constructs. + """ + ssl_ctx = create_ssl_context( + disable_verify=disable_verify, + ca_cert_path=ca_cert_path, + disable_system_cas=disable_system_cas, + ) + + def _factory( + headers: dict[str, str] | None = None, + timeout: httpx.Timeout | None = None, + auth: httpx.Auth | None = None, + ) -> httpx.AsyncClient: + kwargs: dict[str, Any] = { + "follow_redirects": True, + "verify": ssl_ctx, + } + if timeout is not None: + kwargs["timeout"] = timeout + else: + # Mirror google-adk's create_mcp_http_client defaults. + kwargs["timeout"] = httpx.Timeout(30, read=300) + if headers is not None: + kwargs["headers"] = headers + if auth is not None: + kwargs["auth"] = auth + return httpx.AsyncClient(**kwargs) + + return _factory + + +class _McpTlsMixin(BaseModel): + """Shared TLS plumbing for HttpMcpServerConfig / SseMcpServerConfig. + + The kagent controller emits TLS keys nested under `params` on the + wire (so the same JSON schema works for both Go and Python + runtimes); we lift them up at load time and install a TLS-aware + httpx_client_factory on the params instance at toolset construction + time. Lift-into-wire-config + install-into-params keeps google-adk's + upstream types unchanged and avoids exporting our own params + subclasses that would shadow the upstream names. + """ + + tls_insecure_skip_verify: bool | None = None + tls_ca_cert_path: str | None = None + tls_disable_system_cas: bool | None = None + + @model_validator(mode="before") + @classmethod + def _lift_tls_from_params(cls, values: Any) -> Any: + # The kagent controller emits TLS keys nested under `params` so + # the same JSON wire schema works for both Go and Python runtimes; + # lift them to the top level. google-adk's params types use + # pydantic's default extra="ignore", so the duplicate copies that + # remain on `params` are silently dropped at construction. + if isinstance(values, dict) and isinstance(values.get("params"), dict): + params = values["params"] + for key in ("tls_insecure_skip_verify", "tls_ca_cert_path", "tls_disable_system_cas"): + if key in params and key not in values: + values[key] = params[key] + return values + + def _apply_tls_to_params(self, params: Any) -> None: + """Install a TLS-aware httpx_client_factory on the params instance + if any TLS field is set. No-op when no TLS config was supplied so + the upstream google-adk default factory remains in place.""" + if ( + self.tls_insecure_skip_verify is None + and self.tls_ca_cert_path is None + and self.tls_disable_system_cas is None + ): + return + factory = _build_tls_httpx_client_factory( + disable_verify=self.tls_insecure_skip_verify or False, + ca_cert_path=self.tls_ca_cert_path, + disable_system_cas=self.tls_disable_system_cas or False, + ) + if hasattr(params, "httpx_client_factory"): + params.httpx_client_factory = factory + else: + # Older google-adk (< 1.28) lacked httpx_client_factory on + # SseConnectionParams. The pyproject floor forbids this in + # production; warn loudly for stale dev environments. + logger.warning( + "TLS configuration ignored on %s: google-adk does not expose " + "httpx_client_factory on this params type — upgrade to ≥ 1.28.1.", + type(params).__name__, + ) + + +class HttpMcpServerConfig(_McpTlsMixin): params: StreamableHTTPConnectionParams tools: list[str] = Field(default_factory=list) allowed_headers: list[str] | None = None # Headers to forward from A2A request to MCP calls require_approval: list[str] | None = None # Tools requiring human approval before execution -class SseMcpServerConfig(BaseModel): +class SseMcpServerConfig(_McpTlsMixin): params: SseConnectionParams tools: list[str] = Field(default_factory=list) allowed_headers: list[str] | None = None # Headers to forward from A2A request to MCP calls @@ -310,6 +410,10 @@ def to_agent( sts_header_provider = sts_integration.header_provider if self.http_tools: for http_tool in self.http_tools: # add http tools + # Install a TLS-aware httpx_client_factory on the params + # before constructing the toolset, so every MCP session + # the session manager opens trusts the configured CA. + http_tool._apply_tls_to_params(http_tool.params) # Create header provider combining STS and allowed headers for this tool tool_header_provider = create_header_provider( allowed_headers=http_tool.allowed_headers, @@ -326,6 +430,7 @@ def to_agent( tools_requiring_approval.update(http_tool.require_approval) if self.sse_tools: for sse_tool in self.sse_tools: # add sse tools + sse_tool._apply_tls_to_params(sse_tool.params) # Create header provider combining STS and allowed headers for this tool tool_header_provider = create_header_provider( allowed_headers=sse_tool.allowed_headers, diff --git a/python/packages/kagent-adk/tests/unittests/models/test_openai.py b/python/packages/kagent-adk/tests/unittests/models/test_openai.py index 392c2d8a4b..5dba0e52ac 100644 --- a/python/packages/kagent-adk/tests/unittests/models/test_openai.py +++ b/python/packages/kagent-adk/tests/unittests/models/test_openai.py @@ -597,7 +597,7 @@ def test_openai_client_with_custom_ca_certificate(): model="gpt-3.5-turbo", type="openai", api_key="fake", - tls_ca_cert_path="/etc/ssl/certs/custom/ca.crt", + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", tls_disable_system_cas=False, ) @@ -607,7 +607,7 @@ def test_openai_client_with_custom_ca_certificate(): # Verify create_ssl_context was called with correct parameters mock_create_ssl.assert_called_once_with( disable_verify=False, - ca_cert_path="/etc/ssl/certs/custom/ca.crt", + ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", disable_system_cas=False, ) @@ -633,7 +633,7 @@ def test_openai_client_with_custom_ca_only(): model="gpt-3.5-turbo", type="openai", api_key="fake", - tls_ca_cert_path="/etc/ssl/certs/custom/ca.crt", + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", tls_disable_system_cas=True, ) @@ -643,7 +643,7 @@ def test_openai_client_with_custom_ca_only(): # Verify create_ssl_context was called with disable_system_cas=True mock_create_ssl.assert_called_once_with( disable_verify=False, - ca_cert_path="/etc/ssl/certs/custom/ca.crt", + ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", disable_system_cas=True, ) @@ -691,7 +691,7 @@ def test_azure_openai_client_with_tls(): api_key="fake", azure_endpoint="https://test.openai.azure.com", api_version="2024-02-15-preview", - tls_ca_cert_path="/etc/ssl/certs/custom/ca.crt", + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", ) # Access _client to trigger client creation @@ -700,7 +700,7 @@ def test_azure_openai_client_with_tls(): # Verify SSL context was created mock_create_ssl.assert_called_once_with( disable_verify=False, - ca_cert_path="/etc/ssl/certs/custom/ca.crt", + ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", disable_system_cas=False, ) @@ -732,7 +732,7 @@ def test_openai_client_with_base_url_and_tls(): type="openai", api_key="fake", base_url="https://litellm.internal.corp:8080", - tls_ca_cert_path="/etc/ssl/certs/custom/ca.crt", + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", ) # Access _client to trigger client creation diff --git a/python/packages/kagent-adk/tests/unittests/models/test_tls_e2e.py b/python/packages/kagent-adk/tests/unittests/models/test_tls_e2e.py index d0af98388e..edd7ca6b00 100644 --- a/python/packages/kagent-adk/tests/unittests/models/test_tls_e2e.py +++ b/python/packages/kagent-adk/tests/unittests/models/test_tls_e2e.py @@ -549,7 +549,7 @@ async def test_e2e_ssl_error_contains_troubleshooting_info(): # Generate troubleshooting message message = get_ssl_troubleshooting_message( error=error, - ca_cert_path="/etc/ssl/certs/custom/ca.crt", + ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", server_url="localhost:8443", ) @@ -559,7 +559,7 @@ async def test_e2e_ssl_error_contains_troubleshooting_info(): assert "kubectl exec" in message assert "openssl x509" in message assert "openssl s_client" in message - assert "/etc/ssl/certs/custom/ca.crt" in message + assert "/etc/ssl/certs/custom/corp-ca/ca.crt" in message assert "localhost:8443" in message assert "kagent.dev/docs" in message diff --git a/python/packages/kagent-adk/tests/unittests/test_mcp_tls.py b/python/packages/kagent-adk/tests/unittests/test_mcp_tls.py new file mode 100644 index 0000000000..fcf02539d8 --- /dev/null +++ b/python/packages/kagent-adk/tests/unittests/test_mcp_tls.py @@ -0,0 +1,219 @@ +"""Unit tests for the kagent TLS wiring on MCP server wire configs. + +The kagent controller emits TLS keys nested under ``params`` on the +wire (matching the Go runtime's shape). kagent-adk lifts those keys up +to the wire-config level via a pydantic ``model_validator(mode="before")`` +and installs a TLS-aware ``httpx_client_factory`` on the underlying +google-adk params instance at toolset construction time, so every MCP +session the manager opens applies the operator's TLS config. + +These tests exercise the lift and the factory installation without +standing up a real MCP server. +""" + +from unittest import mock + +import httpx +import pytest +from google.adk.tools.mcp_tool import SseConnectionParams, StreamableHTTPConnectionParams + +from kagent.adk.types import HttpMcpServerConfig, SseMcpServerConfig + + +def test_lifts_tls_fields_from_nested_params_on_streamable(): + """The controller emits ``{"params": {"url": ..., "tls_ca_cert_path": ...}}``; + the pre-validator lifts the TLS keys onto the wire-config object so + the runtime can read them without digging into google-adk's params + type (whose ``extra="ignore"`` would silently drop them).""" + cfg = HttpMcpServerConfig.model_validate( + { + "params": { + "url": "https://upstream.example.com/mcp", + "tls_insecure_skip_verify": True, + "tls_ca_cert_path": "/etc/ssl/certs/custom/corp-ca/ca.crt", + "tls_disable_system_cas": False, + }, + "tools": ["t1"], + } + ) + assert cfg.tls_insecure_skip_verify is True + assert cfg.tls_ca_cert_path == "/etc/ssl/certs/custom/corp-ca/ca.crt" + assert cfg.tls_disable_system_cas is False + + +def test_lifts_tls_fields_from_nested_params_on_sse(): + """SSE wire-config gets the same lift behavior as Streamable HTTP so + operators don't need to know which transport is in use.""" + cfg = SseMcpServerConfig.model_validate( + { + "params": { + "url": "https://upstream.example.com/sse", + "tls_insecure_skip_verify": False, + "tls_ca_cert_path": "/etc/ssl/certs/custom/corp-ca/ca.crt", + }, + } + ) + assert cfg.tls_insecure_skip_verify is False + assert cfg.tls_ca_cert_path == "/etc/ssl/certs/custom/corp-ca/ca.crt" + + +def test_lift_respects_explicit_top_level_value(): + """When the wire config already carries an explicit top-level TLS + value, the nested-params lift must not override it. Production never + emits both, but a Python caller building configs directly might.""" + cfg = HttpMcpServerConfig.model_validate( + { + "params": { + "url": "https://upstream.example.com/mcp", + "tls_ca_cert_path": "/from/params", + }, + "tools": [], + "tls_ca_cert_path": "/from/top-level", + } + ) + assert cfg.tls_ca_cert_path == "/from/top-level" + + +def test_no_tls_config_leaves_factory_default(): + """When no TLS fields are set, ``_apply_tls_to_params`` is a no-op + so google-adk's upstream default factory remains in place. This is + the common case for any RemoteMCPServer without ``spec.tls`` set.""" + params = StreamableHTTPConnectionParams(url="https://upstream.example.com/mcp") + original_factory = params.httpx_client_factory + + cfg = HttpMcpServerConfig(params=params, tools=[]) + cfg._apply_tls_to_params(cfg.params) + + assert cfg.params.httpx_client_factory is original_factory + + +def test_disable_verify_installs_factory_with_verify_false(): + """``tls_insecure_skip_verify=True`` is the test-fixture escape hatch. + The factory it installs must hand httpx ``verify=False`` so self-signed + upstreams accept the connection.""" + params = StreamableHTTPConnectionParams(url="https://upstream.example.com/mcp") + cfg = HttpMcpServerConfig( + params=params, + tools=[], + tls_insecure_skip_verify=True, + ) + + with mock.patch("kagent.adk.types.create_ssl_context") as mock_create: + mock_create.return_value = False # httpx accepts False to disable + cfg._apply_tls_to_params(cfg.params) + + mock_create.assert_called_once_with( + disable_verify=True, + ca_cert_path=None, + disable_system_cas=False, + ) + + # Calling the installed factory should produce an httpx.AsyncClient + # configured with the SSL context returned by create_ssl_context. + with mock.patch("kagent.adk.types.httpx.AsyncClient") as mock_client: + cfg.params.httpx_client_factory() + kwargs = mock_client.call_args[1] + assert kwargs["verify"] is False + + +def test_custom_ca_path_installs_factory_with_ssl_context(): + """The production case: ``tls_ca_cert_path`` pins a corporate CA. The + factory must call ``create_ssl_context`` with the same path so the + resulting SSL context trusts that CA.""" + params = StreamableHTTPConnectionParams(url="https://mcp.corp.internal/mcp") + cfg = HttpMcpServerConfig( + params=params, + tools=[], + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", + ) + + fake_ctx = object() # stand-in for an ssl.SSLContext + with mock.patch("kagent.adk.types.create_ssl_context") as mock_create: + mock_create.return_value = fake_ctx + cfg._apply_tls_to_params(cfg.params) + + mock_create.assert_called_once_with( + disable_verify=False, + ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", + disable_system_cas=False, + ) + + with mock.patch("kagent.adk.types.httpx.AsyncClient") as mock_client: + cfg.params.httpx_client_factory() + kwargs = mock_client.call_args[1] + assert kwargs["verify"] is fake_ctx + # Caller defaults preserved: follow_redirects + timeout matching + # google-adk's create_mcp_http_client defaults. + assert kwargs["follow_redirects"] is True + assert isinstance(kwargs["timeout"], httpx.Timeout) + + +def test_disable_system_cas_propagates_to_create_ssl_context(): + """``tls_disable_system_cas=True`` is the strict-trust mode: the SSL + context should trust ONLY the supplied CA. The wire-config must pass + the flag through to ``create_ssl_context`` without rewriting.""" + params = StreamableHTTPConnectionParams(url="https://mcp.corp.internal/mcp") + cfg = HttpMcpServerConfig( + params=params, + tools=[], + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", + tls_disable_system_cas=True, + ) + + with mock.patch("kagent.adk.types.create_ssl_context") as mock_create: + cfg._apply_tls_to_params(cfg.params) + + mock_create.assert_called_once_with( + disable_verify=False, + ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", + disable_system_cas=True, + ) + + +def test_factory_forwards_caller_kwargs(): + """google-adk's MCP session manager invokes the factory with optional + headers/timeout/auth on a per-session basis. The TLS-wrapping factory + must forward those kwargs so per-session overrides still work.""" + params = StreamableHTTPConnectionParams(url="https://mcp.corp.internal/mcp") + cfg = HttpMcpServerConfig( + params=params, + tools=[], + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", + ) + with mock.patch("kagent.adk.types.create_ssl_context", return_value=object()): + cfg._apply_tls_to_params(cfg.params) + + auth = httpx.BasicAuth(username="u", password="p") + timeout = httpx.Timeout(60) + with mock.patch("kagent.adk.types.httpx.AsyncClient") as mock_client: + cfg.params.httpx_client_factory( + headers={"X-Token": "abc"}, + timeout=timeout, + auth=auth, + ) + + kwargs = mock_client.call_args[1] + assert kwargs["headers"] == {"X-Token": "abc"} + assert kwargs["timeout"] is timeout + assert kwargs["auth"] is auth + + +def test_sse_params_get_factory_when_supported(): + """SSE transport should receive the same factory treatment when the + installed google-adk version is recent enough to expose + ``httpx_client_factory`` on ``SseConnectionParams`` (≥ 1.28.1, the + kagent-adk pyproject floor).""" + params = SseConnectionParams(url="https://upstream.example.com/sse") + cfg = SseMcpServerConfig( + params=params, + tools=[], + tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", + ) + + if not hasattr(SseConnectionParams, "model_fields") or "httpx_client_factory" not in SseConnectionParams.model_fields: + pytest.skip("installed google-adk lacks httpx_client_factory on SSE — upgrade to ≥ 1.28.1") + + with mock.patch("kagent.adk.types.create_ssl_context", return_value=object()): + cfg._apply_tls_to_params(cfg.params) + + assert callable(cfg.params.httpx_client_factory) diff --git a/ui/src/types/index.ts b/ui/src/types/index.ts index f0c32803ec..d379fd0a8f 100644 --- a/ui/src/types/index.ts +++ b/ui/src/types/index.ts @@ -484,6 +484,7 @@ export interface RemoteMCPServerSpec { timeout?: string; sseReadTimeout?: string; terminateOnClose?: boolean; + tls?: TLSConfig; } export interface RemoteMCPServerResponse { From 9d90c173bbab73e0ed0384783a70150288eb2d27 Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Thu, 21 May 2026 10:47:53 -0700 Subject: [PATCH 2/8] Add secrets?: SecretMaterial[] for RMS to ui types Signed-off-by: Jeremy Alvis --- ui/src/types/index.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ui/src/types/index.ts b/ui/src/types/index.ts index d379fd0a8f..3b68f3f7ef 100644 --- a/ui/src/types/index.ts +++ b/ui/src/types/index.ts @@ -540,6 +540,14 @@ export interface ToolServerCreateRequest { type: "RemoteMCPServer" | "MCPServer"; remoteMCPServer?: RemoteMCPServer; mcpServer?: MCPServer; + // Optional companion Secrets to create or update alongside the + // ToolServer. Each entry materializes as a key in an Opaque Secret + // owned by the created resource so K8s GC cleans up on delete. + // Names referenced here must match a Secret described in this list + // (e.g. RemoteMCPServer.spec.tls.caCertSecretRef) for inline + // materialization; pre-existing Secrets can also be referenced by + // name without supplying material here. + secrets?: SecretMaterial[]; } From 0a5f8bfbb228e13ee534fde350c40159806f2be5 Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Fri, 22 May 2026 11:08:27 -0700 Subject: [PATCH 3/8] Fix ci failure Signed-off-by: Jeremy Alvis --- python/packages/kagent-adk/tests/unittests/test_mcp_tls.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/packages/kagent-adk/tests/unittests/test_mcp_tls.py b/python/packages/kagent-adk/tests/unittests/test_mcp_tls.py index fcf02539d8..915db3eab2 100644 --- a/python/packages/kagent-adk/tests/unittests/test_mcp_tls.py +++ b/python/packages/kagent-adk/tests/unittests/test_mcp_tls.py @@ -210,7 +210,10 @@ def test_sse_params_get_factory_when_supported(): tls_ca_cert_path="/etc/ssl/certs/custom/corp-ca/ca.crt", ) - if not hasattr(SseConnectionParams, "model_fields") or "httpx_client_factory" not in SseConnectionParams.model_fields: + if ( + not hasattr(SseConnectionParams, "model_fields") + or "httpx_client_factory" not in SseConnectionParams.model_fields + ): pytest.skip("installed google-adk lacks httpx_client_factory on SSE — upgrade to ≥ 1.28.1") with mock.patch("kagent.adk.types.create_ssl_context", return_value=object()): From 0f263f644921c9b4cd7d1d8b7d4c628cb9cd6bdc Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Fri, 22 May 2026 12:35:41 -0700 Subject: [PATCH 4/8] Fix ci issues Signed-off-by: Jeremy Alvis --- .../crd/bases/kagent.dev_modelconfigs.yaml | 13 +-- .../bases/kagent.dev_remotemcpservers.yaml | 13 +-- go/api/v1alpha2/modelconfig_types.go | 13 +-- .../controller/reconciler/reconciler.go | 2 +- .../translator/agent/adk_api_translator.go | 18 ++-- .../httpserver/handlers/toolservers.go | 4 + .../httpserver/handlers/toolservers_test.go | 90 +++++++++++++++++++ go/core/test/e2e/remotemcpserver_tls_test.go | 48 ++++++---- .../templates/kagent.dev_modelconfigs.yaml | 13 +-- .../kagent.dev_remotemcpservers.yaml | 13 +-- 10 files changed, 172 insertions(+), 55 deletions(-) diff --git a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml index a3d8025a6c..5aa3eeff71 100644 --- a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml +++ b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml @@ -652,17 +652,18 @@ spec: properties: caCertSecretKey: description: |- - CACertSecretKey is the key within the Secret that contains the CA certificate data. - This field follows the same pattern as APIKeySecretKey. - Required when CACertSecretRef is set (unless DisableVerify is true). + CACertSecretKey is the key within the Secret that contains the + CA certificate data (PEM-encoded). Required when CACertSecretRef + is set (unless DisableVerify is true). type: string caCertSecretRef: description: |- CACertSecretRef is a reference to a Kubernetes Secret containing CA certificate(s) in PEM format. The Secret must be in the same - namespace as the ModelConfig. - When set, the certificate will be used to verify the provider's SSL certificate. - This field follows the same pattern as APIKeySecret. + namespace as the resource referencing it (ModelConfig, + RemoteMCPServer, or any future consumer of TLSConfig). + When set, the certificate will be used to verify the upstream's + SSL certificate. type: string disableSystemCAs: default: false diff --git a/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml b/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml index e0b4e272ab..a842ced11c 100644 --- a/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml +++ b/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml @@ -192,17 +192,18 @@ spec: properties: caCertSecretKey: description: |- - CACertSecretKey is the key within the Secret that contains the CA certificate data. - This field follows the same pattern as APIKeySecretKey. - Required when CACertSecretRef is set (unless DisableVerify is true). + CACertSecretKey is the key within the Secret that contains the + CA certificate data (PEM-encoded). Required when CACertSecretRef + is set (unless DisableVerify is true). type: string caCertSecretRef: description: |- CACertSecretRef is a reference to a Kubernetes Secret containing CA certificate(s) in PEM format. The Secret must be in the same - namespace as the ModelConfig. - When set, the certificate will be used to verify the provider's SSL certificate. - This field follows the same pattern as APIKeySecret. + namespace as the resource referencing it (ModelConfig, + RemoteMCPServer, or any future consumer of TLSConfig). + When set, the certificate will be used to verify the upstream's + SSL certificate. type: string disableSystemCAs: default: false diff --git a/go/api/v1alpha2/modelconfig_types.go b/go/api/v1alpha2/modelconfig_types.go index 6caaf1f434..b7de32aac0 100644 --- a/go/api/v1alpha2/modelconfig_types.go +++ b/go/api/v1alpha2/modelconfig_types.go @@ -294,15 +294,16 @@ type TLSConfig struct { // CACertSecretRef is a reference to a Kubernetes Secret containing // CA certificate(s) in PEM format. The Secret must be in the same - // namespace as the ModelConfig. - // When set, the certificate will be used to verify the provider's SSL certificate. - // This field follows the same pattern as APIKeySecret. + // namespace as the resource referencing it (ModelConfig, + // RemoteMCPServer, or any future consumer of TLSConfig). + // When set, the certificate will be used to verify the upstream's + // SSL certificate. // +optional CACertSecretRef string `json:"caCertSecretRef,omitempty"` - // CACertSecretKey is the key within the Secret that contains the CA certificate data. - // This field follows the same pattern as APIKeySecretKey. - // Required when CACertSecretRef is set (unless DisableVerify is true). + // CACertSecretKey is the key within the Secret that contains the + // CA certificate data (PEM-encoded). Required when CACertSecretRef + // is set (unless DisableVerify is true). // +optional CACertSecretKey string `json:"caCertSecretKey,omitempty"` diff --git a/go/core/internal/controller/reconciler/reconciler.go b/go/core/internal/controller/reconciler/reconciler.go index 1541b44cc5..67e28a6ea8 100644 --- a/go/core/internal/controller/reconciler/reconciler.go +++ b/go/core/internal/controller/reconciler/reconciler.go @@ -1130,7 +1130,7 @@ func (a *kagentReconciler) buildRemoteMCPServerTLSConfig(ctx context.Context, s // requests. When tlsConfig is non-nil it's installed on a cloned // transport so tool discovery honors RemoteMCPServer.spec.tls. func newHTTPClient(headers map[string]string, timeout time.Duration, tlsConfig *tls.Config) *http.Client { - var base http.RoundTripper = http.DefaultTransport + var base = http.DefaultTransport if tlsConfig != nil { // Clone the default transport to preserve its dial/keepalive // settings (proxies, dual-stack, HTTP/2) and override only the diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go index 6735528291..a82b5c5c01 100644 --- a/go/core/internal/controller/translator/agent/adk_api_translator.go +++ b/go/core/internal/controller/translator/agent/adk_api_translator.go @@ -415,7 +415,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - openai.BaseModel.TLSInsecureSkipVerify, openai.BaseModel.TLSCACertPath, openai.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + openai.TLSInsecureSkipVerify, openai.TLSCACertPath, openai.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) // Populate TokenExchange fields (OpenAI-specific) addTokenExchangeConfiguration(openai, modelDeploymentData, &model.Spec) openai.APIKeyPassthrough = model.Spec.APIKeyPassthrough @@ -473,7 +473,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - anthropic.BaseModel.TLSInsecureSkipVerify, anthropic.BaseModel.TLSCACertPath, anthropic.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + anthropic.TLSInsecureSkipVerify, anthropic.TLSCACertPath, anthropic.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough if model.Spec.Anthropic != nil { @@ -522,7 +522,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - azureOpenAI.BaseModel.TLSInsecureSkipVerify, azureOpenAI.BaseModel.TLSCACertPath, azureOpenAI.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + azureOpenAI.TLSInsecureSkipVerify, azureOpenAI.TLSCACertPath, azureOpenAI.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) azureOpenAI.APIKeyPassthrough = model.Spec.APIKeyPassthrough return azureOpenAI, modelDeploymentData, secretHashBytes, nil @@ -567,7 +567,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - gemini.BaseModel.TLSInsecureSkipVerify, gemini.BaseModel.TLSCACertPath, gemini.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + gemini.TLSInsecureSkipVerify, gemini.TLSCACertPath, gemini.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) gemini.APIKeyPassthrough = model.Spec.APIKeyPassthrough return gemini, modelDeploymentData, secretHashBytes, nil @@ -608,7 +608,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - anthropic.BaseModel.TLSInsecureSkipVerify, anthropic.BaseModel.TLSCACertPath, anthropic.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + anthropic.TLSInsecureSkipVerify, anthropic.TLSCACertPath, anthropic.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) anthropic.APIKeyPassthrough = model.Spec.APIKeyPassthrough return anthropic, modelDeploymentData, secretHashBytes, nil @@ -632,7 +632,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC Options: model.Spec.Ollama.Options, } // Populate TLS fields in BaseModel - ollama.BaseModel.TLSInsecureSkipVerify, ollama.BaseModel.TLSCACertPath, ollama.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + ollama.TLSInsecureSkipVerify, ollama.TLSCACertPath, ollama.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) ollama.APIKeyPassthrough = model.Spec.APIKeyPassthrough return ollama, modelDeploymentData, secretHashBytes, nil @@ -655,7 +655,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, } // Populate TLS fields in BaseModel - gemini.BaseModel.TLSInsecureSkipVerify, gemini.BaseModel.TLSCACertPath, gemini.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + gemini.TLSInsecureSkipVerify, gemini.TLSCACertPath, gemini.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) return gemini, modelDeploymentData, secretHashBytes, nil case v1alpha2.ModelProviderBedrock: if model.Spec.Bedrock == nil { @@ -743,7 +743,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC } // Populate TLS fields in BaseModel - bedrock.BaseModel.TLSInsecureSkipVerify, bedrock.BaseModel.TLSCACertPath, bedrock.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + bedrock.TLSInsecureSkipVerify, bedrock.TLSCACertPath, bedrock.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) bedrock.APIKeyPassthrough = model.Spec.APIKeyPassthrough return bedrock, modelDeploymentData, secretHashBytes, nil @@ -792,7 +792,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC AuthUrl: model.Spec.SAPAICore.AuthURL, } - sapAICore.BaseModel.TLSInsecureSkipVerify, sapAICore.BaseModel.TLSCACertPath, sapAICore.BaseModel.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) + sapAICore.TLSInsecureSkipVerify, sapAICore.TLSCACertPath, sapAICore.TLSDisableSystemCAs = deriveTLSFields(model.Spec.TLS) sapAICore.APIKeyPassthrough = model.Spec.APIKeyPassthrough return sapAICore, modelDeploymentData, secretHashBytes, nil diff --git a/go/core/internal/httpserver/handlers/toolservers.go b/go/core/internal/httpserver/handlers/toolservers.go index 59acd66b57..84e679a82b 100644 --- a/go/core/internal/httpserver/handlers/toolservers.go +++ b/go/core/internal/httpserver/handlers/toolservers.go @@ -170,6 +170,10 @@ func (h *ToolServersHandler) handleCreateRemoteMCPServer(w ErrorResponseWriter, "toolServerName", toolRef.Name, "toolServerNamespace", toolRef.Namespace, ) + if err := Check(h.Authorizer, r, auth.Resource{Type: "ToolServer", Name: toolRef.String()}); err != nil { + w.RespondWithError(err) + return + } if err := h.KubeClient.Create(r.Context(), toolServerRequest); err != nil { w.RespondWithError(errors.NewInternalServerError("Failed to create RemoteMCPServer in Kubernetes", err)) diff --git a/go/core/internal/httpserver/handlers/toolservers_test.go b/go/core/internal/httpserver/handlers/toolservers_test.go index b82f161257..c1f06bb90d 100644 --- a/go/core/internal/httpserver/handlers/toolservers_test.go +++ b/go/core/internal/httpserver/handlers/toolservers_test.go @@ -27,9 +27,22 @@ import ( "github.com/kagent-dev/kagent/go/core/internal/httpserver/auth" "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" common "github.com/kagent-dev/kagent/go/core/internal/utils" + pkgauth "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kmcp/api/v1alpha1" ) +// denyAuthorizer satisfies pkgauth.Authorizer by refusing every Check. +// Used to pin the authorization gate on the create endpoints: a request +// from an unauthorized caller must surface a 403 BEFORE the handler +// reaches KubeClient.Create or createOrUpdateCompanionSecrets. +type denyAuthorizer struct{} + +func (denyAuthorizer) Check(_ context.Context, _ pkgauth.Principal, _ pkgauth.Verb, _ pkgauth.Resource) error { + return assert.AnError +} + +var _ pkgauth.Authorizer = denyAuthorizer{} + func TestToolServersHandler(t *testing.T) { scheme := runtime.NewScheme() @@ -564,6 +577,83 @@ func TestToolServersHandler(t *testing.T) { assert.Equal(t, []byte("OLD"), fresh.Data["ca.crt"]) }) + // AuthorizationRequired_RemoteMCPServer pins the authz gate on the + // RMS create path. A caller the authorizer rejects must get a 403 + // AND no RMS, no companion Secret should land in the cluster. + t.Run("AuthorizationRequired_RemoteMCPServer", func(t *testing.T) { + handler, kubeClient, _, responseRecorder := setupHandler(t) + // Swap the Noop authorizer for one that denies every Check. + handler.Authorizer = denyAuthorizer{} + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "RemoteMCPServer", + RemoteMCPServer: &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "denied-rms", Namespace: "default"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "should not be created", + URL: "https://x/y", + }, + }, + Secrets: []api.SecretMaterial{ + {Name: "denied-rms-ca", Key: "ca.crt", Value: "PEM"}, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "unauthorized-user") + + handler.HandleCreateToolServer(responseRecorder, req) + + assert.Equal(t, http.StatusForbidden, responseRecorder.Code, + "unauthorized RMS create must surface 403") + // Neither the RMS nor the companion Secret should have been + // created — the authz gate fires before any KubeClient.Create. + rms := &v1alpha2.RemoteMCPServer{} + err := kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "denied-rms"}, rms) + assert.Error(t, err, "denied request must not create the RemoteMCPServer") + secret := &corev1.Secret{} + err = kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "denied-rms-ca"}, secret) + assert.Error(t, err, "denied request must not create the companion Secret") + }) + + // AuthorizationRequired_MCPServer pins the symmetric authz gate + // on the kmcp MCPServer create path, so a regression on either + // branch surfaces in the test suite. + t.Run("AuthorizationRequired_MCPServer", func(t *testing.T) { + handler, kubeClient, _, responseRecorder := setupHandler(t) + handler.Authorizer = denyAuthorizer{} + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "MCPServer", + MCPServer: &v1alpha1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "denied-mcp", Namespace: "default"}, + Spec: v1alpha1.MCPServerSpec{ + Deployment: v1alpha1.MCPServerDeployment{ + Image: "example/kmcp:latest", + Port: 8080, + Cmd: "/bin/serve", + }, + }, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "unauthorized-user") + + handler.HandleCreateToolServer(responseRecorder, req) + + assert.Equal(t, http.StatusForbidden, responseRecorder.Code, + "unauthorized MCPServer create must surface 403") + mcp := &v1alpha1.MCPServer{} + err := kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "denied-mcp"}, mcp) + assert.Error(t, err, "denied request must not create the MCPServer") + }) + t.Run("ToolServerAlreadyExists", func(t *testing.T) { handler, kubeClient, _, responseRecorder := setupHandler(t) diff --git a/go/core/test/e2e/remotemcpserver_tls_test.go b/go/core/test/e2e/remotemcpserver_tls_test.go index 54702dc37f..366417612d 100644 --- a/go/core/test/e2e/remotemcpserver_tls_test.go +++ b/go/core/test/e2e/remotemcpserver_tls_test.go @@ -29,6 +29,7 @@ import ( "math/big" "net" "net/http" + "strings" "testing" "time" @@ -52,11 +53,11 @@ import ( // mock LLM. type mockmcpFixture struct { server *mockmcp.Server - baseURL string // dial URL from inside the cluster - mcpURL string // dial URL with the /mcp path appended - sseURL string // dial URL with the /sse path appended - caCertPEM []byte // when TLS, the PEM bundle to install as a Secret (nil for plaintext fixtures) - hostBindURL string // local 127.0.0.1 URL for debug logging — NOT the cluster-facing URL + baseURL string // dial URL from inside the cluster + mcpURL string // dial URL with the /mcp path appended + sseURL string // dial URL with the /sse path appended + caCertPEM []byte // when TLS, the PEM bundle to install as a Secret (nil for plaintext fixtures) + hostBindURL string // local 127.0.0.1 URL for debug logging — NOT the cluster-facing URL } // setupMockMCP starts a mockmcp fixture. When withTLS is true a fresh @@ -102,7 +103,12 @@ func setupMockMCP(t *testing.T, withTLS bool, opts mockmcp.Options) *mockmcpFixt _ = server.Stop(ctx) }) - clusterURL := buildK8sURL(hostURL) + // buildK8sURL hard-codes "http://" — fine for the mockllm path it + // was written for, but mockmcp's TLS scenarios serve HTTPS and the + // scheme must round-trip to the controller's dial config. Reuse + // buildK8sURL to pick the right host (kind gateway IP / docker + // alias), then restore the actual scheme mockmcp listened on. + clusterURL := strings.Replace(buildK8sURL(hostURL), "http://", schemeOf(hostURL)+"://", 1) return &mockmcpFixture{ server: server, @@ -114,6 +120,18 @@ func setupMockMCP(t *testing.T, withTLS bool, opts mockmcp.Options) *mockmcpFixt } } +// schemeOf returns "http" or "https" depending on the URL's prefix. +// Used by setupMockMCP to preserve mockmcp's actual listening scheme +// when buildK8sURL rewrites the host (which it does by string-edit, +// not by url.Parse, so it can't be asked to preserve the scheme +// itself without changing its signature for one caller). +func schemeOf(rawURL string) string { + if strings.HasPrefix(rawURL, "https://") { + return "https" + } + return "http" +} + // createCASecret writes the CA PEM produced by setupMockMCP into a // Kubernetes Secret in the kagent namespace under the given name and // key. Registers for cleanup. Returns the Secret so the caller can @@ -223,15 +241,15 @@ func generateSelfSignedCert(t *testing.T, dnsNames []string, ips []net.IP) (cert require.NoError(t, err) tmpl := &x509.Certificate{ - SerialNumber: big.NewInt(time.Now().UnixNano()), - Subject: pkix.Name{CommonName: "kagent-e2e-mockmcp"}, - NotBefore: time.Now().Add(-time.Hour), - NotAfter: time.Now().Add(24 * time.Hour), - KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment | x509.KeyUsageCertSign, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - DNSNames: dnsNames, - IPAddresses: ips, - IsCA: true, + SerialNumber: big.NewInt(time.Now().UnixNano()), + Subject: pkix.Name{CommonName: "kagent-e2e-mockmcp"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment | x509.KeyUsageCertSign, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + DNSNames: dnsNames, + IPAddresses: ips, + IsCA: true, BasicConstraintsValid: true, } der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key) diff --git a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml index a3d8025a6c..5aa3eeff71 100644 --- a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml +++ b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml @@ -652,17 +652,18 @@ spec: properties: caCertSecretKey: description: |- - CACertSecretKey is the key within the Secret that contains the CA certificate data. - This field follows the same pattern as APIKeySecretKey. - Required when CACertSecretRef is set (unless DisableVerify is true). + CACertSecretKey is the key within the Secret that contains the + CA certificate data (PEM-encoded). Required when CACertSecretRef + is set (unless DisableVerify is true). type: string caCertSecretRef: description: |- CACertSecretRef is a reference to a Kubernetes Secret containing CA certificate(s) in PEM format. The Secret must be in the same - namespace as the ModelConfig. - When set, the certificate will be used to verify the provider's SSL certificate. - This field follows the same pattern as APIKeySecret. + namespace as the resource referencing it (ModelConfig, + RemoteMCPServer, or any future consumer of TLSConfig). + When set, the certificate will be used to verify the upstream's + SSL certificate. type: string disableSystemCAs: default: false diff --git a/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml b/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml index e0b4e272ab..a842ced11c 100644 --- a/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml +++ b/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml @@ -192,17 +192,18 @@ spec: properties: caCertSecretKey: description: |- - CACertSecretKey is the key within the Secret that contains the CA certificate data. - This field follows the same pattern as APIKeySecretKey. - Required when CACertSecretRef is set (unless DisableVerify is true). + CACertSecretKey is the key within the Secret that contains the + CA certificate data (PEM-encoded). Required when CACertSecretRef + is set (unless DisableVerify is true). type: string caCertSecretRef: description: |- CACertSecretRef is a reference to a Kubernetes Secret containing CA certificate(s) in PEM format. The Secret must be in the same - namespace as the ModelConfig. - When set, the certificate will be used to verify the provider's SSL certificate. - This field follows the same pattern as APIKeySecret. + namespace as the resource referencing it (ModelConfig, + RemoteMCPServer, or any future consumer of TLSConfig). + When set, the certificate will be used to verify the upstream's + SSL certificate. type: string disableSystemCAs: default: false From c7707a2d8cce2993a3f50c22af3a269a50ab9bdf Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Fri, 22 May 2026 14:08:54 -0700 Subject: [PATCH 5/8] Fix e2e test Signed-off-by: Jeremy Alvis --- go/core/test/e2e/remotemcpserver_tls_test.go | 60 +++++++++++++++----- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/go/core/test/e2e/remotemcpserver_tls_test.go b/go/core/test/e2e/remotemcpserver_tls_test.go index 366417612d..c40331a0b8 100644 --- a/go/core/test/e2e/remotemcpserver_tls_test.go +++ b/go/core/test/e2e/remotemcpserver_tls_test.go @@ -29,6 +29,7 @@ import ( "math/big" "net" "net/http" + "os" "strings" "testing" "time" @@ -68,25 +69,28 @@ type mockmcpFixture struct { func setupMockMCP(t *testing.T, withTLS bool, opts mockmcp.Options) *mockmcpFixture { t.Helper() - // Bind ephemerally so concurrent tests don't fight for ports. + // Bind on all interfaces so the controller pod (inside kind) can + // reach the listener via the kind-network gateway IP. Binding to + // "127.0.0.1:0" rejects connections from outside the loopback + // interface, which is what kind pod traffic looks like after + // routing through the network gateway. Ephemeral port still keeps + // concurrent tests from fighting. if opts.Addr == "" { - opts.Addr = "127.0.0.1:0" + opts.Addr = ":0" } opts.RecordRequests = true var caPEM []byte if withTLS { - // Mint a self-signed CA + server cert. SANs include - // host.docker.internal and the Docker-bridge IP so the - // certificate validates whichever host alias the in-cluster - // pod ends up using. - certPEM, keyPEM, ca := generateSelfSignedCert(t, []string{ - "localhost", - "host.docker.internal", - }, []net.IP{ - net.ParseIP("127.0.0.1"), - net.ParseIP("172.17.0.1"), - }) + // Mint a self-signed CA + server cert. SANs must include + // whichever host alias / IP the in-cluster pod ends up + // dialing — that varies by network setup: + // * Mac without override: host.docker.internal (DNS) + // * Linux Docker default bridge: 172.17.0.1 + // * kind on Linux: kind network gateway (often 172.18.0.1) + // — passed via KAGENT_LOCAL_HOST in CI + dnsNames, ips := certSubjectAltNames() + certPEM, keyPEM, ca := generateSelfSignedCert(t, dnsNames, ips) opts.CertPEM = certPEM opts.KeyPEM = keyPEM caPEM = ca @@ -132,6 +136,36 @@ func schemeOf(rawURL string) string { return "http" } +// certSubjectAltNames returns the DNS-name and IP SAN lists for the +// mockmcp self-signed cert. Mirrors buildK8sURL's host-selection +// logic so the cert validates the same host the controller actually +// dials. Includes liberal fallbacks (kind default + Docker default) +// so the local-dev path doesn't require an explicit env override. +func certSubjectAltNames() ([]string, []net.IP) { + dns := []string{"localhost", "host.docker.internal"} + ips := []net.IP{ + net.ParseIP("127.0.0.1"), + net.ParseIP("172.17.0.1"), // Docker default bridge + net.ParseIP("172.18.0.1"), // common kind gateway + } + + // KAGENT_LOCAL_HOST is the dynamic gateway IP CI detects from + // `docker network inspect kind`. Include it as a SAN so the + // controller's TLS verification accepts whatever IP the runner + // happens to have allocated. buildK8sURL's OS-default fallbacks + // (host.docker.internal on darwin, 172.17.0.1 on linux) are + // already covered by the static DNS/IP lists above. + if override := os.Getenv("KAGENT_LOCAL_HOST"); override != "" { + if ip := net.ParseIP(override); ip != nil { + ips = append(ips, ip) + } else { + dns = append(dns, override) + } + } + + return dns, ips +} + // createCASecret writes the CA PEM produced by setupMockMCP into a // Kubernetes Secret in the kagent namespace under the given name and // key. Registers for cleanup. Returns the Secret so the caller can From eefa465b1b009d782c7661b6c28885556a5c41ab Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Tue, 26 May 2026 08:47:03 -0700 Subject: [PATCH 6/8] Fixes based on claude review Signed-off-by: Jeremy Alvis --- .../bases/kagent.dev_remotemcpservers.yaml | 6 ++ go/api/v1alpha2/remotemcpserver_types.go | 5 ++ .../controller/reconciler/reconciler.go | 54 ++++++++++++ .../controller/reconciler/reconciler_test.go | 85 +++++++++++++++++++ .../translator/agent/adk_api_translator.go | 52 +++++++++--- .../controller/translator/agent/compiler.go | 11 ++- .../translator/agent/manifest_builder.go | 49 ++++++++--- .../agent/remotemcpserver_tls_test.go | 74 ++++++++++++++++ .../httpserver/handlers/companion_secrets.go | 33 ++++++- .../httpserver/handlers/modelconfig.go | 5 ++ .../httpserver/handlers/toolservers.go | 24 ++++-- .../httpserver/handlers/toolservers_test.go | 55 ++++++++++++ .../sandboxbackend/agentsxk8s/agentsxk8s.go | 15 ++++ go/core/pkg/sandboxbackend/backend.go | 7 ++ .../kagent.dev_remotemcpservers.yaml | 6 ++ 15 files changed, 445 insertions(+), 36 deletions(-) diff --git a/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml b/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml index a842ced11c..4ff81906eb 100644 --- a/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml +++ b/go/api/config/crd/bases/kagent.dev_remotemcpservers.yaml @@ -321,6 +321,12 @@ spec: Important: Run "make" to regenerate code after modifying this file format: int64 type: integer + secretHash: + description: |- + SecretHash stores a hash of the TLS Secret referenced by spec.tls so + agents that consume this RemoteMCPServer can detect cert rotation and + roll on the next reconcile. Empty when spec.tls.caCertSecretRef is unset. + type: string type: object type: object served: true diff --git a/go/api/v1alpha2/remotemcpserver_types.go b/go/api/v1alpha2/remotemcpserver_types.go index d63cc75347..aaa15125be 100644 --- a/go/api/v1alpha2/remotemcpserver_types.go +++ b/go/api/v1alpha2/remotemcpserver_types.go @@ -102,6 +102,11 @@ type RemoteMCPServerStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` // +optional DiscoveredTools []*MCPTool `json:"discoveredTools,omitempty"` + // SecretHash stores a hash of the TLS Secret referenced by spec.tls so + // agents that consume this RemoteMCPServer can detect cert rotation and + // roll on the next reconcile. Empty when spec.tls.caCertSecretRef is unset. + // +optional + SecretHash string `json:"secretHash,omitempty"` } type MCPTool struct { diff --git a/go/core/internal/controller/reconciler/reconciler.go b/go/core/internal/controller/reconciler/reconciler.go index 67e28a6ea8..0e697d9fcb 100644 --- a/go/core/internal/controller/reconciler/reconciler.go +++ b/go/core/internal/controller/reconciler/reconciler.go @@ -437,6 +437,18 @@ func (a *kagentReconciler) ReconcileKagentModelConfig(ctx context.Context, req c if kubeErr := a.kube.Get(ctx, namespacedName, secret); kubeErr != nil { err = multierror.Append(err, fmt.Errorf("failed to get secret %s: %w", modelConfig.Spec.TLS.CACertSecretRef, kubeErr)) } else { + // Surface the misconfiguration on the ModelConfig's Accepted + // condition: mounting an absent key would crash the agent at + // startup with FileNotFoundError without explaining which + // resource owns the bad reference. Translation still proceeds + // using whatever path the spec derived — agents fail loudly + // at startup, which matches the existing "missing-Secret" + // surface above. + if modelConfig.Spec.TLS.CACertSecretKey != "" { + if _, ok := secret.Data[modelConfig.Spec.TLS.CACertSecretKey]; !ok { + err = multierror.Append(err, fmt.Errorf("tls secret %s does not contain key %q", modelConfig.Spec.TLS.CACertSecretRef, modelConfig.Spec.TLS.CACertSecretKey)) + } + } secrets = append(secrets, secretRef{ NamespacedName: namespacedName, Secret: secret, @@ -604,6 +616,15 @@ func (a *kagentReconciler) ReconcileKagentRemoteMCPServer(ctx context.Context, r GroupKind: server.GroupVersionKind().GroupKind().String(), } + // Compute the TLS-Secret hash before tool discovery so the status + // reflects the operator's current spec.tls.caCertSecretRef contents. + // Agents that mount this Secret read the hash from Status.SecretHash + // at translate time so an in-place cert rotation triggers a rollout + // even when the Secret name (and therefore the cert path) didn't + // change. Missing-Secret errors are folded into the registration + // error path so a misconfigured RMS still surfaces a Failed condition. + secretHash, secretErr := a.computeRemoteMCPServerSecretHash(ctx, server) + l.Info("registering remote MCP server", "url", server.Spec.URL, "protocol", server.Spec.Protocol) start := time.Now() tools, err := a.upsertToolServerForRemoteMCPServer(ctx, dbServer, server) @@ -619,12 +640,16 @@ func (a *kagentReconciler) ReconcileKagentRemoteMCPServer(ctx context.Context, r } else { l.Info("successfully registered remote MCP server", "url", server.Spec.URL, "toolCount", len(tools), "duration", time.Since(start)) } + if secretErr != nil { + err = multierror.Append(err, secretErr) + } // update the tool server status as the agents depend on it if err := a.reconcileRemoteMCPServerStatus( ctx, server, tools, + secretHash, err, ); err != nil { return fmt.Errorf("failed to reconcile remote mcp server status %s: %w", req.NamespacedName, err) @@ -633,10 +658,37 @@ func (a *kagentReconciler) ReconcileKagentRemoteMCPServer(ctx context.Context, r return nil } +// computeRemoteMCPServerSecretHash returns a hash over the TLS Secret +// referenced by spec.tls.caCertSecretRef, matching the shape used by +// ModelConfig's secret hash so agents can detect cert rotation. Returns +// the empty string (no error) when no TLS Secret is referenced. Also +// validates that the named key exists in the Secret so the operator +// gets a clear Accepted=false on the RMS rather than a startup crash +// (FileNotFoundError from the Python ADK) on every consuming agent — +// mirrors the equivalent check in ReconcileKagentModelConfig. +func (a *kagentReconciler) computeRemoteMCPServerSecretHash(ctx context.Context, server *v1alpha2.RemoteMCPServer) (string, error) { + tlsSpec := server.Spec.TLS + if tlsSpec == nil || tlsSpec.CACertSecretRef == "" { + return "", nil + } + secret := &corev1.Secret{} + nn := types.NamespacedName{Namespace: server.Namespace, Name: tlsSpec.CACertSecretRef} + if err := a.kube.Get(ctx, nn, secret); err != nil { + return "", fmt.Errorf("failed to get TLS secret %s: %w", tlsSpec.CACertSecretRef, err) + } + if tlsSpec.CACertSecretKey != "" { + if _, ok := secret.Data[tlsSpec.CACertSecretKey]; !ok { + return "", fmt.Errorf("tls secret %s does not contain key %q", tlsSpec.CACertSecretRef, tlsSpec.CACertSecretKey) + } + } + return computeStatusSecretHash([]secretRef{{NamespacedName: nn, Secret: secret}}), nil +} + func (a *kagentReconciler) reconcileRemoteMCPServerStatus( ctx context.Context, server *v1alpha2.RemoteMCPServer, discoveredTools []*v1alpha2.MCPTool, + secretHash string, err error, ) error { var ( @@ -664,12 +716,14 @@ func (a *kagentReconciler) reconcileRemoteMCPServerStatus( // only update if the status has changed to prevent looping the reconciler if !conditionChanged && server.Status.ObservedGeneration == server.Generation && + server.Status.SecretHash == secretHash && reflect.DeepEqual(server.Status.DiscoveredTools, discoveredTools) { return nil } server.Status.ObservedGeneration = server.Generation server.Status.DiscoveredTools = discoveredTools + server.Status.SecretHash = secretHash if err := a.kube.Status().Update(ctx, server); err != nil { return fmt.Errorf("failed to update remote mcp server status: %w", err) diff --git a/go/core/internal/controller/reconciler/reconciler_test.go b/go/core/internal/controller/reconciler/reconciler_test.go index d09217c078..0d69b455d6 100644 --- a/go/core/internal/controller/reconciler/reconciler_test.go +++ b/go/core/internal/controller/reconciler/reconciler_test.go @@ -683,6 +683,91 @@ func TestValidateCrossNamespaceReferences(t *testing.T) { } } +// TestComputeRemoteMCPServerSecretHash pins the controller-side shape of +// the RMS TLS-Secret hash: the empty string when no TLS Secret is +// referenced, a deterministic hex string when one is, and a different +// hex string after the Secret contents rotate. Agents fold this value +// into their config-hash so an in-place cert rotation triggers a +// rollout (the Python ADK loads the cert at startup, so without a +// rollout pods keep the stale trust chain in memory). +func TestComputeRemoteMCPServerSecretHash(t *testing.T) { + scheme := clientgoscheme.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + caV1 := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-ca", Namespace: "ns"}, + Data: map[string][]byte{"ca.crt": []byte("PEM-V1")}, + } + caV2 := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-ca", Namespace: "ns"}, + Data: map[string][]byte{"ca.crt": []byte("PEM-V2")}, + } + + rmsNoTLS := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "x", Namespace: "ns"}, + Spec: v1alpha2.RemoteMCPServerSpec{URL: "https://x/y"}, + } + rmsWithTLS := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "x", Namespace: "ns"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + URL: "https://x/y", + TLS: &v1alpha2.TLSConfig{CACertSecretRef: "corp-ca", CACertSecretKey: "ca.crt"}, + }, + } + + t.Run("no TLS → empty hash", func(t *testing.T) { + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + r := &kagentReconciler{kube: kube} + hash, err := r.computeRemoteMCPServerSecretHash(context.Background(), rmsNoTLS) + require.NoError(t, err) + assert.Empty(t, hash) + }) + + t.Run("missing secret → error", func(t *testing.T) { + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + r := &kagentReconciler{kube: kube} + _, err := r.computeRemoteMCPServerSecretHash(context.Background(), rmsWithTLS) + require.Error(t, err) + }) + + t.Run("missing key in present secret → error", func(t *testing.T) { + // Secret exists but has the wrong key — the operator typo'd + // caCertSecretKey. Without this guard the agent would mount + // the Secret, fail to find the file at startup, and produce a + // FileNotFoundError that doesn't identify the resource owner. + // Surfacing the error on the RMS's Accepted condition gives + // the operator a precise pointer. + wrongKey := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-ca", Namespace: "ns"}, + Data: map[string][]byte{"other.crt": []byte("PEM")}, + } + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(wrongKey).Build() + r := &kagentReconciler{kube: kube} + _, err := r.computeRemoteMCPServerSecretHash(context.Background(), rmsWithTLS) + require.Error(t, err) + assert.Contains(t, err.Error(), "ca.crt") + }) + + t.Run("present secret → stable hex; rotation → different hex", func(t *testing.T) { + kube1 := fake.NewClientBuilder().WithScheme(scheme).WithObjects(caV1.DeepCopy()).Build() + kube2 := fake.NewClientBuilder().WithScheme(scheme).WithObjects(caV2.DeepCopy()).Build() + r1 := &kagentReconciler{kube: kube1} + r2 := &kagentReconciler{kube: kube2} + + h1, err := r1.computeRemoteMCPServerSecretHash(context.Background(), rmsWithTLS) + require.NoError(t, err) + require.NotEmpty(t, h1) + + h1Again, err := r1.computeRemoteMCPServerSecretHash(context.Background(), rmsWithTLS) + require.NoError(t, err) + assert.Equal(t, h1, h1Again, "same Secret content must produce identical hash") + + h2, err := r2.computeRemoteMCPServerSecretHash(context.Background(), rmsWithTLS) + require.NoError(t, err) + assert.NotEqual(t, h1, h2, "rotating the Secret content must change the hash") + }) +} + // TestRemoteMCPRegistrationTimeout verifies that remoteMCPRegistrationTimeout // returns spec.timeout when set and falls back to the package default otherwise. func TestRemoteMCPRegistrationTimeout(t *testing.T) { diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go index a82b5c5c01..1d8468ce87 100644 --- a/go/core/internal/controller/translator/agent/adk_api_translator.go +++ b/go/core/internal/controller/translator/agent/adk_api_translator.go @@ -284,6 +284,14 @@ func deriveTLSFields(tlsConfig *v1alpha2.TLSConfig) (insecureSkipVerify *bool, c // RemoteMCPServer path (translateRemoteMCPServerTarget) appends directly to the // already-merged modelDeploymentData rather than through // mergeDeploymentData. +// +// Spec validation (Secret exists, named key present) is the reconciler's +// job for both ModelConfig and RemoteMCPServer — see the TLS branches of +// ReconcileKagentModelConfig and ReconcileKagentRemoteMCPServer. The +// translator trusts that the Status has already surfaced any +// misconfiguration; mounting an absent key here would crash the agent at +// startup, but the operator gets the early signal on the resource's own +// Accepted condition. func addTLSConfiguration(modelDeploymentData *modelDeploymentData, tlsConfig *v1alpha2.TLSConfig) { if tlsConfig == nil { return @@ -867,7 +875,7 @@ func (a *adkApiTranslator) translateSseHttpTool(ctx context.Context, server *v1a return params, nil } -func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, mdd *modelDeploymentData, agentNamespace string, toolServer *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) error { +func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, mdd *modelDeploymentData, agentNamespace string, toolServer *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) ([]byte, error) { gvk := toolServer.GroupKind() switch gvk { @@ -890,12 +898,12 @@ func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent * err := a.kube.Get(ctx, mcpServerRef, mcpServer) if err != nil { - return err + return nil, err } remoteMcpServer, err := ConvertMCPServerToRemoteMCPServer(mcpServer) if err != nil { - return err + return nil, err } return a.translateRemoteMCPServerTarget(ctx, agent, mdd, remoteMcpServer, toolServer, agentHeaders, proxyURL) @@ -914,7 +922,7 @@ func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent * err := a.kube.Get(ctx, remoteMcpServerRef, remoteMcpServer) if err != nil { - return err + return nil, err } // RemoteMCPServer uses user-supplied URLs, but if the URL points to an internal k8s service, @@ -939,26 +947,26 @@ func (a *adkApiTranslator) translateMCPServerTarget(ctx context.Context, agent * err := a.kube.Get(ctx, svcRef, svc) if err != nil { - return err + return nil, err } remoteMcpServer, err := ConvertServiceToRemoteMCPServer(svc) if err != nil { - return err + return nil, err } return a.translateRemoteMCPServerTarget(ctx, agent, mdd, remoteMcpServer, toolServer, agentHeaders, proxyURL) default: - return fmt.Errorf("unknown tool server type: %s", gvk) + return nil, fmt.Errorf("unknown tool server type: %s", gvk) } } -func (a *adkApiTranslator) translateRemoteMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, mdd *modelDeploymentData, remoteMcpServer *v1alpha2.RemoteMCPServer, mcpServerTool *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) error { +func (a *adkApiTranslator) translateRemoteMCPServerTarget(ctx context.Context, agent *adk.AgentConfig, mdd *modelDeploymentData, remoteMcpServer *v1alpha2.RemoteMCPServer, mcpServerTool *v1alpha2.McpServerTool, agentHeaders map[string]string, proxyURL string) ([]byte, error) { switch remoteMcpServer.Spec.Protocol { case v1alpha2.RemoteMCPServerProtocolSse: tool, err := a.translateSseHttpTool(ctx, remoteMcpServer, agentHeaders, proxyURL) if err != nil { - return err + return nil, err } agent.SseTools = append(agent.SseTools, adk.SseMcpServerConfig{ Params: *tool, @@ -969,7 +977,7 @@ func (a *adkApiTranslator) translateRemoteMCPServerTarget(ctx context.Context, a default: tool, err := a.translateStreamableHttpTool(ctx, remoteMcpServer, agentHeaders, proxyURL) if err != nil { - return err + return nil, err } agent.HttpTools = append(agent.HttpTools, adk.HttpMcpServerConfig{ Params: *tool, @@ -980,11 +988,31 @@ func (a *adkApiTranslator) translateRemoteMCPServerTarget(ctx context.Context, a } // Mount the CA Secret on the agent pod when the RemoteMCPServer pins a TLS bundle. // Converters that synthesize RMSs from in-cluster MCPServer/Service - // references don't set Spec.TLS, so this is a no-op for those. + // references don't set Spec.TLS, so this is a no-op for those. Returns + // the controller-resolved TLS Secret hash so callers can mix it into + // the agent's config hash — that's the signal that drives a rollout + // when the CA Secret rotates in place (same Secret name, new PEM). if mdd != nil { addTLSConfiguration(mdd, remoteMcpServer.Spec.TLS) } - return nil + return remoteMCPServerSecretHashBytes(remoteMcpServer), nil +} + +// remoteMCPServerSecretHashBytes returns the hex-decoded bytes of the +// RMS's Status.SecretHash so the agent translator can fold them into the +// agent's config hash. Returns nil (no contribution, no error) when the +// status hash is empty or malformed — the controller is responsible for +// keeping Status.SecretHash in sync, and a transient missing/garbage +// value should not block agent translation. +func remoteMCPServerSecretHashBytes(remoteMcpServer *v1alpha2.RemoteMCPServer) []byte { + if remoteMcpServer == nil || remoteMcpServer.Status.SecretHash == "" { + return nil + } + decoded, err := hex.DecodeString(remoteMcpServer.Status.SecretHash) + if err != nil { + return nil + } + return decoded } // Helper functions diff --git a/go/core/internal/controller/translator/agent/compiler.go b/go/core/internal/controller/translator/agent/compiler.go index 801297ad15..93cb893a58 100644 --- a/go/core/internal/controller/translator/agent/compiler.go +++ b/go/core/internal/controller/translator/agent/compiler.go @@ -252,10 +252,19 @@ func (a *adkApiTranslator) translateInlineAgent(ctx context.Context, agent v1alp switch { case tool.McpServer != nil: - err := a.translateMCPServerTarget(ctx, cfg, mdd, agent.GetNamespace(), tool.McpServer, headers, a.globalProxyURL) + toolHashBytes, err := a.translateMCPServerTarget(ctx, cfg, mdd, agent.GetNamespace(), tool.McpServer, headers, a.globalProxyURL) if err != nil { return nil, nil, nil, err } + // Fold the RemoteMCPServer's TLS-Secret hash into the agent + // config hash so an in-place rotation of the RMS CA Secret + // (same Secret name, new PEM) triggers a rollout — the + // agent pod has the old cert cached in memory and would + // otherwise keep dialing with stale trust. Mirrors how + // ModelConfig.Status.SecretHash flows in above. + if len(toolHashBytes) > 0 { + secretHashBytes = append(secretHashBytes, toolHashBytes...) + } case tool.Agent != nil: agentRef := tool.Agent.NamespacedName(agent.GetNamespace()) diff --git a/go/core/internal/controller/translator/agent/manifest_builder.go b/go/core/internal/controller/translator/agent/manifest_builder.go index 704cc8fe04..66d0148746 100644 --- a/go/core/internal/controller/translator/agent/manifest_builder.go +++ b/go/core/internal/controller/translator/agent/manifest_builder.go @@ -72,15 +72,10 @@ func (a *adkApiTranslator) BuildManifest( } // Build the pod template with a placeholder config-hash. The real - // hash is computed and stamped after plugins run (see below): a - // plugin that mutates the config Secret needs the hash to reflect - // the mutation so pods roll on the next reconcile. buildPodTemplate - // initializes the Annotations map by reference; the post-plugin - // stamp writes into that same map, which workloadObjects share via - // Go's map-reference semantics — so the stamp propagates to - // whichever workload type buildWorkloadObjects produced (Deployment - // in normal mode, Sandbox in sandbox mode) without the translator - // needing to know about the workload's concrete type. + // hash is computed and stamped onto each workload's pod template + // after plugins run (see below): a plugin that mutates the config + // Secret needs the hash to reflect the mutation so pods roll on the + // next reconcile. podTemplate := buildPodTemplate(manifestCtx, podRuntime, 0) workloadObjects, err := a.buildWorkloadObjects(ctx, manifestCtx, podTemplate) @@ -102,16 +97,42 @@ func (a *adkApiTranslator) BuildManifest( return outputs, err } - // Stamp the post-plugin config-hash. The mutation propagates to - // every workload object that embedded podTemplate, since their - // pod-template Annotations map is the same reference as - // podTemplate.Annotations (see buildPodTemplate). + // Stamp the post-plugin config-hash explicitly onto every workload + // object in the manifest. Walking outputs.Manifest is structural — + // it survives a plugin that reassigns or defensively clones the + // PodTemplateSpec.Annotations map, which would silently break a + // shared-reference scheme. New workload types only need a case in + // stampConfigHashOnWorkloads (or, for backend-emitted kinds, an + // implementation of Backend.StampPodTemplateAnnotation) for the + // hash to flow through. configHash := computeHashFromConfigSecret(configSecret.secret, inputs.SecretHashBytes) - podTemplate.Annotations["kagent.dev/config-hash"] = fmt.Sprintf("%d", configHash) + a.stampConfigHashOnWorkloads(outputs.Manifest, configHash) return outputs, nil } +// stampConfigHashOnWorkloads walks the translated manifest and sets the +// kagent.dev/config-hash annotation on every workload object's pod +// template. Deployments are stamped here directly; sandbox-backend kinds +// delegate to the backend so this file stays independent of the +// agent-sandbox CRD type. +func (a *adkApiTranslator) stampConfigHashOnWorkloads(manifest []client.Object, configHash uint64) { + const key = "kagent.dev/config-hash" + value := fmt.Sprintf("%d", configHash) + for _, obj := range manifest { + if d, ok := obj.(*appsv1.Deployment); ok { + if d.Spec.Template.Annotations == nil { + d.Spec.Template.Annotations = map[string]string{} + } + d.Spec.Template.Annotations[key] = value + continue + } + if a.sandboxBackend != nil && a.sandboxBackend.StampPodTemplateAnnotation(obj, key, value) { + continue + } + } +} + func newManifestContext(agent v1alpha2.AgentObject, dep *resolvedDeployment) manifestContext { return manifestContext{ agent: agent, diff --git a/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go index 29347ad997..7996c003af 100644 --- a/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go +++ b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go @@ -522,3 +522,77 @@ func Test_AdkApiTranslator_RMSTLS_CoexistsWithModelConfigTLS(t *testing.T) { assert.Len(t, tlsVolumeNames, 3, "Three TLS Secrets must produce three distinct volume names") assert.Len(t, tlsMountPaths, 3, "Three TLS Secrets must produce three distinct mount paths") } + +// Test_AdkApiTranslator_RMSTLS_SecretHashChangesAgentConfigHash pins the +// cert-rotation behavior: an in-place rotation of the RMS TLS Secret +// (same Secret name, new PEM, status SecretHash flipped by the +// controller) must change the agent's kagent.dev/config-hash so the +// deployment rolls. Without this, agent pods retain the old cert loaded +// at process startup via ssl.create_default_context. +func Test_AdkApiTranslator_RMSTLS_SecretHashChangesAgentConfigHash(t *testing.T) { + scheme := schemev1.Scheme + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + build := func(rmsHash string) string { + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "rotate-test"}} + caSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-ca", Namespace: "rotate-test"}, + Data: map[string][]byte{"ca.crt": []byte("FAKE CA PEM")}, + } + modelConfig := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "rotate-test"}, + Spec: v1alpha2.ModelConfigSpec{Model: "gpt-4o", Provider: v1alpha2.ModelProviderOpenAI}, + } + rms := &v1alpha2.RemoteMCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "corp-upstream", Namespace: "rotate-test"}, + Spec: v1alpha2.RemoteMCPServerSpec{ + Description: "Corp upstream", + URL: "https://mcp.corp.internal/mcp", + TLS: &v1alpha2.TLSConfig{ + CACertSecretRef: "corp-ca", + CACertSecretKey: "ca.crt", + }, + }, + Status: v1alpha2.RemoteMCPServerStatus{SecretHash: rmsHash}, + } + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "rotate-test"}, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Description: "Agent", + Declarative: &v1alpha2.DeclarativeAgentSpec{ + SystemMessage: "System", + ModelConfig: "model", + Tools: []*v1alpha2.Tool{{ + Type: v1alpha2.ToolProviderType_McpServer, + McpServer: &v1alpha2.McpServerTool{ + TypedReference: v1alpha2.TypedReference{ + Kind: "RemoteMCPServer", ApiGroup: "kagent.dev", Name: "corp-upstream", + }, + }, + }}, + }, + }, + } + + kube := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(ns, caSecret, modelConfig, rms, agent). + Build() + trans := translator.NewAdkApiTranslator( + kube, + types.NamespacedName{Namespace: "rotate-test", Name: "model"}, + nil, "", nil, + ) + outputs, err := translator.TranslateAgent(context.Background(), trans, agent) + require.NoError(t, err) + dep := findDeployment(t, outputs) + return dep.Spec.Template.Annotations["kagent.dev/config-hash"] + } + + preRotate := build("deadbeef") + postRotate := build("cafef00d") + assert.NotEqual(t, preRotate, postRotate, + "agent config-hash must change when RMS Status.SecretHash rotates") +} + diff --git a/go/core/internal/httpserver/handlers/companion_secrets.go b/go/core/internal/httpserver/handlers/companion_secrets.go index 3f3a00b525..aa38b7a671 100644 --- a/go/core/internal/httpserver/handlers/companion_secrets.go +++ b/go/core/internal/httpserver/handlers/companion_secrets.go @@ -7,6 +7,7 @@ import ( "maps" "strings" + "github.com/go-logr/logr" api "github.com/kagent-dev/kagent/go/api/httpapi" "github.com/kagent-dev/kagent/go/core/internal/httpserver/errors" corev1 "k8s.io/api/core/v1" @@ -33,6 +34,23 @@ func companionSecretAPIError(err error) *errors.APIError { return errors.NewInternalServerError("Failed to create or update companion secrets", err) } +// rollbackOwnerOnCompanionSecretFailure deletes the owner resource the +// caller just created when the companion-Secret pass that follows fails. +// Use it to close the partial-failure window where the owner is in K8s +// but its referenced Secrets aren't — the operator's retry of the same +// POST would otherwise hit AlreadyExists on the owner without realizing +// the prior attempt half-succeeded. Best-effort: a delete failure is +// logged via the caller's logger but does not change the outer error; +// the caller already surfaces the companion-Secret error to the client. +func rollbackOwnerOnCompanionSecretFailure(ctx context.Context, kubeClient client.Client, owner client.Object, log logr.Logger) { + if err := kubeClient.Delete(ctx, owner); err != nil && !apierrors.IsNotFound(err) { + log.Error(err, "failed to roll back owner after companion-secret failure", + "kind", owner.GetObjectKind().GroupVersionKind().Kind, + "namespace", owner.GetNamespace(), + "name", owner.GetName()) + } +} + // validateSecretMaterials checks each SecretMaterial's name and key // against Kubernetes naming rules. Returns a single error for the first // invalid material so the caller can return a 400 with a precise reason. @@ -140,17 +158,24 @@ func ownerReferenceFor(owner client.Object, gvk schema.GroupVersionKind) metav1. } // isOwnedBy reports whether the secret carries an OwnerReference back -// to the named owner of the given GVK. UID is compared when the owner -// has one (so unit tests that build owners without a live UID still -// match by name+kind, which is the intended semantic). +// to the given owner. Compares APIVersion, Kind, Name, AND UID — the UID +// match is load-bearing because a delete-recreate of an owner with the +// same name issues a fresh UID, and the prior owner's secrets may still +// be visible through K8s GC's deletion delay. Requires a non-empty UID +// on the caller's owner; in production K8s always populates this after +// Create, so a zero UID here means a programming error (calling the +// helper before persisting the owner). func isOwnedBy(secret *corev1.Secret, owner client.Object, gvk schema.GroupVersionKind) bool { + if owner.GetUID() == "" { + return false + } for _, ownerRef := range secret.GetOwnerReferences() { if ownerRef.APIVersion != gvk.GroupVersion().Identifier() || ownerRef.Kind != gvk.Kind || ownerRef.Name != owner.GetName() { continue } - if owner.GetUID() != "" && ownerRef.UID != owner.GetUID() { + if ownerRef.UID != owner.GetUID() { continue } return true diff --git a/go/core/internal/httpserver/handlers/modelconfig.go b/go/core/internal/httpserver/handlers/modelconfig.go index 4e74c0c01f..045a9c3718 100644 --- a/go/core/internal/httpserver/handlers/modelconfig.go +++ b/go/core/internal/httpserver/handlers/modelconfig.go @@ -188,6 +188,11 @@ func (h *ModelConfigHandler) HandleCreateModelConfig(w ErrorResponseWriter, r *h if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, modelConfig, modelConfigGVK, req.Secrets); err != nil { log.Error(err, "Failed to create or update companion secrets") + // Close the partial-failure window: the ModelConfig is in K8s + // but its companion Secrets aren't. The operator's retry would + // otherwise hit AlreadyExists on the ModelConfig without a hint + // that the prior attempt half-succeeded. + rollbackOwnerOnCompanionSecretFailure(r.Context(), h.KubeClient, modelConfig, log) w.RespondWithError(companionSecretAPIError(err)) return } diff --git a/go/core/internal/httpserver/handlers/toolservers.go b/go/core/internal/httpserver/handlers/toolservers.go index 84e679a82b..24cd60ac9c 100644 --- a/go/core/internal/httpserver/handlers/toolservers.go +++ b/go/core/internal/httpserver/handlers/toolservers.go @@ -128,11 +128,6 @@ func (h *ToolServersHandler) HandleCreateToolServer(w ErrorResponseWriter, r *ht return } - if err := validateSecretMaterials(toolServerRequest.Secrets); err != nil { - w.RespondWithError(errors.NewBadRequestError(err.Error(), err)) - return - } - switch toolServerRequest.Type { case ToolServerTypeRemoteMCPServer: if toolServerRequest.RemoteMCPServer == nil { @@ -175,6 +170,14 @@ func (h *ToolServersHandler) handleCreateRemoteMCPServer(w ErrorResponseWriter, return } + // validateSecretMaterials runs after authz so an unauthorized caller + // gets 403 regardless of payload shape — keeps the error surface + // dependent only on authz, not on request structure. + if err := validateSecretMaterials(secrets); err != nil { + w.RespondWithError(errors.NewBadRequestError(err.Error(), err)) + return + } + if err := h.KubeClient.Create(r.Context(), toolServerRequest); err != nil { w.RespondWithError(errors.NewInternalServerError("Failed to create RemoteMCPServer in Kubernetes", err)) return @@ -182,6 +185,11 @@ func (h *ToolServersHandler) handleCreateRemoteMCPServer(w ErrorResponseWriter, if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, toolServerRequest, remoteMCPServerGVK, secrets); err != nil { log.Error(err, "Failed to create or update companion secrets") + // Close the partial-failure window: the RMS is already in K8s + // but its companion Secrets aren't. Leaving it would force the + // operator to delete-then-retry on a confusing 500 AlreadyExists + // on the next POST. Roll back, surface the original error. + rollbackOwnerOnCompanionSecretFailure(r.Context(), h.KubeClient, toolServerRequest, log) w.RespondWithError(companionSecretAPIError(err)) return } @@ -215,6 +223,11 @@ func (h *ToolServersHandler) handleCreateMCPServer(w ErrorResponseWriter, r *htt return } + if err := validateSecretMaterials(secrets); err != nil { + w.RespondWithError(errors.NewBadRequestError(err.Error(), err)) + return + } + if err := h.KubeClient.Create(r.Context(), toolServerRequest); err != nil { w.RespondWithError(errors.NewInternalServerError("Failed to create MCPServer in Kubernetes", err)) return @@ -222,6 +235,7 @@ func (h *ToolServersHandler) handleCreateMCPServer(w ErrorResponseWriter, r *htt if err := createOrUpdateCompanionSecrets(r.Context(), h.KubeClient, toolServerRequest, mcpServerGVK, secrets); err != nil { log.Error(err, "Failed to create or update companion secrets") + rollbackOwnerOnCompanionSecretFailure(r.Context(), h.KubeClient, toolServerRequest, log) w.RespondWithError(companionSecretAPIError(err)) return } diff --git a/go/core/internal/httpserver/handlers/toolservers_test.go b/go/core/internal/httpserver/handlers/toolservers_test.go index c1f06bb90d..07d53a9224 100644 --- a/go/core/internal/httpserver/handlers/toolservers_test.go +++ b/go/core/internal/httpserver/handlers/toolservers_test.go @@ -13,6 +13,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -575,6 +576,60 @@ func TestToolServersHandler(t *testing.T) { ctrl_client.ObjectKey{Namespace: "default", Name: "stranger"}, fresh) require.NoError(t, err) assert.Equal(t, []byte("OLD"), fresh.Data["ca.crt"]) + + // Companion-secret failure must roll back the RMS so the + // operator's retry doesn't hit AlreadyExists. Pins the + // partial-failure fix; without rollback the orphan would + // be readable here. + orphan := &v1alpha2.RemoteMCPServer{} + err = kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "stranger-rms"}, orphan) + assert.True(t, apierrors.IsNotFound(err), + "RMS must be rolled back when companion-secret creation fails; got err=%v", err) + }) + + // CompanionSecretFailure_RollsBackMCPServer pins the symmetric + // rollback behavior on the kmcp MCPServer create path so a + // regression on either branch surfaces in CI. + t.Run("CompanionSecretFailure_RollsBackMCPServer", func(t *testing.T) { + handler, kubeClient, _, responseRecorder := setupHandler(t) + + preexisting := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "stranger-kmcp", Namespace: "default"}, + Type: corev1.SecretTypeOpaque, + Data: map[string][]byte{"x": []byte("OLD")}, + } + require.NoError(t, kubeClient.Create(context.Background(), preexisting)) + + reqBody := &handlers.ToolServerCreateRequest{ + Type: "MCPServer", + MCPServer: &v1alpha1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "stranger-mcp", Namespace: "default"}, + Spec: v1alpha1.MCPServerSpec{ + Deployment: v1alpha1.MCPServerDeployment{ + Image: "example/kmcp:latest", + Port: 8080, + Cmd: "/bin/serve", + }, + }, + }, + Secrets: []api.SecretMaterial{ + {Name: "stranger-kmcp", Key: "x", Value: "NEW"}, + }, + } + jsonBody, _ := json.Marshal(reqBody) + req := httptest.NewRequest("POST", "/api/toolservers/", bytes.NewBuffer(jsonBody)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + + handler.HandleCreateToolServer(responseRecorder, req) + assert.Equal(t, http.StatusBadRequest, responseRecorder.Code) + + orphan := &v1alpha1.MCPServer{} + err := kubeClient.Get(context.Background(), + ctrl_client.ObjectKey{Namespace: "default", Name: "stranger-mcp"}, orphan) + assert.True(t, apierrors.IsNotFound(err), + "MCPServer must be rolled back when companion-secret creation fails; got err=%v", err) }) // AuthorizationRequired_RemoteMCPServer pins the authz gate on the diff --git a/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go b/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go index e5940816a8..1d1c0112cc 100644 --- a/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go +++ b/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go @@ -88,6 +88,21 @@ func mapsUnion(podLabels map[string]string, agentLabels map[string]string) map[s return out } +// StampPodTemplateAnnotation sets key=value on the Sandbox's pod-template +// annotations. Returns false for objects that aren't Sandboxes so the +// caller can fall through to its other workload kinds. +func (b *Backend) StampPodTemplateAnnotation(obj client.Object, key, value string) bool { + sb, ok := obj.(*agentsandboxv1.Sandbox) + if !ok { + return false + } + if sb.Spec.PodTemplate.ObjectMeta.Annotations == nil { + sb.Spec.PodTemplate.ObjectMeta.Annotations = map[string]string{} + } + sb.Spec.PodTemplate.ObjectMeta.Annotations[key] = value + return true +} + func (b *Backend) ComputeReady(ctx context.Context, cl client.Client, nn types.NamespacedName) (metav1.ConditionStatus, string, string) { sb := &agentsandboxv1.Sandbox{} if err := cl.Get(ctx, nn, sb); err != nil { diff --git a/go/core/pkg/sandboxbackend/backend.go b/go/core/pkg/sandboxbackend/backend.go index 02b17c14af..816ad4c2e1 100644 --- a/go/core/pkg/sandboxbackend/backend.go +++ b/go/core/pkg/sandboxbackend/backend.go @@ -25,4 +25,11 @@ type Backend interface { // ComputeReady reflects implementation-specific status into condition pieces for Agent.status. ComputeReady(ctx context.Context, cl client.Client, nn types.NamespacedName) (status metav1.ConditionStatus, reason, message string) + + // StampPodTemplateAnnotation sets the given annotation on the + // backend's workload pod template, returning true if obj is a kind + // the backend produces. Used by the agent translator to stamp the + // post-plugin config-hash without needing to import the backend's + // concrete CRD types. + StampPodTemplateAnnotation(obj client.Object, key, value string) bool } diff --git a/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml b/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml index a842ced11c..4ff81906eb 100644 --- a/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml +++ b/helm/kagent-crds/templates/kagent.dev_remotemcpservers.yaml @@ -321,6 +321,12 @@ spec: Important: Run "make" to regenerate code after modifying this file format: int64 type: integer + secretHash: + description: |- + SecretHash stores a hash of the TLS Secret referenced by spec.tls so + agents that consume this RemoteMCPServer can detect cert rotation and + roll on the next reconcile. Empty when spec.tls.caCertSecretRef is unset. + type: string type: object type: object served: true From 6a57b1446bb3e1fb0577029fa0f309c720d33871 Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Tue, 26 May 2026 09:12:11 -0700 Subject: [PATCH 7/8] Fix go lint error Signed-off-by: Jeremy Alvis --- .../controller/translator/agent/remotemcpserver_tls_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go index 7996c003af..994ce5df16 100644 --- a/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go +++ b/go/core/internal/controller/translator/agent/remotemcpserver_tls_test.go @@ -595,4 +595,3 @@ func Test_AdkApiTranslator_RMSTLS_SecretHashChangesAgentConfigHash(t *testing.T) assert.NotEqual(t, preRotate, postRotate, "agent config-hash must change when RMS Status.SecretHash rotates") } - From 70d3a25c928b60a47c8d963f3d36fb9dc917f832 Mon Sep 17 00:00:00 2001 From: Jeremy Alvis Date: Tue, 26 May 2026 15:04:19 -0700 Subject: [PATCH 8/8] Remove config hash re-calculation, handle on plugin side Signed-off-by: Jeremy Alvis --- .../translator/agent/manifest_builder.go | 108 ++---- .../translator/agent/manifest_builder_test.go | 337 ------------------ .../sandboxbackend/agentsxk8s/agentsxk8s.go | 15 - go/core/pkg/sandboxbackend/backend.go | 7 - 4 files changed, 20 insertions(+), 447 deletions(-) diff --git a/go/core/internal/controller/translator/agent/manifest_builder.go b/go/core/internal/controller/translator/agent/manifest_builder.go index 66d0148746..3b059bbef5 100644 --- a/go/core/internal/controller/translator/agent/manifest_builder.go +++ b/go/core/internal/controller/translator/agent/manifest_builder.go @@ -28,9 +28,10 @@ type manifestContext struct { } type configSecretInputs struct { - secret *corev1.Secret - volumes []corev1.Volume - mounts []corev1.VolumeMount + secret *corev1.Secret + configHash uint64 + volumes []corev1.Volume + mounts []corev1.VolumeMount } type podRuntimeInputs struct { @@ -56,7 +57,7 @@ func (a *adkApiTranslator) BuildManifest( outputs := &AgentOutputs{} manifestCtx := newManifestContext(agent, inputs.Deployment) - configSecret, err := a.buildConfigSecret(manifestCtx, inputs.Config, inputs.Sandbox, inputs.AgentCard) + configSecret, err := a.buildConfigSecret(manifestCtx, inputs.Config, inputs.Sandbox, inputs.AgentCard, inputs.SecretHashBytes) if err != nil { return nil, err } @@ -71,12 +72,7 @@ func (a *adkApiTranslator) BuildManifest( return nil, err } - // Build the pod template with a placeholder config-hash. The real - // hash is computed and stamped onto each workload's pod template - // after plugins run (see below): a plugin that mutates the config - // Secret needs the hash to reflect the mutation so pods roll on the - // next reconcile. - podTemplate := buildPodTemplate(manifestCtx, podRuntime, 0) + podTemplate := buildPodTemplate(manifestCtx, podRuntime, configSecret.configHash) workloadObjects, err := a.buildWorkloadObjects(ctx, manifestCtx, podTemplate) if err != nil { @@ -93,44 +89,7 @@ func (a *adkApiTranslator) BuildManifest( outputs.AgentCard = *inputs.AgentCard } - if err := a.runPlugins(ctx, agent, outputs); err != nil { - return outputs, err - } - - // Stamp the post-plugin config-hash explicitly onto every workload - // object in the manifest. Walking outputs.Manifest is structural — - // it survives a plugin that reassigns or defensively clones the - // PodTemplateSpec.Annotations map, which would silently break a - // shared-reference scheme. New workload types only need a case in - // stampConfigHashOnWorkloads (or, for backend-emitted kinds, an - // implementation of Backend.StampPodTemplateAnnotation) for the - // hash to flow through. - configHash := computeHashFromConfigSecret(configSecret.secret, inputs.SecretHashBytes) - a.stampConfigHashOnWorkloads(outputs.Manifest, configHash) - - return outputs, nil -} - -// stampConfigHashOnWorkloads walks the translated manifest and sets the -// kagent.dev/config-hash annotation on every workload object's pod -// template. Deployments are stamped here directly; sandbox-backend kinds -// delegate to the backend so this file stays independent of the -// agent-sandbox CRD type. -func (a *adkApiTranslator) stampConfigHashOnWorkloads(manifest []client.Object, configHash uint64) { - const key = "kagent.dev/config-hash" - value := fmt.Sprintf("%d", configHash) - for _, obj := range manifest { - if d, ok := obj.(*appsv1.Deployment); ok { - if d.Spec.Template.Annotations == nil { - d.Spec.Template.Annotations = map[string]string{} - } - d.Spec.Template.Annotations[key] = value - continue - } - if a.sandboxBackend != nil && a.sandboxBackend.StampPodTemplateAnnotation(obj, key, value) { - continue - } - } + return outputs, a.runPlugins(ctx, agent, outputs) } func newManifestContext(agent v1alpha2.AgentObject, dep *resolvedDeployment) manifestContext { @@ -170,10 +129,12 @@ func (a *adkApiTranslator) buildConfigSecret( cfg *adk.AgentConfig, sandboxCfg *v1alpha2.SandboxConfig, card *server.AgentCard, + modelConfigSecretHashBytes []byte, ) (*configSecretInputs, error) { cfgJSON := "" agentCard := "" srtSettingsJSON := "" + var configHash uint64 var volumes []corev1.Volume var mounts []corev1.VolumeMount @@ -200,6 +161,14 @@ func (a *adkApiTranslator) buildConfigSecret( } if cfg != nil || srtSettingsJSON != "" { + secretData := modelConfigSecretHashBytes + if secretData == nil { + secretData = []byte{} + } + hashData := make([]byte, 0, len(secretData)+len(srtSettingsJSON)) + hashData = append(hashData, secretData...) + hashData = append(hashData, srtSettingsJSON...) + configHash = computeConfigHash([]byte(cfgJSON), []byte(agentCard), hashData) volumes = []corev1.Volume{{ Name: "config", VolumeSource: corev1.VolumeSource{ @@ -215,49 +184,12 @@ func (a *adkApiTranslator) buildConfigSecret( ObjectMeta: manifestCtx.objectMeta(), StringData: buildConfigSecretData(cfgJSON, agentCard, srtSettingsJSON), }, - volumes: volumes, - mounts: mounts, + configHash: configHash, + volumes: volumes, + mounts: mounts, }, nil } -// computeHashFromConfigSecret derives the kagent.dev/config-hash -// annotation value from the agent's config Secret. Called after plugins -// have had a chance to mutate the Secret's contents — so the hash -// reflects what the agent pod will actually load at startup, and -// plugin-driven mutations naturally trigger rollouts on the next -// reconcile. -// -// modelConfigSecretHashBytes is the controller-resolved hash over -// upstream credentials (API key, etc.) that aren't stored in the Secret -// itself but still need to participate in the rollout signal. It's not -// touched by plugins; passing it through preserves the pre/post-plugin -// hash equivalence in the no-mutation case so existing agents don't -// roll on this change alone. -// -// Returns 0 in the no-config case (matches the original gate in -// buildConfigSecret) so pods that have no config to roll on stay -// stable. -func computeHashFromConfigSecret(secret *corev1.Secret, modelConfigSecretHashBytes []byte) uint64 { - if secret == nil { - return 0 - } - cfgJSON := secret.StringData["config.json"] - agentCard := secret.StringData["agent-card.json"] - srtSettings := secret.StringData["srt-settings.json"] - if cfgJSON == "" && srtSettings == "" { - return 0 - } - - secretData := modelConfigSecretHashBytes - if secretData == nil { - secretData = []byte{} - } - hashData := make([]byte, 0, len(secretData)+len(srtSettings)) - hashData = append(hashData, secretData...) - hashData = append(hashData, srtSettings...) - return computeConfigHash([]byte(cfgJSON), []byte(agentCard), hashData) -} - func buildConfigSecretData(cfgJSON, agentCard, srtSettingsJSON string) map[string]string { data := map[string]string{ "config.json": cfgJSON, diff --git a/go/core/internal/controller/translator/agent/manifest_builder_test.go b/go/core/internal/controller/translator/agent/manifest_builder_test.go index 6f5008f7dc..8c01a2af58 100644 --- a/go/core/internal/controller/translator/agent/manifest_builder_test.go +++ b/go/core/internal/controller/translator/agent/manifest_builder_test.go @@ -1,26 +1,11 @@ package agent import ( - "context" "encoding/json" - "fmt" - "strings" "testing" - "github.com/kagent-dev/kagent/go/api/adk" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/agentsxk8s" - pkgtranslator "github.com/kagent-dev/kagent/go/core/pkg/translator" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - schemev1 "k8s.io/client-go/kubernetes/scheme" - agentsandboxv1 "sigs.k8s.io/agent-sandbox/api/v1alpha1" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/client/fake" ) func TestBuildSRTSettingsJSON_DefaultDenyConfig(t *testing.T) { @@ -132,325 +117,3 @@ func TestBuildConfigSecretData_IncludesSRTSettingsWhenPresent(t *testing.T) { t.Fatal("srt-settings.json should be present when non-empty") } } - -// TestComputeHashFromConfigSecret_NilSecretReturnsZero pins the -// fail-safe shape of the helper: when there's no Secret to read, we -// don't fabricate a hash. -func TestComputeHashFromConfigSecret_NilSecretReturnsZero(t *testing.T) { - if got := computeHashFromConfigSecret(nil, nil); got != 0 { - t.Fatalf("nil secret should hash to 0, got %d", got) - } -} - -// TestComputeHashFromConfigSecret_EmptyConfigReturnsZero preserves the -// pre-PR no-config behavior. Agents without compiled config produce a -// zero hash so pods that have nothing to roll on stay stable. -func TestComputeHashFromConfigSecret_EmptyConfigReturnsZero(t *testing.T) { - secret := &corev1.Secret{ - StringData: map[string]string{ - "config.json": "", - "agent-card.json": "", - }, - } - if got := computeHashFromConfigSecret(secret, nil); got != 0 { - t.Fatalf("empty config secret should hash to 0, got %d", got) - } -} - -// TestComputeHashFromConfigSecret_DifferentContentDifferentHash is the -// load-bearing assertion for the plugin-restamp fix: if a plugin mutates -// the Secret's config.json, the recomputed hash must differ from the -// pre-mutation hash so pods actually roll. -func TestComputeHashFromConfigSecret_DifferentContentDifferentHash(t *testing.T) { - pre := &corev1.Secret{StringData: map[string]string{ - "config.json": `{"url":"https://upstream/mcp"}`, - "agent-card.json": `{"name":"a"}`, - }} - post := &corev1.Secret{StringData: map[string]string{ - "config.json": `{"url":"http://upstream:443/mcp"}`, - "agent-card.json": `{"name":"a"}`, - }} - - preHash := computeHashFromConfigSecret(pre, []byte("model-hash")) - postHash := computeHashFromConfigSecret(post, []byte("model-hash")) - - assert.NotEqual(t, preHash, postHash, "mutating config.json must change the hash") -} - -// TestComputeHashFromConfigSecret_SameContentSameHash pins determinism: -// the recompute path must produce identical hashes for identical inputs -// so the no-plugin case doesn't cause spurious pod rollouts on -// controller restart or version upgrade. -func TestComputeHashFromConfigSecret_SameContentSameHash(t *testing.T) { - secret := &corev1.Secret{StringData: map[string]string{ - "config.json": `{"url":"https://upstream/mcp"}`, - "agent-card.json": `{"name":"a"}`, - }} - a := computeHashFromConfigSecret(secret, []byte("model-hash")) - b := computeHashFromConfigSecret(secret, []byte("model-hash")) - assert.Equal(t, a, b) -} - -// configSecretMutatingPlugin walks outputs.Manifest, finds the agent -// config Secret by name, parses config.json, mutates it, and -// re-marshals. Used to verify that BuildManifest re-stamps the -// kagent.dev/config-hash annotation after plugins so the workload -// rolls on the next reconcile. -type configSecretMutatingPlugin struct { - agentName string - mutate func(cfg *adk.AgentConfig) -} - -func (p *configSecretMutatingPlugin) ProcessAgent(_ context.Context, _ v1alpha2.AgentObject, outputs *pkgtranslator.AgentOutputs) error { - for _, obj := range outputs.Manifest { - secret, ok := obj.(*corev1.Secret) - if !ok || secret.GetName() != p.agentName { - continue - } - raw := secret.StringData["config.json"] - if raw == "" { - return nil - } - var cfg adk.AgentConfig - if err := json.Unmarshal([]byte(raw), &cfg); err != nil { - return fmt.Errorf("unmarshal config.json: %w", err) - } - p.mutate(&cfg) - out, err := json.Marshal(&cfg) - if err != nil { - return fmt.Errorf("marshal config.json: %w", err) - } - secret.StringData["config.json"] = string(out) - return nil - } - return nil -} - -func (p *configSecretMutatingPlugin) GetOwnedResourceTypes() []client.Object { return nil } - -// TestBuildManifest_PluginMutatesSecret_HashReflectsPostPluginContent -// is the integration regression for the OSS gap that forced operators -// to `kubectl rollout restart` after toggling enterprise plugins. -// -// Pre-fix: buildPodTemplate stamped the kagent.dev/config-hash -// annotation from the pre-plugin AgentConfig, then plugins ran and -// mutated the Secret's config.json. The annotation never reflected the -// mutation, so the Deployment's pod template hash stayed stable and -// pods didn't roll. Operators had to restart agents manually. -// -// Post-fix: BuildManifest builds the workload with a placeholder hash, -// runs plugins, then stamps the real hash computed from the post-plugin -// Secret content. The annotation on the workload (via shared Go map -// reference) flips, the next reconcile sees the new hash, and pods roll -// automatically. -func TestBuildManifest_PluginMutatesSecret_HashReflectsPostPluginContent(t *testing.T) { - scheme := schemev1.Scheme - require.NoError(t, v1alpha2.AddToScheme(scheme)) - - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "hash-test"}} - modelConfig := &v1alpha2.ModelConfig{ - ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "hash-test"}, - Spec: v1alpha2.ModelConfigSpec{ - Model: "gpt-4o", - Provider: v1alpha2.ModelProviderOpenAI, - }, - } - agent := &v1alpha2.Agent{ - ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "hash-test"}, - Spec: v1alpha2.AgentSpec{ - Type: v1alpha2.AgentType_Declarative, - Description: "Agent", - Declarative: &v1alpha2.DeclarativeAgentSpec{ - SystemMessage: "System", - ModelConfig: "model", - }, - }, - } - - // Capture the unmodified pre-plugin hash by running once without a - // plugin installed. - kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ns, modelConfig, agent).Build() - defaultModel := types.NamespacedName{Namespace: "hash-test", Name: "model"} - transNoPlugin := NewAdkApiTranslator(kube, defaultModel, nil, "", nil) - noPluginOutputs, err := TranslateAgent(context.Background(), transNoPlugin, agent) - require.NoError(t, err) - preHash := configHashFromDeployment(t, noPluginOutputs) - - // Now run the same compile through a plugin that rewrites - // config.json's instruction field. The annotation MUST differ. - mutator := &configSecretMutatingPlugin{ - agentName: "agent", - mutate: func(cfg *adk.AgentConfig) { - cfg.Instruction = "MUTATED BY PLUGIN" - }, - } - transWithPlugin := NewAdkApiTranslator(kube, defaultModel, []TranslatorPlugin{mutator}, "", nil) - pluginOutputs, err := TranslateAgent(context.Background(), transWithPlugin, agent) - require.NoError(t, err) - postHash := configHashFromDeployment(t, pluginOutputs) - - assert.NotEqual(t, preHash, postHash, - "kagent.dev/config-hash on the workload must change when a plugin mutates the config Secret") - - // Also verify the Secret itself carries the mutation (sanity-check - // the plugin actually ran). - mutatedSecret := configSecretFromManifest(t, pluginOutputs, "agent") - assert.Contains(t, mutatedSecret.StringData["config.json"], "MUTATED BY PLUGIN") -} - -// TestBuildManifest_NoPlugin_HashMatchesGoldenBehavior is the -// no-spurious-rollout guard. When no plugin runs, the post-plugin -// recompute must produce the same hash that the pre-PR code would have -// stamped — otherwise upgrading the controller would roll every agent -// pod for no functional reason. -// -// The golden testdata under testdata/outputs/ already pins specific -// hash values for many agent shapes; this is the dedicated regression -// test that names the invariant. -func TestBuildManifest_NoPlugin_HashMatchesGoldenBehavior(t *testing.T) { - scheme := schemev1.Scheme - require.NoError(t, v1alpha2.AddToScheme(scheme)) - - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "no-plugin"}} - modelConfig := &v1alpha2.ModelConfig{ - ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "no-plugin"}, - Spec: v1alpha2.ModelConfigSpec{Model: "gpt-4o", Provider: v1alpha2.ModelProviderOpenAI}, - } - agent := &v1alpha2.Agent{ - ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "no-plugin"}, - Spec: v1alpha2.AgentSpec{ - Type: v1alpha2.AgentType_Declarative, - Description: "Agent", - Declarative: &v1alpha2.DeclarativeAgentSpec{ - SystemMessage: "System", - ModelConfig: "model", - }, - }, - } - - kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ns, modelConfig, agent).Build() - trans := NewAdkApiTranslator(kube, types.NamespacedName{Namespace: "no-plugin", Name: "model"}, nil, "", nil) - - first, err := TranslateAgent(context.Background(), trans, agent) - require.NoError(t, err) - second, err := TranslateAgent(context.Background(), trans, agent) - require.NoError(t, err) - - assert.Equal(t, configHashFromDeployment(t, first), configHashFromDeployment(t, second), - "two translations of the same agent must produce identical hashes (deterministic, no rollout drift)") - assert.NotEqual(t, "0", configHashFromDeployment(t, first), - "a non-empty config should produce a non-zero hash") -} - -// configHashFromDeployment extracts the kagent.dev/config-hash -// annotation value from the Deployment's pod template in an outputs -// bundle. Test helper. -func configHashFromDeployment(t *testing.T, outputs *pkgtranslator.AgentOutputs) string { - t.Helper() - for _, obj := range outputs.Manifest { - if dep, ok := obj.(*appsv1.Deployment); ok { - return dep.Spec.Template.Annotations["kagent.dev/config-hash"] - } - } - t.Fatal("Deployment not found in outputs.Manifest") - return "" -} - -// configSecretFromManifest finds the agent's config Secret in an -// outputs bundle. Test helper. -func configSecretFromManifest(t *testing.T, outputs *pkgtranslator.AgentOutputs, name string) *corev1.Secret { - t.Helper() - for _, obj := range outputs.Manifest { - if s, ok := obj.(*corev1.Secret); ok && strings.Contains(s.GetName(), name) { - return s - } - } - t.Fatal("config Secret not found in outputs.Manifest") - return nil -} - -// configHashFromSandbox extracts the kagent.dev/config-hash annotation -// from the Sandbox CR's pod template. Sandbox-mode workloads carry the -// same annotation as Deployment-mode workloads, just nested through -// Sandbox.Spec.PodTemplate.ObjectMeta (agent-sandbox uses its own -// PodMetadata type, not corev1.ObjectMeta). Test helper. -func configHashFromSandbox(t *testing.T, outputs *pkgtranslator.AgentOutputs) string { - t.Helper() - for _, obj := range outputs.Manifest { - if sb, ok := obj.(*agentsandboxv1.Sandbox); ok { - return sb.Spec.PodTemplate.ObjectMeta.Annotations["kagent.dev/config-hash"] - } - } - t.Fatal("Sandbox not found in outputs.Manifest") - return "" -} - -// TestBuildManifest_SandboxMode_PluginMutatesSecret_HashReflectsPostPluginContent -// is the sandbox-mode sibling of the Deployment-mode regression test. -// -// The post-plugin hash stamp relies on Go map-reference sharing: the -// Annotations map on the local podTemplate variable, on -// Deployment.Spec.Template.Annotations (normal mode), and on -// Sandbox.Spec.PodTemplate.Annotations (sandbox mode) must all point at -// the same underlying map. If a future sandbox backend "defensively" -// deep-copies that map via maps.Clone, the post-plugin mutation silently -// stops propagating to the Sandbox CR — sandbox-mode agents stop rolling -// on plugin-driven config changes. -// -// This test specifically pins the sandbox-path invariant. The -// agentsxk8s.New() backend at agentsxk8s.go:50 currently does -// `Annotations: in.PodTemplate.Annotations` (reference assignment, not -// clone) and a regression to clone here breaks this assertion. -func TestBuildManifest_SandboxMode_PluginMutatesSecret_HashReflectsPostPluginContent(t *testing.T) { - scheme := schemev1.Scheme - require.NoError(t, v1alpha2.AddToScheme(scheme)) - require.NoError(t, agentsandboxv1.AddToScheme(scheme)) - - ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "sb-hash-test"}} - modelConfig := &v1alpha2.ModelConfig{ - ObjectMeta: metav1.ObjectMeta{Name: "model", Namespace: "sb-hash-test"}, - Spec: v1alpha2.ModelConfigSpec{Model: "gpt-4o", Provider: v1alpha2.ModelProviderOpenAI}, - } - // SandboxAgent (not Agent) drives the sandbox path through - // buildWorkloadObjects → sandboxBackend.BuildSandbox. - sb := &v1alpha2.SandboxAgent{ - ObjectMeta: metav1.ObjectMeta{Name: "agent", Namespace: "sb-hash-test"}, - Spec: v1alpha2.AgentSpec{ - Type: v1alpha2.AgentType_Declarative, - Declarative: &v1alpha2.DeclarativeAgentSpec{ - SystemMessage: "System", - ModelConfig: "model", - }, - }, - } - - kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ns, modelConfig).Build() - defaultModel := types.NamespacedName{Namespace: "sb-hash-test", Name: "model"} - - // No-plugin baseline. - noPluginTrans := NewAdkApiTranslator(kube, defaultModel, nil, "", agentsxk8s.New()) - noPluginOutputs, err := TranslateAgent(context.Background(), noPluginTrans, sb) - require.NoError(t, err) - preHash := configHashFromSandbox(t, noPluginOutputs) - assert.NotEqual(t, "0", preHash, "sandbox-mode workload should have a non-placeholder hash post-stamp") - - // With a plugin that mutates the Secret, the Sandbox CR's hash - // MUST differ. Failure here means the map-sharing invariant broke - // on the sandbox path (typically a future agentsxk8s change that - // clones the annotations map). - mutator := &configSecretMutatingPlugin{ - agentName: "agent", - mutate: func(cfg *adk.AgentConfig) { - cfg.Instruction = "MUTATED BY PLUGIN" - }, - } - pluginTrans := NewAdkApiTranslator(kube, defaultModel, []TranslatorPlugin{mutator}, "", agentsxk8s.New()) - pluginOutputs, err := TranslateAgent(context.Background(), pluginTrans, sb) - require.NoError(t, err) - postHash := configHashFromSandbox(t, pluginOutputs) - - assert.NotEqual(t, preHash, postHash, - "sandbox CR's config-hash annotation must change when a plugin mutates the config Secret — "+ - "failure typically indicates the sandbox backend started deep-copying the pod template annotations map, "+ - "which silently breaks plugin-driven rollouts") -} diff --git a/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go b/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go index 1d1c0112cc..e5940816a8 100644 --- a/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go +++ b/go/core/pkg/sandboxbackend/agentsxk8s/agentsxk8s.go @@ -88,21 +88,6 @@ func mapsUnion(podLabels map[string]string, agentLabels map[string]string) map[s return out } -// StampPodTemplateAnnotation sets key=value on the Sandbox's pod-template -// annotations. Returns false for objects that aren't Sandboxes so the -// caller can fall through to its other workload kinds. -func (b *Backend) StampPodTemplateAnnotation(obj client.Object, key, value string) bool { - sb, ok := obj.(*agentsandboxv1.Sandbox) - if !ok { - return false - } - if sb.Spec.PodTemplate.ObjectMeta.Annotations == nil { - sb.Spec.PodTemplate.ObjectMeta.Annotations = map[string]string{} - } - sb.Spec.PodTemplate.ObjectMeta.Annotations[key] = value - return true -} - func (b *Backend) ComputeReady(ctx context.Context, cl client.Client, nn types.NamespacedName) (metav1.ConditionStatus, string, string) { sb := &agentsandboxv1.Sandbox{} if err := cl.Get(ctx, nn, sb); err != nil { diff --git a/go/core/pkg/sandboxbackend/backend.go b/go/core/pkg/sandboxbackend/backend.go index 816ad4c2e1..02b17c14af 100644 --- a/go/core/pkg/sandboxbackend/backend.go +++ b/go/core/pkg/sandboxbackend/backend.go @@ -25,11 +25,4 @@ type Backend interface { // ComputeReady reflects implementation-specific status into condition pieces for Agent.status. ComputeReady(ctx context.Context, cl client.Client, nn types.NamespacedName) (status metav1.ConditionStatus, reason, message string) - - // StampPodTemplateAnnotation sets the given annotation on the - // backend's workload pod template, returning true if obj is a kind - // the backend produces. Used by the agent translator to stamp the - // post-plugin config-hash without needing to import the backend's - // concrete CRD types. - StampPodTemplateAnnotation(obj client.Object, key, value string) bool }