Skip to content

Commit 3d0712c

Browse files
JAORMXclaudeyrobla
authored
Gate proxyrunner StatefulSet apply by MCPServer generation (#5024)
Guard proxyrunner StatefulSet apply with a generation gate During a proxy Deployment rolling update the old and new ReplicaSet pods both run DeployWorkload, each calling server-side apply on the backing StatefulSet with the same field manager and Force: true. The stale pod's apply can land after the fresh pod's and clobber the image, leaving the StatefulSet pinned to the old digest even after MCPServer.spec.image has been bumped. The operator now stamps MCPServer.metadata.generation into the RunConfig as a monotonic version. Proxyrunner threads it through to DeployWorkloadOptions. Before apply, the K8s runtime reads the existing StatefulSet's mcpserver-generation annotation; if it is strictly greater than ours, the apply is skipped. Otherwise our generation is stamped on the pod template and apply proceeds. Zero generation is the backward-compat signal and bypasses both the gate and the stamp, preserving existing behavior for callers that don't pass one. Signed-off-by: Juan Antonio Osorio <ozz@stacklok.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Yolanda Robla Mota <yolanda@stacklok.com>
1 parent bbe8b85 commit 3d0712c

13 files changed

Lines changed: 412 additions & 13 deletions

File tree

cmd/thv-operator/controllers/mcpserver_runconfig.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ func (r *MCPServerReconciler) createRunConfigFromMCPServer(m *mcpv1beta1.MCPServ
144144
options := []runner.RunConfigBuilderOption{
145145
runner.WithName(m.Name),
146146
runner.WithImage(m.Spec.Image),
147+
runner.WithMCPServerGeneration(m.Generation),
147148
runner.WithCmdArgs(m.Spec.Args),
148149
runner.WithTransportAndPorts(m.Spec.Transport, int(m.GetProxyPort()), int(m.GetMCPPort())),
149150
runner.WithProxyMode(transporttypes.ProxyMode(effectiveProxyMode)),

cmd/thv-operator/controllers/mcpserver_runconfig_test.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,3 +1782,34 @@ func TestCreateRunConfigFromMCPServer_RateLimiting(t *testing.T) {
17821782
})
17831783
}
17841784
}
1785+
1786+
func TestCreateRunConfigFromMCPServer_SetsMCPServerGeneration(t *testing.T) {
1787+
t.Parallel()
1788+
1789+
m := &mcpv1beta1.MCPServer{
1790+
ObjectMeta: metav1.ObjectMeta{
1791+
Name: "generation-server",
1792+
Namespace: "default",
1793+
Generation: 7,
1794+
},
1795+
Spec: mcpv1beta1.MCPServerSpec{
1796+
Image: "ghcr.io/example/mcp:v1",
1797+
Transport: stdioTransport,
1798+
ProxyPort: 8080,
1799+
},
1800+
}
1801+
1802+
r := newTestMCPServerReconciler(
1803+
fake.NewClientBuilder().WithScheme(createRunConfigTestScheme()).WithObjects(m).Build(),
1804+
createRunConfigTestScheme(),
1805+
kubernetes.PlatformKubernetes,
1806+
)
1807+
1808+
rc, err := r.createRunConfigFromMCPServer(m)
1809+
1810+
require.NoError(t, err)
1811+
require.NotNil(t, rc)
1812+
1813+
assert.Equal(t, int64(7), rc.MCPServerGeneration,
1814+
"MCPServerGeneration should match MCPServer .metadata.generation")
1815+
}

docs/server/docs.go

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/server/swagger.json

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/server/swagger.yaml

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/container/kubernetes/client.go

Lines changed: 104 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,15 @@ const (
4949
defaultNamespace = "default"
5050
// serviceFieldManager is the field manager name for server-side apply operations
5151
serviceFieldManager = "toolhive-container-manager"
52+
53+
// RunConfigMCPServerGenerationAnnotation carries the MCPServer .metadata.generation that
54+
// produced the RunConfig applied to this StatefulSet. Used as a monotonic version stamp
55+
// to prevent stale proxyrunner pods (from an old Deployment ReplicaSet) from clobbering
56+
// a newer RunConfig's apply. The gate only becomes effective once proxyrunner is upgraded
57+
// to a version that reads this annotation; operator-only upgrades leave the race window
58+
// in place until proxyrunner is also rolled. Exported because it forms a wire contract
59+
// that external readers (operator, diagnostic tooling) may consume.
60+
RunConfigMCPServerGenerationAnnotation = "toolhive.stacklok.dev/mcpserver-generation"
5261
)
5362

5463
// RuntimeName is the name identifier for the Kubernetes runtime
@@ -397,22 +406,26 @@ func (c *Client) DeployWorkload(ctx context.Context,
397406
return 0, err
398407
}
399408

400-
// Create an apply configuration for the statefulset
401-
statefulSetApply := appsv1apply.StatefulSet(containerName, namespace).
402-
WithLabels(containerLabels).
403-
WithSpec(buildStatefulSetSpec(containerName, podTemplateSpec, options))
404-
405-
// Apply the statefulset using server-side apply
406-
createdStatefulSet, err := c.client.AppsV1().StatefulSets(namespace).
407-
Apply(ctx, statefulSetApply, metav1.ApplyOptions{
408-
FieldManager: serviceFieldManager,
409-
Force: true,
410-
})
409+
ourGen := runConfigGeneration(options)
410+
skip, err := c.shouldSkipStatefulSetApply(ctx, namespace, containerName, ourGen)
411411
if err != nil {
412-
return 0, fmt.Errorf("failed to apply statefulset: %w", err)
412+
return 0, err
413+
}
414+
if skip {
415+
// Intentionally skip ensureBackendServices in the gated path: this pod's RunConfig
416+
// is stale, so reconciling services here would clobber port/config fields set by
417+
// the newer-generation pod under the same field manager + Force: true — the same
418+
// race this gate prevents for the StatefulSet. The newer pod already reconciled
419+
// services; if that failed, it returns an error and retries on its own.
420+
return 0, nil
413421
}
414422

415-
slog.Debug("applied statefulset", "name", createdStatefulSet.Name)
423+
createdStatefulSet, err := c.applyStatefulSet(
424+
ctx, namespace, containerName, containerLabels, podTemplateSpec, options, ourGen,
425+
)
426+
if err != nil {
427+
return 0, err
428+
}
416429

417430
err = c.ensureBackendServices(
418431
ctx, containerName, namespace, containerLabels, transportType, options, createdStatefulSet)
@@ -435,6 +448,84 @@ func (c *Client) DeployWorkload(ctx context.Context,
435448
return 0, nil
436449
}
437450

451+
// runConfigGeneration extracts the RunConfig MCPServer generation from options,
452+
// returning 0 when options is nil (backward-compat / non-operator callers).
453+
func runConfigGeneration(options *runtime.DeployWorkloadOptions) int64 {
454+
if options == nil {
455+
return 0
456+
}
457+
return options.RunConfigMCPServerGeneration
458+
}
459+
460+
// applyStatefulSet stamps the MCPServer generation annotation when non-zero,
461+
// builds the StatefulSet apply configuration, and performs the server-side apply.
462+
func (c *Client) applyStatefulSet(
463+
ctx context.Context,
464+
namespace, containerName string,
465+
containerLabels map[string]string,
466+
podTemplateSpec *corev1apply.PodTemplateSpecApplyConfiguration,
467+
options *runtime.DeployWorkloadOptions,
468+
ourGen int64,
469+
) (*appsv1.StatefulSet, error) {
470+
if ourGen > 0 {
471+
podTemplateSpec = podTemplateSpec.WithAnnotations(map[string]string{
472+
RunConfigMCPServerGenerationAnnotation: strconv.FormatInt(ourGen, 10),
473+
})
474+
}
475+
statefulSetApply := appsv1apply.StatefulSet(containerName, namespace).
476+
WithLabels(containerLabels).
477+
WithSpec(buildStatefulSetSpec(containerName, podTemplateSpec, options))
478+
createdStatefulSet, err := c.client.AppsV1().StatefulSets(namespace).
479+
Apply(ctx, statefulSetApply, metav1.ApplyOptions{
480+
FieldManager: serviceFieldManager,
481+
Force: true,
482+
})
483+
if err != nil {
484+
return nil, fmt.Errorf("failed to apply statefulset: %w", err)
485+
}
486+
slog.Debug("applied statefulset", "name", createdStatefulSet.Name)
487+
return createdStatefulSet, nil
488+
}
489+
490+
// shouldSkipStatefulSetApply returns true when the existing StatefulSet is already
491+
// stamped with a strictly greater MCPServer generation than ours, meaning a newer
492+
// proxyrunner pod has already reconciled the workload and ours would be a regression.
493+
// Returns false (apply as normal) when ourGen is zero or negative, when the StatefulSet
494+
// does not yet exist, when the annotation is absent, or when the annotation is unparsable.
495+
func (c *Client) shouldSkipStatefulSetApply(
496+
ctx context.Context, namespace, name string, ourGen int64,
497+
) (bool, error) {
498+
if ourGen <= 0 {
499+
return false, nil
500+
}
501+
existing, err := c.client.AppsV1().StatefulSets(namespace).Get(ctx, name, metav1.GetOptions{})
502+
if err != nil {
503+
if errors.IsNotFound(err) {
504+
return false, nil
505+
}
506+
return false, fmt.Errorf("failed to get existing statefulset: %w", err)
507+
}
508+
if existing.Spec.Template.Annotations == nil {
509+
return false, nil
510+
}
511+
theirs := existing.Spec.Template.Annotations[RunConfigMCPServerGenerationAnnotation]
512+
if theirs == "" {
513+
return false, nil
514+
}
515+
theirsGen, parseErr := strconv.ParseInt(theirs, 10, 64)
516+
if parseErr != nil {
517+
slog.Warn("unparsable mcpserver-generation annotation; proceeding with apply",
518+
"sts", name, "value", theirs, "err", parseErr)
519+
return false, nil
520+
}
521+
if theirsGen > ourGen {
522+
slog.Debug("skipping StatefulSet apply; newer MCPServer generation already applied",
523+
"sts", name, "ours", ourGen, "theirs", theirsGen)
524+
return true, nil
525+
}
526+
return false, nil
527+
}
528+
438529
// buildStatefulSetSpec constructs the StatefulSet spec apply configuration.
439530
// WithReplicas is only included when BackendReplicas is explicitly set; omitting
440531
// the field lets the existing field manager (e.g. HPA or kubectl) retain control

0 commit comments

Comments
 (0)