fix: fixed the provider bug for labels

juicycleff · juicycleff · commit 1f01bfaa6a04 · 2026-06-16T21:35:53.000-05:00
diff --git a/provider/kubernetes/provider.go b/provider/kubernetes/provider.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"maps"
 
 	"helm.sh/helm/v3/pkg/action"
 	"helm.sh/helm/v3/pkg/chart"
@@ -143,7 +144,16 @@ func (p *Provider) Capabilities() []provider.Capability {
 // (ClusterIP=None) + volumeClaimTemplates per persistent volume + per-
 // service ConfigMaps. Each replica gets its own PVC by name.
 func (p *Provider) Provision(ctx context.Context, req provider.ProvisionRequest) (*provider.ProvisionResult, error) {
-	labels := instanceLabels(req.InstanceID, req.TenantID, p.cfg.Labels)
+	// Per-instance labels (req.Labels — e.g. a caller's workspace/component
+	// tags) are layered onto the provider's static cfg labels so the pods
+	// are identifiable and queryable by the caller's own scheme, not just
+	// the opaque instance ID. cfg labels are the base; req labels win on
+	// collision; the reserved ctrlplane.io/* keys are set last by
+	// instanceLabels and always survive.
+	extra := make(map[string]string, len(p.cfg.Labels)+len(req.Labels))
+	maps.Copy(extra, p.cfg.Labels)
+	maps.Copy(extra, req.Labels)
+	labels := instanceLabels(req.InstanceID, req.TenantID, extra)
 	ns := p.cfg.Namespace
 
 	// Create per-service ConfigMaps before the controller object so the
@@ -413,15 +423,69 @@ func (p *Provider) Rollback(_ context.Context, _ id.ID, _ id.ID) error {
 	return nil
 }
 
-// Scale adjusts the instance's resource allocation.
+// Scale adjusts the instance's resource allocation. A ScaleRequest may
+// carry CPU/memory changes, a replica change, or both: resource changes
+// patch the pod template's container resources (rolling the pods),
+// replica changes patch the scale subresource.
 func (p *Provider) Scale(ctx context.Context, instanceID id.ID, spec provider.ResourceSpec) error {
+	if spec.CPUMillis > 0 || spec.MemoryMB > 0 {
+		if err := p.applyResources(ctx, instanceID, spec); err != nil {
+			return err
+		}
+	}
+
 	if spec.Replicas > 0 {
 		return p.scaleReplicas(ctx, instanceID, int32(min(spec.Replicas, int(^int32(0))))) //nolint:gosec // clamped to int32 range via min
 	}
 
 	return nil
 }
 
+// applyResources patches the pod template's container resource
+// requests+limits to spec, rolling the pods. Mirrors instance.Service's
+// Scale contract: it targets the workload's app container(s); a
+// multi-service workload gets every app container set to the same spec
+// (per-service resourcing is a future API). Init containers are left
+// untouched. The Deployment path is tried first, then StatefulSet —
+// same dispatch as Deploy.
+func (p *Provider) applyResources(ctx context.Context, instanceID id.ID, spec provider.ResourceSpec) error {
+	ns := p.cfg.Namespace
+	name := deploymentName(instanceID)
+
+	dep, depErr := p.client.AppsV1().Deployments(ns).Get(ctx, name, metav1.GetOptions{})
+	if depErr == nil {
+		setContainerResources(dep.Spec.Template.Spec.Containers, spec)
+
+		if _, err := p.client.AppsV1().Deployments(ns).Update(ctx, dep, metav1.UpdateOptions{}); err != nil {
+			return fmt.Errorf("kubernetes: patch deployment resources: %w", err)
+		}
+
+		return nil
+	}
+
+	ss, ssErr := p.client.AppsV1().StatefulSets(ns).Get(ctx, name, metav1.GetOptions{})
+	if ssErr != nil {
+		return fmt.Errorf("kubernetes: get workload for resize: deployment: %w; statefulset: %w", depErr, ssErr)
+	}
+
+	setContainerResources(ss.Spec.Template.Spec.Containers, spec)
+
+	if _, err := p.client.AppsV1().StatefulSets(ns).Update(ctx, ss, metav1.UpdateOptions{}); err != nil {
+		return fmt.Errorf("kubernetes: patch statefulset resources: %w", err)
+	}
+
+	return nil
+}
+
+// setContainerResources sets every container's requests+limits to spec.
+// Separate ResourceLists per field avoid map aliasing between the two.
+func setContainerResources(containers []corev1.Container, spec provider.ResourceSpec) {
+	for i := range containers {
+		containers[i].Resources.Requests = buildResourceList(spec)
+		containers[i].Resources.Limits = buildResourceList(spec)
+	}
+}
+
 // Resources returns a one-shot point-in-time sample of the
 // instance's pod resource usage via the metrics.k8s.io API.
 //
diff --git a/provider/kubernetes/resources.go b/provider/kubernetes/resources.go
@@ -57,16 +57,20 @@ func providerRef(namespace string, instanceID id.ID) string {
 	return fmt.Sprintf("k8s:%s/%s", namespace, deploymentName(instanceID))
 }
 
-// instanceLabels builds the standard label set for a ctrlplane-managed resource.
+// instanceLabels builds the standard label set for a ctrlplane-managed
+// resource. Caller-supplied extra labels are layered in first; the
+// reserved ctrlplane.io/* keys are set last so they're authoritative and
+// a caller's labels can never clobber instance identity/management (the
+// selector depends on labelInstanceID).
 func instanceLabels(instanceID id.ID, tenantID string, extra map[string]string) map[string]string {
-	labels := map[string]string{
-		labelInstanceID: instanceID.String(),
-		labelTenantID:   tenantID,
-		labelManagedBy:  labelManagedByValue,
-	}
+	labels := make(map[string]string, len(extra)+3)
 
 	maps.Copy(labels, extra)
 
+	labels[labelInstanceID] = instanceID.String()
+	labels[labelTenantID] = tenantID
+	labels[labelManagedBy] = labelManagedByValue
+
 	return labels
 }
 
diff --git a/provider/kubernetes/scale_resources_test.go b/provider/kubernetes/scale_resources_test.go
@@ -0,0 +1,166 @@
+package kubernetes
+
+import (
+	"context"
+	"testing"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	k8sfake "k8s.io/client-go/kubernetes/fake"
+
+	"github.com/xraph/ctrlplane/id"
+	"github.com/xraph/ctrlplane/provider"
+)
+
+// TestScale_PatchesContainerResources is the regression for the bug where
+// Provider.Scale ignored CPU/memory and only acted on replicas — leaving a
+// provisioned workload's resources immutable. A resize must now patch the
+// pod template's container requests AND limits.
+func TestScale_PatchesContainerResources(t *testing.T) {
+	instID := id.New(id.PrefixInstance)
+	name := deploymentName(instID)
+
+	const ns = "default"
+
+	big := provider.ResourceSpec{CPUMillis: 500, MemoryMB: 512}
+	dep := &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Spec: appsv1.DeploymentSpec{
+			Template: corev1.PodTemplateSpec{
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{{
+						Name:  "twinos",
+						Image: "img:1",
+						Resources: corev1.ResourceRequirements{
+							Requests: buildResourceList(big),
+							Limits:   buildResourceList(big),
+						},
+					}},
+				},
+			},
+		},
+	}
+
+	p := &Provider{
+		cfg:    Config{Namespace: ns},
+		client: k8sfake.NewSimpleClientset(dep),
+	}
+
+	// Shrink to values that fit a small node.
+	err := p.Scale(context.Background(), instID, provider.ResourceSpec{CPUMillis: 150, MemoryMB: 256})
+	if err != nil {
+		t.Fatalf("Scale: %v", err)
+	}
+
+	got, err := p.client.AppsV1().Deployments(ns).Get(context.Background(), name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("get deployment: %v", err)
+	}
+
+	c := got.Spec.Template.Spec.Containers[0]
+
+	if cpu := c.Resources.Requests.Cpu().MilliValue(); cpu != 150 {
+		t.Errorf("cpu request: want 150m, got %dm", cpu)
+	}
+
+	if cpu := c.Resources.Limits.Cpu().MilliValue(); cpu != 150 {
+		t.Errorf("cpu limit: want 150m, got %dm", cpu)
+	}
+
+	if memMB := c.Resources.Requests.Memory().Value() / (1024 * 1024); memMB != 256 {
+		t.Errorf("mem request: want 256Mi, got %dMi", memMB)
+	}
+}
+
+// TestScale_noResourceChange_isNoop verifies that a ScaleRequest carrying
+// neither CPU nor memory nor replicas leaves the workload untouched (and
+// crucially does not attempt the resource patch).
+func TestScale_noResourceChange_isNoop(t *testing.T) {
+	instID := id.New(id.PrefixInstance)
+	name := deploymentName(instID)
+
+	const ns = "default"
+
+	orig := provider.ResourceSpec{CPUMillis: 500, MemoryMB: 512}
+	dep := &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns},
+		Spec: appsv1.DeploymentSpec{
+			Template: corev1.PodTemplateSpec{
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{{
+						Name:      "twinos",
+						Image:     "img:1",
+						Resources: corev1.ResourceRequirements{Requests: buildResourceList(orig), Limits: buildResourceList(orig)},
+					}},
+				},
+			},
+		},
+	}
+
+	p := &Provider{cfg: Config{Namespace: ns}, client: k8sfake.NewSimpleClientset(dep)}
+
+	if err := p.Scale(context.Background(), instID, provider.ResourceSpec{}); err != nil {
+		t.Fatalf("Scale (empty spec): %v", err)
+	}
+
+	got, err := p.client.AppsV1().Deployments(ns).Get(context.Background(), name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("get deployment: %v", err)
+	}
+
+	if cpu := got.Spec.Template.Spec.Containers[0].Resources.Requests.Cpu().MilliValue(); cpu != 500 {
+		t.Errorf("empty scale must not change cpu: got %dm, want 500m", cpu)
+	}
+}
+
+// TestProvision_propagatesRequestLabelsToPods is the regression for pods
+// carrying only ctrlplane labels: a caller's per-instance labels (e.g.
+// twinos.workspace / twinos.component) must reach the Deployment AND its
+// pod template so pods are identifiable/queryable, while the reserved
+// ctrlplane.io/* keys stay authoritative.
+func TestProvision_propagatesRequestLabelsToPods(t *testing.T) {
+	instID := id.New(id.PrefixInstance)
+
+	const ns = "default"
+
+	p := &Provider{
+		cfg:    Config{Namespace: ns},
+		client: k8sfake.NewSimpleClientset(),
+	}
+
+	req := provider.ProvisionRequest{
+		InstanceID: instID,
+		TenantID:   "ten_abc",
+		Kind:       provider.KindDeployment,
+		Labels: map[string]string{
+			"twinos.workspace": "ws_acme",
+			"twinos.component": "twinos",
+			// A caller trying to clobber a reserved key must NOT win.
+			labelInstanceID: "spoofed",
+		},
+		Services: []provider.ServiceSpec{{
+			Name:  "twinos",
+			Image: "img:1",
+			Role:  provider.RoleMain,
+		}},
+	}
+
+	if _, err := p.Provision(context.Background(), req); err != nil {
+		t.Fatalf("Provision: %v", err)
+	}
+
+	dep, err := p.client.AppsV1().Deployments(ns).Get(context.Background(), deploymentName(instID), metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("get deployment: %v", err)
+	}
+
+	pod := dep.Spec.Template.Labels
+	if pod["twinos.workspace"] != "ws_acme" || pod["twinos.component"] != "twinos" {
+		t.Fatalf("caller labels missing from pod template: %v", pod)
+	}
+
+	if pod[labelInstanceID] != instID.String() {
+		t.Fatalf("reserved instance-id label was clobbered: got %q, want %q", pod[labelInstanceID], instID.String())
+	}
+}