Skip to content

Commit 492eb6c

Browse files
scotwellsclaude
andcommitted
feat: implement federated deployment scheduling across POP cells
Workloads targeting a city location are now automatically routed to the correct physical site via a Karmada-based federation layer. Each POP cell operates independently, instance health is surfaced back to the control plane in real time, and the platform remains available even when parts of the control plane are temporarily unreachable. Controllers added: - WorkloadDeploymentFederator: replicates WDs into Karmada and manages PropagationPolicies per city code - InstanceProjector: mirrors Instance write-backs from Karmada into the project namespace on the control plane ResourceInterpreterCustomization deployed at config time teaches Karmada how to aggregate replica counts and conditions across POP cells. Operator flags --enable-management-controllers and --enable-cell-controllers allow each deployment to opt into only the controllers it needs. Includes a 6-test Chainsaw e2e suite covering federation, deletion cascade, propagation policy lifecycle, instance projection, instance write-back, and the full end-to-end chain. Resolves #85 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 9d96bd5 commit 492eb6c

57 files changed

Lines changed: 4208 additions & 404 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/publish.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ jobs:
1818
secrets: inherit
1919

2020
publish-kustomize-bundles:
21+
needs: publish-container-image
2122
permissions:
2223
id-token: write
2324
contents: read
@@ -26,4 +27,6 @@ jobs:
2627
with:
2728
bundle-name: ghcr.io/datum-cloud/compute-kustomize
2829
bundle-path: config
30+
image-name: ghcr.io/datum-cloud/compute
31+
image-overlays: config/base/manager
2932
secrets: inherit

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,6 @@ go.work.sum
2525
.env
2626

2727
bin/
28+
29+
# Local e2e environment artefacts (Kind kubeconfigs, etc.)
30+
tmp/

Taskfile.yaml

Lines changed: 481 additions & 0 deletions
Large diffs are not rendered by default.

api/v1alpha/workloaddeployment_types.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ package v1alpha
22

33
import (
44
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
5-
6-
networkingv1alpha "go.datum.net/network-services-operator/api/v1alpha"
75
)
86

97
// WorkloadDeploymentSpec defines the desired state of WorkloadDeployment
@@ -37,11 +35,6 @@ type WorkloadDeploymentSpec struct {
3735

3836
// WorkloadDeploymentStatus defines the observed state of WorkloadDeployment
3937
type WorkloadDeploymentStatus struct {
40-
// The location which the deployment has been scheduled to
41-
//
42-
// +kubebuilder:validation:Optional
43-
Location *networkingv1alpha.LocationReference `json:"location,omitempty"`
44-
4538
// Represents the observations of a deployment's current state.
4639
// Known condition types are: "Available", "Progressing"
4740
Conditions []metav1.Condition `json:"conditions,omitempty"`
@@ -80,8 +73,6 @@ const (
8073
// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.readyReplicas`
8174
// +kubebuilder:printcolumn:name="Desired",type=string,JSONPath=`.status.desiredReplicas`
8275
// +kubebuilder:printcolumn:name="Up-to-date",type=string,JSONPath=`.status.currentReplicas`
83-
// +kubebuilder:printcolumn:name="Location Namespace",type=string,JSONPath=`.status.location.namespace`,priority=1
84-
// +kubebuilder:printcolumn:name="Location Name",type=string,JSONPath=`.status.location.name`,priority=1
8576
type WorkloadDeployment struct {
8677
metav1.TypeMeta `json:",inline"`
8778
metav1.ObjectMeta `json:"metadata,omitempty"`

api/v1alpha/zz_generated.deepcopy.go

Lines changed: 0 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/main.go

Lines changed: 100 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,22 @@ import (
1818
"k8s.io/apimachinery/pkg/runtime/serializer"
1919
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2020
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
21+
"k8s.io/client-go/rest"
22+
"k8s.io/client-go/tools/clientcmd"
2123
ctrl "sigs.k8s.io/controller-runtime"
2224
"sigs.k8s.io/controller-runtime/pkg/client"
2325
"sigs.k8s.io/controller-runtime/pkg/cluster"
2426
"sigs.k8s.io/controller-runtime/pkg/healthz"
2527
"sigs.k8s.io/controller-runtime/pkg/log/zap"
2628
"sigs.k8s.io/controller-runtime/pkg/manager"
29+
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
2730
"sigs.k8s.io/controller-runtime/pkg/webhook"
2831
mcmanager "sigs.k8s.io/multicluster-runtime/pkg/manager"
2932
"sigs.k8s.io/multicluster-runtime/pkg/multicluster"
3033
mcsingle "sigs.k8s.io/multicluster-runtime/providers/single"
3134

35+
karmadaclusterv1alpha1 "github.com/karmada-io/api/cluster/v1alpha1"
36+
karmadapolicyv1alpha1 "github.com/karmada-io/api/policy/v1alpha1"
3237
computev1alpha "go.datum.net/compute/api/v1alpha"
3338
"go.datum.net/compute/internal/config"
3439
"go.datum.net/compute/internal/controller"
@@ -51,6 +56,11 @@ var (
5156
gitCommit = "unknown"
5257
gitTreeState = "unknown"
5358
buildDate = "unknown"
59+
60+
// downstreamRestConfig holds the REST config for the downstream control plane.
61+
// It is populated from --downstream-kubeconfig when set, and is nil when the
62+
// flag is omitted (e.g. in non-federation deployments).
63+
downstreamRestConfig *rest.Config
5464
)
5565

5666
func init() {
@@ -61,6 +71,8 @@ func init() {
6171
utilruntime.Must(computev1alpha.AddToScheme(scheme))
6272
utilruntime.Must(networkingv1alpha.AddToScheme(scheme))
6373
utilruntime.Must(quotav1alpha1.AddToScheme(scheme))
74+
utilruntime.Must(karmadapolicyv1alpha1.Install(scheme))
75+
utilruntime.Must(karmadaclusterv1alpha1.Install(scheme))
6476

6577
// +kubebuilder:scaffold:scheme
6678
}
@@ -71,12 +83,27 @@ func main() {
7183
var leaderElectionNamespace string
7284
var probeAddr string
7385
var serverConfigFile string
86+
var downstreamKubeconfig string
87+
var downstreamContext string
88+
var enableManagementControllers bool
89+
var enableCellControllers bool
7490

7591
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
7692
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
7793
"Enable leader election for controller manager. "+
7894
"Enabling this will ensure there is only one active controller manager.")
7995
flag.StringVar(&leaderElectionNamespace, "leader-elect-namespace", "", "The namespace to use for leader election.")
96+
flag.StringVar(&downstreamKubeconfig, "downstream-kubeconfig", "",
97+
"Path to the kubeconfig file for the downstream control plane. "+
98+
"When omitted, downstream federation features are disabled.")
99+
flag.StringVar(&downstreamContext, "downstream-context", "",
100+
"Context to use from the downstream kubeconfig. When omitted, the current context is used.")
101+
flag.BoolVar(&enableManagementControllers, "enable-management-controllers", true,
102+
"Enable management-plane controllers (WorkloadDeploymentFederator, InstanceProjector). "+
103+
"Disable when running a cell-only operator instance.")
104+
flag.BoolVar(&enableCellControllers, "enable-cell-controllers", true,
105+
"Enable cell controllers (WorkloadDeploymentReconciler, InstanceReconciler). "+
106+
"Disable when running a management-only operator instance.")
80107

81108
opts := zap.Options{
82109
Development: true,
@@ -89,6 +116,23 @@ func main() {
89116

90117
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
91118

119+
// Load the downstream REST config when --downstream-kubeconfig is provided.
120+
// When the flag is omitted, downstreamRestConfig remains nil and federation
121+
// features will be skipped at controller setup time.
122+
if downstreamKubeconfig != "" {
123+
loader := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
124+
&clientcmd.ClientConfigLoadingRules{ExplicitPath: downstreamKubeconfig},
125+
&clientcmd.ConfigOverrides{CurrentContext: downstreamContext},
126+
)
127+
var err error
128+
downstreamRestConfig, err = loader.ClientConfig()
129+
if err != nil {
130+
setupLog.Error(err, "unable to load downstream kubeconfig", "path", downstreamKubeconfig)
131+
os.Exit(1)
132+
}
133+
setupLog.Info("downstream kubeconfig loaded", "path", downstreamKubeconfig)
134+
}
135+
92136
setupLog.Info("starting compute",
93137
"version", version,
94138
"gitCommit", gitCommit,
@@ -180,17 +224,63 @@ func main() {
180224
setupLog.Error(err, "unable to create controller", "controller", "Workload")
181225
os.Exit(1)
182226
}
183-
if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil {
184-
setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment")
185-
os.Exit(1)
227+
228+
// Build a single downstream client shared across all controllers that need
229+
// to read or write to the downstream control plane. Nil when federation is disabled.
230+
var downstreamClient client.Client
231+
if downstreamRestConfig != nil {
232+
downstreamClient, err = client.New(downstreamRestConfig, client.Options{Scheme: scheme})
233+
if err != nil {
234+
setupLog.Error(err, "unable to create downstream client")
235+
os.Exit(1)
236+
}
186237
}
187-
if err = (&controller.WorkloadDeploymentScheduler{}).SetupWithManager(mgr); err != nil {
188-
setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentScheduler")
189-
os.Exit(1)
238+
239+
if enableCellControllers {
240+
if err = (&controller.WorkloadDeploymentReconciler{}).SetupWithManager(mgr); err != nil {
241+
setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeployment")
242+
os.Exit(1)
243+
}
190244
}
191-
if err = (&controller.InstanceReconciler{}).SetupWithManager(mgr, deploymentCluster); err != nil {
192-
setupLog.Error(err, "unable to create controller", "controller", "Instance")
193-
os.Exit(1)
245+
246+
if enableCellControllers {
247+
instanceReconciler := &controller.InstanceReconciler{DownstreamClient: downstreamClient}
248+
if err = instanceReconciler.SetupWithManager(mgr, deploymentCluster); err != nil {
249+
setupLog.Error(err, "unable to create controller", "controller", "Instance")
250+
os.Exit(1)
251+
}
252+
}
253+
254+
// WorkloadDeploymentFederator and InstanceProjector are management-plane
255+
// controllers that run on the control-plane cluster. They require a downstream
256+
// control plane to be configured (--downstream-kubeconfig provided).
257+
if enableManagementControllers && downstreamRestConfig != nil {
258+
federator := &controller.WorkloadDeploymentFederator{DownstreamClient: downstreamClient}
259+
if err = federator.SetupWithManager(mgr); err != nil {
260+
setupLog.Error(err, "unable to create controller", "controller", "WorkloadDeploymentFederator")
261+
os.Exit(1)
262+
}
263+
264+
// InstanceProjector: runs in the Control Plane Cell, watches Instances
265+
// written back to the downstream control plane by POP-cell operators, and
266+
// projects them into the corresponding project namespaces via the
267+
// multicluster manager.
268+
downstreamMgr, err := manager.New(downstreamRestConfig, manager.Options{
269+
Scheme: scheme,
270+
Metrics: metricsserver.Options{BindAddress: "0"},
271+
})
272+
if err != nil {
273+
setupLog.Error(err, "unable to create downstream manager for InstanceProjector")
274+
os.Exit(1)
275+
}
276+
if err = (&controller.InstanceProjector{
277+
DownstreamClient: downstreamClient,
278+
MCManager: mgr,
279+
}).SetupWithManager(downstreamMgr); err != nil {
280+
setupLog.Error(err, "unable to create controller", "controller", "InstanceProjector")
281+
os.Exit(1)
282+
}
283+
runnables = append(runnables, downstreamMgr)
194284
}
195285

196286
if serverConfig.WebhookServer != nil {
@@ -284,6 +374,7 @@ func initializeClusterDiscovery(
284374
}
285375

286376
discoveryManager, err := manager.New(discoveryRestConfig, manager.Options{
377+
Metrics: metricsserver.Options{BindAddress: "0"},
287378
Client: client.Options{
288379
Cache: &client.CacheOptions{
289380
Unstructured: true,
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
resources:
5+
- rbac.yaml
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
name: compute-manager
5+
rules:
6+
- apiGroups: ["compute.datumapis.com"]
7+
resources: ["workloaddeployments", "workloaddeployments/status", "instances", "instances/status"]
8+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
9+
- apiGroups: ["policy.karmada.io"]
10+
resources: ["propagationpolicies", "clusterpropagationpolicies"]
11+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
12+
- apiGroups: ["cluster.karmada.io"]
13+
resources: ["clusters"]
14+
verbs: ["get", "list", "watch"]
15+
- apiGroups: ["work.karmada.io"]
16+
resources: ["resourcebindings", "clusterresourcebindings"]
17+
verbs: ["get", "list", "watch"]
18+
- apiGroups: ["config.karmada.io"]
19+
resources: ["resourceinterpreterwebhookconfigurations", "resourceinterpretercustomizations"]
20+
verbs: ["get", "list", "watch"]
21+
---
22+
apiVersion: rbac.authorization.k8s.io/v1
23+
kind: ClusterRoleBinding
24+
metadata:
25+
name: compute-manager
26+
roleRef:
27+
apiGroup: rbac.authorization.k8s.io
28+
kind: ClusterRole
29+
name: compute-manager
30+
subjects:
31+
- kind: User
32+
name: system:serviceaccount:compute-system:compute-manager
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
resources:
5+
- ../crd/bases/compute.datumapis.com_instances.yaml
6+
- ../crd/bases/compute.datumapis.com_workloaddeployments.yaml
7+
- ../crd/bases/compute.datumapis.com_workloads.yaml
8+
9+
components:
10+
- ../../components/federation

config/base/manager/manager.yaml

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,30 @@ spec:
2626
seccompProfile:
2727
type: RuntimeDefault
2828
containers:
29-
- command:
29+
- name: manager
30+
command:
3031
- /manager
3132
args:
32-
- --leader-elect
33-
- --health-probe-bind-address=:8081
34-
- --server-config=/config/config.yaml
33+
- --leader-elect=$(LEADER_ELECT)
34+
- --health-probe-bind-address=$(HEALTH_PROBE_BIND_ADDRESS)
35+
- --server-config=$(SERVER_CONFIG)
36+
- --downstream-kubeconfig=$(DOWNSTREAM_KUBECONFIG)
37+
- --enable-management-controllers=$(ENABLE_MANAGEMENT_CONTROLLERS)
38+
- --enable-cell-controllers=$(ENABLE_CELL_CONTROLLERS)
39+
env:
40+
- name: LEADER_ELECT
41+
value: "true"
42+
- name: HEALTH_PROBE_BIND_ADDRESS
43+
value: ":8081"
44+
- name: SERVER_CONFIG
45+
value: /config/config.yaml
46+
- name: DOWNSTREAM_KUBECONFIG
47+
value: ""
48+
- name: ENABLE_MANAGEMENT_CONTROLLERS
49+
value: "false"
50+
- name: ENABLE_CELL_CONTROLLERS
51+
value: "false"
3552
image: ghcr.io/datum-cloud/compute:latest
36-
name: manager
3753
ports:
3854
- containerPort: 9443
3955
name: webhook-server
@@ -69,7 +85,7 @@ spec:
6985
- name: webhook-cert
7086
mountPath: /tmp/k8s-webhook-server/serving-certs
7187
readOnly: true
72-
serviceAccountName: compute
88+
serviceAccountName: compute-manager
7389
terminationGracePeriodSeconds: 10
7490
volumes:
7591
- name: config

0 commit comments

Comments
 (0)