Skip to content

Commit b3da392

Browse files
committed
ctrl: sched: add topologySpreadConstraints to deployment
To evenly spread the pods across the nodes in a a balanced way taking into account the new replicaset pods while rollout. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent fdfbc1e commit b3da392

5 files changed

Lines changed: 395 additions & 1 deletion

File tree

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,10 @@ func (r *NUMAResourcesSchedulerReconciler) syncNUMASchedulerResources(ctx contex
364364
return nropv1.NUMAResourcesSchedulerStatus{}, err
365365
}
366366

367+
if err := schedupdate.DeploymentTopologySpreadConstraints(r.SchedulerManifests.Deployment); err != nil {
368+
return nropv1.NUMAResourcesSchedulerStatus{}, err
369+
}
370+
367371
k8swgrbacupdate.RoleForLeaderElection(r.SchedulerManifests.Role, r.Namespace, nrosched.LeaderElectionResourceName)
368372
k8swgrbacupdate.RoleBinding(r.SchedulerManifests.RoleBinding, r.SchedulerManifests.ServiceAccount.Name, r.Namespace)
369373

internal/controller/numaresourcesscheduler_controller_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,28 @@ var _ = Describe("Test NUMAResourcesScheduler Reconcile", func() {
10301030
Expect(dp.Spec.Template.Spec.Containers[0].Args).To(ContainElement("--tls-min-version=" + updatedSettings.MinVersion))
10311031
Expect(dp.Spec.Template.Spec.Containers[0].Args).To(ContainElement("--tls-cipher-suites=" + updatedSettings.CipherSuites))
10321032
})
1033+
1034+
It("should set the TopologySpreadConstraints in the deployment by default", func() {
1035+
_, err := reconciler.Reconcile(context.TODO(), reconcile.Request{NamespacedName: nrsKey})
1036+
Expect(err).ToNot(HaveOccurred())
1037+
1038+
Expect(reconciler.Client.Get(context.TODO(), nrsKey, nrs)).To(Succeed())
1039+
dpKey := client.ObjectKey{Namespace: nrs.Status.Deployment.Namespace, Name: nrs.Status.Deployment.Name}
1040+
dp := &appsv1.Deployment{}
1041+
Expect(reconciler.Client.Get(context.TODO(), dpKey, dp)).To(Succeed())
1042+
Expect(dp.Spec.Template.Spec.TopologySpreadConstraints).To(HaveLen(1))
1043+
1044+
expectedConstraint := corev1.TopologySpreadConstraint{
1045+
MaxSkew: 1,
1046+
TopologyKey: "kubernetes.io/hostname",
1047+
WhenUnsatisfiable: corev1.DoNotSchedule,
1048+
MatchLabelKeys: []string{"pod-template-hash"},
1049+
LabelSelector: &metav1.LabelSelector{
1050+
MatchLabels: dp.Spec.Template.Labels,
1051+
},
1052+
}
1053+
Expect(dp.Spec.Template.Spec.TopologySpreadConstraints[0]).To(Equal(expectedConstraint))
1054+
})
10331055
})
10341056
})
10351057

pkg/objectupdate/sched/sched.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
appsv1 "k8s.io/api/apps/v1"
2323
corev1 "k8s.io/api/core/v1"
2424
"k8s.io/apimachinery/pkg/api/resource"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2526
"k8s.io/klog/v2"
2627

2728
"github.com/k8stopologyawareschedwg/deployer/pkg/flagcodec"
@@ -102,6 +103,29 @@ func DeploymentTLSSettings(dp *appsv1.Deployment, tlsSettings objtls.Settings) e
102103
return nil
103104
}
104105

106+
func DeploymentTopologySpreadConstraints(dp *appsv1.Deployment) error {
107+
labels := dp.Spec.Template.Labels
108+
if labels == nil {
109+
return fmt.Errorf("no labels found in deployment template")
110+
}
111+
112+
dp.Spec.Template.Spec.TopologySpreadConstraints = []corev1.TopologySpreadConstraint{
113+
{
114+
LabelSelector: &metav1.LabelSelector{
115+
MatchLabels: labels,
116+
},
117+
MaxSkew: 1,
118+
TopologyKey: "kubernetes.io/hostname",
119+
WhenUnsatisfiable: corev1.DoNotSchedule,
120+
// this is needed for safe rollouts to allow ignoring the old replicaset and calculates the spread
121+
// purely based on the new replicaset, ensuring the final state is perfectly balanced without stalling.
122+
MatchLabelKeys: []string{"pod-template-hash"},
123+
},
124+
}
125+
126+
klog.V(3).InfoS("scheduler deployment topology spread constraints", "constraints", dp.Spec.Template.Spec.TopologySpreadConstraints[0].String())
127+
return nil
128+
}
105129
func SchedulerConfig(cm *corev1.ConfigMap, name string, params *k8swgmanifests.ConfigParams) error {
106130
if cm.Data == nil {
107131
return fmt.Errorf("no data found in ConfigMap: %s/%s", cm.Namespace, cm.Name)

pkg/objectupdate/sched/sched_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,38 @@ func TestDeploymentTLSSettingsRepeated(t *testing.T) {
631631
}
632632
}
633633

634+
func TestDeploymentTopologySpreadConstraintsNoLabels(t *testing.T) {
635+
dp := dpMinimal.DeepCopy()
636+
dp.Spec.Template.Labels = nil
637+
if err := DeploymentTopologySpreadConstraints(dp); err == nil {
638+
t.Fatalf("expected error but got nil")
639+
}
640+
}
641+
642+
func TestDeploymentTopologySpreadConstraints(t *testing.T) {
643+
dp := dpMinimal.DeepCopy()
644+
dp.Spec.Template.Labels = map[string]string{
645+
"app": "numaresources-scheduler",
646+
}
647+
if err := DeploymentTopologySpreadConstraints(dp); err != nil {
648+
t.Fatalf("unexpected error: %v", err)
649+
}
650+
expectedConstraints := []corev1.TopologySpreadConstraint{
651+
{
652+
LabelSelector: &metav1.LabelSelector{
653+
MatchLabels: dp.Spec.Template.Labels,
654+
},
655+
MaxSkew: 1,
656+
TopologyKey: "kubernetes.io/hostname",
657+
WhenUnsatisfiable: corev1.DoNotSchedule,
658+
MatchLabelKeys: []string{"pod-template-hash"},
659+
},
660+
}
661+
if !reflect.DeepEqual(dp.Spec.Template.Spec.TopologySpreadConstraints, expectedConstraints) {
662+
t.Errorf("constraints mismatch\ngot: %v\nexpected: %v", dp.Spec.Template.Spec.TopologySpreadConstraints, expectedConstraints)
663+
}
664+
}
665+
634666
func mustParseResource(t *testing.T, v string) resource.Quantity {
635667
t.Helper()
636668
qty, err := resource.ParseQuantity(v)

0 commit comments

Comments
 (0)