Skip to content

Commit fdfbc1e

Browse files
committed
ctrl: sched: explicit replicas should not exceed nodes number
It's reasonable to have high-availability enabled to have scheduler pods number matches the control-plane nodes' number or workers on hypershift. but a number greater than that makes no point. This commit degrades the scheduler status in case a higher number of replicas is requested. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent 31ed1ac commit fdfbc1e

2 files changed

Lines changed: 150 additions & 26 deletions

File tree

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,6 @@ func isDeploymentRunning(ctx context.Context, c client.Client, key nropv1.Namesp
244244
}
245245

246246
func (r *NUMAResourcesSchedulerReconciler) computeSchedulerReplicas(ctx context.Context, schedSpec nropv1.NUMAResourcesSchedulerSpec) (*int32, error) {
247-
// do not autodetect if explicitly set by the user
248-
if schedSpec.Replicas != nil {
249-
return schedSpec.Replicas, nil
250-
}
251-
252247
var labelSelector map[string]string
253248
var nodeRoleDescription string
254249

@@ -276,6 +271,14 @@ func (r *NUMAResourcesSchedulerReconciler) computeSchedulerReplicas(ctx context.
276271

277272
replicaCount := int32(len(nodeList.Items))
278273

274+
if schedSpec.Replicas != nil {
275+
// should not allow more replicas than max found nodes
276+
if *schedSpec.Replicas > replicaCount {
277+
return nil, fmt.Errorf("explicitly set replicas count should not be greater than the number of the nodes with role %s", nodeRoleDescription)
278+
}
279+
return schedSpec.Replicas, nil
280+
}
281+
279282
// check if no nodes are found for the target role
280283
if replicaCount == 0 {
281284
return nil, fmt.Errorf("failed to compute scheduler replicas: no %s nodes found for platform %s", nodeRoleDescription, r.PlatformInfo.Platform)

internal/controller/numaresourcesscheduler_controller_test.go

Lines changed: 142 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,36 +1105,157 @@ var _ = Describe("Test computeSchedulerReplicas", func() {
11051105
Entry("HyperShift: 3 control-plane + 5 worker should return 3 replicas (uses worker)", platform.HyperShift, 3, 5, ptr.To(int32(3)), false),
11061106
)
11071107

1108-
Context("when replicas are explicitly set", func() {
1109-
It("should return the explicitly set replicas without checking nodes", func() {
1110-
// setup reconciler with HyperShift platform (uses worker nodes)
1111-
reconciler.PlatformInfo = platforminfo.PlatformInfo{
1112-
Platform: platform.HyperShift,
1113-
Version: "v4.14.0",
1114-
}
1108+
When("replicas count is explicitly set", func() {
1109+
type testCase struct {
1110+
platform platform.Platform
1111+
numControlPlane int
1112+
numWorker int
1113+
explicitReplicas *int32
1114+
expectedReplicas *int32
1115+
expectError string
1116+
}
1117+
DescribeTable(
1118+
"should not allow more replicas than the number of nodes", func(ctx context.Context, tc testCase) {
1119+
nrs := testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
1120+
nrs.Spec.Replicas = tc.explicitReplicas
1121+
initObjects := []runtime.Object{nrs}
1122+
initObjects = append(initObjects, fakeNodes(tc.numControlPlane, tc.numWorker)...)
1123+
reconciler, err := NewFakeNUMAResourcesSchedulerReconciler(initObjects...)
1124+
Expect(err).ToNot(HaveOccurred())
1125+
reconciler.PlatformInfo = platforminfo.PlatformInfo{
1126+
Platform: tc.platform,
1127+
Version: "v4.14.0",
1128+
}
11151129

1116-
// create scenario with no worker nodes but explicit replicas
1117-
nodes := fakeNodes(3, 0) // only control-plane nodes
1118-
reconciler, err := NewFakeNUMAResourcesSchedulerReconciler(nodes...)
1130+
key := client.ObjectKeyFromObject(nrs)
1131+
_, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
1132+
if tc.expectError == "" {
1133+
Expect(err).ToNot(HaveOccurred())
1134+
} else {
1135+
Expect(err).To(HaveOccurred())
1136+
Expect(err.Error()).To(ContainSubstring(tc.expectError))
1137+
}
1138+
1139+
Expect(reconciler.Client.Get(ctx, key, nrs)).To(Succeed())
1140+
1141+
if tc.expectError != "" {
1142+
degradedCondition := getConditionByType(nrs.Status.Conditions, status.ConditionDegraded)
1143+
Expect(degradedCondition.Status).To(Equal(metav1.ConditionTrue))
1144+
Expect(degradedCondition.Reason).To(Equal(status.ReasonInternalError))
1145+
Expect(degradedCondition.Message).To(ContainSubstring(tc.expectError))
1146+
return
1147+
}
1148+
1149+
dp := &appsv1.Deployment{}
1150+
dpKey := client.ObjectKey{Namespace: nrs.Status.Deployment.Namespace, Name: nrs.Status.Deployment.Name}
1151+
Expect(reconciler.Client.Get(ctx, dpKey, dp)).To(Succeed())
1152+
Expect(*dp.Spec.Replicas).To(Equal(*tc.expectedReplicas))
1153+
},
1154+
// openshift scenarios
1155+
Entry("OpenShift: 3 control-plane with explicit replicas count equal to the number of nodes",
1156+
testCase{
1157+
platform: platform.OpenShift,
1158+
numControlPlane: 3,
1159+
numWorker: 5,
1160+
explicitReplicas: ptr.To(int32(3)),
1161+
expectedReplicas: ptr.To(int32(3)),
1162+
}),
1163+
Entry("OpenShift: 3 control-plane with explicit replicas count less than the number of nodes",
1164+
testCase{
1165+
platform: platform.OpenShift,
1166+
numControlPlane: 3,
1167+
numWorker: 0,
1168+
explicitReplicas: ptr.To(int32(2)),
1169+
expectedReplicas: ptr.To(int32(2)),
1170+
}),
1171+
Entry("OpenShift: 3 control-plane with explicit replicas count greater than the number of nodes",
1172+
testCase{
1173+
platform: platform.OpenShift,
1174+
numControlPlane: 3,
1175+
numWorker: 5,
1176+
explicitReplicas: ptr.To(int32(5)),
1177+
expectError: "explicitly set replicas count should not be greater than the number of the nodes with role",
1178+
}),
1179+
1180+
// hypershift scenarios
1181+
Entry("HyperShift: 3 control-plane with explicit replicas count equal to the number of nodes",
1182+
testCase{
1183+
platform: platform.HyperShift,
1184+
numControlPlane: 0,
1185+
numWorker: 3,
1186+
explicitReplicas: ptr.To(int32(3)),
1187+
expectedReplicas: ptr.To(int32(3)),
1188+
}),
1189+
Entry("HyperShift: 3 control-plane with explicit replicas count less than the number of nodes",
1190+
testCase{
1191+
platform: platform.HyperShift,
1192+
numControlPlane: 0,
1193+
numWorker: 3,
1194+
explicitReplicas: ptr.To(int32(2)),
1195+
expectedReplicas: ptr.To(int32(2)),
1196+
}),
1197+
Entry("HyperShift: 3 control-plane with explicit replicas count greater than the number of nodes",
1198+
testCase{
1199+
platform: platform.HyperShift,
1200+
numControlPlane: 0,
1201+
numWorker: 5,
1202+
explicitReplicas: ptr.To(int32(6)),
1203+
expectError: "explicitly set replicas count should not be greater than the number of the nodes with role",
1204+
}),
1205+
)
1206+
1207+
It("should degrade when nodes are scaled down with higher number of explicit replicas", func(ctx context.Context) {
1208+
nrs := testobjs.NewNUMAResourcesScheduler("numaresourcesscheduler", "some/url:latest", testSchedulerName, 11*time.Second)
1209+
initialReplicas := int32(3)
1210+
nrs.Spec.Replicas = ptr.To(initialReplicas)
1211+
initObjects := []runtime.Object{nrs}
1212+
initialNodes := fakeNodes(4, 4)
1213+
initObjects = append(initObjects, initialNodes...)
1214+
reconciler, err := NewFakeNUMAResourcesSchedulerReconciler(initObjects...)
11191215
Expect(err).ToNot(HaveOccurred())
11201216
reconciler.PlatformInfo = platforminfo.PlatformInfo{
1121-
Platform: platform.HyperShift,
1217+
Platform: platform.OpenShift,
11221218
Version: "v4.14.0",
11231219
}
11241220

1125-
// create test scheduler spec with explicit replicas
1126-
explicitReplicas := int32(5)
1127-
schedSpec := nropv1.NUMAResourcesSchedulerSpec{
1128-
Replicas: &explicitReplicas,
1129-
}
1221+
key := client.ObjectKeyFromObject(nrs)
1222+
_, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
1223+
Expect(err).ToNot(HaveOccurred())
11301224

1131-
// call the function under test
1132-
result, err := reconciler.computeSchedulerReplicas(context.TODO(), schedSpec)
1225+
Expect(reconciler.Client.Get(ctx, key, nrs)).To(Succeed())
1226+
dp := &appsv1.Deployment{}
1227+
dpKey := client.ObjectKey{Namespace: nrs.Status.Deployment.Namespace, Name: nrs.Status.Deployment.Name}
1228+
Expect(reconciler.Client.Get(ctx, dpKey, dp)).To(Succeed())
1229+
Expect(*dp.Spec.Replicas).To(Equal(initialReplicas))
11331230

1134-
// should return the explicit value without error
1231+
// scale down the nodes by one but keep it same as the explicit replicas count
1232+
Eventually(func(g Gomega) {
1233+
g.Expect(reconciler.Client.Delete(ctx, initialNodes[0].(*corev1.Node))).To(Succeed())
1234+
}).WithPolling(5 * time.Second).WithTimeout(30 * time.Second).Should(Succeed())
1235+
1236+
_, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
11351237
Expect(err).ToNot(HaveOccurred())
1136-
Expect(result).ToNot(BeNil())
1137-
Expect(*result).To(Equal(explicitReplicas))
1238+
1239+
Expect(reconciler.Client.Get(ctx, key, nrs)).To(Succeed())
1240+
Expect(reconciler.Client.Get(ctx, dpKey, dp)).To(Succeed())
1241+
Expect(*dp.Spec.Replicas).To(Equal(initialReplicas))
1242+
1243+
// scale down the nodes by one but keep it same as the explicit replicas count
1244+
Eventually(func(g Gomega) {
1245+
g.Expect(reconciler.Client.Delete(ctx, initialNodes[1].(*corev1.Node))).To(Succeed())
1246+
}).WithPolling(5 * time.Second).WithTimeout(30 * time.Second).Should(Succeed())
1247+
1248+
errMessage := "explicitly set replicas count should not be greater than the number of the nodes with role"
1249+
_, err = reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
1250+
Expect(err).To(HaveOccurred())
1251+
Expect(err.Error()).To(ContainSubstring(errMessage))
1252+
1253+
Expect(reconciler.Client.Get(ctx, key, nrs)).To(Succeed())
1254+
1255+
degradedCondition := getConditionByType(nrs.Status.Conditions, status.ConditionDegraded)
1256+
Expect(degradedCondition.Status).To(Equal(metav1.ConditionTrue))
1257+
Expect(degradedCondition.Reason).To(Equal(status.ReasonInternalError))
1258+
Expect(degradedCondition.Message).To(ContainSubstring(errMessage))
11381259
})
11391260
})
11401261
})

0 commit comments

Comments
 (0)