Skip to content

Commit 9903dc3

Browse files
committed
internal: fix waiting for deployment get ready with new replicas
So far the old function was waiting for the deployment to be ready with the same replicas count as the old one. It is however possible that the new deployment version includes a different replicas field hence should expect to a different replicas count. The old version would fail on such case where the old count vs the new one are not the same. This commit fixes this issue by introducing new function that accepts the expected new replicas count and compare against it. The old function API still exists but is redirected to use the new function with same old behavior. Existing tests are not affected by this. Signed-off-by: Shereen Haj <shajmakh@redhat.com> (cherry picked from commit f30a924)
1 parent 27d6897 commit 9903dc3

5 files changed

Lines changed: 54 additions & 23 deletions

File tree

internal/wait/deployment.go

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@ import (
2424
"k8s.io/klog/v2"
2525
)
2626

27+
// ForDeploymentComplete waits for the deployment to be complete with the number of replicas specified in the deployment spec.
2728
func (wt Waiter) ForDeploymentComplete(ctx context.Context, dp *appsv1.Deployment) (*appsv1.Deployment, error) {
28-
// This function waits for the readiness of the pods under the deployment. The best use of this check is for
29-
// completely new deployments. If the deployment exists on the cluster and simply updated, this check is
30-
// not enough to guarantee that the deployment is ready with the NEW replica, thus need to cover that by
31-
// additional checks as the context requires
29+
return wt.ForDeploymentCompleteWithReplicas(ctx, dp, *(dp.Spec.Replicas))
30+
}
31+
32+
// ForDeploymentCompleteWithReplicas waits for the deployment to be complete and have the specified number of replicas.
33+
// Use this function when the deployment is expected to have new number of replicas after an update.
34+
func (wt Waiter) ForDeploymentCompleteWithReplicas(ctx context.Context, dp *appsv1.Deployment, newExpectedReplicas int32) (*appsv1.Deployment, error) {
3235
key := ObjectKeyFromObject(dp)
3336
updatedDp := &appsv1.Deployment{}
3437
immediate := true
@@ -39,7 +42,7 @@ func (wt Waiter) ForDeploymentComplete(ctx context.Context, dp *appsv1.Deploymen
3942
return false, err
4043
}
4144

42-
if !IsDeploymentComplete(dp, &updatedDp.Status) {
45+
if !IsDeploymentComplete(dp.Generation, &updatedDp.Status, newExpectedReplicas) {
4346
klog.Warningf("deployment %s not yet complete", key.String())
4447
return false, nil
4548
}
@@ -66,13 +69,13 @@ func areDeploymentReplicasAvailable(newStatus *appsv1.DeploymentStatus, replicas
6669
return true
6770
}
6871

69-
func IsDeploymentComplete(dp *appsv1.Deployment, newStatus *appsv1.DeploymentStatus) bool {
70-
if newStatus.ObservedGeneration < dp.Generation {
71-
klog.InfoS("generation is older than expected to indicate the deployment is complete", "old", dp.Generation, "new", newStatus.ObservedGeneration)
72+
func IsDeploymentComplete(oldGeneration int64, newStatus *appsv1.DeploymentStatus, expectedReplicas int32) bool {
73+
if newStatus.ObservedGeneration < oldGeneration {
74+
klog.InfoS("generation is older than expected to indicate the deployment is complete", "old", oldGeneration, "new", newStatus.ObservedGeneration)
7275
return false
7376
}
7477

75-
return areDeploymentReplicasAvailable(newStatus, *(dp.Spec.Replicas))
78+
return areDeploymentReplicasAvailable(newStatus, expectedReplicas)
7679
}
7780

7881
func (wt Waiter) ForDeploymentReplicasCreation(ctx context.Context, dp *appsv1.Deployment, expectedReplicas int32) (*appsv1.Deployment, error) {

test/e2e/sched/sched_test.go

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,23 @@ import (
2626
appsv1 "k8s.io/api/apps/v1"
2727
corev1 "k8s.io/api/core/v1"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/apimachinery/pkg/labels"
2930
"k8s.io/apimachinery/pkg/types"
3031
"k8s.io/apimachinery/pkg/util/sets"
3132
klog "k8s.io/klog/v2"
3233
"k8s.io/utils/ptr"
3334

3435
"sigs.k8s.io/controller-runtime/pkg/client"
3536

37+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
3638
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests"
3739

3840
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
3941
"github.com/openshift-kni/numaresources-operator/internal/podlist"
4042
"github.com/openshift-kni/numaresources-operator/internal/wait"
4143
schedstate "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/objectstate/sched"
4244
e2eclient "github.com/openshift-kni/numaresources-operator/test/internal/clients"
45+
"github.com/openshift-kni/numaresources-operator/test/internal/configuration"
4346
e2efixture "github.com/openshift-kni/numaresources-operator/test/internal/fixture"
4447
e2eimages "github.com/openshift-kni/numaresources-operator/test/internal/images"
4548
"github.com/openshift-kni/numaresources-operator/test/internal/objects"
@@ -80,7 +83,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
8083
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.UID)
8184
Expect(err).ToNot(HaveOccurred(), "unable to get deployment by owner reference")
8285

83-
_, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(context.TODO(), dp)
86+
_, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
8487
Expect(err).ToNot(HaveOccurred())
8588
})
8689
})
@@ -137,7 +140,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
137140
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.GetUID())
138141
Expect(err).ToNot(HaveOccurred())
139142

140-
_, err = wait.With(e2eclient.Client).Interval(30*time.Second).Timeout(2*time.Minute).ForDeploymentComplete(context.TODO(), dp)
143+
_, err = wait.With(e2eclient.Client).Interval(30*time.Second).Timeout(2*time.Minute).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
141144
Expect(err).ToNot(HaveOccurred())
142145
})
143146

@@ -298,7 +301,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
298301
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.UID)
299302
Expect(err).ToNot(HaveOccurred(), "unable to get deployment by owner reference")
300303

301-
dp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(context.TODO(), dp)
304+
dp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
302305
Expect(err).ToNot(HaveOccurred())
303306

304307
podList, err = podlist.With(e2eclient.Client).ByDeployment(context.TODO(), *dp)
@@ -311,6 +314,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
311314
})
312315

313316
When("testing deployment TopologySpreadConstraints", func() {
317+
314318
var (
315319
nroSchedKey client.ObjectKey
316320
autoDetectedReplicasCount *int32
@@ -326,14 +330,38 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
326330
Expect(err).ToNot(HaveOccurred())
327331

328332
if currentReplicas != nil {
333+
By("compute expected autodetected replicas count")
334+
var labelSelector map[string]string
335+
switch configuration.Plat {
336+
case platform.HyperShift:
337+
labelSelector = map[string]string{
338+
"node-role.kubernetes.io/worker": "",
339+
}
340+
case platform.OpenShift:
341+
labelSelector = map[string]string{
342+
"node-role.kubernetes.io/control-plane": "",
343+
}
344+
default:
345+
Fail("unknown platform")
346+
}
347+
348+
nodeList := &corev1.NodeList{}
349+
Expect(e2eclient.Client.List(ctx, nodeList, &client.ListOptions{LabelSelector: labels.SelectorFromSet(labelSelector)})).To(Succeed())
350+
Expect(nodeList.Items).ToNot(BeEmpty(), "no nodes found with control-plane or worker role")
351+
expectedAutodetectedCount := int32(len(nodeList.Items))
352+
if expectedAutodetectedCount > 3 {
353+
// this is how it is capped in the controller
354+
expectedAutodetectedCount = 3
355+
}
356+
329357
e2efixture.By("configure NRS replicas for autodetection: current=%d desired=nil", *currentReplicas)
330358
Eventually(func(g Gomega) {
331359
g.Expect(e2eclient.Client.Get(ctx, nroSchedKey, nroSchedObj)).To(Succeed())
332360
nroSchedObj.Spec.Replicas = nil
333361
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
334362
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
335363

336-
schedDp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
364+
schedDp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, expectedAutodetectedCount)
337365
if err != nil {
338366
infoUponDeploymentCompleteFailure(ctx, schedDp)
339367
}
@@ -346,7 +374,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
346374
nroSchedObj.Spec.Replicas = currentReplicas
347375
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
348376
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
349-
schedDp, err := wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
377+
schedDp, err := wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *currentReplicas)
350378
if err != nil {
351379
infoUponDeploymentCompleteFailure(ctx, schedDp)
352380
}
@@ -394,7 +422,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
394422
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
395423
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with explicit replicas count")
396424

397-
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
425+
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, 0)
398426
if err != nil {
399427
infoUponDeploymentCompleteFailure(ctx, schedDp)
400428
}
@@ -420,7 +448,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
420448
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
421449
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
422450

423-
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
451+
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *autoDetectedReplicasCount)
424452
if err != nil {
425453
infoUponDeploymentCompleteFailure(ctx, schedDp)
426454
}
@@ -482,7 +510,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
482510
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
483511
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with explicit replicas count")
484512

485-
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
513+
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, 1)
486514
if err != nil {
487515
infoUponDeploymentCompleteFailure(ctx, schedDp)
488516
}
@@ -504,7 +532,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
504532
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
505533
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
506534

507-
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
535+
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *autoDetectedReplicasCount)
508536
if err != nil {
509537
infoUponDeploymentCompleteFailure(ctx, schedDp)
510538
}

test/e2e/serial/tests/configuration.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -918,7 +918,7 @@ var _ = Describe("[serial][disruptive] numaresources configuration management",
918918
By(fmt.Sprintf("ensuring the deployment %q keep being pending %d/%d", deployment.Name, step+1, maxStep))
919919
err = fxt.Client.Get(ctx, client.ObjectKeyFromObject(deployment), &updatedDeployment)
920920
Expect(err).ToNot(HaveOccurred())
921-
Expect(wait.IsDeploymentComplete(deployment, &updatedDeployment.Status)).To(BeFalse(), "deployment %q become ready", deployment.Name)
921+
Expect(wait.IsDeploymentComplete(deployment.Generation, &updatedDeployment.Status, *(deployment.Spec.Replicas))).To(BeFalse(), "deployment %q become ready", deployment.Name)
922922
}
923923

924924
By("checking the deployment pod has failed scheduling and its at the pending status")

test/e2e/serial/tests/scheduler_removal.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
9696
err = fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(dp), updatedDp)
9797
Expect(err).ToNot(HaveOccurred())
9898

99-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeTrue(), "deployment %q become unready", dp.Name)
99+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *dp.Spec.Replicas)).To(BeTrue(), "deployment %q become unready", dp.Name)
100100
}
101101
})
102102

@@ -123,7 +123,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
123123
err = fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(dp), updatedDp)
124124
Expect(err).ToNot(HaveOccurred())
125125

126-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeFalse(), "deployment %q become ready", dp.Name)
126+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *dp.Spec.Replicas)).To(BeFalse(), "deployment %q become ready", dp.Name)
127127
}
128128
})
129129
})
@@ -154,7 +154,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
154154
err = fxt.Client.Get(ctx, client.ObjectKeyFromObject(dp), updatedDp)
155155
Expect(err).ToNot(HaveOccurred())
156156

157-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeFalse(), "deployment %q become ready", dp.Name)
157+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *dp.Spec.Replicas)).To(BeFalse(), "deployment %q become ready", dp.Name)
158158
}
159159

160160
restoreScheduler(fxt, nroSchedObj)

test/e2e/serial/tests/workload_placement_nodelabel.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
348348
updatedDp := &appsv1.Deployment{}
349349
err := fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(deployment), updatedDp)
350350
Expect(err).ToNot(HaveOccurred())
351-
return wait.IsDeploymentComplete(deployment, &updatedDp.Status)
351+
return wait.IsDeploymentComplete(deployment.Generation, &updatedDp.Status, *deployment.Spec.Replicas)
352352
}).WithTimeout(time.Second*30).WithPolling(time.Second*5).Should(BeTrue(), "deployment %q became unready", deployment.Name)
353353
},
354354
Entry("[test_id:47597] should be able to schedule pod with affinity property requiredDuringSchedulingIgnoredDuringExecution on the available node with feasible numa zone", createNodeAffinityRequiredDuringSchedulingIgnoredDuringExecution),

0 commit comments

Comments
 (0)