Skip to content

Commit f30a924

Browse files
committed
internal: fix waiting for deployment get ready with new replicas
So far the old function was waiting for the deployment to be ready with the same replicas count as the old one. It is however possible that the new deployment version includes a different replicas field hence should expect to a different replicas count. The old version would fail on such case where the old count vs the new one are not the same. This commit fixes this issue by introducing new function that accepts the expected new replicas count and compare against it. The old function API still exists but is redirected to use the new function with same old behavior. Existing tests are not affected by this. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent 9a8f010 commit f30a924

5 files changed

Lines changed: 54 additions & 23 deletions

File tree

internal/wait/deployment.go

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@ import (
2424
"k8s.io/klog/v2"
2525
)
2626

27+
// ForDeploymentComplete waits for the deployment to be complete with the number of replicas specified in the deployment spec.
2728
func (wt Waiter) ForDeploymentComplete(ctx context.Context, dp *appsv1.Deployment) (*appsv1.Deployment, error) {
28-
// This function waits for the readiness of the pods under the deployment. The best use of this check is for
29-
// completely new deployments. If the deployment exists on the cluster and simply updated, this check is
30-
// not enough to guarantee that the deployment is ready with the NEW replica, thus need to cover that by
31-
// additional checks as the context requires
29+
return wt.ForDeploymentCompleteWithReplicas(ctx, dp, *(dp.Spec.Replicas))
30+
}
31+
32+
// ForDeploymentCompleteWithReplicas waits for the deployment to be complete and have the specified number of replicas.
33+
// Use this function when the deployment is expected to have new number of replicas after an update.
34+
func (wt Waiter) ForDeploymentCompleteWithReplicas(ctx context.Context, dp *appsv1.Deployment, newExpectedReplicas int32) (*appsv1.Deployment, error) {
3235
key := ObjectKeyFromObject(dp)
3336
updatedDp := &appsv1.Deployment{}
3437
immediate := true
@@ -39,7 +42,7 @@ func (wt Waiter) ForDeploymentComplete(ctx context.Context, dp *appsv1.Deploymen
3942
return false, err
4043
}
4144

42-
if !IsDeploymentComplete(dp, &updatedDp.Status) {
45+
if !IsDeploymentComplete(dp.Generation, &updatedDp.Status, newExpectedReplicas) {
4346
klog.Warningf("deployment %s not yet complete", key.String())
4447
return false, nil
4548
}
@@ -66,13 +69,13 @@ func areDeploymentReplicasAvailable(newStatus *appsv1.DeploymentStatus, replicas
6669
return true
6770
}
6871

69-
func IsDeploymentComplete(dp *appsv1.Deployment, newStatus *appsv1.DeploymentStatus) bool {
70-
if newStatus.ObservedGeneration < dp.Generation {
71-
klog.InfoS("generation is older than expected to indicate the deployment is complete", "old", dp.Generation, "new", newStatus.ObservedGeneration)
72+
func IsDeploymentComplete(oldGeneration int64, newStatus *appsv1.DeploymentStatus, expectedReplicas int32) bool {
73+
if newStatus.ObservedGeneration < oldGeneration {
74+
klog.InfoS("generation is older than expected to indicate the deployment is complete", "old", oldGeneration, "new", newStatus.ObservedGeneration)
7275
return false
7376
}
7477

75-
return areDeploymentReplicasAvailable(newStatus, *(dp.Spec.Replicas))
78+
return areDeploymentReplicasAvailable(newStatus, expectedReplicas)
7679
}
7780

7881
func (wt Waiter) ForDeploymentReplicasCreation(ctx context.Context, dp *appsv1.Deployment, expectedReplicas int32) (*appsv1.Deployment, error) {

test/e2e/sched/sched_test.go

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,23 @@ import (
2626
appsv1 "k8s.io/api/apps/v1"
2727
corev1 "k8s.io/api/core/v1"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/apimachinery/pkg/labels"
2930
"k8s.io/apimachinery/pkg/types"
3031
"k8s.io/apimachinery/pkg/util/sets"
3132
klog "k8s.io/klog/v2"
3233
"k8s.io/utils/ptr"
3334

3435
"sigs.k8s.io/controller-runtime/pkg/client"
3536

37+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
3638
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests"
3739

3840
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
3941
"github.com/openshift-kni/numaresources-operator/internal/podlist"
4042
"github.com/openshift-kni/numaresources-operator/internal/wait"
4143
schedstate "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/objectstate/sched"
4244
e2eclient "github.com/openshift-kni/numaresources-operator/test/internal/clients"
45+
"github.com/openshift-kni/numaresources-operator/test/internal/configuration"
4346
e2efixture "github.com/openshift-kni/numaresources-operator/test/internal/fixture"
4447
e2eimages "github.com/openshift-kni/numaresources-operator/test/internal/images"
4548
"github.com/openshift-kni/numaresources-operator/test/internal/objects"
@@ -74,7 +77,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
7477
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.UID)
7578
Expect(err).ToNot(HaveOccurred(), "unable to get deployment by owner reference")
7679

77-
_, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(context.TODO(), dp)
80+
_, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
7881
Expect(err).ToNot(HaveOccurred())
7982
})
8083
})
@@ -129,7 +132,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
129132
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.GetUID())
130133
Expect(err).ToNot(HaveOccurred())
131134

132-
_, err = wait.With(e2eclient.Client).Interval(30*time.Second).Timeout(2*time.Minute).ForDeploymentComplete(context.TODO(), dp)
135+
_, err = wait.With(e2eclient.Client).Interval(30*time.Second).Timeout(2*time.Minute).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
133136
Expect(err).ToNot(HaveOccurred())
134137
})
135138

@@ -267,7 +270,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
267270
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.UID)
268271
Expect(err).ToNot(HaveOccurred(), "unable to get deployment by owner reference")
269272

270-
dp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(context.TODO(), dp)
273+
dp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
271274
Expect(err).ToNot(HaveOccurred())
272275

273276
podList, err = podlist.With(e2eclient.Client).ByDeployment(context.TODO(), *dp)
@@ -280,6 +283,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
280283
})
281284

282285
When("testing deployment TopologySpreadConstraints", func() {
286+
283287
var (
284288
nroSchedKey client.ObjectKey
285289
autoDetectedReplicasCount *int32
@@ -295,14 +299,38 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
295299
Expect(err).ToNot(HaveOccurred())
296300

297301
if currentReplicas != nil {
302+
By("compute expected autodetected replicas count")
303+
var labelSelector map[string]string
304+
switch configuration.Plat {
305+
case platform.HyperShift:
306+
labelSelector = map[string]string{
307+
"node-role.kubernetes.io/worker": "",
308+
}
309+
case platform.OpenShift:
310+
labelSelector = map[string]string{
311+
"node-role.kubernetes.io/control-plane": "",
312+
}
313+
default:
314+
Fail("unknown platform")
315+
}
316+
317+
nodeList := &corev1.NodeList{}
318+
Expect(e2eclient.Client.List(ctx, nodeList, &client.ListOptions{LabelSelector: labels.SelectorFromSet(labelSelector)})).To(Succeed())
319+
Expect(nodeList.Items).ToNot(BeEmpty(), "no nodes found with control-plane or worker role")
320+
expectedAutodetectedCount := int32(len(nodeList.Items))
321+
if expectedAutodetectedCount > 3 {
322+
// this is how it is capped in the controller
323+
expectedAutodetectedCount = 3
324+
}
325+
298326
e2efixture.By("configure NRS replicas for autodetection: current=%d desired=nil", *currentReplicas)
299327
Eventually(func(g Gomega) {
300328
g.Expect(e2eclient.Client.Get(ctx, nroSchedKey, nroSchedObj)).To(Succeed())
301329
nroSchedObj.Spec.Replicas = nil
302330
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
303331
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
304332

305-
schedDp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
333+
schedDp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, expectedAutodetectedCount)
306334
if err != nil {
307335
infoUponDeploymentCompleteFailure(ctx, schedDp)
308336
}
@@ -315,7 +343,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
315343
nroSchedObj.Spec.Replicas = currentReplicas
316344
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
317345
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
318-
schedDp, err := wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
346+
schedDp, err := wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *currentReplicas)
319347
if err != nil {
320348
infoUponDeploymentCompleteFailure(ctx, schedDp)
321349
}
@@ -363,7 +391,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
363391
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
364392
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with explicit replicas count")
365393

366-
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
394+
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, 0)
367395
if err != nil {
368396
infoUponDeploymentCompleteFailure(ctx, schedDp)
369397
}
@@ -389,7 +417,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
389417
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
390418
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
391419

392-
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
420+
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *autoDetectedReplicasCount)
393421
if err != nil {
394422
infoUponDeploymentCompleteFailure(ctx, schedDp)
395423
}
@@ -451,7 +479,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
451479
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
452480
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with explicit replicas count")
453481

454-
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
482+
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, 1)
455483
if err != nil {
456484
infoUponDeploymentCompleteFailure(ctx, schedDp)
457485
}
@@ -473,7 +501,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
473501
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
474502
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
475503

476-
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
504+
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *autoDetectedReplicasCount)
477505
if err != nil {
478506
infoUponDeploymentCompleteFailure(ctx, schedDp)
479507
}

test/e2e/serial/tests/configuration.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1000,7 +1000,7 @@ var _ = Describe("[serial][disruptive] numaresources configuration management",
10001000
e2efixture.By("ensuring the deployment %q keep being pending %d/%d", deployment.Name, step+1, maxStep)
10011001
err = fxt.Client.Get(ctx, client.ObjectKeyFromObject(deployment), &updatedDeployment)
10021002
Expect(err).ToNot(HaveOccurred())
1003-
Expect(wait.IsDeploymentComplete(deployment, &updatedDeployment.Status)).To(BeFalse(), "deployment %q become ready", deployment.Name)
1003+
Expect(wait.IsDeploymentComplete(deployment.Generation, &updatedDeployment.Status, *(deployment.Spec.Replicas))).To(BeFalse(), "deployment %q become ready", deployment.Name)
10041004
}
10051005

10061006
By("checking the deployment pod has failed scheduling and its at the pending status")

test/e2e/serial/tests/scheduler_removal.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
9595
err = fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(dp), updatedDp)
9696
Expect(err).ToNot(HaveOccurred())
9797

98-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeTrue(), "deployment %q become unready", dp.Name)
98+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *dp.Spec.Replicas)).To(BeTrue(), "deployment %q become unready", dp.Name)
9999
}
100100
})
101101

@@ -122,7 +122,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
122122
err = fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(dp), updatedDp)
123123
Expect(err).ToNot(HaveOccurred())
124124

125-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeFalse(), "deployment %q become ready", dp.Name)
125+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *dp.Spec.Replicas)).To(BeFalse(), "deployment %q become ready", dp.Name)
126126
}
127127
})
128128
})
@@ -153,7 +153,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
153153
err = fxt.Client.Get(ctx, client.ObjectKeyFromObject(dp), updatedDp)
154154
Expect(err).ToNot(HaveOccurred())
155155

156-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeFalse(), "deployment %q become ready", dp.Name)
156+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *dp.Spec.Replicas)).To(BeFalse(), "deployment %q become ready", dp.Name)
157157
}
158158

159159
restoreScheduler(fxt, nroSchedObj)

test/e2e/serial/tests/workload_placement_nodelabel.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
347347
updatedDp := &appsv1.Deployment{}
348348
err := fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(deployment), updatedDp)
349349
Expect(err).ToNot(HaveOccurred())
350-
return wait.IsDeploymentComplete(deployment, &updatedDp.Status)
350+
return wait.IsDeploymentComplete(deployment.Generation, &updatedDp.Status, *deployment.Spec.Replicas)
351351
}).WithTimeout(time.Second*30).WithPolling(time.Second*5).Should(BeTrue(), "deployment %q became unready", deployment.Name)
352352
},
353353
Entry("[test_id:47597] should be able to schedule pod with affinity property requiredDuringSchedulingIgnoredDuringExecution on the available node with feasible numa zone", createNodeAffinityRequiredDuringSchedulingIgnoredDuringExecution),

0 commit comments

Comments
 (0)