Skip to content

Commit b1ee4d6

Browse files
committed
internal: fix waiting for deployment get ready with new replicas
So far the old function was waiting for the deployment to be ready with the same replicas count as the old one. It is however possible that the new deployment version includes a different replicas field hence should expect to a different replicas count. The old version would fail on such case where the old count vs the new one are not the same. This commit fixes this issue by allowing the function to accept the expected new replicas count and compare against it. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent 3d27406 commit b1ee4d6

10 files changed

Lines changed: 62 additions & 37 deletions

internal/wait/deployment.go

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,8 @@ import (
2424
"k8s.io/klog/v2"
2525
)
2626

27-
func (wt Waiter) ForDeploymentComplete(ctx context.Context, dp *appsv1.Deployment) (*appsv1.Deployment, error) {
28-
// This function waits for the readiness of the pods under the deployment. The best use of this check is for
29-
// completely new deployments. If the deployment exists on the cluster and simply updated, this check is
30-
// not enough to guarantee that the deployment is ready with the NEW replica, thus need to cover that by
31-
// additional checks as the context requires
27+
// ForDeploymentCompleteWithReplicas waits for the deployment to be complete and have the specified number of replicas.
28+
func (wt Waiter) ForDeploymentCompleteWithReplicas(ctx context.Context, dp *appsv1.Deployment, newExpectedReplicas int32) (*appsv1.Deployment, error) {
3229
key := ObjectKeyFromObject(dp)
3330
updatedDp := &appsv1.Deployment{}
3431
immediate := true
@@ -39,7 +36,7 @@ func (wt Waiter) ForDeploymentComplete(ctx context.Context, dp *appsv1.Deploymen
3936
return false, err
4037
}
4138

42-
if !IsDeploymentComplete(dp, &updatedDp.Status) {
39+
if !IsDeploymentComplete(dp.Generation, &updatedDp.Status, newExpectedReplicas) {
4340
klog.Warningf("deployment %s not yet complete", key.String())
4441
return false, nil
4542
}
@@ -66,13 +63,13 @@ func areDeploymentReplicasAvailable(newStatus *appsv1.DeploymentStatus, replicas
6663
return true
6764
}
6865

69-
func IsDeploymentComplete(dp *appsv1.Deployment, newStatus *appsv1.DeploymentStatus) bool {
70-
if newStatus.ObservedGeneration < dp.Generation {
71-
klog.InfoS("generation is older than expected to indicate the deployment is complete", "old", dp.Generation, "new", newStatus.ObservedGeneration)
66+
func IsDeploymentComplete(oldGeneration int64, newStatus *appsv1.DeploymentStatus, expectedReplicas int32) bool {
67+
if newStatus.ObservedGeneration < oldGeneration {
68+
klog.InfoS("generation is older than expected to indicate the deployment is complete", "old", oldGeneration, "new", newStatus.ObservedGeneration)
7269
return false
7370
}
7471

75-
return areDeploymentReplicasAvailable(newStatus, *(dp.Spec.Replicas))
72+
return areDeploymentReplicasAvailable(newStatus, expectedReplicas)
7673
}
7774

7875
func (wt Waiter) ForDeploymentReplicasCreation(ctx context.Context, dp *appsv1.Deployment, expectedReplicas int32) (*appsv1.Deployment, error) {

test/e2e/sched/sched_test.go

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,23 @@ import (
2626
appsv1 "k8s.io/api/apps/v1"
2727
corev1 "k8s.io/api/core/v1"
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/apimachinery/pkg/labels"
2930
"k8s.io/apimachinery/pkg/types"
3031
"k8s.io/apimachinery/pkg/util/sets"
3132
klog "k8s.io/klog/v2"
3233
"k8s.io/utils/ptr"
3334

3435
"sigs.k8s.io/controller-runtime/pkg/client"
3536

37+
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
3638
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests"
3739

3840
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
3941
"github.com/openshift-kni/numaresources-operator/internal/podlist"
4042
"github.com/openshift-kni/numaresources-operator/internal/wait"
4143
schedstate "github.com/openshift-kni/numaresources-operator/pkg/numaresourcesscheduler/objectstate/sched"
4244
e2eclient "github.com/openshift-kni/numaresources-operator/test/internal/clients"
45+
"github.com/openshift-kni/numaresources-operator/test/internal/configuration"
4346
e2efixture "github.com/openshift-kni/numaresources-operator/test/internal/fixture"
4447
e2eimages "github.com/openshift-kni/numaresources-operator/test/internal/images"
4548
"github.com/openshift-kni/numaresources-operator/test/internal/objects"
@@ -74,7 +77,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
7477
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.UID)
7578
Expect(err).ToNot(HaveOccurred(), "unable to get deployment by owner reference")
7679

77-
_, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(context.TODO(), dp)
80+
_, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
7881
Expect(err).ToNot(HaveOccurred())
7982
})
8083
})
@@ -129,7 +132,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
129132
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.GetUID())
130133
Expect(err).ToNot(HaveOccurred())
131134

132-
_, err = wait.With(e2eclient.Client).Interval(30*time.Second).Timeout(2*time.Minute).ForDeploymentComplete(context.TODO(), dp)
135+
_, err = wait.With(e2eclient.Client).Interval(30*time.Second).Timeout(2*time.Minute).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
133136
Expect(err).ToNot(HaveOccurred())
134137
})
135138

@@ -267,7 +270,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
267270
dp, err := podlist.With(e2eclient.Client).DeploymentByOwnerReference(context.TODO(), nroSchedObj.UID)
268271
Expect(err).ToNot(HaveOccurred(), "unable to get deployment by owner reference")
269272

270-
dp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(context.TODO(), dp)
273+
dp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
271274
Expect(err).ToNot(HaveOccurred())
272275

273276
podList, err = podlist.With(e2eclient.Client).ByDeployment(context.TODO(), *dp)
@@ -280,6 +283,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
280283
})
281284

282285
When("testing deployment TopologySpreadConstraints", func() {
286+
283287
var (
284288
nroSchedKey client.ObjectKey
285289
autoDetectedReplicasCount *int32
@@ -295,14 +299,38 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
295299
Expect(err).ToNot(HaveOccurred())
296300

297301
if currentReplicas != nil {
302+
By("compute expected autodetected replicas count")
303+
var labelSelector map[string]string
304+
switch configuration.Plat {
305+
case platform.HyperShift:
306+
labelSelector = map[string]string{
307+
"node-role.kubernetes.io/worker": "",
308+
}
309+
case platform.OpenShift:
310+
labelSelector = map[string]string{
311+
"node-role.kubernetes.io/control-plane": "",
312+
}
313+
default:
314+
Fail("unknown platform")
315+
}
316+
317+
nodeList := &corev1.NodeList{}
318+
Expect(e2eclient.Client.List(ctx, nodeList, &client.ListOptions{LabelSelector: labels.SelectorFromSet(labelSelector)})).To(Succeed())
319+
Expect(nodeList.Items).ToNot(BeEmpty(), "no nodes found with control-plane or worker role")
320+
expectedAutodetectedCount := int32(len(nodeList.Items))
321+
if expectedAutodetectedCount > 3 {
322+
// this is how it is capped in the controller
323+
expectedAutodetectedCount = 3
324+
}
325+
298326
e2efixture.By("configure NRS replicas for autodetection: current=%d desired=nil", *currentReplicas)
299327
Eventually(func(g Gomega) {
300328
g.Expect(e2eclient.Client.Get(ctx, nroSchedKey, nroSchedObj)).To(Succeed())
301329
nroSchedObj.Spec.Replicas = nil
302330
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
303331
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
304332

305-
schedDp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
333+
schedDp, err = wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, expectedAutodetectedCount)
306334
if err != nil {
307335
infoUponDeploymentCompleteFailure(ctx, schedDp)
308336
}
@@ -315,7 +343,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
315343
nroSchedObj.Spec.Replicas = currentReplicas
316344
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
317345
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
318-
schedDp, err := wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
346+
schedDp, err := wait.With(e2eclient.Client).Timeout(5*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *currentReplicas)
319347
if err != nil {
320348
infoUponDeploymentCompleteFailure(ctx, schedDp)
321349
}
@@ -363,7 +391,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
363391
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
364392
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with explicit replicas count")
365393

366-
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
394+
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, 0)
367395
if err != nil {
368396
infoUponDeploymentCompleteFailure(ctx, schedDp)
369397
}
@@ -389,7 +417,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
389417
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
390418
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
391419

392-
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
420+
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *autoDetectedReplicasCount)
393421
if err != nil {
394422
infoUponDeploymentCompleteFailure(ctx, schedDp)
395423
}
@@ -451,7 +479,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
451479
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
452480
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with explicit replicas count")
453481

454-
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
482+
schedDp, err := wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, 1)
455483
if err != nil {
456484
infoUponDeploymentCompleteFailure(ctx, schedDp)
457485
}
@@ -473,7 +501,7 @@ var _ = Describe("[Scheduler] scheduler object updates", func() {
473501
g.Expect(e2eclient.Client.Update(ctx, nroSchedObj)).To(Succeed())
474502
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).Should(Succeed(), "failed to update NRS with autodetection mode for replicas")
475503

476-
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentComplete(ctx, schedDp)
504+
schedDp, err = wait.With(e2eclient.Client).Timeout(10*time.Minute).Interval(10*time.Second).ForDeploymentCompleteWithReplicas(ctx, schedDp, *autoDetectedReplicasCount)
477505
if err != nil {
478506
infoUponDeploymentCompleteFailure(ctx, schedDp)
479507
}

test/e2e/serial/tests/configuration.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -946,7 +946,7 @@ var _ = Describe("[serial][disruptive] numaresources configuration management",
946946
e2efixture.By("deleting the NRO Scheduler object to trigger the pod to restart: %s", nroSchedObj.Name)
947947
Expect(fxt.Client.Delete(ctx, &schedPods[0])).ToNot(HaveOccurred())
948948

949-
_, err = wait.With(fxt.Client).Interval(10*time.Second).Timeout(time.Minute).ForDeploymentComplete(ctx, schedDeployment)
949+
_, err = wait.With(fxt.Client).Interval(10*time.Second).Timeout(time.Minute).ForDeploymentCompleteWithReplicas(ctx, schedDeployment, *schedDeployment.Spec.Replicas)
950950
Expect(err).ToNot(HaveOccurred())
951951

952952
mcpsInfo, err = buildMCPsInfo(fxt.Client, ctx, nroOperObj)
@@ -1000,7 +1000,7 @@ var _ = Describe("[serial][disruptive] numaresources configuration management",
10001000
e2efixture.By("ensuring the deployment %q keep being pending %d/%d", deployment.Name, step+1, maxStep)
10011001
err = fxt.Client.Get(ctx, client.ObjectKeyFromObject(deployment), &updatedDeployment)
10021002
Expect(err).ToNot(HaveOccurred())
1003-
Expect(wait.IsDeploymentComplete(deployment, &updatedDeployment.Status)).To(BeFalse(), "deployment %q become ready", deployment.Name)
1003+
Expect(wait.IsDeploymentComplete(deployment.Generation, &updatedDeployment.Status, *(deployment.Spec.Replicas))).To(BeFalse(), "deployment %q become ready", deployment.Name)
10041004
}
10051005

10061006
By("checking the deployment pod has failed scheduling and its at the pending status")

test/e2e/serial/tests/non_regression.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload placeme
255255
err = fxt.Client.Create(context.TODO(), dp)
256256
Expect(err).ToNot(HaveOccurred())
257257

258-
updatedDp, err := wait.With(fxt.Client).Interval(10*time.Second).Timeout(time.Minute).ForDeploymentComplete(context.TODO(), dp)
258+
updatedDp, err := wait.With(fxt.Client).Interval(10*time.Second).Timeout(time.Minute).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
259259
Expect(err).ToNot(HaveOccurred())
260260

261261
nrtPostCreateDeploymentList, err := e2enrt.GetUpdated(fxt.Client, nrtInitialList, time.Minute)

test/e2e/serial/tests/resource_accounting.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ var _ = Describe("[serial][disruptive][scheduler][resacct] numaresources workloa
486486
Expect(err).NotTo(HaveOccurred(), "unable to create deployment %q", deployment.Name)
487487

488488
By("waiting for deployment to be up & running")
489-
_, err = wait.With(fxt.Client).Interval(10*time.Second).Timeout(1*time.Minute).ForDeploymentComplete(context.TODO(), deployment)
489+
_, err = wait.With(fxt.Client).Interval(10*time.Second).Timeout(1*time.Minute).ForDeploymentCompleteWithReplicas(context.TODO(), deployment, *deployment.Spec.Replicas)
490490
Expect(err).NotTo(HaveOccurred(), "Deployment %q not up & running after %v", deployment.Name, 1*time.Minute)
491491

492492
e2efixture.By("checking deployment pods have been scheduled with the topology aware scheduler %q and in the proper node %q", serialconfig.Config.SchedulerName, targetNodeName)

test/e2e/serial/tests/scheduler_removal.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
9595
err = fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(dp), updatedDp)
9696
Expect(err).ToNot(HaveOccurred())
9797

98-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeTrue(), "deployment %q become unready", dp.Name)
98+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *(dp.Spec.Replicas))).To(BeTrue(), "deployment %q become unready", dp.Name)
9999
}
100100
})
101101

@@ -122,7 +122,7 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
122122
err = fxt.Client.Get(context.TODO(), client.ObjectKeyFromObject(dp), updatedDp)
123123
Expect(err).ToNot(HaveOccurred())
124124

125-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeFalse(), "deployment %q become ready", dp.Name)
125+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *(dp.Spec.Replicas))).To(BeFalse(), "deployment %q become ready", dp.Name)
126126
}
127127
})
128128
})
@@ -153,14 +153,14 @@ var _ = Describe("[serial][disruptive][scheduler][schedrst] numaresources schedu
153153
err = fxt.Client.Get(ctx, client.ObjectKeyFromObject(dp), updatedDp)
154154
Expect(err).ToNot(HaveOccurred())
155155

156-
Expect(wait.IsDeploymentComplete(dp, &updatedDp.Status)).To(BeFalse(), "deployment %q become ready", dp.Name)
156+
Expect(wait.IsDeploymentComplete(dp.Generation, &updatedDp.Status, *(dp.Spec.Replicas))).To(BeFalse(), "deployment %q become ready", dp.Name)
157157
}
158158

159159
restoreScheduler(fxt, nroSchedObj)
160160
nroSchedObj = nrosched.CheckNROSchedulerAvailable(ctx, fxt.Client, nroSchedObj.Name)
161161

162162
e2efixture.By("waiting for the test deployment %q to become complete and ready", updatedDp.Name)
163-
_, err = wait.With(fxt.Client).Interval(2*time.Second).Interval(30*time.Second).ForDeploymentComplete(ctx, updatedDp)
163+
_, err = wait.With(fxt.Client).Interval(2*time.Second).Interval(30*time.Second).ForDeploymentCompleteWithReplicas(ctx, updatedDp, *updatedDp.Spec.Replicas)
164164
Expect(err).ToNot(HaveOccurred())
165165
})
166166
})
@@ -205,7 +205,7 @@ func createDeployment(fxt *e2efixture.Fixture, name, schedulerName string) *apps
205205
func createDeploymentSync(fxt *e2efixture.Fixture, name, schedulerName string) *appsv1.Deployment {
206206
dp := createDeployment(fxt, name, schedulerName)
207207
e2efixture.By("waiting for the test deployment %q to be complete and ready", name)
208-
_, err := wait.With(fxt.Client).Interval(10*time.Second).Timeout(time.Minute).ForDeploymentComplete(context.TODO(), dp)
208+
_, err := wait.With(fxt.Client).Interval(10*time.Second).Timeout(time.Minute).ForDeploymentCompleteWithReplicas(context.TODO(), dp, *dp.Spec.Replicas)
209209
Expect(err).ToNot(HaveOccurred(), "Deployment %q is not up & running after %v", dp.Name, time.Minute)
210210
return dp
211211
}

test/e2e/serial/tests/workload_overhead.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ var _ = Describe("[serial][disruptive][scheduler] numaresources workload overhea
234234
Expect(err).NotTo(HaveOccurred(), "unable to create deployment %q", deployment.Name)
235235

236236
By("waiting for deployment to be up&running")
237-
_, err = wait.With(fxt.Client).Interval(10*time.Second).Timeout(2*time.Minute).ForDeploymentComplete(context.TODO(), deployment)
237+
_, err = wait.With(fxt.Client).Interval(10*time.Second).Timeout(2*time.Minute).ForDeploymentCompleteWithReplicas(context.TODO(), deployment, *deployment.Spec.Replicas)
238238
Expect(err).NotTo(HaveOccurred(), "Deployment %q not up&running after %v", deployment.Name, 2*time.Minute)
239239

240240
By("wait for NRT data to settle")

0 commit comments

Comments
 (0)