Skip to content

Commit 311d8e2

Browse files
committed
Onboarding: keep retrying after instance creation/error failures
When the test instance creation in the onboarding smoke test fails, the controller used to return a bare error from Reconcile and never updated the Onboarding condition. Operators saw a stale message frozen for days while the loop kept silently retrying — combined with the SSA condition helper preserving LastTransitionTime when Status does not change, the resource looked stuck even though reconciliation was healthy. This change: - In smokeTest, when createOrGetTestServer fails, set the Onboarding condition to Testing with the current error message and requeue with defaultWaitTime instead of returning the error. This makes each retry observable in the resource itself and matches the style used by every other smokeTest error path. - Include the failed server's UUID in the ERROR-state, console-output-failure, timeout, and terminate-failure messages. A fresh attempt allocates a new instance UUID, so each retry yields a different message even when Nova keeps returning the same fault text preventing SetApplyConfigurationStatusCondition from treating successive failures as no-op updates. - Tests: add Ginkgo specs covering POST /servers returning 500 and a server in ERROR state, asserting the condition message, the requeue, the absence of a returned error, and (for ERROR) that the delete is issued and the UUID is in the message.
1 parent 9458d90 commit 311d8e2

2 files changed

Lines changed: 140 additions & 8 deletions

File tree

internal/controller/onboarding_controller.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,19 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
226226
zone := hv.Labels[corev1.LabelTopologyZone]
227227
server, err := r.createOrGetTestServer(ctx, zone, host, hv.UID)
228228
if err != nil {
229-
return ctrl.Result{}, fmt.Errorf("failed to create or get test instance: %w", err)
229+
// Surface the failure in the Onboarding condition so the user can see
230+
// the latest error and confirm the controller is still retrying.
231+
// Returning a bare error here would leave the previously-set message
232+
// in place and the resource looks frozen even though we keep retrying.
233+
if err2 := r.applyOnboardingCondition(ctx, hv,
234+
*k8sacmetav1.Condition().
235+
WithType(kvmv1.ConditionTypeOnboarding).
236+
WithStatus(metav1.ConditionTrue).
237+
WithReason(kvmv1.ConditionReasonTesting).
238+
WithMessage(fmt.Sprintf("failed to create or get test instance: %v", err))); err2 != nil {
239+
return ctrl.Result{}, err2
240+
}
241+
return ctrl.Result{RequeueAfter: defaultWaitTime}, nil
230242
}
231243

232244
switch server.Status {
@@ -239,13 +251,16 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
239251
return ctrl.Result{RequeueAfter: defaultWaitTime}, nil
240252
}
241253

242-
// Set condition back to testing
254+
// Set condition back to testing. Include the server ID so each retry
255+
// (which creates a fresh instance with a new UUID) yields a different
256+
// message — otherwise SetApplyConfigurationStatusCondition would treat
257+
// the update as a no-op when Nova keeps reporting the same fault text.
243258
if err = r.applyOnboardingCondition(ctx, hv,
244259
*k8sacmetav1.Condition().
245260
WithType(kvmv1.ConditionTypeOnboarding).
246261
WithStatus(metav1.ConditionTrue).
247262
WithReason(kvmv1.ConditionReasonTesting).
248-
WithMessage("Server ended up in error state: "+server.Fault.Message)); err != nil {
263+
WithMessage(fmt.Sprintf("Server %s ended up in error state: %s", id, server.Fault.Message))); err != nil {
249264
return ctrl.Result{}, err
250265
}
251266

@@ -266,7 +281,7 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
266281
WithType(kvmv1.ConditionTypeOnboarding).
267282
WithStatus(metav1.ConditionTrue).
268283
WithReason(kvmv1.ConditionReasonTesting).
269-
WithMessage(fmt.Sprintf("could not get console output %v", err))); err2 != nil {
284+
WithMessage(fmt.Sprintf("could not get console output for server %s: %v", server.ID, err))); err2 != nil {
270285
return ctrl.Result{}, err2
271286
}
272287
return ctrl.Result{RequeueAfter: defaultWaitTime}, nil
@@ -279,7 +294,7 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
279294
WithType(kvmv1.ConditionTypeOnboarding).
280295
WithStatus(metav1.ConditionTrue).
281296
WithReason(kvmv1.ConditionReasonTesting).
282-
WithMessage(fmt.Sprintf("timeout waiting for console output since %v", server.LaunchedAt))); err2 != nil {
297+
WithMessage(fmt.Sprintf("timeout waiting for console output of server %s since %v", server.ID, server.LaunchedAt))); err2 != nil {
283298
return ctrl.Result{}, err2
284299
}
285300
if err = servers.Delete(ctx, r.testComputeClient, server.ID).ExtractErr(); err != nil {
@@ -297,7 +312,7 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
297312
WithType(kvmv1.ConditionTypeOnboarding).
298313
WithStatus(metav1.ConditionTrue).
299314
WithReason(kvmv1.ConditionReasonTesting).
300-
WithMessage(fmt.Sprintf("failed to terminate instance %v", err))); err2 != nil {
315+
WithMessage(fmt.Sprintf("failed to terminate instance %s: %v", server.ID, err))); err2 != nil {
301316
return ctrl.Result{}, err2
302317
}
303318
return ctrl.Result{RequeueAfter: defaultWaitTime}, nil

internal/controller/onboarding_controller_test.go

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ var _ = Describe("Onboarding Controller", func() {
336336
Context("running tests after initial setup", func() {
337337
var (
338338
serverActionHandler func(http.ResponseWriter, *http.Request)
339+
serverCreateHandler func(http.ResponseWriter, *http.Request)
339340
serverDeleteHandler func(http.ResponseWriter, *http.Request)
340341
serverDetailHandler func(http.ResponseWriter, *http.Request)
341342
)
@@ -397,11 +398,14 @@ var _ = Describe("Onboarding Controller", func() {
397398
Expect(err).NotTo(HaveOccurred())
398399
})
399400

400-
fakeServer.Mux.HandleFunc("POST /servers", func(w http.ResponseWriter, r *http.Request) {
401+
serverCreateHandler = func(w http.ResponseWriter, _ *http.Request) {
401402
w.Header().Add("Content-Type", "application/json")
402403
w.WriteHeader(http.StatusOK)
403404
_, err := fmt.Fprint(w, createServerBody)
404405
Expect(err).NotTo(HaveOccurred())
406+
}
407+
fakeServer.Mux.HandleFunc("POST /servers", func(w http.ResponseWriter, r *http.Request) {
408+
serverCreateHandler(w, r)
405409
})
406410

407411
serverActionHandler = func(w http.ResponseWriter, _ *http.Request) {
@@ -632,13 +636,126 @@ var _ = Describe("Onboarding Controller", func() {
632636
By("Verifying the timed-out server was deleted")
633637
Expect(serverDeletedCalled).To(BeTrue())
634638

635-
By("Verifying the onboarding condition message indicates a timeout")
639+
By("Verifying the onboarding condition message indicates a timeout and includes the server ID")
636640
hv := &kvmv1.Hypervisor{}
637641
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
638642
onboardingCond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
639643
Expect(onboardingCond).NotTo(BeNil())
640644
Expect(onboardingCond.Reason).To(Equal(kvmv1.ConditionReasonTesting))
641645
Expect(onboardingCond.Message).To(ContainSubstring("timeout"))
646+
Expect(onboardingCond.Message).To(ContainSubstring("server-id"))
647+
})
648+
})
649+
650+
When("test server creation fails", func() {
651+
BeforeEach(func(ctx SpecContext) {
652+
By("Overriding HV status to Testing state")
653+
hv := &kvmv1.Hypervisor{}
654+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
655+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
656+
Type: kvmv1.ConditionTypeOnboarding,
657+
Status: metav1.ConditionTrue,
658+
Reason: kvmv1.ConditionReasonTesting,
659+
Message: "previously stuck on something else",
660+
})
661+
Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed())
662+
})
663+
664+
It("should surface the error in the Onboarding condition and requeue without returning an error", func(ctx SpecContext) {
665+
By("Overriding the POST /servers handler to return 500")
666+
serverCreateHandler = func(w http.ResponseWriter, _ *http.Request) {
667+
w.WriteHeader(http.StatusInternalServerError)
668+
_, err := fmt.Fprint(w, `{"computeFault": {"message": "Cinder volume backend is degraded"}}`)
669+
Expect(err).NotTo(HaveOccurred())
670+
}
671+
672+
By("Reconciling: createOrGetTestServer should fail at POST /servers")
673+
result, err := onboardingReconciler.Reconcile(ctx, reconcileReq)
674+
Expect(err).NotTo(HaveOccurred())
675+
Expect(result.RequeueAfter).To(Equal(defaultWaitTime))
676+
677+
By("Verifying the Onboarding condition reflects the create error")
678+
hv := &kvmv1.Hypervisor{}
679+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
680+
onboardingCond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
681+
Expect(onboardingCond).NotTo(BeNil())
682+
Expect(onboardingCond.Status).To(Equal(metav1.ConditionTrue))
683+
Expect(onboardingCond.Reason).To(Equal(kvmv1.ConditionReasonTesting))
684+
Expect(onboardingCond.Message).To(ContainSubstring("failed to create or get test instance"))
685+
})
686+
})
687+
688+
When("test server is in ERROR state", func() {
689+
var (
690+
serverGetCalled bool
691+
serverDeleteCalled bool
692+
)
693+
694+
BeforeEach(func(ctx SpecContext) {
695+
By("Overriding HV status to Testing state")
696+
hv := &kvmv1.Hypervisor{}
697+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
698+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
699+
Type: kvmv1.ConditionTypeOnboarding,
700+
Status: metav1.ConditionTrue,
701+
Reason: kvmv1.ConditionReasonTesting,
702+
Message: "Running onboarding tests",
703+
})
704+
Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed())
705+
706+
serverName := fmt.Sprintf("%s-%s-%s", testPrefixName, hypervisorName, string(hv.UID))
707+
serverGetCalled = false
708+
serverDeleteCalled = false
709+
710+
// GET /servers/detail returns a single ERROR-state server with the
711+
// expected name so createOrGetTestServer returns it as foundServer.
712+
serverDetailHandler = func(w http.ResponseWriter, _ *http.Request) {
713+
_, err := fmt.Fprintf(w,
714+
`{"servers": [{"id": "server-id", "name": %q, "status": "ERROR"}], "servers_links": []}`,
715+
serverName)
716+
Expect(err).NotTo(HaveOccurred())
717+
}
718+
719+
// GET /servers/server-id returns the ERROR server with a fault
720+
// message so smokeTest can record it.
721+
fakeServer.Mux.HandleFunc("GET /servers/server-id", func(w http.ResponseWriter, _ *http.Request) {
722+
serverGetCalled = true
723+
w.Header().Add("Content-Type", "application/json")
724+
w.WriteHeader(http.StatusOK)
725+
_, err := fmt.Fprintf(w,
726+
`{"server": {"id": "server-id", "name": %q, "status": "ERROR", "fault": {"message": "Build of instance aborted: volume creation failed"}}}`,
727+
serverName)
728+
Expect(err).NotTo(HaveOccurred())
729+
})
730+
731+
serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) {
732+
serverDeleteCalled = true
733+
w.WriteHeader(http.StatusAccepted)
734+
}
735+
})
736+
737+
It("should record the failure with the server UUID and issue a delete, without returning an error", func(ctx SpecContext) {
738+
By("Reconciling once to enter the ERROR branch")
739+
result, err := onboardingReconciler.Reconcile(ctx, reconcileReq)
740+
Expect(err).NotTo(HaveOccurred())
741+
Expect(result.RequeueAfter).To(Equal(defaultWaitTime))
742+
743+
By("Verifying GET on the server happened so the fault could be read")
744+
Expect(serverGetCalled).To(BeTrue())
745+
746+
By("Verifying the server delete was attempted")
747+
Expect(serverDeleteCalled).To(BeTrue())
748+
749+
By("Verifying the Onboarding condition message includes the server UUID and the fault text")
750+
hv := &kvmv1.Hypervisor{}
751+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
752+
onboardingCond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
753+
Expect(onboardingCond).NotTo(BeNil())
754+
Expect(onboardingCond.Status).To(Equal(metav1.ConditionTrue))
755+
Expect(onboardingCond.Reason).To(Equal(kvmv1.ConditionReasonTesting))
756+
Expect(onboardingCond.Message).To(ContainSubstring("server-id"))
757+
Expect(onboardingCond.Message).To(ContainSubstring("ended up in error state"))
758+
Expect(onboardingCond.Message).To(ContainSubstring("Build of instance aborted"))
642759
})
643760
})
644761

0 commit comments

Comments
 (0)