Skip to content

Commit 65bc78f

Browse files
committed
Fix eviction overwriting failed migration condition with success
When an eviction processes a list of VMs and one is in ERROR state, the controller correctly skips it via moveToBack and sets a Failed migration condition. However, the next successful migration of another VM unconditionally overwrote that condition with Succeeded, hiding the fact that an errored VM was still outstanding. Preserve the Failed condition while there are still outstanding VMs. The condition resolves naturally: either the errored VM is fixed and migrates (overwriting with Succeeded), or the eviction terminates with Evicting=Succeeded once the list empties. Adds a unit test covering a mixed list of healthy and errored VMs that exercises the skip-and-retry pattern across multiple reconciliations.
1 parent c252897 commit 65bc78f

2 files changed

Lines changed: 178 additions & 6 deletions

File tree

internal/controller/eviction_controller.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,12 +326,24 @@ func (r *EvictionReconciler) evictNext(ctx context.Context, eviction *kvmv1.Evic
326326

327327
if currentHypervisor != eviction.Spec.Hypervisor {
328328
log.Info("migrated")
329-
meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{
330-
Type: kvmv1.ConditionTypeMigration,
331-
Status: metav1.ConditionFalse,
332-
Message: fmt.Sprintf("Migration of instance %s finished", vm.ID),
333-
Reason: kvmv1.ConditionReasonSucceeded,
334-
})
329+
// Don't overwrite a sticky Failed migration condition with Succeeded
330+
// while there are still other outstanding VMs - an earlier ERROR VM
331+
// has been moved to the back of the queue and the eviction is not
332+
// actually clean. The condition is reset only when the whole
333+
// eviction completes (OutstandingInstances becomes empty).
334+
remaining, _ := popInstance(eviction.Status.OutstandingInstances)
335+
prior := meta.FindStatusCondition(eviction.Status.Conditions, kvmv1.ConditionTypeMigration)
336+
stickyFailure := len(remaining) > 0 && prior != nil &&
337+
prior.Status == metav1.ConditionFalse &&
338+
prior.Reason == kvmv1.ConditionReasonFailed
339+
if !stickyFailure {
340+
meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{
341+
Type: kvmv1.ConditionTypeMigration,
342+
Status: metav1.ConditionFalse,
343+
Message: fmt.Sprintf("Migration of instance %s finished", vm.ID),
344+
Reason: kvmv1.ConditionReasonSucceeded,
345+
})
346+
}
335347

336348
// So, it is already off this one, do we need to verify it?
337349
if vm.Status == "VERIFY_RESIZE" {

internal/controller/eviction_controller_test.go

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,5 +439,165 @@ var _ = Describe("Eviction Controller", func() {
439439
})
440440
})
441441
})
442+
443+
Context("Mixed VM Eviction", func() {
444+
// serverTpl renders a single server response. The eviction controller
445+
// reads OS-EXT-SRV-ATTR:hypervisor_hostname (compared against the
446+
// short-form hypervisor name from spec) plus status/task_state/power_state.
447+
const serverTpl = `{
448+
"server": {
449+
"id": "%[1]s",
450+
"status": "%[2]s",
451+
"OS-EXT-SRV-ATTR:hypervisor_hostname": "%[3]s",
452+
"OS-EXT-STS:task_state": "%[4]s",
453+
"OS-EXT-STS:power_state": %[5]d,
454+
"fault": {"code": 500, "message": "%[6]s"}
455+
}
456+
}`
457+
458+
// migratedVMs is updated as the test simulates successful migrations.
459+
// When a VM has been "migrated", its hypervisor_hostname response
460+
// changes to a different host, signalling the controller it has left.
461+
var migratedVMs map[string]bool
462+
var liveMigrateCalls map[string]int
463+
464+
BeforeEach(func(ctx SpecContext) {
465+
migratedVMs = map[string]bool{}
466+
liveMigrateCalls = map[string]int{}
467+
468+
By("Seeding the eviction status with a list of VMs to evict")
469+
// OutstandingInstances is processed from the END (peekInstance returns
470+
// last). With [good-1, error-1, good-2], processing order is:
471+
// 1) good-2 (last) - migrate, then drop
472+
// 2) error-1 (now last) - skipped via moveToBack
473+
// 3) good-1 - migrate, then drop
474+
// 4) error-1 (alone) - keeps erroring
475+
eviction := &kvmv1.Eviction{}
476+
Expect(k8sClient.Get(ctx, typeNamespacedName, eviction)).To(Succeed())
477+
meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{
478+
Type: kvmv1.ConditionTypeEvicting,
479+
Status: metav1.ConditionTrue,
480+
Message: "Running",
481+
Reason: kvmv1.ConditionReasonRunning,
482+
})
483+
meta.SetStatusCondition(&eviction.Status.Conditions, metav1.Condition{
484+
Type: kvmv1.ConditionTypePreflight,
485+
Status: metav1.ConditionTrue,
486+
Message: "preflight passed",
487+
Reason: kvmv1.ConditionReasonSucceeded,
488+
})
489+
eviction.Status.HypervisorServiceId = serviceId
490+
eviction.Status.OutstandingInstances = []string{"good-1", "error-1", "good-2"}
491+
eviction.Status.OutstandingRamMb = 4096
492+
Expect(k8sClient.Status().Update(ctx, eviction)).To(Succeed())
493+
494+
By("Mocking GET /servers/{id} to return per-VM state")
495+
fakeServer.Mux.HandleFunc("GET /servers/{server_id}", func(w http.ResponseWriter, r *http.Request) {
496+
serverID := r.PathValue("server_id")
497+
w.Header().Add("Content-Type", "application/json")
498+
499+
// hypervisor_hostname uses the FQDN-style name; the controller
500+
// only compares the short prefix (before the first ".") against
501+
// eviction.Spec.Hypervisor. After we mark a VM as migrated,
502+
// pretend it lives on a different host so the controller treats
503+
// it as "already moved" and removes it from the list.
504+
hvHost := hypervisorName + ".example.local"
505+
if migratedVMs[serverID] {
506+
hvHost = "other-host.example.local"
507+
}
508+
509+
switch serverID {
510+
case "good-1", "good-2":
511+
status := "ACTIVE"
512+
if migratedVMs[serverID] {
513+
// Once migrated, status doesn't really matter, but
514+
// keep it ACTIVE so we exercise the "different host"
515+
// branch rather than VERIFY_RESIZE.
516+
status = "ACTIVE"
517+
}
518+
w.WriteHeader(http.StatusOK)
519+
_, err := fmt.Fprintf(w, serverTpl, serverID, status, hvHost, "", 1, "")
520+
Expect(err).NotTo(HaveOccurred())
521+
case "error-1":
522+
w.WriteHeader(http.StatusOK)
523+
_, err := fmt.Fprintf(w, serverTpl, serverID, "ERROR", hvHost, "", 0,
524+
"manual intervention required")
525+
Expect(err).NotTo(HaveOccurred())
526+
default:
527+
Fail("unexpected server id: " + serverID)
528+
}
529+
})
530+
531+
By("Mocking POST /servers/{id}/action for live-migration")
532+
fakeServer.Mux.HandleFunc("POST /servers/{server_id}/action", func(w http.ResponseWriter, r *http.Request) {
533+
serverID := r.PathValue("server_id")
534+
liveMigrateCalls[serverID]++
535+
// Mark this VM as migrated so the next GET reports a different host.
536+
migratedVMs[serverID] = true
537+
w.WriteHeader(http.StatusAccepted)
538+
})
539+
})
540+
541+
It("skips errored VMs, evicts healthy ones, and retries errored VMs in subsequent loops", func(ctx SpecContext) {
542+
resource := &kvmv1.Eviction{}
543+
544+
// Reconcile loop until the list is empty or we've gone too long.
545+
// We expect: good-2 migrated, error-1 skipped, good-1 migrated, then
546+
// only error-1 remains and keeps erroring.
547+
By("Running reconciliations until only the errored VM remains")
548+
const maxLoops = 20
549+
for i := range maxLoops {
550+
// Tolerate errors here: when the controller hits the ERROR
551+
// VM it returns an error (joined with the status update).
552+
// That is part of the pattern under test, not a test failure.
553+
_, reconcileErr := evictionReconciler.Reconcile(ctx, reconcileRequest)
554+
_ = reconcileErr // expected on ERROR-VM iterations
555+
Expect(k8sClient.Get(ctx, typeNamespacedName, resource)).To(Succeed())
556+
557+
// Once both healthy VMs have been migrated and removed, we are
558+
// in the steady "only errored VM left" state.
559+
remaining := resource.Status.OutstandingInstances
560+
if len(remaining) == 1 && remaining[0] == "error-1" {
561+
By(fmt.Sprintf("Reached steady state after %d reconciliations", i+1))
562+
break
563+
}
564+
}
565+
566+
By("Both healthy VMs were live-migrated exactly once")
567+
Expect(liveMigrateCalls["good-1"]).To(Equal(1),
568+
"good-1 should have been migrated once")
569+
Expect(liveMigrateCalls["good-2"]).To(Equal(1),
570+
"good-2 should have been migrated once")
571+
572+
By("The errored VM is still outstanding and never received a migrate call")
573+
Expect(resource.Status.OutstandingInstances).To(Equal([]string{"error-1"}))
574+
Expect(liveMigrateCalls).NotTo(HaveKey("error-1"))
575+
576+
By("The migration condition reflects the most recent failure")
577+
Expect(resource.Status.Conditions).To(ContainElement(SatisfyAll(
578+
HaveField("Type", kvmv1.ConditionTypeMigration),
579+
HaveField("Status", metav1.ConditionFalse),
580+
HaveField("Reason", kvmv1.ConditionReasonFailed),
581+
HaveField("Message", ContainSubstring("error-1")),
582+
)))
583+
584+
By("The eviction is NOT marked successful while the errored VM remains")
585+
Expect(resource.Status.Conditions).NotTo(ContainElement(SatisfyAll(
586+
HaveField("Type", kvmv1.ConditionTypeEvicting),
587+
HaveField("Status", metav1.ConditionFalse),
588+
HaveField("Reason", kvmv1.ConditionReasonSucceeded),
589+
)))
590+
591+
By("Subsequent reconciliations keep retrying the errored VM (and surfacing the error)")
592+
_, err := evictionReconciler.Reconcile(ctx, reconcileRequest)
593+
// The controller returns an error when it encounters a VM in ERROR
594+
// state. The reconcile error should mention the errored UUID.
595+
if err != nil {
596+
Expect(err.Error()).To(ContainSubstring("error-1"))
597+
}
598+
Expect(k8sClient.Get(ctx, typeNamespacedName, resource)).To(Succeed())
599+
Expect(resource.Status.OutstandingInstances).To(Equal([]string{"error-1"}))
600+
})
601+
})
442602
})
443603
})

0 commit comments

Comments
 (0)