Skip to content

Commit daae898

Browse files
authored
fix(failover): keep allocation if VM missing from postgres but on hyper (#909)
When the VM source (postgres) is missing data but VMs are still active on hypervisors (e.g. after a postgres data loss), the failover controller previously treated every allocated VM as 'no longer exists' and wiped the allocations. Empty reservations were then deleted, causing total loss of failover coverage despite the VMs still running. Cross-check hv1.HypervisorList.Status.Instances before removing a VM from allocations in reconcileRemoveInvalidVMFromReservations. If the VM is still reported active on any known hypervisor, keep the allocation and log a warning. The hypervisor-mismatch branch is intentionally left unchanged: it is already governed by the TrustHypervisorLocation flag. Adds unit tests covering: - VM missing from VM source but still on hypervisor (allocation kept) - VM source completely empty while VMs still on hypervisors (all reservations preserved -- the exact incident scenario) - Mixed: postgres-missing-but-on-HV vs. truly gone (only truly gone removed) - Hypervisor mismatch is not suppressed by the safeguard
1 parent b7fbbe1 commit daae898

2 files changed

Lines changed: 148 additions & 5 deletions

File tree

internal/scheduling/reservations/failover/controller.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,28 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
268268
allHypervisors = append(allHypervisors, hv.Name)
269269
}
270270

271+
// Build a set of VM UUIDs currently active on any known hypervisor.
272+
// This is used as a safeguard against removing failover allocations when the
273+
// VM source (postgres) is missing data but the VM is still alive on a
274+
// hypervisor (e.g. after a postgres data loss / restore). Without this
275+
// cross-check a wiped postgres would cause every failover reservation to be
276+
// emptied and then deleted.
277+
vmsOnHypervisor := make(map[string]string)
278+
for _, hv := range hypervisorList.Items {
279+
for _, inst := range hv.Status.Instances {
280+
if !inst.Active {
281+
continue
282+
}
283+
if _, exists := vmsOnHypervisor[inst.ID]; exists {
284+
// VM appears on multiple hypervisors (transient during live
285+
// migration). Keep the first occurrence; the safeguard only
286+
// needs to know it exists somewhere.
287+
continue
288+
}
289+
vmsOnHypervisor[inst.ID] = hv.Name
290+
}
291+
}
292+
271293
// 2. Get all VMs that might need failover reservations
272294
vms, err := c.VMSource.ListVMsOnHypervisors(ctx, &hypervisorList, c.Config.TrustHypervisorLocation)
273295
if err != nil {
@@ -289,7 +311,7 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
289311
logger.V(1).Info("found failover reservations", "count", len(failoverReservations))
290312

291313
// 3. Remove VMs from reservations if they are no longer valid
292-
failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, failoverReservations)
314+
failoverReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(ctx, vms, vmsOnHypervisor, failoverReservations)
293315

294316
for _, res := range reservationsToUpdate {
295317
if err := c.patchReservationStatus(ctx, res); err != nil {
@@ -381,9 +403,17 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
381403
// - The VM has moved to a different host
382404
// Returns the updated list of reservations (with modifications applied in-memory).
383405
// The caller is responsible for persisting any changes to the cluster.
406+
//
407+
// vmsOnHypervisor maps VM UUID -> hypervisor name for every VM currently active
408+
// on any known hypervisor (sourced from hv1.HypervisorList.Status.Instances).
409+
// It is used as a safeguard: if a VM is missing from the postgres-derived vms
410+
// slice but is still present on a hypervisor, we keep the allocation. This
411+
// protects failover reservations from being wiped by transient/total postgres
412+
// data loss.
384413
func reconcileRemoveInvalidVMFromReservations(
385414
ctx context.Context,
386415
vms []reservations.VM,
416+
vmsOnHypervisor map[string]string,
387417
failoverReservations []v1alpha1.Reservation,
388418
) (updatedReservations []v1alpha1.Reservation, reservationsToUpdate []*v1alpha1.Reservation) {
389419

@@ -404,6 +434,18 @@ func reconcileRemoveInvalidVMFromReservations(
404434
for vmUUID, allocatedHypervisor := range allocations {
405435
vmCurrentHypervisor, vmExists := vmToHypervisor[vmUUID]
406436
if !vmExists {
437+
// Safeguard: if the VM is missing from the VM source (e.g.
438+
// postgres) but is still reported active on a hypervisor by
439+
// the hypervisor operator CRD, keep the allocation. This
440+
// prevents a postgres data loss from cascading into a mass
441+
// deletion of failover reservations.
442+
if hv, stillOnHV := vmsOnHypervisor[vmUUID]; stillOnHV {
443+
logger.Info("keeping VM allocation despite missing from VM source: still active on hypervisor",
444+
"vmUUID", vmUUID, "reservation", res.Name,
445+
"allocatedHypervisor", allocatedHypervisor, "hypervisor", hv)
446+
updatedAllocations[vmUUID] = allocatedHypervisor
447+
continue
448+
}
407449
logger.Info("removing VM from reservation allocations because VM no longer exists",
408450
"vmUUID", vmUUID, "reservation", res.Name)
409451
needsUpdate = true

internal/scheduling/reservations/failover/controller_test.go

Lines changed: 105 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
417417
tests := []struct {
418418
name string
419419
vms []reservations.VM
420+
vmsOnHypervisor map[string]string
420421
reservations []v1alpha1.Reservation
421422
expectedUpdatedCount int // number of reservations in updatedReservations
422423
expectedToUpdateCount int // number of reservations that need cluster update
@@ -444,7 +445,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
444445
name: "VM no longer exists - remove from allocations",
445446
vms: []reservations.VM{
446447
newTestVM("vm-1", "host1"),
447-
// vm-2 no longer exists
448+
// vm-2 no longer exists (and not on any hypervisor)
448449
},
449450
reservations: []v1alpha1.Reservation{
450451
newTestReservation("res-1", "host3", map[string]string{
@@ -481,7 +482,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
481482
vms: []reservations.VM{
482483
newTestVM("vm-1", "host1"),
483484
newTestVM("vm-2", "host2"),
484-
// vm-3 no longer exists
485+
// vm-3 no longer exists (and not on any hypervisor)
485486
},
486487
reservations: []v1alpha1.Reservation{
487488
newTestReservation("res-1", "host3", map[string]string{
@@ -502,7 +503,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
502503
{
503504
name: "all VMs removed from reservation - empty allocations",
504505
vms: []reservations.VM{
505-
// no VMs exist
506+
// no VMs exist (and not on any hypervisor)
506507
},
507508
reservations: []v1alpha1.Reservation{
508509
newTestReservation("res-1", "host3", map[string]string{
@@ -545,7 +546,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
545546
vms: []reservations.VM{
546547
newTestVM("vm-1", "host1"), // valid
547548
newTestVM("vm-2", "host5"), // moved from host2 to host5
548-
// vm-3 deleted
549+
// vm-3 deleted (and not on any hypervisor)
549550
newTestVM("vm-4", "host4"), // valid
550551
},
551552
reservations: []v1alpha1.Reservation{
@@ -565,6 +566,105 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
565566
"res-2": {"vm-4": "host4"},
566567
},
567568
},
569+
// ====================================================================
570+
// Safeguard: VM missing from VM source but still on a hypervisor
571+
// (e.g. postgres data loss). Allocations must be preserved.
572+
// ====================================================================
573+
{
574+
name: "safeguard: VM missing from VM source but still on hypervisor - keep allocation",
575+
vms: []reservations.VM{
576+
newTestVM("vm-1", "host1"),
577+
// vm-2 missing from VM source (postgres lost data)
578+
},
579+
vmsOnHypervisor: map[string]string{
580+
"vm-1": "host1",
581+
"vm-2": "host2", // still alive on hypervisor
582+
},
583+
reservations: []v1alpha1.Reservation{
584+
newTestReservation("res-1", "host3", map[string]string{
585+
"vm-1": "host1",
586+
"vm-2": "host2",
587+
}),
588+
},
589+
expectedUpdatedCount: 1,
590+
expectedToUpdateCount: 0, // safeguard prevents the update
591+
expectedAllocationsPerRes: map[string]map[string]string{
592+
"res-1": {"vm-1": "host1", "vm-2": "host2"},
593+
},
594+
},
595+
{
596+
name: "safeguard: VM source completely empty but VMs still on hypervisors - keep all allocations",
597+
vms: []reservations.VM{
598+
// VM source returns nothing (postgres wiped)
599+
},
600+
vmsOnHypervisor: map[string]string{
601+
"vm-1": "host1",
602+
"vm-2": "host2",
603+
"vm-3": "host3",
604+
},
605+
reservations: []v1alpha1.Reservation{
606+
newTestReservation("res-1", "host4", map[string]string{
607+
"vm-1": "host1",
608+
"vm-2": "host2",
609+
}),
610+
newTestReservation("res-2", "host5", map[string]string{
611+
"vm-3": "host3",
612+
}),
613+
},
614+
expectedUpdatedCount: 2,
615+
expectedToUpdateCount: 0, // safeguard prevents both updates
616+
expectedAllocationsPerRes: map[string]map[string]string{
617+
"res-1": {"vm-1": "host1", "vm-2": "host2"},
618+
"res-2": {"vm-3": "host3"},
619+
},
620+
},
621+
{
622+
name: "safeguard: only some missing VMs are still on hypervisor - remove only fully gone ones",
623+
vms: []reservations.VM{
624+
newTestVM("vm-1", "host1"),
625+
// vm-2 missing from postgres but on hypervisor
626+
// vm-3 missing from both
627+
},
628+
vmsOnHypervisor: map[string]string{
629+
"vm-1": "host1",
630+
"vm-2": "host2",
631+
// vm-3 not on any hypervisor
632+
},
633+
reservations: []v1alpha1.Reservation{
634+
newTestReservation("res-1", "host4", map[string]string{
635+
"vm-1": "host1",
636+
"vm-2": "host2", // safeguarded
637+
"vm-3": "host3", // truly gone - remove
638+
}),
639+
},
640+
expectedUpdatedCount: 1,
641+
expectedToUpdateCount: 1,
642+
expectedAllocationsPerRes: map[string]map[string]string{
643+
"res-1": {"vm-1": "host1", "vm-2": "host2"},
644+
},
645+
},
646+
{
647+
name: "safeguard does not protect against hypervisor mismatch (VM is in postgres on different host)",
648+
vms: []reservations.VM{
649+
newTestVM("vm-1", "host1"),
650+
newTestVM("vm-2", "host5"), // moved to host5 in postgres
651+
},
652+
vmsOnHypervisor: map[string]string{
653+
"vm-1": "host1",
654+
"vm-2": "host5",
655+
},
656+
reservations: []v1alpha1.Reservation{
657+
newTestReservation("res-1", "host3", map[string]string{
658+
"vm-1": "host1",
659+
"vm-2": "host2", // host mismatch - should still be removed
660+
}),
661+
},
662+
expectedUpdatedCount: 1,
663+
expectedToUpdateCount: 1,
664+
expectedAllocationsPerRes: map[string]map[string]string{
665+
"res-1": {"vm-1": "host1"},
666+
},
667+
},
568668
}
569669

570670
for _, tt := range tests {
@@ -573,6 +673,7 @@ func TestReconcileRemoveInvalidVMFromReservations(t *testing.T) {
573673
updatedReservations, reservationsToUpdate := reconcileRemoveInvalidVMFromReservations(
574674
ctx,
575675
tt.vms,
676+
tt.vmsOnHypervisor,
576677
tt.reservations,
577678
)
578679

0 commit comments

Comments
 (0)