diff --git a/internal/controller/ensure_tiebreaker_backfill_bug393_test.go b/internal/controller/ensure_tiebreaker_backfill_bug393_test.go new file mode 100644 index 00000000..f144e92a --- /dev/null +++ b/internal/controller/ensure_tiebreaker_backfill_bug393_test.go @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller_test + +import ( + "context" + "slices" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + blockstoriov1alpha1 "github.com/cozystack/blockstor/api/v1alpha1" + controllerpkg "github.com/cozystack/blockstor/internal/controller" + apiv1 "github.com/cozystack/blockstor/pkg/api/v1" + "github.com/cozystack/blockstor/pkg/store" +) + +// Bug 393 (release-gate, durability/availability) — the auto-tiebreaker +// witness reaper `removeWitnesses` raced the placer's redundancy +// backfill. The reaper's old Get-then-Delete was non-atomic: it re-read +// the witness flags, but the Delete carried no version precondition, so +// a promotion landing in the gap between the read and the Delete was +// silently deleted BY NAME. Net effect: after an inactive-replica +// return / node event, `r c --auto-place` promoted the witness to a +// diskful backfill, the reaper then clobbered that very replica, and +// redundancy was never restored (fewer diskful than place_count) — the +// inactive-return-backfills-redundancy replay timed out at +// active_diskful_count >= 2. +// +// The fix routes the reap through store.ResourceStore.DeleteIfTieBreaker, +// which carries the flag re-check and the delete across as ONE +// optimistic-concurrency operation. This test pins it by injecting the +// concurrent promotion at the exact race point — between the reaper's +// observation of the witness and the delete commit — and asserting the +// promoted replica SURVIVES and redundancy holds. + +// promoteRacingStore decorates a Store and, on the FIRST +// DeleteIfTieBreaker call, promotes the targeted witness to diskful +// BEFORE delegating to the inner store — modelling the placer's backfill +// landing in the reaper's race window. The inner store's conditional +// delete (re-check + version/lock guard) must then refuse the reap. +type promoteRacingStore struct { + store.Store + + promoted *bool +} + +func (s *promoteRacingStore) Resources() store.ResourceStore { + return &promoteRacingResources{ResourceStore: s.Store.Resources(), promoted: s.promoted} +} + +type promoteRacingResources struct { + store.ResourceStore + + promoted *bool +} + +func (r *promoteRacingResources) DeleteIfTieBreaker(ctx context.Context, rdName, node string) (bool, error) { + if !*r.promoted { + *r.promoted = true + + // The placer's redundancy backfill (corner-D2b promoteWitness): + // strip DISKLESS + TIE_BREAKER and stamp a storage pool, in place + // on the same (rd, node) key — exactly what pkg/placer does, and + // what bumps the backing object's version on the k8s store. + err := r.PatchResourceSpec(ctx, rdName, node, func(live *apiv1.Resource) error { + keep, _ := apiv1.PromoteWitnessFlags(live.Flags, true) + live.Flags = keep + + if live.Props == nil { + live.Props = map[string]string{} + } + + live.Props["StorPoolName"] = "pool" + + return nil + }) + if err != nil { + return false, err + } + } + + return r.ResourceStore.DeleteIfTieBreaker(ctx, rdName, node) +} + +// TestBug393BackfillSurvivesWitnessReap is the core regression: with the +// witness already promoted to a diskful backfill the instant the reaper +// fires its delete, the promoted replica must survive and the RD must +// still carry two diskful replicas (redundancy restored), with no +// surviving TIE_BREAKER. +func TestBug393BackfillSurvivesWitnessReap(t *testing.T) { + t.Parallel() + + scheme := newScheme(t) + inner := store.NewInMemory() + ctx := context.Background() + + for _, n := range []string{"n1", "n2", "n3"} { + if err := inner.Nodes().Create(ctx, &apiv1.Node{ + Name: n, Type: apiv1.NodeTypeSatellite, + }); err != nil { + t.Fatalf("seed node %s: %v", n, err) + } + } + + // Two diskful replicas → the witness invariant is NOT wanted, so the + // reconciler's witness pass will try to reap the witness on n3. + for _, n := range []string{"n1", "n2"} { + if err := inner.Resources().Create(ctx, &apiv1.Resource{ + Name: "pvc-bug393", NodeName: n, + Props: map[string]string{"StorPoolName": "pool"}, + }); err != nil { + t.Fatalf("seed diskful %s: %v", n, err) + } + } + + // The witness on n3 — the placer is concurrently promoting it to a + // diskful backfill replica (the inactive-return redundancy refill). + if err := inner.Resources().Create(ctx, &apiv1.Resource{ + Name: "pvc-bug393", NodeName: "n3", + Flags: []string{apiv1.ResourceFlagDiskless, apiv1.ResourceFlagTieBreaker}, + }); err != nil { + t.Fatalf("seed witness: %v", err) + } + + promoted := false + st := &promoteRacingStore{Store: inner, promoted: &promoted} + + rd := &blockstoriov1alpha1.ResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: "pvc-bug393"}, + } + + cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(rd).Build() + + rec := &controllerpkg.ResourceDefinitionReconciler{ + Client: cli, + Scheme: scheme, + Store: st, + } + + // The witness list as snapshotted at the top of ensureTiebreaker — + // n3 still looks like a witness here. The reaper must NOT clobber the + // row once it has been promoted under it. + witnesses := []apiv1.Resource{ + { + Name: "pvc-bug393", NodeName: "n3", + Flags: []string{apiv1.ResourceFlagDiskless, apiv1.ResourceFlagTieBreaker}, + }, + } + + if err := rec.RemoveWitnesses(ctx, "pvc-bug393", witnesses); err != nil { + t.Fatalf("RemoveWitnesses: %v", err) + } + + if !promoted { + t.Fatalf("test bug: the racing promotion never fired") + } + + // The promoted backfill replica on n3 MUST survive. + got, err := inner.Resources().Get(ctx, "pvc-bug393", "n3") + if err != nil { + t.Fatalf("Bug 393 regression: promoted backfill on n3 was clobbered by the reaper: %v", err) + } + + if slices.Contains(got.Flags, apiv1.ResourceFlagTieBreaker) { + t.Errorf("n3 still carries TIE_BREAKER after promotion: %v", got.Flags) + } + + if slices.Contains(got.Flags, apiv1.ResourceFlagDiskless) { + t.Errorf("n3 still DISKLESS after diskful promotion: %v", got.Flags) + } + + // Redundancy holds: all three nodes carry a diskful replica, none a + // witness. place_count-2's worth of redundancy is restored, not lost. + all, err := inner.Resources().ListByDefinition(ctx, "pvc-bug393") + if err != nil { + t.Fatalf("list: %v", err) + } + + diskful := 0 + + for i := range all { + if slices.Contains(all[i].Flags, apiv1.ResourceFlagTieBreaker) { + t.Errorf("surviving witness on %s: %v", all[i].NodeName, all[i].Flags) + } + + if !slices.Contains(all[i].Flags, apiv1.ResourceFlagDiskless) { + diskful++ + } + } + + if diskful < 2 { + t.Errorf("redundancy not restored: got %d diskful replicas, want >= 2", diskful) + } +} + +// TestBug393WitnessStillReapedWhenGenuineOrphan guards against +// over-correction: a witness that is NOT racing any promotion (a genuine +// orphan, the legitimate reap path) must STILL be reaped. The fix must +// only skip the delete when identity actually changed under it. +func TestBug393WitnessStillReapedWhenGenuineOrphan(t *testing.T) { + t.Parallel() + + st := store.NewInMemory() + ctx := context.Background() + + for _, n := range []string{"n1", "n2", "n3"} { + if err := st.Nodes().Create(ctx, &apiv1.Node{ + Name: n, Type: apiv1.NodeTypeSatellite, + }); err != nil { + t.Fatalf("seed node %s: %v", n, err) + } + } + + for _, n := range []string{"n1", "n2"} { + if err := st.Resources().Create(ctx, &apiv1.Resource{ + Name: "pvc-bug393-orphan", NodeName: n, + Props: map[string]string{"StorPoolName": "pool"}, + }); err != nil { + t.Fatalf("seed diskful %s: %v", n, err) + } + } + + if err := st.Resources().Create(ctx, &apiv1.Resource{ + Name: "pvc-bug393-orphan", NodeName: "n3", + Flags: []string{apiv1.ResourceFlagDiskless, apiv1.ResourceFlagTieBreaker}, + }); err != nil { + t.Fatalf("seed orphan witness: %v", err) + } + + rec := &controllerpkg.ResourceDefinitionReconciler{Store: st} + + witnesses := []apiv1.Resource{ + { + Name: "pvc-bug393-orphan", NodeName: "n3", + Flags: []string{apiv1.ResourceFlagDiskless, apiv1.ResourceFlagTieBreaker}, + }, + } + + if err := rec.RemoveWitnesses(ctx, "pvc-bug393-orphan", witnesses); err != nil { + t.Fatalf("RemoveWitnesses: %v", err) + } + + // The genuine orphan witness must be gone. + if _, err := st.Resources().Get(ctx, "pvc-bug393-orphan", "n3"); err == nil { + t.Errorf("genuine orphan witness on n3 survived reap — over-correction") + } + + // The diskful pair is untouched. + for _, n := range []string{"n1", "n2"} { + if _, err := st.Resources().Get(ctx, "pvc-bug393-orphan", n); err != nil { + t.Errorf("diskful %s vanished: %v", n, err) + } + } +} + +// TestBug393WitnessKeptWhileInactivePresent pins the decision-level fix +// and is the direct unit-level reproduction of the +// inactive-return-backfills-redundancy stand hang. +// +// Topology after `r deactivate node2`: 1 active diskful (node1) + 1 +// INACTIVE diskful (node2) + 1 auto-tiebreaker witness (node3, left over +// from the pre-deactivate 2-active-diskful state). The placer's +// `r c --auto-place=2` wants to promote the node3 witness to a diskful +// backfill to restore active redundancy to 2. +// +// Pre-fix: ensureTiebreaker filters the INACTIVE node2 out, sees `1 +// diskful + 1 witness`, treats the witness as a genuine orphan, and +// reaps node3 — clobbering the backfill target. The reconciler then +// oscillates 1↔2 active diskful forever and active_diskful_count never +// stabilises at 2 (the 240s assertion timeout observed on stand). +// +// Post-fix: the INACTIVE replica marks the cluster as degraded / +// backfill-in-progress, so the witness on node3 is HELD — it is the +// backfill target, not an orphan. The placer can promote it (or it stays +// as the third voter once node2 returns). +func TestBug393WitnessKeptWhileInactivePresent(t *testing.T) { + t.Parallel() + + scheme := newScheme(t) + st := store.NewInMemory() + ctx := context.Background() + + for _, n := range []string{"n1", "n2", "n3"} { + if err := st.Nodes().Create(ctx, &apiv1.Node{ + Name: n, Type: apiv1.NodeTypeSatellite, + }); err != nil { + t.Fatalf("seed node %s: %v", n, err) + } + } + + // n1: active diskful (the only voting peer). + if err := st.Resources().Create(ctx, &apiv1.Resource{ + Name: "pvc-bug393-keep", NodeName: "n1", + Props: map[string]string{"StorPoolName": "pool"}, + }); err != nil { + t.Fatalf("seed active diskful: %v", err) + } + + // n2: INACTIVE diskful (operator-deactivated; redundancy degraded). + if err := st.Resources().Create(ctx, &apiv1.Resource{ + Name: "pvc-bug393-keep", NodeName: "n2", + Flags: []string{apiv1.ResourceFlagInactive}, + Props: map[string]string{"StorPoolName": "pool"}, + }); err != nil { + t.Fatalf("seed inactive diskful: %v", err) + } + + // n3: leftover auto-tiebreaker witness — the backfill target. + if err := st.Resources().Create(ctx, &apiv1.Resource{ + Name: "pvc-bug393-keep", NodeName: "n3", + Flags: []string{apiv1.ResourceFlagDiskless, apiv1.ResourceFlagTieBreaker}, + }); err != nil { + t.Fatalf("seed witness: %v", err) + } + + rd := &blockstoriov1alpha1.ResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: "pvc-bug393-keep"}, + } + + cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(rd).Build() + + rec := &controllerpkg.ResourceDefinitionReconciler{ + Client: cli, + Scheme: scheme, + Store: st, + } + + // Run the full reconcile decision several times — the pre-fix + // behaviour reaped the witness on the very first pass; a stable fix + // must keep it across repeated reconciles (the reconciler fires many + // times per second under the Resource watch fan-out). + for pass := range 3 { + if err := rec.EnsureTiebreaker(ctx, rd); err != nil { + t.Fatalf("EnsureTiebreaker pass %d: %v", pass, err) + } + + got, err := st.Resources().Get(ctx, "pvc-bug393-keep", "n3") + if err != nil { + t.Fatalf("pass %d: witness on n3 was reaped while an INACTIVE replica is present "+ + "(backfill target clobbered — the oscillation root cause): %v", pass, err) + } + + if !slices.Contains(got.Flags, apiv1.ResourceFlagTieBreaker) { + t.Fatalf("pass %d: n3 lost TIE_BREAKER unexpectedly: %v", pass, got.Flags) + } + } + + // The INACTIVE replica must not have been touched, and no spurious + // extra witness/replica should have been created. + all, err := st.Resources().ListByDefinition(ctx, "pvc-bug393-keep") + if err != nil { + t.Fatalf("list: %v", err) + } + + if len(all) != 3 { + t.Errorf("replica set churned: got %d rows, want 3 (n1 active, n2 inactive, n3 witness); %+v", + len(all), all) + } +} diff --git a/internal/controller/resourcedefinition_controller.go b/internal/controller/resourcedefinition_controller.go index 24ee89d8..ab95d9e7 100644 --- a/internal/controller/resourcedefinition_controller.go +++ b/internal/controller/resourcedefinition_controller.go @@ -252,12 +252,29 @@ func (r *ResourceDefinitionReconciler) ensureTiebreaker(ctx context.Context, rd // INACTIVE replicas from the (already disabled-node-filtered) voting // set before the split so neither disabled-node nor deactivated // replicas influence the diskful or diskless/witness count. + // Bug 393: an INACTIVE diskful replica means active redundancy is + // degraded and a backfill is expected (the placer's `r c --auto-place` + // gap-fills a replacement active diskful on a healthy spare). While + // that is true, an EXISTING witness on a spare node is the backfill + // target the placer is about to promote in place — reaping it + // collapses the very node redundancy is being restored onto, and the + // reconciler oscillates 1↔2 active diskful forever (the + // inactive-return-backfills-redundancy hang). Detect the INACTIVE + // replica BEFORE filterActiveReplicas drops it from the voting set, so + // shouldTieBreakerExist can hold the witness through the backfill + // window. This never grows a NEW witness (the create branch is + // unaffected) — it only preserves one that already exists, which is + // also the forward-correct shape: when the INACTIVE replica returns + // (`r activate`) the cluster is immediately at the 2-diskful + + // 1-witness three-voter quorum with no recreate churn. + hasInactive := containsInactiveReplica(live) + live = filterActiveReplicas(live) diskful, diskless := splitByDiskless(live) witness := filterTieBreaker(diskless) - wantWitness := shouldTieBreakerExist(rd, diskful, diskless, witness) + wantWitness := shouldTieBreakerExist(rd, diskful, diskless, witness, hasInactive) // Bug 338: the orphan-witness shape (1 diskful + 1 TIE_BREAKER + // 0 user-diskless) collapses on this reconcile — no grace timer. @@ -376,6 +393,7 @@ func shouldKeepExistingWitness(diskful, _, witnessUnnecessaryDiskfulCount int) b func shouldTieBreakerExist( rd *blockstoriov1alpha1.ResourceDefinition, diskful, diskless, witness []apiv1.Resource, + hasInactive bool, ) bool { if !isAutoTieBreakerEnabled(rd) { return false @@ -388,12 +406,39 @@ func shouldTieBreakerExist( const witnessUnnecessaryDiskfulCount = 3 + // Bug 393: hold an EXISTING witness through the backfill window when + // an INACTIVE diskful replica is present (active redundancy degraded, + // a replacement active diskful is being backfilled onto the witness's + // node). Without this the orphan-collapse branch (1 active diskful + + // 1 witness) reaps the very spare the placer is promoting, and the + // reconciler flaps 1↔2 active diskful forever. Gated on an existing + // witness, so it never grows a new one. + if hasInactive && len(witness) > 0 && len(diskful) >= 1 { + return true + } + keepExistingWitness := keepExistingWitnessFor(rd, diskful, witness, nonWitnessDiskless, witnessUnnecessaryDiskfulCount) return wantNewWitness || keepExistingWitness } +// containsInactiveReplica reports whether any replica in the slice +// carries the INACTIVE flag (`drbdadm down`). Used by the tiebreaker +// decision to recognise the degraded-redundancy / backfill-in-progress +// state — see the Bug 393 keep branch in shouldTieBreakerExist. Probed +// on the pre-filter slice, before filterActiveReplicas drops INACTIVE +// replicas from the voting set. +func containsInactiveReplica(replicas []apiv1.Resource) bool { + for i := range replicas { + if slices.Contains(replicas[i].Flags, apiv1.ResourceFlagInactive) { + return true + } + } + + return false +} + // keepExistingWitnessFor folds the two "preserve an existing // TIE_BREAKER witness" branches into a single helper so // shouldTieBreakerExist stays under the gocyclo threshold. Both @@ -1092,34 +1137,36 @@ func quorumPropsAlreadySet(rd *blockstoriov1alpha1.ResourceDefinition, value, qu } // removeWitnesses deletes every TIE_BREAKER replica of the named RD. -// Best-effort: ErrNotFound is swallowed so concurrent reconciles -// converge. +// Best-effort: a witness that was already reaped, or has been promoted +// to a diskful backfill/relocate target, is silently skipped so +// concurrent reconciles converge. +// +// Why the delete must be a version-guarded conditional, not a plain +// Get-then-Delete: the `witnesses` slice is a snapshot taken at the top +// of ensureTiebreaker, and two distinct concurrent transitions promote a +// witness row IN PLACE on the same (rd, node) key: +// +// - Phase-3 relocate-onto-the-tiebreaker (`r-full-lifecycle.sh`: +// `r d ` leaves 1 diskful + 1 orphan witness, then +// `r c ` promotes that SAME witness via +// promoteDisklessReplica — TIE_BREAKER+DISKLESS stripped, +// StorPoolName stamped). +// - Bug 393 redundancy backfill: after an inactive-replica return / +// node event, `r c --auto-place` promotes the witness to a diskful +// backfill replica via pkg/placer.promoteWitness (corner-D2b). // -// Why the live re-check before each Delete: the `witnesses` slice is a -// snapshot taken at the top of ensureTiebreaker. During the Phase-3 -// relocate-onto-the-tiebreaker transition (`r-full-lifecycle.sh`: -// `r d ` leaves 1 diskful + 1 orphan witness, then -// `r c ` promotes that SAME witness row in-place to -// the diskful relocate target via promoteDisklessReplica — TIE_BREAKER -// + DISKLESS stripped, StorPoolName stamped on the same (rd, node) -// key). If the Bug-338 orphan-collapse fires concurrently it would -// Delete the just-promoted relocate target by node key, the topology -// resets, the next `r c`/reconcile re-creates, and the diskful count -// flip-flops 1↔2 forever (the ensureTiebreaker oscillation). Re-read -// each row and skip any that no longer carries TIE_BREAKER: a witness -// that became diskful is the relocate target, not an orphan, and must -// never be reaped here. +// A non-atomic Get-then-Delete narrows but does NOT close the window: the +// promotion can land in the gap between our re-Get and the Delete, and an +// unconditional Delete-by-name then clobbers the freshly-promoted diskful +// replica — redundancy is silently never restored (fewer diskful replicas +// than place_count). DeleteIfTieBreaker carries the re-check and the +// delete across as one optimistic-concurrency operation (ResourceVersion +// + UID precondition on the k8s store; lock-held on the in-memory store), +// so a racing promotion aborts the delete instead of clobbering it. func (r *ResourceDefinitionReconciler) removeWitnesses(ctx context.Context, rdName string, witnesses []apiv1.Resource) error { for i := range witnesses { - live, getErr := r.Store.Resources().Get(ctx, rdName, witnesses[i].NodeName) - if getErr == nil && !slices.Contains(live.Flags, apiv1.ResourceFlagTieBreaker) { - // Promoted to the diskful relocate target between the - // snapshot and now — leave it; it is no longer a witness. - continue - } - - err := r.Store.Resources().Delete(ctx, rdName, witnesses[i].NodeName) - if err != nil && !stderrors.Is(err, store.ErrNotFound) { + _, err := r.Store.Resources().DeleteIfTieBreaker(ctx, rdName, witnesses[i].NodeName) + if err != nil { return err } } diff --git a/pkg/rest/bug_359_mid_delete_promote_test.go b/pkg/rest/bug_359_mid_delete_promote_test.go index 3078f93a..4b9ee465 100644 --- a/pkg/rest/bug_359_mid_delete_promote_test.go +++ b/pkg/rest/bug_359_mid_delete_promote_test.go @@ -118,6 +118,10 @@ func (v *midDeleteTBResources) Delete(ctx context.Context, rdName, node string) return v.inner.Delete(ctx, rdName, node) //nolint:wrapcheck // test helper } +func (v *midDeleteTBResources) DeleteIfTieBreaker(ctx context.Context, rdName, node string) (bool, error) { + return v.inner.DeleteIfTieBreaker(ctx, rdName, node) //nolint:wrapcheck // test helper +} + func (v *midDeleteTBResources) SetState(ctx context.Context, rdName, node string, state apiv1.ResourceState, volumes []apiv1.VolumeObservation, ) error { diff --git a/pkg/rest/bug_359_witness_collapse_race_test.go b/pkg/rest/bug_359_witness_collapse_race_test.go index edb4ef46..d0f2a5eb 100644 --- a/pkg/rest/bug_359_witness_collapse_race_test.go +++ b/pkg/rest/bug_359_witness_collapse_race_test.go @@ -106,6 +106,10 @@ func (v *vanishingTBResources) Delete(ctx context.Context, rdName, node string) return v.inner.Delete(ctx, rdName, node) //nolint:wrapcheck // test helper } +func (v *vanishingTBResources) DeleteIfTieBreaker(ctx context.Context, rdName, node string) (bool, error) { + return v.inner.DeleteIfTieBreaker(ctx, rdName, node) //nolint:wrapcheck // test helper +} + func (v *vanishingTBResources) SetState(ctx context.Context, rdName, node string, state apiv1.ResourceState, volumes []apiv1.VolumeObservation, ) error { @@ -422,6 +426,10 @@ func (v *alwaysVanishingTBResources) Delete(ctx context.Context, rdName, node st return v.inner.Delete(ctx, rdName, node) //nolint:wrapcheck // test helper } +func (v *alwaysVanishingTBResources) DeleteIfTieBreaker(ctx context.Context, rdName, node string) (bool, error) { + return v.inner.DeleteIfTieBreaker(ctx, rdName, node) //nolint:wrapcheck // test helper +} + func (v *alwaysVanishingTBResources) SetState(ctx context.Context, rdName, node string, state apiv1.ResourceState, volumes []apiv1.VolumeObservation, ) error { diff --git a/pkg/rest/cache_invalidation_bug_124_test.go b/pkg/rest/cache_invalidation_bug_124_test.go index 799f437f..421ce7da 100644 --- a/pkg/rest/cache_invalidation_bug_124_test.go +++ b/pkg/rest/cache_invalidation_bug_124_test.go @@ -93,6 +93,10 @@ func (l *laggingResources) Update(ctx context.Context, r *apiv1.Resource) error return l.inner.Update(ctx, r) //nolint:wrapcheck // test helper } +func (l *laggingResources) DeleteIfTieBreaker(ctx context.Context, rdName, node string) (bool, error) { + return l.inner.DeleteIfTieBreaker(ctx, rdName, node) //nolint:wrapcheck // test helper +} + func (l *laggingResources) Delete(ctx context.Context, rdName, node string) error { // Confirm the row exists right now so the caller still sees // ErrNotFound for a row that genuinely never existed. The diff --git a/pkg/store/inmemory_resource.go b/pkg/store/inmemory_resource.go index 3cdf21dc..0ec9618c 100644 --- a/pkg/store/inmemory_resource.go +++ b/pkg/store/inmemory_resource.go @@ -20,6 +20,7 @@ package store import ( "context" + "slices" "sort" "sync" @@ -180,6 +181,35 @@ func (s *inMemoryResources) Delete(_ context.Context, rdName, node string) error return nil } +// DeleteIfTieBreaker deletes the named replica only if it still carries +// TIE_BREAKER, atomically under the write lock. The InMemory store has no +// resourceVersion surface, so the lock-held read-check-delete covers what +// the k8s store's ResourceVersion/UID Delete precondition does: a +// concurrent promoteWitness (PatchResourceSpec, also lock-held) either +// commits fully before us — in which case we observe the stripped flags +// and skip — or fully after us. There is no interleaving in which we +// delete a row the promote has already turned diskful. Same shim contract +// as PatchResourceSpec (Bug 204b). Bug 393. +func (s *inMemoryResources) DeleteIfTieBreaker(_ context.Context, rdName, node string) (bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + + key := rKey{rdName, node} + + r, exists := s.m[key] + if !exists { + return false, nil + } + + if !slices.Contains(r.Flags, apiv1.ResourceFlagTieBreaker) { + return false, nil + } + + delete(s.m, key) + + return true, nil +} + // SetState mutates the runtime-observed state without touching Spec. // In the in-memory store there's no Status subresource, so we just // merge the runtime fields onto the stored value. diff --git a/pkg/store/k8s/resources.go b/pkg/store/k8s/resources.go index 451e1584..95ef8146 100644 --- a/pkg/store/k8s/resources.go +++ b/pkg/store/k8s/resources.go @@ -308,6 +308,72 @@ func (s *resources) Delete(ctx context.Context, rdName, node string) error { return nil } +// DeleteIfTieBreaker deletes the named replica only if it is still an +// auto-tiebreaker witness (carries TIE_BREAKER) at the freshly-observed +// object version. The delete is issued under a Preconditions guard on +// both the ResourceVersion AND the UID of the object we just read: +// +// - ResourceVersion precondition: a concurrent in-place promotion +// (pkg/placer.promoteWitness flips DISKLESS+TIE_BREAKER → diskful via +// a JSON-merge-patch, bumping ResourceVersion) lands in the gap +// between our Get and our Delete. The apiserver rejects the +// version-guarded delete with Conflict, so the just-promoted diskful +// replica is NOT clobbered — the exact Bug 393 race. We translate the +// Conflict into a benign "skipped" (false, nil): the next reconcile +// re-evaluates the now-diskful row and leaves it alone. +// - UID precondition: closes the delete-then-recreate ABA window — if +// the witness were deleted and a fresh Resource recreated on the same +// (rd, node) name between our Get and Delete, the UID would differ and +// the apiserver rejects the delete rather than removing the new row. +// +// A NotFound (already reaped by a racing reconcile) is also a benign +// (false, nil). Genuine transport/permission errors propagate. +func (s *resources) DeleteIfTieBreaker(ctx context.Context, rdName, node string) (bool, error) { + name := resourceCRDName(rdName, node) + + var crd crdv1alpha1.Resource + + err := s.c.Get(ctx, types.NamespacedName{Name: name}, &crd) + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + + return false, errors.Wrapf(err, "get Resource %s/%s", rdName, node) + } + + // Re-check identity under the version we are about to pin: a row that + // no longer carries TIE_BREAKER has been promoted to the relocate / + // backfill target (or downgraded to a plain diskless peer) and must + // never be reaped by the witness pass. + if !slices.Contains(crd.Spec.Flags, apiv1.ResourceFlagTieBreaker) { + return false, nil + } + + resourceVersion := crd.ResourceVersion + uid := crd.UID + + del := &crdv1alpha1.Resource{ObjectMeta: metav1.ObjectMeta{Name: name}} + + err = s.c.Delete(ctx, del, ctrlclient.Preconditions{ + ResourceVersion: &resourceVersion, + UID: &uid, + }) + if err != nil { + // Conflict: the row was mutated (promoted to diskful) between + // our Get and the version-guarded Delete — abort the reap so the + // freshly-promoted replica survives. NotFound: already reaped by + // a racing reconcile. Both are convergence-benign no-ops. + if apierrors.IsConflict(err) || apierrors.IsNotFound(err) { + return false, nil + } + + return false, errors.Wrapf(err, "delete Resource %s/%s", rdName, node) + } + + return true, nil +} + // SatelliteFieldOwner is the SSA field-manager identity the // satellite uses for its observed-state writes (DrbdState, // per-volume DiskState/CurrentGI). The controller uses diff --git a/pkg/store/store.go b/pkg/store/store.go index 1f3ae01f..32bcfc4d 100644 --- a/pkg/store/store.go +++ b/pkg/store/store.go @@ -177,6 +177,29 @@ type ResourceStore interface { Update(ctx context.Context, r *apiv1.Resource) error Delete(ctx context.Context, rdName, node string) error + // DeleteIfTieBreaker atomically deletes the named replica ONLY if, + // at the observed object version, it still carries the TIE_BREAKER + // flag (i.e. it is still an auto-tiebreaker witness). The delete is + // guarded by an optimistic-concurrency precondition on the backing + // object's identity: a concurrent promotion of that same (rd, node) + // row from witness/diskless to a diskful backfill replica (the + // placer's redundancy backfill — pkg/placer.promoteWitness flips the + // flags via PatchResourceSpec, bumping the object version) makes the + // delete a no-op instead of clobbering the freshly-promoted replica. + // + // This closes the Bug 393 / inactive-return-backfills-redundancy + // race: the witness reaper's plain Get-then-Delete was non-atomic — + // it re-read the flags but the Delete had no version precondition, so + // a promotion landing in the gap between the read and the Delete was + // silently deleted by name and redundancy was never restored. + // + // Returns (true, nil) when the witness was deleted; (false, nil) when + // it was skipped because it is no longer a witness (promoted, or the + // flags otherwise changed under us) or already gone — both are benign + // convergence outcomes the reaper swallows. A genuine error (lost + // connectivity, etc.) is returned as (false, err). + DeleteIfTieBreaker(ctx context.Context, rdName, node string) (bool, error) + // SetState writes the runtime-observed state subresource (InUse, // DrbdState, per-volume observations) without touching Spec. // Required because the satellite's events2 observer can race a diff --git a/pkg/store/storetest/storetest.go b/pkg/store/storetest/storetest.go index 94625137..5ae2a1c0 100644 --- a/pkg/store/storetest/storetest.go +++ b/pkg/store/storetest/storetest.go @@ -665,6 +665,11 @@ func RunResourceStore(t *testing.T, newStore Factory) { t.Errorf("Get missing: got %v, want ErrNotFound", err) } }) + // DeleteIfTieBreaker pins the Bug 393 conditional-delete contract + // across both backends: a TIE_BREAKER row is reaped; a row that has + // been promoted to diskful (TIE_BREAKER stripped) is left intact and + // the call reports (false, nil); a missing row is a benign no-op. + t.Run("DeleteIfTieBreaker", func(t *testing.T) { testResourceDeleteIfTieBreaker(t, newStore) }) // SetState pins the path the satellite's events2 observer drives: // runtime State (InUse) + DRBD-state props land on the existing // Resource without disturbing Spec. Tested across both InMemory @@ -894,6 +899,82 @@ func testResourceListSorted(t *testing.T, newStore Factory) { } } +// testResourceDeleteIfTieBreaker pins the Bug 393 conditional-delete +// contract. It must behave identically on both the in-memory and the +// CRD-backed store: a witness row is reaped, a promoted (diskful) row is +// preserved, and a missing row is a benign no-op. This is the store-level +// half of the inactive-return-backfills-redundancy fix — the controller's +// removeWitnesses relies on it to never clobber a freshly-promoted +// backfill replica. +func testResourceDeleteIfTieBreaker(t *testing.T, newStore Factory) { + t.Helper() + + t.Run("ReapsWitness", func(t *testing.T) { + s := newStore(t).Resources() + ctx := t.Context() + + if err := s.Create(ctx, &apiv1.Resource{ + Name: "pvc-1", NodeName: "n1", + Flags: []string{apiv1.ResourceFlagDiskless, apiv1.ResourceFlagTieBreaker}, + }); err != nil { + t.Fatalf("Create witness: %v", err) + } + + deleted, err := s.DeleteIfTieBreaker(ctx, "pvc-1", "n1") + if err != nil { + t.Fatalf("DeleteIfTieBreaker: %v", err) + } + + if !deleted { + t.Errorf("witness must be reaped: got deleted=false") + } + + if _, err := s.Get(ctx, "pvc-1", "n1"); !errors.Is(err, store.ErrNotFound) { + t.Errorf("witness survived: got %v, want ErrNotFound", err) + } + }) + + t.Run("PreservesPromotedDiskful", func(t *testing.T) { + s := newStore(t).Resources() + ctx := t.Context() + + // A diskful replica that USED to be a witness but has since been + // promoted (TIE_BREAKER stripped). It must never be reaped. + if err := s.Create(ctx, &apiv1.Resource{ + Name: "pvc-1", NodeName: "n1", + Props: map[string]string{"StorPoolName": "pool"}, + }); err != nil { + t.Fatalf("Create diskful: %v", err) + } + + deleted, err := s.DeleteIfTieBreaker(ctx, "pvc-1", "n1") + if err != nil { + t.Fatalf("DeleteIfTieBreaker: %v", err) + } + + if deleted { + t.Errorf("promoted diskful must NOT be reaped: got deleted=true") + } + + if _, err := s.Get(ctx, "pvc-1", "n1"); err != nil { + t.Errorf("promoted diskful clobbered: %v", err) + } + }) + + t.Run("MissingIsNoOp", func(t *testing.T) { + s := newStore(t).Resources() + + deleted, err := s.DeleteIfTieBreaker(t.Context(), "ghost", "n1") + if err != nil { + t.Errorf("DeleteIfTieBreaker(missing): got %v, want nil", err) + } + + if deleted { + t.Errorf("DeleteIfTieBreaker(missing): got deleted=true, want false") + } + }) +} + // RunResourceDefinitionStore exercises every branch of // store.ResourceDefinitionStore. func RunResourceDefinitionStore(t *testing.T, newStore Factory) {