From c60c7cca227627890d4b0c062dc88d064a803ffc Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Tue, 16 Jun 2026 09:46:37 +0300 Subject: [PATCH 1/3] fix(satellite): fire late-add metadata/self-heal only for unattached desired volumes (BUG-048 deadlock) PR #164 widened the ensurePerVolumeMetadata trigger gate from the .res-count hasLateAddedVolume() to the unconditional dr.GetMetadataCreated(), so every post-activation diskful reconcile ran a per-volume drbdadm dump-md (an md_buffer consumer) plus the cluster- wide late-add self-heals. During a vd s resize DRBD holds md_buffer for the whole cluster-wide size change (change_cluster_wide_device_size / drbd_determine_dev_size); the per-reconcile md probe + self-heals then perpetually lose the cluster-wide state-change arbitration, the resize never completes, md_buffer is never released, and the resource deadlocks cluster-wide, reboot-proof. Narrow the gate to the race-free, deadlock-safe truth: fire the per- volume metadata pass ONLY when some desired volume is NOT present-and- attached in the kernel (absent, or present but Diskless / no-metadata). An attached volume already has a valid DRBD-9 superblock (the kernel cannot attach a lower disk without one), so dump-md on it is pointless and is the md_buffer consumer that contends with the resize. This still fires for a genuine late-add (the new volume is unattached) so BUG-048 #164 and the pre-rendered-.res regression are preserved, but skips a converged steady-state and a resize-in-flight (all volumes attached). Add Adm.AttachedVolumes (parses drbdsetup status --json into the set of attached, metadata-bearing local volume numbers) as the kernel-truth backing for the new hasUnattachedDesiredVolume predicate. Also gate the late-add promote / resync-kick self-heals OUT of a reconcile that just issued drbdadm resize (defense-in-depth: their recovery actions are themselves cluster-wide state-change / md_buffer operations). Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/drbdsetup_show.go | 69 ++++++++++++++ pkg/satellite/reconciler.go | 183 +++++++++++++++++++++++++++--------- 2 files changed, 206 insertions(+), 46 deletions(-) diff --git a/pkg/drbd/drbdsetup_show.go b/pkg/drbd/drbdsetup_show.go index a4b702f8..33a9d6e9 100644 --- a/pkg/drbd/drbdsetup_show.go +++ b/pkg/drbd/drbdsetup_show.go @@ -324,3 +324,72 @@ func parseShowJSON(out []byte) map[string]KernelSlot { return slots } + +// AttachedVolumes probes `drbdsetup status --json` and returns the +// set of LOCAL volume numbers that are present-and-attached in the kernel — +// i.e. the kernel has a lower disk attached and a valid DRBD-9 metadata +// block for them (disk-state UpToDate / Inconsistent / Consistent / +// Outdated). A volume that is absent from the device list entirely, or +// present but Diskless / DUnknown / mid-transition (Attaching / Detaching / +// Failed / Negotiating), is NOT in the returned set. +// +// DRBD invariant exploited by the BUG-048 resize-deadlock fix: the kernel +// CANNOT attach a lower disk without a valid metadata block, so an attached +// volume ALREADY has metadata — there is nothing for ensurePerVolumeMetadata +// to create-md and a `drbdadm dump-md` probe on it is pointless. That +// dump-md is an md_buffer consumer; firing it on every post-activation +// reconcile contends with an in-flight `vd s` resize (DRBD holds md_buffer +// in change_cluster_wide_device_size / drbd_determine_dev_size), which is +// the reboot-proof cluster-wide deadlock BUG-048's #164 gate widening +// introduced. The reconciler gates the per-volume metadata pass on "some +// DESIRED volume is NOT in this set" so the probe only fires for a genuine +// late-add (a volume that really still needs metadata) and never on a +// converged or resizing RD whose volumes are all attached. +// +// Conservative on any probe/parse failure (kernel slot absent, status +// non-zero, malformed JSON, missing node-id) → empty set. An empty set means +// "nothing is known-attached", so the caller treats every desired volume as +// possibly needing metadata and runs the (idempotent) per-volume pass — the +// fail-toward-correctness direction (the pre-BUG-048 behaviour). +func (a *Adm) AttachedVolumes(ctx context.Context, resource string) map[int32]struct{} { + out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json") + if err != nil { + return nil + } + + var status drbdsetupStatusRoot + + err = json.Unmarshal(out, &status) + if err != nil || len(status) == 0 { + return nil + } + + return attachedVolumeSet(status[0].Devices) +} + +// attachedVolumeSet returns the volume numbers from `devices` whose +// disk-state proves the lower disk is attached with a valid metadata block. +// Split out of AttachedVolumes so unit tests can drive it from fixtures +// without the FakeExec round-trip. +func attachedVolumeSet(devices []drbdsetupStatusDevice) map[int32]struct{} { + out := map[int32]struct{}{} + + for _, dev := range devices { + switch DiskState(dev.DiskState) { + case DiskStateUpToDate, DiskStateInconsistent, + DiskStateConsistent, DiskStateOutdated: + // Lower disk attached with metadata — DRBD cannot reach any of + // these states without a valid DRBD-9 superblock. No create-md / + // dump-md needed for this volume. + out[dev.VolumeNumber] = struct{}{} + case DiskStateDiskless, DiskStateAttaching, DiskStateDetaching, + DiskStateFailed, DiskStateNegotiating, DiskStateDUnknown: + // Diskless (no lower disk), or a transient / failed state where + // the disk is not cleanly attached: treat as still-needing the + // metadata pass so a genuine late-add or a re-attach is not + // skipped. + } + } + + return out +} diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go index 8d56cc52..c41de5cb 100644 --- a/pkg/satellite/reconciler.go +++ b/pkg/satellite/reconciler.go @@ -2101,25 +2101,43 @@ func (r *Reconciler) applyDRBD(ctx context.Context, dr *intent.DesiredResource, // (MetadataCreated Condition / .md-created marker) rather than the // OLD .res's `volume N {` block count. // - // BUG-048 (P1, availability): the previous `.res`-count gate raced - // the FSM dispatch's renderResFile preamble. On a concurrent late - // vd-add the FSM `adjust` of an EARLIER reconcile already re-rendered - // `.res` to include the new volume's `volume N {` block BEFORE this - // gate read the file, so hasLateAddedVolume(resPath) flipped FALSE, - // ensurePerVolumeMetadata was SKIPPED, and the new volume came up via - // the kernel adjust with NO metadata + NO winner-election GI seed — - // Inconsistent on every replica, no SyncSource, permanently wedged - // (~half of concurrent two-VD adds). The on-disk metadata, not the - // rendered .res, is the race-free truth: ensurePerVolumeMetadata - // probes `drbdadm dump-md` per volume and only creates+seeds the ones - // that actually lack metadata (len(freshlyCreated)==0 → early return), - // so running it on every post-activation diskful reconcile is - // idempotent and converged passes pay only the cheap per-volume - // dump-md probe. Skipped on diskless replicas (no lower disk to + // BUG-048 (P1, availability): the original `.res`-count gate + // (hasLateAddedVolume) raced the FSM dispatch's renderResFile preamble. + // On a concurrent late vd-add the FSM `adjust` of an EARLIER reconcile + // already re-rendered `.res` to include the new volume's `volume N {` + // block BEFORE this gate read the file, so hasLateAddedVolume(resPath) + // flipped FALSE, ensurePerVolumeMetadata was SKIPPED, and the new volume + // came up via the kernel adjust with NO metadata + NO winner-election GI + // seed — Inconsistent on every replica, no SyncSource, permanently + // wedged (~half of concurrent two-VD adds). PR #164 widened the gate to + // the race-free `dr.GetMetadataCreated()` — but that fires the per-volume + // pass on EVERY post-activation diskful reconcile. + // + // BUG-048 RESIZE DEADLOCK (this fix): even though ensurePerVolumeMetadata + // only create-md's volumes that lack metadata, it still runs a per-volume + // `drbdadm dump-md` probe on EVERY volume first — and dump-md is an + // md_buffer consumer. During a `vd s` resize DRBD holds md_buffer for the + // whole cluster-wide size change (change_cluster_wide_device_size / + // drbd_determine_dev_size); a per-reconcile dump-md then perpetually + // loses the cluster-wide state-change arbitration, the resize never + // completes, md_buffer is never released, and the resource deadlocks + // cluster-wide, reboot-proof. The unconditional `GetMetadataCreated()` + // gate fired this on every reconcile of a converged/resizing RD. + // + // Narrow the gate to the race-free, deadlock-safe truth: fire the + // per-volume metadata pass ONLY when some DESIRED volume is NOT + // present-and-attached in the kernel (absent entirely, or present but + // Diskless / no-metadata). A volume that IS attached already has a valid + // DRBD-9 superblock (the kernel cannot attach a lower disk without one), + // so dump-md on it is pointless AND is the md_buffer consumer that + // contends with a resize. This still fires on a genuine late-add (the new + // volume is unattached → BUG-048 #164 / the pre-rendered-.res test + // preserved) and on a re-attach, but skips a converged steady-state and a + // resize-in-flight (every desired volume attached) → no dump-md → no + // md_buffer contention. Skipped on diskless replicas (no lower disk to // stamp) and on the diskless→diskful flip path (Bug 319 owns that // re-stamp). - if !diskless && dr.GetMetadataCreated() && - !r.isDisklessToDiskfulFlip(ctx, dr, diskless) { + if r.shouldEnsurePerVolumeMetadata(ctx, dr, diskless) { err = r.ensurePerVolumeMetadata(ctx, dr, devices, diskless) if err != nil { return err @@ -2515,19 +2533,32 @@ func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredReso // Steady-state promote self-heals — run after the firstActivation // promote so a wedged replica still converges. Folded into one helper - // so finishDRBDApply stays under the gocyclo budget. - return r.maybePromoteSelfHeals(ctx, dr, diskless, autoPromote, autoPrimaryReplica) + // so finishDRBDApply stays under the gocyclo budget. `resized` gates the + // BUG-048 late-add self-heals OUT for this pass (resize-deadlock guard). + return r.maybePromoteSelfHeals(ctx, dr, diskless, autoPromote, autoPrimaryReplica, resized) } -// maybePromoteSelfHeals runs the two steady-state force-primary self-heals -// in order: the Bug 366 recovery-promote (a fresh RD whose late replica -// wedged Inconsistent with a data-bearing peer and no Primary) and the -// solo-promote (a lone, peerless diskful replica wedged below UpToDate -// after a diskless→diskful toggle). Both read live kernel state and are -// self-limiting; their predicates are mutually exclusive (recovery needs a -// peer, solo needs none), so at most one acts per pass. Extracted from -// finishDRBDApply to keep that function under the gocyclo budget. -func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.DesiredResource, diskless, autoPromote, autoPrimaryReplica bool) error { +// maybePromoteSelfHeals runs the steady-state force-primary / resync self- +// heals in order: the Bug 366 recovery-promote (a fresh RD whose late replica +// wedged Inconsistent with a data-bearing peer and no Primary), the BUG-048 +// late-add promote + resync-kick, and the solo-promote (a lone, peerless +// diskful replica wedged below UpToDate after a diskless→diskful toggle). All +// read live kernel state and are self-limiting. Extracted from finishDRBDApply +// to keep that function under the gocyclo budget. +// +// `resizing` is true when THIS reconcile pass just issued a `drbdadm resize` +// (pickup-time `vd s`). The BUG-048 late-add self-heals are gated OUT in that +// case as a resize-deadlock guard: their kernel-truth probes only act on an +// Inconsistent volume and would already veto a normal converged resize (the +// volume stays UpToDate while only the grown region resyncs), but the +// cluster-wide size change can transiently dip a volume below UpToDate, and +// the heals' recovery actions (MintLateAddSource = disconnect + +// new-current-uuid --clear-bitmap + adjust; InvalidateVolume) are exactly the +// cluster-wide state-change / md_buffer operations that contend with DRBD's +// in-flight change_cluster_wide_device_size and reintroduce the BUG-048 +// reboot-proof deadlock. A genuine late-add wedge is recovered on the very +// next non-resize reconcile. +func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.DesiredResource, diskless, autoPromote, autoPrimaryReplica, resizing bool) error { // Bug 366 recovery-promote self-heal — re-arm the auto-primary seed // when a fresh RD wedged with the late replica stuck Inconsistent and // no Primary anywhere. See maybeRecoveryPromote for the full why. @@ -2541,25 +2572,29 @@ func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.Desir // holds data) to UpToDate. Distinct from maybeRecoveryPromote: that // path needs the local already-UpToDate + the dispatcher's // auto-primary, BOTH false for a late-add to an initialized RD. See - // maybeLateAddPromote / Adm.NeedsLateAddPromote. - err = r.maybeLateAddPromote(ctx, dr, diskless) - if err != nil { - return err - } + // maybeLateAddPromote / Adm.NeedsLateAddPromote. Skipped during a resize + // (see `resizing` note above). + if !resizing { + err = r.maybeLateAddPromote(ctx, dr, diskless) + if err != nil { + return err + } - // BUG-048 late-add resync-kick self-heal — unstick a late-added - // volume whose resync EXISTS but wedged in a paused/bitmap-exchange - // state that never advances (the ≥3-replica concurrent-add - // convergence wedge: a SyncSource is elected but its resync sits - // PausedSyncS/resync-suspended:dependency while partner peers wait in - // WFBitMapT/resync-suspended:peer, or one peer reaches UpToDate while - // a second stalls WFBitMapT/Outdated forever). Distinct from - // maybeLateAddPromote: that mints a source when NONE exists; this - // re-handshakes an EXISTING-but-stalled resync via disconnect+adjust. - // See maybeKickLateAddResync / Adm.NeedsLateAddResyncKick. - err = r.maybeKickLateAddResync(ctx, dr, diskless) - if err != nil { - return err + // BUG-048 late-add resync-kick self-heal — unstick a late-added + // volume whose resync EXISTS but wedged in a paused/bitmap-exchange + // state that never advances (the ≥3-replica concurrent-add + // convergence wedge: a SyncSource is elected but its resync sits + // PausedSyncS/resync-suspended:dependency while partner peers wait in + // WFBitMapT/resync-suspended:peer, or one peer reaches UpToDate while + // a second stalls WFBitMapT/Outdated forever). Distinct from + // maybeLateAddPromote: that mints a source when NONE exists; this + // re-handshakes an EXISTING-but-stalled resync via disconnect+adjust. + // See maybeKickLateAddResync / Adm.NeedsLateAddResyncKick. Skipped + // during a resize (see `resizing` note above). + err = r.maybeKickLateAddResync(ctx, dr, diskless) + if err != nil { + return err + } } // Solo diskless→diskful toggle self-heal — force-promote a lone, @@ -3295,6 +3330,62 @@ func (r *Reconciler) isDisklessToDiskfulFlip(ctx context.Context, dr *intent.Des return disklessVol } +// hasUnattachedDesiredVolume reports whether some DESIRED volume of dr is +// NOT present-and-attached in the kernel — i.e. a volume that genuinely +// still needs DRBD-9 metadata (absent from the kernel device list entirely, +// or present but Diskless / mid-transition / no-metadata). It is the gate +// for ensurePerVolumeMetadata's per-volume `drbdadm dump-md` + create-md + +// winner-seed pass. +// +// BUG-048 resize deadlock (this fix): a volume that IS attached already has +// a valid DRBD-9 superblock (the kernel cannot attach a lower disk without +// one), so dump-md on it is pointless — and dump-md is an md_buffer consumer +// that, fired on every post-activation reconcile by PR #164's +// `GetMetadataCreated()` gate, contends with an in-flight `vd s` resize's +// cluster-wide md_buffer hold and deadlocks the cluster reboot-proof. Firing +// the pass ONLY when a desired volume is still unattached: +// - preserves BUG-048 #164: a genuine late-add volume is unattached (or +// comes up Diskless without metadata) → the pass fires → it gets +// create-md + the winner-election seed → converges; +// - skips a converged steady-state and a resize-in-flight (every desired +// volume attached, just resyncing the grown region) → no dump-md → no +// md_buffer contention → no deadlock. +// +// Conservative toward FIRING (fail-toward-correctness): AttachedVolumes +// returns an empty set on any probe/parse failure AND when the kernel slot +// is not loaded yet (cold start / transient down window). An empty set means +// "nothing is known-attached", so every desired volume looks unattached and +// the (idempotent, HasMD-gated) pass runs — exactly the pre-#164 behaviour. +// The deadlock-relevant skip only happens once the kernel positively reports +// every desired volume attached, which is precisely the converged/resizing +// state where dump-md must NOT run. +func (r *Reconciler) hasUnattachedDesiredVolume(ctx context.Context, dr *intent.DesiredResource) bool { + attached := r.cfg.Adm.AttachedVolumes(ctx, dr.GetName()) + + for _, vol := range dr.GetVolumes() { + if _, ok := attached[vol.GetVolumeNumber()]; !ok { + return true + } + } + + return false +} + +// shouldEnsurePerVolumeMetadata gates the per-volume create-md + winner-seed +// pass (ensurePerVolumeMetadata). It fires only for a diskful replica of an +// RD that is past first activation (MetadataCreated) which still has a +// genuinely-unattached desired volume (hasUnattachedDesiredVolume — the +// BUG-048 resize-deadlock guard that keeps the md_buffer-consuming dump-md +// off a converged / resizing RD), excluding the diskless→diskful flip path +// (Bug 319 owns that re-stamp). Extracted from applyDRBD so the gate's +// reasoning lives in one place and applyDRBD stays under the gocyclo budget. +func (r *Reconciler) shouldEnsurePerVolumeMetadata(ctx context.Context, dr *intent.DesiredResource, diskless bool) bool { + return !diskless && + dr.GetMetadataCreated() && + r.hasUnattachedDesiredVolume(ctx, dr) && + !r.isDisklessToDiskfulFlip(ctx, dr, diskless) +} + // ensureMetadata is the upstream-aligned create-md entry point. It // runs in three cases: // From 06cf0c7125d3c75bcbbe91a945e1847a5bc6737c Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Tue, 16 Jun 2026 09:58:44 +0300 Subject: [PATCH 2/3] test(satellite,drbd): regression pin for BUG-048 resize deadlock (no dump-md/self-heal on attached volumes) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L1 regression for the BUG-048 resize deadlock. The satellite test drives a full rec.Apply() and observes the mock-recorded command stream: - converged steady-state (all desired volumes attached UpToDate) → NO drbdadm dump-md, NO cluster-wide self-heal (disconnect / new-current-uuid --clear-bitmap / invalidate); - resize-in-flight (drbdadm resize issued, grown volume transiently Inconsistent) → same: no dump-md, no self-heal; - BUG-048 preserved: a desired volume NOT attached in the kernel (genuine late-add) → the per-volume metadata pass DOES fire (dump-md + create-md for the missing volume). Proven to FAIL on the pre-fix dr.GetMetadataCreated() gate (dump-md fired on the converged reconcile; MintLateAddSource disconnect + new-current-uuid fired mid-resize) and PASS with the fix. drbd AttachedVolumes unit pins the kernel-truth parse: only metadata-bearing disk-states (UpToDate/Inconsistent/Consistent/Outdated) count as attached; Diskless / transient / absent volumes do not; empty on no kernel slot (fail-toward-firing). Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- pkg/drbd/attached_volumes_test.go | 114 ++++++ .../reconciler_bug048_resize_deadlock_test.go | 378 ++++++++++++++++++ 2 files changed, 492 insertions(+) create mode 100644 pkg/drbd/attached_volumes_test.go create mode 100644 pkg/satellite/reconciler_bug048_resize_deadlock_test.go diff --git a/pkg/drbd/attached_volumes_test.go b/pkg/drbd/attached_volumes_test.go new file mode 100644 index 00000000..43f5efe7 --- /dev/null +++ b/pkg/drbd/attached_volumes_test.go @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drbd_test + +import ( + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/storage" +) + +// AttachedVolumes is the kernel-truth backing for the BUG-048 resize-deadlock +// gate: it returns the LOCAL volumes that are present-and-attached (have a +// valid DRBD-9 metadata block — disk-state UpToDate / Inconsistent / +// Consistent / Outdated). A volume absent from the device list, or Diskless / +// mid-transition, is excluded so the reconciler still runs the metadata pass +// for a genuine late-add but never for a converged/resizing RD. + +func attachedVolumesAdm(t *testing.T, json string) *drbd.Adm { + t.Helper() + + fx := storage.NewFakeExec() + fx.Responses["drbdsetup status pvc-av --json"] = storage.FakeResponse{Stdout: []byte(json)} + + return drbd.NewAdm(fx) +} + +func assertAttachedSet(t *testing.T, got map[int32]struct{}, want ...int32) { + t.Helper() + + if len(got) != len(want) { + t.Fatalf("attached set size = %d, want %d (got %v, want %v)", len(got), len(want), got, want) + } + + for _, v := range want { + if _, ok := got[v]; !ok { + t.Errorf("expected volume %d in attached set %v", v, got) + } + } +} + +// Every metadata-bearing disk-state counts as attached. Diskless and +// transient states do NOT — those are volumes that still need the metadata +// pass (a genuine late-add or a re-attach). +func TestAttachedVolumes_OnlyMetadataBearingStatesCount(t *testing.T) { + adm := attachedVolumesAdm(t, `[{ + "name":"pvc-av","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"}, + {"volume":1,"disk-state":"Inconsistent"}, + {"volume":2,"disk-state":"Consistent"}, + {"volume":3,"disk-state":"Outdated"}, + {"volume":4,"disk-state":"Diskless"}, + {"volume":5,"disk-state":"Negotiating"}, + {"volume":6,"disk-state":"DUnknown"} + ] + }]`) + + // 0..3 are attached (metadata-bearing); 4 (Diskless), 5 (Negotiating), + // 6 (DUnknown) are not. + assertAttachedSet(t, adm.AttachedVolumes(t.Context(), "pvc-av"), 0, 1, 2, 3) +} + +// A volume entirely absent from the kernel device list (the genuine late-add +// window — `vd c` added it to the desired set but the kernel has not brought +// it up yet) is simply not in the set, so the reconciler sees it as +// unattached and runs the metadata pass for it. +func TestAttachedVolumes_AbsentVolumeNotInSet(t *testing.T) { + adm := attachedVolumesAdm(t, `[{ + "name":"pvc-av","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"disk-state":"UpToDate"} + ] + }]`) + + got := adm.AttachedVolumes(t.Context(), "pvc-av") + assertAttachedSet(t, got, 0) + + if _, ok := got[1]; ok { + t.Errorf("volume 1 is absent from the device list and must NOT be reported attached; got %v", got) + } +} + +// Conservative toward FIRING (fail-toward-correctness): on a probe failure, +// empty output, or an unloaded slot the set is empty, so the reconciler treats +// every desired volume as possibly-unattached and runs the idempotent metadata +// pass — the pre-#164 behaviour. The deadlock-relevant skip only happens once +// the kernel positively reports volumes attached. +func TestAttachedVolumes_ConservativeEmptyOnNoKernelSlot(t *testing.T) { + fx := storage.NewFakeExec() + // No registered response → FakeExec returns empty stdout, nil error + // (the "kernel slot present but status empty" / blank-output shape). + adm := drbd.NewAdm(fx) + + if got := adm.AttachedVolumes(t.Context(), "pvc-missing"); len(got) != 0 { + t.Errorf("expected empty attached set on no kernel data; got %v", got) + } +} diff --git a/pkg/satellite/reconciler_bug048_resize_deadlock_test.go b/pkg/satellite/reconciler_bug048_resize_deadlock_test.go new file mode 100644 index 00000000..d1ad6063 --- /dev/null +++ b/pkg/satellite/reconciler_bug048_resize_deadlock_test.go @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: Apache-2.0 + +/* +Copyright 2026 Cozystack contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package satellite_test + +import ( + "fmt" + "os" + "path/filepath" + "slices" + "strings" + "testing" + + "github.com/cozystack/blockstor/pkg/drbd" + "github.com/cozystack/blockstor/pkg/satellite" + intent "github.com/cozystack/blockstor/pkg/satellite/intent" + "github.com/cozystack/blockstor/pkg/storage" + "github.com/cozystack/blockstor/pkg/storage/lvm" +) + +// TestBUG048SteadyStateDiskfulReconcileIssuesNoMetadataProbeOrSelfHeal is the +// regression pin for the BUG-048 resize deadlock (P0, availability — a +// reboot-proof cluster-wide DRBD deadlock that PR #164 shipped to main with a +// fully-green CI). +// +// ROOT CAUSE: PR #164 widened the ensurePerVolumeMetadata trigger gate from +// the .res-count hasLateAddedVolume() to the unconditional +// dr.GetMetadataCreated(), so EVERY post-activation diskful reconcile ran a +// per-volume `drbdadm dump-md` (an md_buffer consumer) plus the cluster-wide +// late-add self-heals. During a `vd s` resize DRBD holds md_buffer for the +// whole cluster-wide size change (change_cluster_wide_device_size / +// drbd_determine_dev_size); a per-reconcile dump-md + the self-heals' cluster- +// wide state-change actions then perpetually lose the cluster-wide state- +// change arbitration, the resize never completes, md_buffer is never +// released, and the resource deadlocks cluster-wide, reboot-proof. +// +// THE FIX: the per-volume metadata pass fires ONLY when some DESIRED volume +// is NOT present-and-attached in the kernel (an attached volume already has +// metadata — the kernel cannot attach a lower disk without it — so dump-md on +// it is pointless and is the md_buffer consumer that contends with the +// resize). The late-add self-heals are also gated out of a resize pass. +// +// WHAT THIS TEST OBSERVES: on a converged steady-state diskful reconcile where +// EVERY desired volume is present-and-attached (UpToDate), the reconciler must +// issue NO `drbdadm dump-md` and NO cluster-wide self-heal command +// (disconnect / new-current-uuid --clear-bitmap / invalidate). The mock +// records every issued command. +// +// FAIL-ON-BUG PROOF: under the pre-fix `dr.GetMetadataCreated()` gate this +// reconcile fired ensurePerVolumeMetadata unconditionally, so `drbdadm +// dump-md pvc-bug048-steady/0` (and /1) WAS issued — this test's +// "no dump-md" assertion FAILS on pre-fix and PASSES on the fix. See the +// PROOF note in the PR / task report. +func TestBUG048SteadyStateDiskfulReconcileIssuesNoMetadataProbeOrSelfHeal(t *testing.T) { + const rd = "pvc-bug048-steady" + + dir := t.TempDir() + fx := storage.NewFakeExec() + + // Both volumes already provisioned at the desired size (no grow → no + // resize): the converged steady-state shape `vd s` settles into and that + // the reconcile loop revisits on every Status/event tick. + for _, vn := range []string{"00000", "00001"} { + fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/%s_%s", rd, vn), + storage.FakeResponse{Stdout: []byte(rd + "_" + vn + "\n")}) + fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/%s_%s", rd, vn), + storage.FakeResponse{Stdout: []byte(fmt.Sprintf("/dev/vg/%s_%s|1048576\n", rd, vn))}) + } + + // KERNEL TRUTH: both desired volumes are present-and-attached UpToDate — + // the converged state. The new AttachedVolumes probe reads THIS, sees + // {0,1} all attached, and the gate finds no unattached desired volume. + fx.Expect("drbdsetup status "+rd+" --json", storage.FakeResponse{Stdout: []byte(`[{ + "name":"` + rd + `","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"minor":1000,"disk-state":"UpToDate"}, + {"volume":1,"minor":1001,"disk-state":"UpToDate"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`)}) + + thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx) + rec := satellite.NewReconciler(satellite.ReconcilerConfig{ + Providers: map[string]storage.Provider{"thin1": thin}, + Adm: drbd.NewAdm(fx), + StateDir: dir, + NodeName: "n1", + }) + + // Converged on-disk state: .res + .md-created already stamped from the + // first activation, both volumes in the .res. MetadataCreated=true. + seedConvergedTwoVolumeRD(t, dir, rd) + + dr := []*intent.DesiredResource{convergedTwoVolumeDR(rd)} + + results, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if len(results) != 1 || !results[0].GetOk() { + t.Fatalf("Apply: expected Ok=true; got results=%+v", results) + } + + assertNoMetadataProbe(t, fx.CommandLines(), rd) + assertNoLateAddSelfHeal(t, fx.CommandLines(), rd) +} + +// TestBUG048ResizeInFlightIssuesNoMetadataProbeOrSelfHeal pins the deadlock- +// shape directly: a pickup-time `vd s` resize (the storage layer grew, so the +// reconcile issues `drbdadm resize`) MUST NOT also fire the per-volume +// dump-md probe nor any cluster-wide late-add self-heal in the same pass — +// those are exactly the md_buffer / cluster-wide-state-change operations that +// contend with DRBD's in-flight change_cluster_wide_device_size and produce +// the reboot-proof deadlock. +// +// The kernel status here shows the grown volume transiently dipped to +// Inconsistent while a sibling stays UpToDate — the worst case, because that +// is precisely the shape NeedsLateAddPromote would misfire on (UpToDate +// sibling + Inconsistent volume + no peer data). The fix's resize guard keeps +// the self-heals out regardless, and the attached-volume gate keeps dump-md +// out (both volumes are attached). +// +// FAIL-ON-BUG PROOF: under the pre-fix gate the resize pass ALSO ran +// ensurePerVolumeMetadata → `drbdadm dump-md` (md_buffer contention), and the +// ungated self-heals would mint a source mid-resize. This test's assertions +// FAIL on pre-fix and PASS on the fix. +func TestBUG048ResizeInFlightIssuesNoMetadataProbeOrSelfHeal(t *testing.T) { + const rd = "pvc-bug048-resize" + + dir := t.TempDir() + fx := storage.NewFakeExec() + + // Both volumes exist but on-disk at 1 GiB while the spec asks 2 GiB → + // applyStorageIfDiskful grows them and pins resized=true → the reconcile + // issues `drbdadm resize` (the in-flight resize the deadlock needs). + for _, vn := range []string{"00000", "00001"} { + fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/%s_%s", rd, vn), + storage.FakeResponse{Stdout: []byte(rd + "_" + vn + "\n")}) + fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/%s_%s", rd, vn), + storage.FakeResponse{Stdout: []byte(fmt.Sprintf("/dev/vg/%s_%s|1048576\n", rd, vn))}) + } + + // KERNEL TRUTH mid-resize: vol-0 UpToDate, vol-1 Inconsistent (grown + // region resyncing / transiently dipped). Both are ATTACHED (Inconsistent + // is an attached, metadata-bearing state). + fx.Expect("drbdsetup status "+rd+" --json", storage.FakeResponse{Stdout: []byte(`[{ + "name":"` + rd + `","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"minor":1000,"disk-state":"UpToDate"}, + {"volume":1,"minor":1001,"disk-state":"Inconsistent"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}, + {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`)}) + + thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx) + rec := satellite.NewReconciler(satellite.ReconcilerConfig{ + Providers: map[string]storage.Provider{"thin1": thin}, + Adm: drbd.NewAdm(fx), + StateDir: dir, + NodeName: "n1", + }) + + seedConvergedTwoVolumeRD(t, dir, rd) + + // Desired 2 GiB per volume → a grow → resized=true. + dr := convergedTwoVolumeDR(rd) + for _, v := range dr.Volumes { + v.SizeKib = 2 * 1024 * 1024 + } + + results, err := rec.Apply(t.Context(), []*intent.DesiredResource{dr}) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if len(results) != 1 || !results[0].GetOk() { + t.Fatalf("Apply: expected Ok=true; got results=%+v", results) + } + + calls := fx.CommandLines() + + // Sanity: the resize MUST actually have fired — otherwise this test is + // not exercising the resize-in-flight path at all. + if !slices.Contains(calls, "drbdadm resize --assume-clean "+rd) { + t.Fatalf("test precondition: expected `drbdadm resize --assume-clean %s` to fire (the in-flight resize); got %v", rd, calls) + } + + assertNoMetadataProbe(t, calls, rd) + assertNoLateAddSelfHeal(t, calls, rd) +} + +// TestBUG048LateAddStillRunsMetadataPassForUnattachedVolume is the BUG-048- +// preserved bookend: when a DESIRED volume is genuinely NOT attached in the +// kernel (a real late `vd c` — the kernel device list lacks it), the per- +// volume metadata pass MUST still fire (dump-md + create-md for the missing +// volume), so the fix does not reintroduce the original BUG-048 wedge it was +// shipped to fix. This is the same invariant +// TestApplyLateAddedVolumePerVolumeMetadataRunsDespitePreRenderedRes pins, +// proven here through the NEW attached-volume gate (drbdsetup status --json +// shows only vol-0). +func TestBUG048LateAddStillRunsMetadataPassForUnattachedVolume(t *testing.T) { + const rd = "pvc-bug048-lateadd" + + dir := t.TempDir() + fx := storage.NewFakeExec() + + for _, vn := range []string{"00000", "00001"} { + fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/%s_%s", rd, vn), + storage.FakeResponse{Stdout: []byte(rd + "_" + vn + "\n")}) + fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/%s_%s", rd, vn), + storage.FakeResponse{Stdout: []byte(fmt.Sprintf("/dev/vg/%s_%s|1048576\n", rd, vn))}) + } + + // vol-0 stamped, vol-1 has no metadata yet (the late-added volume). + fx.Expect("drbdadm dump-md "+rd+"/0", + storage.FakeResponse{Stdout: []byte("version \"v09\";\nla-size-sect 2048;\n")}) + fx.Expect("drbdadm dump-md "+rd+"/1", + storage.FakeResponse{Err: errDrbdadmDumpMdNoMeta}) + fx.Expect(fmt.Sprintf("drbdadm create-md --force --max-peers=%d %s/1", drbd.MaxPeers-1, rd), + storage.FakeResponse{}) + + // KERNEL TRUTH: only vol-0 is attached; vol-1 is absent from the device + // list (the genuine late-add window). AttachedVolumes returns {0}, so the + // gate finds vol-1 unattached and the metadata pass fires. + fx.Expect("drbdsetup status "+rd+" --json", storage.FakeResponse{Stdout: []byte(`[{ + "name":"` + rd + `","node-id":0,"role":"Secondary", + "devices":[ + {"volume":0,"minor":1000,"disk-state":"UpToDate"} + ], + "connections":[{ + "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary", + "peer_devices":[ + {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"} + ] + }] + }]`)}) + + thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx) + rec := satellite.NewReconciler(satellite.ReconcilerConfig{ + Providers: map[string]storage.Provider{"thin1": thin}, + Adm: drbd.NewAdm(fx), + StateDir: dir, + NodeName: "n1", + }) + + seedConvergedTwoVolumeRD(t, dir, rd) + + dr := []*intent.DesiredResource{convergedTwoVolumeDR(rd)} + + results, err := rec.Apply(t.Context(), dr) + if err != nil { + t.Fatalf("Apply: %v", err) + } + if len(results) != 1 || !results[0].GetOk() { + t.Fatalf("Apply: expected Ok=true; got results=%+v", results) + } + + calls := fx.CommandLines() + + wantCreateMD := fmt.Sprintf("drbdadm create-md --force --max-peers=%d %s/1", drbd.MaxPeers-1, rd) + if !slices.Contains(calls, wantCreateMD) { + t.Errorf("BUG-048 REGRESSED: late-added unattached vol-1 got NO metadata pass — it would come up Diskless / Inconsistent with no winner seed and wedge; want %q in %v", wantCreateMD, calls) + } + + // The dump-md probe MUST have run for the unattached volume. + if !slices.Contains(calls, "drbdadm dump-md "+rd+"/1") { + t.Errorf("expected the per-volume dump-md probe to run for the unattached vol-1; got %v", calls) + } + + // vol-0 (attached) must NOT be re-created (would wipe its GI + bitmap). + forbidden := fmt.Sprintf("drbdadm create-md --force --max-peers=%d %s/0", drbd.MaxPeers-1, rd) + if slices.Contains(calls, forbidden) { + t.Errorf("re-ran create-md on the attached vol-0 (would wipe operator-stamped metadata): %v", calls) + } +} + +// --- shared fixtures ------------------------------------------------------- + +// seedConvergedTwoVolumeRD writes the on-disk state a 2-volume RD settles +// into after first activation: a .res carrying both volume blocks and the +// .md-created marker (so dr.GetMetadataCreated()==true drives firstActivation +// false — the post-activation reconcile shape the deadlock lived in). +func seedConvergedTwoVolumeRD(t *testing.T, dir, rd string) { + t.Helper() + + resBody := "resource " + rd + " {\n" + + " on n1 {\n" + + " volume 0 {\n }\n" + + " volume 1 {\n }\n" + + " }\n}\n" + if err := os.WriteFile(filepath.Join(dir, rd+".res"), []byte(resBody), 0o600); err != nil { + t.Fatalf("seed .res: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, rd+".md-created"), nil, 0o600); err != nil { + t.Fatalf("seed md-created: %v", err) + } +} + +// convergedTwoVolumeDR is the DesiredResource for a past-first-activation +// 2-diskful RD with one peer: MetadataCreated=true, both volumes 1 GiB. +func convergedTwoVolumeDR(rd string) *intent.DesiredResource { + peerID := int32(1) + + return &intent.DesiredResource{ + Name: rd, + NodeName: "n1", + MetadataCreated: true, + SkipInitialSync: skipInitTrue(), + Volumes: []*intent.DesiredVolume{ + {VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + {VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"}, + }, + Peers: []intent.DesiredPeer{{Name: "n2", NodeID: &peerID}}, + DrbdOptions: map[string]string{ + "port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000", + "peer.n2.address": "10.0.0.2", "peer.n2.node-id": "1", "peer.n2.port": "7000", + }, + } +} + +// assertNoMetadataProbe fails the test if the reconcile issued ANY per-volume +// `drbdadm dump-md /` — the md_buffer consumer that deadlocks a resize. +func assertNoMetadataProbe(t *testing.T, calls []string, rd string) { + t.Helper() + + for _, line := range calls { + if strings.HasPrefix(line, "drbdadm dump-md "+rd+"/") { + t.Errorf("BUG-048 resize deadlock: per-volume metadata probe %q fired on a converged/resizing diskful reconcile — "+ + "dump-md is an md_buffer consumer that contends with an in-flight `vd s` resize and deadlocks the cluster reboot-proof; all calls: %v", line, calls) + } + } +} + +// assertNoLateAddSelfHeal fails the test if the reconcile issued any cluster- +// wide late-add self-heal action: MintLateAddSource (disconnect + +// new-current-uuid --clear-bitmap + adjust) or InvalidateVolume. These are +// the cluster-wide state-change operations that contend with an in-flight +// resize's md_buffer hold. +func assertNoLateAddSelfHeal(t *testing.T, calls []string, rd string) { + t.Helper() + + for _, line := range calls { + switch { + case line == "drbdadm disconnect "+rd: + t.Errorf("BUG-048: late-add self-heal `disconnect %s` fired on a converged/resizing reconcile (MintLateAddSource) — would contend with an in-flight resize; calls: %v", rd, calls) + case strings.HasPrefix(line, "drbdsetup new-current-uuid --clear-bitmap "): + t.Errorf("BUG-048: late-add self-heal `%s` fired on a converged/resizing reconcile (MintLateAddSource) — would contend with an in-flight resize; calls: %v", line, calls) + case strings.HasPrefix(line, "drbdadm invalidate "+rd): + t.Errorf("BUG-048: late-add resync-kick `%s` fired on a converged/resizing reconcile (InvalidateVolume) — would contend with an in-flight resize; calls: %v", line, calls) + } + } +} From 77f374d433e13a78a77ad82cf4fd4dca87ee0e88 Mon Sep 17 00:00:00 2001 From: Andrei Kvapil Date: Tue, 16 Jun 2026 10:02:49 +0300 Subject: [PATCH 3/3] test(harness): BUG-048 resize-deadlock L6 cli-matrix cell + L7 replay (zfs-thin 2-diskful+1-diskless) CLI-bug-fix protocol cells for the BUG-048 resize deadlock (the fix touches DRBD resize / satellite paths). L6 cli-matrix bug-048-resize-deadlock-zfs-thin.sh: build the forensic shape (zfs-thin, 2 diskful + 1 diskless-client), drive 3 sequential grows under reconcile pressure, and assert each grow COMPLETES (size lands + diskful replicas re-reach UpToDate within a bounded window) with NO deadlock surface (StandAlone / suspended / a diskful volume stuck below UpToDate). With the bug the first grow stalls in a size-change retry storm and the assert times out. L7 replay bug048-resize.yaml: codify the operator sequence (rd c -> vd c 1G -> 2 diskful + 1 diskless-client -> grow 1G->2G->3G) with vd_size_kib + all_uptodate + replica_diskless convergence asserts; the grow timeouts are deadlock discriminators, not convergence targets. Honest scope: the full kernel md_buffer deadlock needs scale/contention to escalate, so the DETERMINISTIC CI guard for this bug is the L1 unit regression; these cells are best-effort stand reproduction the orchestrator runs on a stand under load. Co-Authored-By: Claude Signed-off-by: Andrei Kvapil --- .../bug-048-resize-deadlock-zfs-thin.sh | 180 ++++++++++++++++++ .../replay/bug048-resize.yaml | 145 ++++++++++++++ 2 files changed, 325 insertions(+) create mode 100755 tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh create mode 100644 tests/operator-harness/replay/bug048-resize.yaml diff --git a/tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh b/tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh new file mode 100755 index 00000000..fdb0cbe7 --- /dev/null +++ b/tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh @@ -0,0 +1,180 @@ +#!/usr/bin/env bash +# +# usage: bug-048-resize-deadlock-zfs-thin.sh WORK_DIR +# +# L6 cli-matrix cell — BUG-048 resize deadlock (P0, availability; a +# reboot-proof cluster-wide DRBD deadlock that PR #164 shipped to main with +# a fully-green CI, including the existing vd-resize cell). +# +# DEADLOCK MECHANISM: PR #164 widened the ensurePerVolumeMetadata trigger gate +# from the .res-count hasLateAddedVolume() to the unconditional +# dr.GetMetadataCreated(), so EVERY post-activation diskful reconcile ran a +# per-volume `drbdadm dump-md` (an md_buffer consumer) plus the cluster-wide +# late-add self-heals (MintLateAddSource: disconnect + new-current-uuid +# --clear-bitmap + adjust; InvalidateVolume). During a `vd s` resize DRBD +# holds md_buffer for the whole cluster-wide size change +# (change_cluster_wide_device_size / drbd_determine_dev_size); the +# per-reconcile md probe + the cluster-wide self-heals then perpetually lose +# the cluster-wide state-change arbitration, the resize never completes, +# md_buffer is never released, and the resource deadlocks cluster-wide, +# reboot-proof (stand forensics: size-change retry count 10,790 vs the normal +# 2-20, on a zfs-thin 2-diskful + 1-diskless-client RD; clean at f3515045a). +# +# THE FIX (pkg/satellite/reconciler.go + pkg/drbd/drbdsetup_show.go): the +# per-volume metadata pass fires ONLY when some desired volume is NOT +# present-and-attached in the kernel (an attached volume already has metadata +# — the kernel cannot attach a lower disk without it — so dump-md on it is +# pointless and is the md_buffer consumer that contends with the resize), and +# the late-add self-heals are gated out of a resize pass. +# +# THE SHAPE THIS CELL REPRODUCES: zfs-thin, 2 diskful + 1 diskless-client, +# repeated grows under reconcile pressure. Each grow MUST complete (size +# lands + diskful replicas re-reach UpToDate) within a bounded window, and the +# resource must NEVER enter the deadlock surface (StandAlone / suspended / +# Failed / a diskful volume stuck below UpToDate forever). +# +# HONEST SCOPE: the full kernel md_buffer deadlock needs scale/contention to +# escalate (the forensic repro took ~10k size-change retries), so the +# DETERMINISTIC CI guard for this bug is the L1 unit regression +# (pkg/satellite/reconciler_bug048_resize_deadlock_test.go — proven to FAIL on +# the pre-fix gate). This L6 cell is the BEST-EFFORT stand reproduction: the +# orchestrator runs it on a live stand under load; a clean pass here is strong +# corroboration, a hang/timeout here is the deadlock. +# +# Unit pin: pkg/satellite/reconciler_bug048_resize_deadlock_test.go +# (no dump-md / no self-heal on a converged or resizing diskful reconcile; +# metadata pass still fires for a genuine unattached late-add). + +set -euo pipefail + +WORK_DIR=${1:?work_dir required} +export KUBECONFIG="$WORK_DIR/kubeconfig" + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=lib.sh +source "$SCRIPT_DIR/lib.sh" + +require_workers 3 + +linstor_cli_setup + +# zfs-thin is the forensic repro substrate (the deadlock was observed on a +# zfs-thin pool). Allow override but default to zfs-thin. +POOL="${POOL:-zfs-thin}" +RD="cli-matrix-bug048-resize-deadlock" +SIZE_2G_KIB=2097152 +SIZE_3G_KIB=3145728 +SIZE_4G_KIB=4194304 + +cleanup() { + [[ -n "${RD:-}" ]] && delete_rd "$RD" + [[ -n "${RD:-}" ]] && assert_no_orphans "$RD" +} +trap 'cleanup; linstor_cli_teardown' EXIT + +echo "============================================================" +echo ">> bug-048-resize-deadlock-zfs-thin :: POOL=$POOL RD=$RD" +echo "============================================================" + +echo ">> pre-flight: $POOL on >=2 nodes + a 3rd worker for the diskless client" +sp_json=$("${LCTL[@]}" --machine-readable storage-pool list --storage-pools "$POOL" 2>/dev/null || echo "[]") +ok_nodes=$(jq -r '[.[]? | .[]? | select(.provider_kind != null) | .node_name] | unique | length' <<<"$sp_json" 2>/dev/null || echo 0) +if (( ok_nodes < 2 )); then + echo "SKIP ($POOL): pool not on >=2 nodes (got $ok_nodes) — BUG-048 deadlock fixture unavailable" + exit 0 +fi + +echo ">> rd c + vd c 1G + r c --auto-place=2 -s $POOL (2 diskful replicas)" +"${LCTL[@]}" resource-definition create "$RD" >/dev/null +"${LCTL[@]}" volume-definition create "$RD" 1G >/dev/null +"${LCTL[@]}" resource create --auto-place=2 --storage-pool="$POOL" "$RD" >/dev/null + +# Resolve the two diskful replicas. +deadline=$(( $(date +%s) + 90 )) +diskful=() +while (( $(date +%s) < deadline )); do + mapfile -t diskful < <(linstor_diskful_nodes "$RD") + if (( ${#diskful[@]} >= 2 )); then break; fi + sleep 2 +done +if (( ${#diskful[@]} < 2 )); then + echo "FAIL: autoplace did not stage 2 diskful replicas (got ${#diskful[@]})" >&2 + exit 1 +fi +N1="${diskful[0]}" +N2="${diskful[1]}" +echo ">> diskful replicas: N1=$N1 N2=$N2" +wait_uptodate "$RD" "$N1" "$N2" + +# Add the diskless CLIENT on a 3rd node — the exact forensic shape +# (2-diskful + 1-diskless-client). Reuse an auto-placed TieBreaker if present. +DLNODE="$(linstor_tiebreaker_node "$RD" || true)" +if [[ -z "$DLNODE" ]]; then + DLNODE="$(linstor_pick_free_node "$RD" "$N1" "$N2" || true)" + if [[ -z "$DLNODE" ]]; then + echo "SKIP: no free 3rd worker for the diskless client" >&2 + exit 0 + fi + echo ">> adding diskless client on $DLNODE" + "${LCTL[@]}" resource create "$DLNODE" "$RD" --drbd-diskless >/dev/null \ + || { echo "FAIL: r c $DLNODE $RD --drbd-diskless" >&2; exit 1; } +else + echo ">> reusing auto-placed TieBreaker on $DLNODE as the diskless client" +fi + +# Deadlock surface probe: a deadlocked RD wedges the kernel connection +# StandAlone or suspends I/O (suspended:*), and a diskful volume sticks below +# UpToDate forever. This helper returns non-zero (and prints) if ANY diskful +# replica is in the deadlock surface — used as a fast-fail after each grow. +assert_no_deadlock_surface() { + local node st + for node in "$N1" "$N2"; do + st=$(on_node "$node" drbdadm status "$RD" 2>&1 || true) + if grep -Eiq 'StandAlone|suspended:[^n]|connection:Timeout|connection:BrokenPipe' <<<"$st"; then + echo "FAIL (BUG-048 deadlock surface): $node shows a wedged/suspended connection for $RD" >&2 + echo "$st" >&2 + return 1 + fi + done + return 0 +} + +# Drive a sequence of grows. Each grow exercises the cluster-wide size change +# (md_buffer hold) under the per-reconcile pressure that PR #164 introduced. +# With the bug, the FIRST grow against this 2-diskful+1-diskless shape stalls +# (size-change retry storm) and the assert below times out. +grow_to() { + local size_arg=$1 expect_kib=$2 + echo ">> linstor vd s $RD 0 $size_arg (grow under reconcile pressure)" + "${LCTL[@]}" volume-definition set-size "$RD" 0 "$size_arg" >/dev/null + + echo " assert vd size == $expect_kib KiB (resize COMPLETED, md_buffer released)" + # Bug => this never lands (resize wedged mid change_cluster_wide_device_size). + wait_vd_size "$RD" 0 "$expect_kib" 180 + + echo " assert both diskful replicas re-reached UpToDate after grow" + wait_uptodate "$RD" "$N1" "$N2" + + echo " assert no deadlock surface (StandAlone / suspended) on either diskful replica" + assert_no_deadlock_surface + + # The diskless client must not have grown a backing dataset (U48 guard, + # asserted here as a free side-check) and must not have wedged. + local dl_state + dl_state="$(status_disk_state "$RD" "$DLNODE" 0)" + case "$dl_state" in + Failed|Inconsistent|Attaching|Detaching|Negotiating|DUnknown) + echo "FAIL: diskless client $DLNODE wedged after grow (state=$dl_state)" >&2 + return 1 ;; + esac + echo " grow to $expect_kib KiB COMPLETE (no deadlock)" +} + +grow_to 2G "$SIZE_2G_KIB" +grow_to 3G "$SIZE_3G_KIB" +grow_to 4G "$SIZE_4G_KIB" + +echo ">> bug-048-resize-deadlock-zfs-thin OK (3 sequential grows on a 2-diskful+1-diskless-client zfs-thin RD all completed; no md_buffer deadlock, no StandAlone/suspended surface)" +cleanup +trap 'linstor_cli_teardown' EXIT +echo ">> bug-048-resize-deadlock-zfs-thin COMPLETE" diff --git a/tests/operator-harness/replay/bug048-resize.yaml b/tests/operator-harness/replay/bug048-resize.yaml new file mode 100644 index 00000000..178f10ff --- /dev/null +++ b/tests/operator-harness/replay/bug048-resize.yaml @@ -0,0 +1,145 @@ +name: bug048-resize +description: | + BUG-048 resize deadlock (P0, availability) — a `vd s` resize on a + 2-diskful + 1-diskless-client RD must COMPLETE; PR #164's widened + ensurePerVolumeMetadata gate (dr.GetMetadataCreated()) ran a per-volume + `drbdadm dump-md` (md_buffer consumer) plus the cluster-wide late-add + self-heals on EVERY post-activation reconcile. During the resize DRBD + holds md_buffer for the cluster-wide size change + (change_cluster_wide_device_size); the per-reconcile md probe + self-heals + perpetually lose the state-change arbitration, the resize never completes, + md_buffer is never released, and the resource deadlocks cluster-wide, + reboot-proof (forensic repro: size-change retry count 10,790 vs 2-20 + normal, zfs-thin 2-diskful + 1-diskless-client). + + Operator sequence: + rd c → vd c 1G → r c node1 + r c node2 (diskful) → + r c --drbd-diskless node3 → grow 1G→2G→3G + + CLI-side contract (each grow): + - the grow returns 0 AND vd size reflects the new size within a bounded + window (with the bug it never lands — the resize wedges) + - every replica reconverges (all_uptodate tolerates the Diskless peer) + - the diskless client STAYS Diskless across the resize, never wedges + + HONEST SCOPE: the full kernel md_buffer deadlock needs scale/contention to + escalate, so the DETERMINISTIC CI guard for this bug is the L1 unit + regression (pkg/satellite/reconciler_bug048_resize_deadlock_test.go, proven + to FAIL on the pre-fix gate). This replay is a BEST-EFFORT stand + reproduction — the orchestrator runs it under load; a hang/timeout here is + the deadlock, a clean pass is strong corroboration. The kernel-truth + no-StandAlone/suspended surface check lives in the L6 cli-matrix cell + tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh. + + If this YAML goes red, no fix in: + - pkg/satellite/reconciler.go (ensurePerVolumeMetadata gate / + maybePromoteSelfHeals resize guard) + - pkg/drbd/drbdsetup_show.go (AttachedVolumes) + may be claimed closed. + +prerequisites: + min_nodes: 3 + storage_pool: stand + +vars: + sp: stand + +steps: + - name: create-rd + cmd: ["resource-definition", "create", "{{rd}}"] + expect_exit: 0 + - name: create-vd-1g + cmd: ["volume-definition", "create", "{{rd}}", "1G"] + expect_exit: 0 + await: + kind: vd_size_kib + rd: "{{rd}}" + vol: 0 + expected_kib: 1048576 + timeout_s: 30 + - name: r-c-node1-diskful + cmd: ["resource", "create", "{{node1}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + - name: r-c-node2-diskful + cmd: ["resource", "create", "{{node2}}", "{{rd}}", "--storage-pool", "{{sp}}"] + expect_exit: 0 + - name: wait-2-diskful-uptodate + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + kind: all_uptodate + rd: "{{rd}}" + # 1G full sync to a fresh peer at ~4 MB/s on this stand => minutes. + timeout_s: 600 + - name: r-c-node3-diskless + # node + rd are positional FIRST, then the flag — the linstor 1.27.1 + # client parses `r c --drbd-diskless ` as =. + cmd: ["resource", "create", "{{node3}}", "{{rd}}", "--drbd-diskless"] + expect_exit: 0 + await: + kind: replica_diskless + rd: "{{rd}}" + node: "{{node3}}" + timeout_s: 120 + + # ---- Grow 1G → 2G (the deadlock window: cluster-wide size change under + # the per-reconcile pressure PR #164 introduced) ---- + - name: grow-2g + cmd: ["volume-definition", "set-size", "{{rd}}", "0", "2G"] + expect_exit: 0 + await: + # Bug => this never lands (resize wedged mid-change_cluster_wide_device_size, + # md_buffer never released). 180s is the deadlock discriminator, NOT a + # convergence target — a healthy grow lands in seconds. + kind: vd_size_kib + rd: "{{rd}}" + vol: 0 + expected_kib: 2097152 + timeout_s: 180 + - name: wait-uptodate-after-2g + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + kind: all_uptodate + rd: "{{rd}}" + timeout_s: 600 + + # ---- Grow 2G → 3G (second cluster-wide size change; the bug's retry + # storm compounds across successive resizes) ---- + - name: grow-3g + cmd: ["volume-definition", "set-size", "{{rd}}", "0", "3G"] + expect_exit: 0 + await: + kind: vd_size_kib + rd: "{{rd}}" + vol: 0 + expected_kib: 3145728 + timeout_s: 180 + - name: wait-uptodate-after-3g + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + kind: all_uptodate + rd: "{{rd}}" + timeout_s: 600 + - name: diskless-client-still-diskless-not-wedged + cmd: ["resource", "list", "--resources", "{{rd}}"] + expect_exit: 0 + await: + # After both grows the diskless client is STILL Diskless — a deadlock + # / resize fan-out would have wedged it to Failed/Inconsistent. + kind: replica_diskless + rd: "{{rd}}" + node: "{{node3}}" + timeout_s: 60 + +teardown: + - cmd: ["resource-definition", "delete", "{{rd}}"] + expect_exit: 0 + await: + kind: rd_absent + rd: "{{rd}}" + timeout_s: 60 + +invariants: + - no_orphans