From c60c7cca227627890d4b0c062dc88d064a803ffc Mon Sep 17 00:00:00 2001
From: Andrei Kvapil <kvapss@gmail.com>
Date: Tue, 16 Jun 2026 09:46:37 +0300
Subject: [PATCH 1/3] fix(satellite): fire late-add metadata/self-heal only for
 unattached desired volumes (BUG-048 deadlock)

PR #164 widened the ensurePerVolumeMetadata trigger gate from the
.res-count hasLateAddedVolume() to the unconditional
dr.GetMetadataCreated(), so every post-activation diskful reconcile ran
a per-volume drbdadm dump-md (an md_buffer consumer) plus the cluster-
wide late-add self-heals. During a vd s resize DRBD holds md_buffer for
the whole cluster-wide size change (change_cluster_wide_device_size /
drbd_determine_dev_size); the per-reconcile md probe + self-heals then
perpetually lose the cluster-wide state-change arbitration, the resize
never completes, md_buffer is never released, and the resource deadlocks
cluster-wide, reboot-proof.

Narrow the gate to the race-free, deadlock-safe truth: fire the per-
volume metadata pass ONLY when some desired volume is NOT present-and-
attached in the kernel (absent, or present but Diskless / no-metadata).
An attached volume already has a valid DRBD-9 superblock (the kernel
cannot attach a lower disk without one), so dump-md on it is pointless
and is the md_buffer consumer that contends with the resize. This still
fires for a genuine late-add (the new volume is unattached) so BUG-048
#164 and the pre-rendered-.res regression are preserved, but skips a
converged steady-state and a resize-in-flight (all volumes attached).

Add Adm.AttachedVolumes (parses drbdsetup status --json into the set of
attached, metadata-bearing local volume numbers) as the kernel-truth
backing for the new hasUnattachedDesiredVolume predicate. Also gate the
late-add promote / resync-kick self-heals OUT of a reconcile that just
issued drbdadm resize (defense-in-depth: their recovery actions are
themselves cluster-wide state-change / md_buffer operations).

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
---
 pkg/drbd/drbdsetup_show.go  |  69 ++++++++++++++
 pkg/satellite/reconciler.go | 183 +++++++++++++++++++++++++++---------
 2 files changed, 206 insertions(+), 46 deletions(-)
diff --git a/pkg/drbd/drbdsetup_show.go b/pkg/drbd/drbdsetup_show.go
index a4b702f8..33a9d6e9 100644
--- a/pkg/drbd/drbdsetup_show.go
+++ b/pkg/drbd/drbdsetup_show.go
@@ -324,3 +324,72 @@ func parseShowJSON(out []byte) map[string]KernelSlot {
 
 	return slots
 }
+
+// AttachedVolumes probes `drbdsetup status <res> --json` and returns the
+// set of LOCAL volume numbers that are present-and-attached in the kernel —
+// i.e. the kernel has a lower disk attached and a valid DRBD-9 metadata
+// block for them (disk-state UpToDate / Inconsistent / Consistent /
+// Outdated). A volume that is absent from the device list entirely, or
+// present but Diskless / DUnknown / mid-transition (Attaching / Detaching /
+// Failed / Negotiating), is NOT in the returned set.
+//
+// DRBD invariant exploited by the BUG-048 resize-deadlock fix: the kernel
+// CANNOT attach a lower disk without a valid metadata block, so an attached
+// volume ALREADY has metadata — there is nothing for ensurePerVolumeMetadata
+// to create-md and a `drbdadm dump-md` probe on it is pointless. That
+// dump-md is an md_buffer consumer; firing it on every post-activation
+// reconcile contends with an in-flight `vd s` resize (DRBD holds md_buffer
+// in change_cluster_wide_device_size / drbd_determine_dev_size), which is
+// the reboot-proof cluster-wide deadlock BUG-048's #164 gate widening
+// introduced. The reconciler gates the per-volume metadata pass on "some
+// DESIRED volume is NOT in this set" so the probe only fires for a genuine
+// late-add (a volume that really still needs metadata) and never on a
+// converged or resizing RD whose volumes are all attached.
+//
+// Conservative on any probe/parse failure (kernel slot absent, status
+// non-zero, malformed JSON, missing node-id) → empty set. An empty set means
+// "nothing is known-attached", so the caller treats every desired volume as
+// possibly needing metadata and runs the (idempotent) per-volume pass — the
+// fail-toward-correctness direction (the pre-BUG-048 behaviour).
+func (a *Adm) AttachedVolumes(ctx context.Context, resource string) map[int32]struct{} {
+	out, err := a.exec.Run(ctx, "drbdsetup", "status", resource, "--json")
+	if err != nil {
+		return nil
+	}
+
+	var status drbdsetupStatusRoot
+
+	err = json.Unmarshal(out, &status)
+	if err != nil || len(status) == 0 {
+		return nil
+	}
+
+	return attachedVolumeSet(status[0].Devices)
+}
+
+// attachedVolumeSet returns the volume numbers from `devices` whose
+// disk-state proves the lower disk is attached with a valid metadata block.
+// Split out of AttachedVolumes so unit tests can drive it from fixtures
+// without the FakeExec round-trip.
+func attachedVolumeSet(devices []drbdsetupStatusDevice) map[int32]struct{} {
+	out := map[int32]struct{}{}
+
+	for _, dev := range devices {
+		switch DiskState(dev.DiskState) {
+		case DiskStateUpToDate, DiskStateInconsistent,
+			DiskStateConsistent, DiskStateOutdated:
+			// Lower disk attached with metadata — DRBD cannot reach any of
+			// these states without a valid DRBD-9 superblock. No create-md /
+			// dump-md needed for this volume.
+			out[dev.VolumeNumber] = struct{}{}
+		case DiskStateDiskless, DiskStateAttaching, DiskStateDetaching,
+			DiskStateFailed, DiskStateNegotiating, DiskStateDUnknown:
+			// Diskless (no lower disk), or a transient / failed state where
+			// the disk is not cleanly attached: treat as still-needing the
+			// metadata pass so a genuine late-add or a re-attach is not
+			// skipped.
+		}
+	}
+
+	return out
+}
diff --git a/pkg/satellite/reconciler.go b/pkg/satellite/reconciler.go
index 8d56cc52..c41de5cb 100644
--- a/pkg/satellite/reconciler.go
+++ b/pkg/satellite/reconciler.go
@@ -2101,25 +2101,43 @@ func (r *Reconciler) applyDRBD(ctx context.Context, dr *intent.DesiredResource,
 	// (MetadataCreated Condition / .md-created marker) rather than the
 	// OLD .res's `volume N {` block count.
 	//
-	// BUG-048 (P1, availability): the previous `.res`-count gate raced
-	// the FSM dispatch's renderResFile preamble. On a concurrent late
-	// vd-add the FSM `adjust` of an EARLIER reconcile already re-rendered
-	// `.res` to include the new volume's `volume N {` block BEFORE this
-	// gate read the file, so hasLateAddedVolume(resPath) flipped FALSE,
-	// ensurePerVolumeMetadata was SKIPPED, and the new volume came up via
-	// the kernel adjust with NO metadata + NO winner-election GI seed —
-	// Inconsistent on every replica, no SyncSource, permanently wedged
-	// (~half of concurrent two-VD adds). The on-disk metadata, not the
-	// rendered .res, is the race-free truth: ensurePerVolumeMetadata
-	// probes `drbdadm dump-md` per volume and only creates+seeds the ones
-	// that actually lack metadata (len(freshlyCreated)==0 → early return),
-	// so running it on every post-activation diskful reconcile is
-	// idempotent and converged passes pay only the cheap per-volume
-	// dump-md probe. Skipped on diskless replicas (no lower disk to
+	// BUG-048 (P1, availability): the original `.res`-count gate
+	// (hasLateAddedVolume) raced the FSM dispatch's renderResFile preamble.
+	// On a concurrent late vd-add the FSM `adjust` of an EARLIER reconcile
+	// already re-rendered `.res` to include the new volume's `volume N {`
+	// block BEFORE this gate read the file, so hasLateAddedVolume(resPath)
+	// flipped FALSE, ensurePerVolumeMetadata was SKIPPED, and the new volume
+	// came up via the kernel adjust with NO metadata + NO winner-election GI
+	// seed — Inconsistent on every replica, no SyncSource, permanently
+	// wedged (~half of concurrent two-VD adds). PR #164 widened the gate to
+	// the race-free `dr.GetMetadataCreated()` — but that fires the per-volume
+	// pass on EVERY post-activation diskful reconcile.
+	//
+	// BUG-048 RESIZE DEADLOCK (this fix): even though ensurePerVolumeMetadata
+	// only create-md's volumes that lack metadata, it still runs a per-volume
+	// `drbdadm dump-md` probe on EVERY volume first — and dump-md is an
+	// md_buffer consumer. During a `vd s` resize DRBD holds md_buffer for the
+	// whole cluster-wide size change (change_cluster_wide_device_size /
+	// drbd_determine_dev_size); a per-reconcile dump-md then perpetually
+	// loses the cluster-wide state-change arbitration, the resize never
+	// completes, md_buffer is never released, and the resource deadlocks
+	// cluster-wide, reboot-proof. The unconditional `GetMetadataCreated()`
+	// gate fired this on every reconcile of a converged/resizing RD.
+	//
+	// Narrow the gate to the race-free, deadlock-safe truth: fire the
+	// per-volume metadata pass ONLY when some DESIRED volume is NOT
+	// present-and-attached in the kernel (absent entirely, or present but
+	// Diskless / no-metadata). A volume that IS attached already has a valid
+	// DRBD-9 superblock (the kernel cannot attach a lower disk without one),
+	// so dump-md on it is pointless AND is the md_buffer consumer that
+	// contends with a resize. This still fires on a genuine late-add (the new
+	// volume is unattached → BUG-048 #164 / the pre-rendered-.res test
+	// preserved) and on a re-attach, but skips a converged steady-state and a
+	// resize-in-flight (every desired volume attached) → no dump-md → no
+	// md_buffer contention. Skipped on diskless replicas (no lower disk to
 	// stamp) and on the diskless→diskful flip path (Bug 319 owns that
 	// re-stamp).
-	if !diskless && dr.GetMetadataCreated() &&
-		!r.isDisklessToDiskfulFlip(ctx, dr, diskless) {
+	if r.shouldEnsurePerVolumeMetadata(ctx, dr, diskless) {
 		err = r.ensurePerVolumeMetadata(ctx, dr, devices, diskless)
 		if err != nil {
 			return err
@@ -2515,19 +2533,32 @@ func (r *Reconciler) finishDRBDApply(ctx context.Context, dr *intent.DesiredReso
 
 	// Steady-state promote self-heals — run after the firstActivation
 	// promote so a wedged replica still converges. Folded into one helper
-	// so finishDRBDApply stays under the gocyclo budget.
-	return r.maybePromoteSelfHeals(ctx, dr, diskless, autoPromote, autoPrimaryReplica)
+	// so finishDRBDApply stays under the gocyclo budget. `resized` gates the
+	// BUG-048 late-add self-heals OUT for this pass (resize-deadlock guard).
+	return r.maybePromoteSelfHeals(ctx, dr, diskless, autoPromote, autoPrimaryReplica, resized)
 }
 
-// maybePromoteSelfHeals runs the two steady-state force-primary self-heals
-// in order: the Bug 366 recovery-promote (a fresh RD whose late replica
-// wedged Inconsistent with a data-bearing peer and no Primary) and the
-// solo-promote (a lone, peerless diskful replica wedged below UpToDate
-// after a diskless→diskful toggle). Both read live kernel state and are
-// self-limiting; their predicates are mutually exclusive (recovery needs a
-// peer, solo needs none), so at most one acts per pass. Extracted from
-// finishDRBDApply to keep that function under the gocyclo budget.
-func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.DesiredResource, diskless, autoPromote, autoPrimaryReplica bool) error {
+// maybePromoteSelfHeals runs the steady-state force-primary / resync self-
+// heals in order: the Bug 366 recovery-promote (a fresh RD whose late replica
+// wedged Inconsistent with a data-bearing peer and no Primary), the BUG-048
+// late-add promote + resync-kick, and the solo-promote (a lone, peerless
+// diskful replica wedged below UpToDate after a diskless→diskful toggle). All
+// read live kernel state and are self-limiting. Extracted from finishDRBDApply
+// to keep that function under the gocyclo budget.
+//
+// `resizing` is true when THIS reconcile pass just issued a `drbdadm resize`
+// (pickup-time `vd s`). The BUG-048 late-add self-heals are gated OUT in that
+// case as a resize-deadlock guard: their kernel-truth probes only act on an
+// Inconsistent volume and would already veto a normal converged resize (the
+// volume stays UpToDate while only the grown region resyncs), but the
+// cluster-wide size change can transiently dip a volume below UpToDate, and
+// the heals' recovery actions (MintLateAddSource = disconnect +
+// new-current-uuid --clear-bitmap + adjust; InvalidateVolume) are exactly the
+// cluster-wide state-change / md_buffer operations that contend with DRBD's
+// in-flight change_cluster_wide_device_size and reintroduce the BUG-048
+// reboot-proof deadlock. A genuine late-add wedge is recovered on the very
+// next non-resize reconcile.
+func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.DesiredResource, diskless, autoPromote, autoPrimaryReplica, resizing bool) error {
 	// Bug 366 recovery-promote self-heal — re-arm the auto-primary seed
 	// when a fresh RD wedged with the late replica stuck Inconsistent and
 	// no Primary anywhere. See maybeRecoveryPromote for the full why.
@@ -2541,25 +2572,29 @@ func (r *Reconciler) maybePromoteSelfHeals(ctx context.Context, dr *intent.Desir
 	// holds data) to UpToDate. Distinct from maybeRecoveryPromote: that
 	// path needs the local already-UpToDate + the dispatcher's
 	// auto-primary, BOTH false for a late-add to an initialized RD. See
-	// maybeLateAddPromote / Adm.NeedsLateAddPromote.
-	err = r.maybeLateAddPromote(ctx, dr, diskless)
-	if err != nil {
-		return err
-	}
+	// maybeLateAddPromote / Adm.NeedsLateAddPromote. Skipped during a resize
+	// (see `resizing` note above).
+	if !resizing {
+		err = r.maybeLateAddPromote(ctx, dr, diskless)
+		if err != nil {
+			return err
+		}
 
-	// BUG-048 late-add resync-kick self-heal — unstick a late-added
-	// volume whose resync EXISTS but wedged in a paused/bitmap-exchange
-	// state that never advances (the ≥3-replica concurrent-add
-	// convergence wedge: a SyncSource is elected but its resync sits
-	// PausedSyncS/resync-suspended:dependency while partner peers wait in
-	// WFBitMapT/resync-suspended:peer, or one peer reaches UpToDate while
-	// a second stalls WFBitMapT/Outdated forever). Distinct from
-	// maybeLateAddPromote: that mints a source when NONE exists; this
-	// re-handshakes an EXISTING-but-stalled resync via disconnect+adjust.
-	// See maybeKickLateAddResync / Adm.NeedsLateAddResyncKick.
-	err = r.maybeKickLateAddResync(ctx, dr, diskless)
-	if err != nil {
-		return err
+		// BUG-048 late-add resync-kick self-heal — unstick a late-added
+		// volume whose resync EXISTS but wedged in a paused/bitmap-exchange
+		// state that never advances (the ≥3-replica concurrent-add
+		// convergence wedge: a SyncSource is elected but its resync sits
+		// PausedSyncS/resync-suspended:dependency while partner peers wait in
+		// WFBitMapT/resync-suspended:peer, or one peer reaches UpToDate while
+		// a second stalls WFBitMapT/Outdated forever). Distinct from
+		// maybeLateAddPromote: that mints a source when NONE exists; this
+		// re-handshakes an EXISTING-but-stalled resync via disconnect+adjust.
+		// See maybeKickLateAddResync / Adm.NeedsLateAddResyncKick. Skipped
+		// during a resize (see `resizing` note above).
+		err = r.maybeKickLateAddResync(ctx, dr, diskless)
+		if err != nil {
+			return err
+		}
 	}
 
 	// Solo diskless→diskful toggle self-heal — force-promote a lone,
@@ -3295,6 +3330,62 @@ func (r *Reconciler) isDisklessToDiskfulFlip(ctx context.Context, dr *intent.Des
 	return disklessVol
 }
 
+// hasUnattachedDesiredVolume reports whether some DESIRED volume of dr is
+// NOT present-and-attached in the kernel — i.e. a volume that genuinely
+// still needs DRBD-9 metadata (absent from the kernel device list entirely,
+// or present but Diskless / mid-transition / no-metadata). It is the gate
+// for ensurePerVolumeMetadata's per-volume `drbdadm dump-md` + create-md +
+// winner-seed pass.
+//
+// BUG-048 resize deadlock (this fix): a volume that IS attached already has
+// a valid DRBD-9 superblock (the kernel cannot attach a lower disk without
+// one), so dump-md on it is pointless — and dump-md is an md_buffer consumer
+// that, fired on every post-activation reconcile by PR #164's
+// `GetMetadataCreated()` gate, contends with an in-flight `vd s` resize's
+// cluster-wide md_buffer hold and deadlocks the cluster reboot-proof. Firing
+// the pass ONLY when a desired volume is still unattached:
+//   - preserves BUG-048 #164: a genuine late-add volume is unattached (or
+//     comes up Diskless without metadata) → the pass fires → it gets
+//     create-md + the winner-election seed → converges;
+//   - skips a converged steady-state and a resize-in-flight (every desired
+//     volume attached, just resyncing the grown region) → no dump-md → no
+//     md_buffer contention → no deadlock.
+//
+// Conservative toward FIRING (fail-toward-correctness): AttachedVolumes
+// returns an empty set on any probe/parse failure AND when the kernel slot
+// is not loaded yet (cold start / transient down window). An empty set means
+// "nothing is known-attached", so every desired volume looks unattached and
+// the (idempotent, HasMD-gated) pass runs — exactly the pre-#164 behaviour.
+// The deadlock-relevant skip only happens once the kernel positively reports
+// every desired volume attached, which is precisely the converged/resizing
+// state where dump-md must NOT run.
+func (r *Reconciler) hasUnattachedDesiredVolume(ctx context.Context, dr *intent.DesiredResource) bool {
+	attached := r.cfg.Adm.AttachedVolumes(ctx, dr.GetName())
+
+	for _, vol := range dr.GetVolumes() {
+		if _, ok := attached[vol.GetVolumeNumber()]; !ok {
+			return true
+		}
+	}
+
+	return false
+}
+
+// shouldEnsurePerVolumeMetadata gates the per-volume create-md + winner-seed
+// pass (ensurePerVolumeMetadata). It fires only for a diskful replica of an
+// RD that is past first activation (MetadataCreated) which still has a
+// genuinely-unattached desired volume (hasUnattachedDesiredVolume — the
+// BUG-048 resize-deadlock guard that keeps the md_buffer-consuming dump-md
+// off a converged / resizing RD), excluding the diskless→diskful flip path
+// (Bug 319 owns that re-stamp). Extracted from applyDRBD so the gate's
+// reasoning lives in one place and applyDRBD stays under the gocyclo budget.
+func (r *Reconciler) shouldEnsurePerVolumeMetadata(ctx context.Context, dr *intent.DesiredResource, diskless bool) bool {
+	return !diskless &&
+		dr.GetMetadataCreated() &&
+		r.hasUnattachedDesiredVolume(ctx, dr) &&
+		!r.isDisklessToDiskfulFlip(ctx, dr, diskless)
+}
+
 // ensureMetadata is the upstream-aligned create-md entry point. It
 // runs in three cases:
 //

From 06cf0c7125d3c75bcbbe91a945e1847a5bc6737c Mon Sep 17 00:00:00 2001
From: Andrei Kvapil <kvapss@gmail.com>
Date: Tue, 16 Jun 2026 09:58:44 +0300
Subject: [PATCH 2/3] test(satellite,drbd): regression pin for BUG-048 resize
 deadlock (no dump-md/self-heal on attached volumes)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

L1 regression for the BUG-048 resize deadlock. The satellite test drives
a full rec.Apply() and observes the mock-recorded command stream:

  - converged steady-state (all desired volumes attached UpToDate) →
    NO drbdadm dump-md, NO cluster-wide self-heal (disconnect /
    new-current-uuid --clear-bitmap / invalidate);
  - resize-in-flight (drbdadm resize issued, grown volume transiently
    Inconsistent) → same: no dump-md, no self-heal;
  - BUG-048 preserved: a desired volume NOT attached in the kernel
    (genuine late-add) → the per-volume metadata pass DOES fire
    (dump-md + create-md for the missing volume).

Proven to FAIL on the pre-fix dr.GetMetadataCreated() gate (dump-md
fired on the converged reconcile; MintLateAddSource disconnect +
new-current-uuid fired mid-resize) and PASS with the fix.

drbd AttachedVolumes unit pins the kernel-truth parse: only
metadata-bearing disk-states (UpToDate/Inconsistent/Consistent/Outdated)
count as attached; Diskless / transient / absent volumes do not; empty
on no kernel slot (fail-toward-firing).

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
---
 pkg/drbd/attached_volumes_test.go             | 114 ++++++
 .../reconciler_bug048_resize_deadlock_test.go | 378 ++++++++++++++++++
 2 files changed, 492 insertions(+)
 create mode 100644 pkg/drbd/attached_volumes_test.go
 create mode 100644 pkg/satellite/reconciler_bug048_resize_deadlock_test.go

diff --git a/pkg/drbd/attached_volumes_test.go b/pkg/drbd/attached_volumes_test.go
new file mode 100644
index 00000000..43f5efe7
--- /dev/null
+++ b/pkg/drbd/attached_volumes_test.go
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/*
+Copyright 2026 Cozystack contributors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package drbd_test
+
+import (
+	"testing"
+
+	"github.com/cozystack/blockstor/pkg/drbd"
+	"github.com/cozystack/blockstor/pkg/storage"
+)
+
+// AttachedVolumes is the kernel-truth backing for the BUG-048 resize-deadlock
+// gate: it returns the LOCAL volumes that are present-and-attached (have a
+// valid DRBD-9 metadata block — disk-state UpToDate / Inconsistent /
+// Consistent / Outdated). A volume absent from the device list, or Diskless /
+// mid-transition, is excluded so the reconciler still runs the metadata pass
+// for a genuine late-add but never for a converged/resizing RD.
+
+func attachedVolumesAdm(t *testing.T, json string) *drbd.Adm {
+	t.Helper()
+
+	fx := storage.NewFakeExec()
+	fx.Responses["drbdsetup status pvc-av --json"] = storage.FakeResponse{Stdout: []byte(json)}
+
+	return drbd.NewAdm(fx)
+}
+
+func assertAttachedSet(t *testing.T, got map[int32]struct{}, want ...int32) {
+	t.Helper()
+
+	if len(got) != len(want) {
+		t.Fatalf("attached set size = %d, want %d (got %v, want %v)", len(got), len(want), got, want)
+	}
+
+	for _, v := range want {
+		if _, ok := got[v]; !ok {
+			t.Errorf("expected volume %d in attached set %v", v, got)
+		}
+	}
+}
+
+// Every metadata-bearing disk-state counts as attached. Diskless and
+// transient states do NOT — those are volumes that still need the metadata
+// pass (a genuine late-add or a re-attach).
+func TestAttachedVolumes_OnlyMetadataBearingStatesCount(t *testing.T) {
+	adm := attachedVolumesAdm(t, `[{
+	  "name":"pvc-av","node-id":0,"role":"Secondary",
+	  "devices":[
+	    {"volume":0,"disk-state":"UpToDate"},
+	    {"volume":1,"disk-state":"Inconsistent"},
+	    {"volume":2,"disk-state":"Consistent"},
+	    {"volume":3,"disk-state":"Outdated"},
+	    {"volume":4,"disk-state":"Diskless"},
+	    {"volume":5,"disk-state":"Negotiating"},
+	    {"volume":6,"disk-state":"DUnknown"}
+	  ]
+	}]`)
+
+	// 0..3 are attached (metadata-bearing); 4 (Diskless), 5 (Negotiating),
+	// 6 (DUnknown) are not.
+	assertAttachedSet(t, adm.AttachedVolumes(t.Context(), "pvc-av"), 0, 1, 2, 3)
+}
+
+// A volume entirely absent from the kernel device list (the genuine late-add
+// window — `vd c` added it to the desired set but the kernel has not brought
+// it up yet) is simply not in the set, so the reconciler sees it as
+// unattached and runs the metadata pass for it.
+func TestAttachedVolumes_AbsentVolumeNotInSet(t *testing.T) {
+	adm := attachedVolumesAdm(t, `[{
+	  "name":"pvc-av","node-id":0,"role":"Secondary",
+	  "devices":[
+	    {"volume":0,"disk-state":"UpToDate"}
+	  ]
+	}]`)
+
+	got := adm.AttachedVolumes(t.Context(), "pvc-av")
+	assertAttachedSet(t, got, 0)
+
+	if _, ok := got[1]; ok {
+		t.Errorf("volume 1 is absent from the device list and must NOT be reported attached; got %v", got)
+	}
+}
+
+// Conservative toward FIRING (fail-toward-correctness): on a probe failure,
+// empty output, or an unloaded slot the set is empty, so the reconciler treats
+// every desired volume as possibly-unattached and runs the idempotent metadata
+// pass — the pre-#164 behaviour. The deadlock-relevant skip only happens once
+// the kernel positively reports volumes attached.
+func TestAttachedVolumes_ConservativeEmptyOnNoKernelSlot(t *testing.T) {
+	fx := storage.NewFakeExec()
+	// No registered response → FakeExec returns empty stdout, nil error
+	// (the "kernel slot present but status empty" / blank-output shape).
+	adm := drbd.NewAdm(fx)
+
+	if got := adm.AttachedVolumes(t.Context(), "pvc-missing"); len(got) != 0 {
+		t.Errorf("expected empty attached set on no kernel data; got %v", got)
+	}
+}
diff --git a/pkg/satellite/reconciler_bug048_resize_deadlock_test.go b/pkg/satellite/reconciler_bug048_resize_deadlock_test.go
new file mode 100644
index 00000000..d1ad6063
--- /dev/null
+++ b/pkg/satellite/reconciler_bug048_resize_deadlock_test.go
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/*
+Copyright 2026 Cozystack contributors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package satellite_test
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/cozystack/blockstor/pkg/drbd"
+	"github.com/cozystack/blockstor/pkg/satellite"
+	intent "github.com/cozystack/blockstor/pkg/satellite/intent"
+	"github.com/cozystack/blockstor/pkg/storage"
+	"github.com/cozystack/blockstor/pkg/storage/lvm"
+)
+
+// TestBUG048SteadyStateDiskfulReconcileIssuesNoMetadataProbeOrSelfHeal is the
+// regression pin for the BUG-048 resize deadlock (P0, availability — a
+// reboot-proof cluster-wide DRBD deadlock that PR #164 shipped to main with a
+// fully-green CI).
+//
+// ROOT CAUSE: PR #164 widened the ensurePerVolumeMetadata trigger gate from
+// the .res-count hasLateAddedVolume() to the unconditional
+// dr.GetMetadataCreated(), so EVERY post-activation diskful reconcile ran a
+// per-volume `drbdadm dump-md` (an md_buffer consumer) plus the cluster-wide
+// late-add self-heals. During a `vd s` resize DRBD holds md_buffer for the
+// whole cluster-wide size change (change_cluster_wide_device_size /
+// drbd_determine_dev_size); a per-reconcile dump-md + the self-heals' cluster-
+// wide state-change actions then perpetually lose the cluster-wide state-
+// change arbitration, the resize never completes, md_buffer is never
+// released, and the resource deadlocks cluster-wide, reboot-proof.
+//
+// THE FIX: the per-volume metadata pass fires ONLY when some DESIRED volume
+// is NOT present-and-attached in the kernel (an attached volume already has
+// metadata — the kernel cannot attach a lower disk without it — so dump-md on
+// it is pointless and is the md_buffer consumer that contends with the
+// resize). The late-add self-heals are also gated out of a resize pass.
+//
+// WHAT THIS TEST OBSERVES: on a converged steady-state diskful reconcile where
+// EVERY desired volume is present-and-attached (UpToDate), the reconciler must
+// issue NO `drbdadm dump-md` and NO cluster-wide self-heal command
+// (disconnect / new-current-uuid --clear-bitmap / invalidate). The mock
+// records every issued command.
+//
+// FAIL-ON-BUG PROOF: under the pre-fix `dr.GetMetadataCreated()` gate this
+// reconcile fired ensurePerVolumeMetadata unconditionally, so `drbdadm
+// dump-md pvc-bug048-steady/0` (and /1) WAS issued — this test's
+// "no dump-md" assertion FAILS on pre-fix and PASSES on the fix. See the
+// PROOF note in the PR / task report.
+func TestBUG048SteadyStateDiskfulReconcileIssuesNoMetadataProbeOrSelfHeal(t *testing.T) {
+	const rd = "pvc-bug048-steady"
+
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+
+	// Both volumes already provisioned at the desired size (no grow → no
+	// resize): the converged steady-state shape `vd s` settles into and that
+	// the reconcile loop revisits on every Status/event tick.
+	for _, vn := range []string{"00000", "00001"} {
+		fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/%s_%s", rd, vn),
+			storage.FakeResponse{Stdout: []byte(rd + "_" + vn + "\n")})
+		fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/%s_%s", rd, vn),
+			storage.FakeResponse{Stdout: []byte(fmt.Sprintf("/dev/vg/%s_%s|1048576\n", rd, vn))})
+	}
+
+	// KERNEL TRUTH: both desired volumes are present-and-attached UpToDate —
+	// the converged state. The new AttachedVolumes probe reads THIS, sees
+	// {0,1} all attached, and the gate finds no unattached desired volume.
+	fx.Expect("drbdsetup status "+rd+" --json", storage.FakeResponse{Stdout: []byte(`[{
+	  "name":"` + rd + `","node-id":0,"role":"Secondary",
+	  "devices":[
+	    {"volume":0,"minor":1000,"disk-state":"UpToDate"},
+	    {"volume":1,"minor":1001,"disk-state":"UpToDate"}
+	  ],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary",
+	    "peer_devices":[
+	      {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"},
+	      {"volume":1,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}
+	    ]
+	  }]
+	}]`)})
+
+	thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx)
+	rec := satellite.NewReconciler(satellite.ReconcilerConfig{
+		Providers: map[string]storage.Provider{"thin1": thin},
+		Adm:       drbd.NewAdm(fx),
+		StateDir:  dir,
+		NodeName:  "n1",
+	})
+
+	// Converged on-disk state: .res + .md-created already stamped from the
+	// first activation, both volumes in the .res. MetadataCreated=true.
+	seedConvergedTwoVolumeRD(t, dir, rd)
+
+	dr := []*intent.DesiredResource{convergedTwoVolumeDR(rd)}
+
+	results, err := rec.Apply(t.Context(), dr)
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+	if len(results) != 1 || !results[0].GetOk() {
+		t.Fatalf("Apply: expected Ok=true; got results=%+v", results)
+	}
+
+	assertNoMetadataProbe(t, fx.CommandLines(), rd)
+	assertNoLateAddSelfHeal(t, fx.CommandLines(), rd)
+}
+
+// TestBUG048ResizeInFlightIssuesNoMetadataProbeOrSelfHeal pins the deadlock-
+// shape directly: a pickup-time `vd s` resize (the storage layer grew, so the
+// reconcile issues `drbdadm resize`) MUST NOT also fire the per-volume
+// dump-md probe nor any cluster-wide late-add self-heal in the same pass —
+// those are exactly the md_buffer / cluster-wide-state-change operations that
+// contend with DRBD's in-flight change_cluster_wide_device_size and produce
+// the reboot-proof deadlock.
+//
+// The kernel status here shows the grown volume transiently dipped to
+// Inconsistent while a sibling stays UpToDate — the worst case, because that
+// is precisely the shape NeedsLateAddPromote would misfire on (UpToDate
+// sibling + Inconsistent volume + no peer data). The fix's resize guard keeps
+// the self-heals out regardless, and the attached-volume gate keeps dump-md
+// out (both volumes are attached).
+//
+// FAIL-ON-BUG PROOF: under the pre-fix gate the resize pass ALSO ran
+// ensurePerVolumeMetadata → `drbdadm dump-md` (md_buffer contention), and the
+// ungated self-heals would mint a source mid-resize. This test's assertions
+// FAIL on pre-fix and PASS on the fix.
+func TestBUG048ResizeInFlightIssuesNoMetadataProbeOrSelfHeal(t *testing.T) {
+	const rd = "pvc-bug048-resize"
+
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+
+	// Both volumes exist but on-disk at 1 GiB while the spec asks 2 GiB →
+	// applyStorageIfDiskful grows them and pins resized=true → the reconcile
+	// issues `drbdadm resize` (the in-flight resize the deadlock needs).
+	for _, vn := range []string{"00000", "00001"} {
+		fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/%s_%s", rd, vn),
+			storage.FakeResponse{Stdout: []byte(rd + "_" + vn + "\n")})
+		fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/%s_%s", rd, vn),
+			storage.FakeResponse{Stdout: []byte(fmt.Sprintf("/dev/vg/%s_%s|1048576\n", rd, vn))})
+	}
+
+	// KERNEL TRUTH mid-resize: vol-0 UpToDate, vol-1 Inconsistent (grown
+	// region resyncing / transiently dipped). Both are ATTACHED (Inconsistent
+	// is an attached, metadata-bearing state).
+	fx.Expect("drbdsetup status "+rd+" --json", storage.FakeResponse{Stdout: []byte(`[{
+	  "name":"` + rd + `","node-id":0,"role":"Secondary",
+	  "devices":[
+	    {"volume":0,"minor":1000,"disk-state":"UpToDate"},
+	    {"volume":1,"minor":1001,"disk-state":"Inconsistent"}
+	  ],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary",
+	    "peer_devices":[
+	      {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"},
+	      {"volume":1,"peer-disk-state":"Inconsistent","replication-state":"Established","resync-suspended":"no"}
+	    ]
+	  }]
+	}]`)})
+
+	thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx)
+	rec := satellite.NewReconciler(satellite.ReconcilerConfig{
+		Providers: map[string]storage.Provider{"thin1": thin},
+		Adm:       drbd.NewAdm(fx),
+		StateDir:  dir,
+		NodeName:  "n1",
+	})
+
+	seedConvergedTwoVolumeRD(t, dir, rd)
+
+	// Desired 2 GiB per volume → a grow → resized=true.
+	dr := convergedTwoVolumeDR(rd)
+	for _, v := range dr.Volumes {
+		v.SizeKib = 2 * 1024 * 1024
+	}
+
+	results, err := rec.Apply(t.Context(), []*intent.DesiredResource{dr})
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+	if len(results) != 1 || !results[0].GetOk() {
+		t.Fatalf("Apply: expected Ok=true; got results=%+v", results)
+	}
+
+	calls := fx.CommandLines()
+
+	// Sanity: the resize MUST actually have fired — otherwise this test is
+	// not exercising the resize-in-flight path at all.
+	if !slices.Contains(calls, "drbdadm resize --assume-clean "+rd) {
+		t.Fatalf("test precondition: expected `drbdadm resize --assume-clean %s` to fire (the in-flight resize); got %v", rd, calls)
+	}
+
+	assertNoMetadataProbe(t, calls, rd)
+	assertNoLateAddSelfHeal(t, calls, rd)
+}
+
+// TestBUG048LateAddStillRunsMetadataPassForUnattachedVolume is the BUG-048-
+// preserved bookend: when a DESIRED volume is genuinely NOT attached in the
+// kernel (a real late `vd c` — the kernel device list lacks it), the per-
+// volume metadata pass MUST still fire (dump-md + create-md for the missing
+// volume), so the fix does not reintroduce the original BUG-048 wedge it was
+// shipped to fix. This is the same invariant
+// TestApplyLateAddedVolumePerVolumeMetadataRunsDespitePreRenderedRes pins,
+// proven here through the NEW attached-volume gate (drbdsetup status --json
+// shows only vol-0).
+func TestBUG048LateAddStillRunsMetadataPassForUnattachedVolume(t *testing.T) {
+	const rd = "pvc-bug048-lateadd"
+
+	dir := t.TempDir()
+	fx := storage.NewFakeExec()
+
+	for _, vn := range []string{"00000", "00001"} {
+		fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings -o lv_name vg/%s_%s", rd, vn),
+			storage.FakeResponse{Stdout: []byte(rd + "_" + vn + "\n")})
+		fx.Expect(fmt.Sprintf("lvs --config devices { filter=['r|^/dev/drbd|','r|^/dev/zd|'] } --noheadings --separator | -o lv_path,lv_size --units k --nosuffix vg/%s_%s", rd, vn),
+			storage.FakeResponse{Stdout: []byte(fmt.Sprintf("/dev/vg/%s_%s|1048576\n", rd, vn))})
+	}
+
+	// vol-0 stamped, vol-1 has no metadata yet (the late-added volume).
+	fx.Expect("drbdadm dump-md "+rd+"/0",
+		storage.FakeResponse{Stdout: []byte("version \"v09\";\nla-size-sect 2048;\n")})
+	fx.Expect("drbdadm dump-md "+rd+"/1",
+		storage.FakeResponse{Err: errDrbdadmDumpMdNoMeta})
+	fx.Expect(fmt.Sprintf("drbdadm create-md --force --max-peers=%d %s/1", drbd.MaxPeers-1, rd),
+		storage.FakeResponse{})
+
+	// KERNEL TRUTH: only vol-0 is attached; vol-1 is absent from the device
+	// list (the genuine late-add window). AttachedVolumes returns {0}, so the
+	// gate finds vol-1 unattached and the metadata pass fires.
+	fx.Expect("drbdsetup status "+rd+" --json", storage.FakeResponse{Stdout: []byte(`[{
+	  "name":"` + rd + `","node-id":0,"role":"Secondary",
+	  "devices":[
+	    {"volume":0,"minor":1000,"disk-state":"UpToDate"}
+	  ],
+	  "connections":[{
+	    "peer-node-id":1,"name":"n2","connection-state":"Connected","peer-role":"Secondary",
+	    "peer_devices":[
+	      {"volume":0,"peer-disk-state":"UpToDate","replication-state":"Established","resync-suspended":"no"}
+	    ]
+	  }]
+	}]`)})
+
+	thin := lvm.NewThin(lvm.ThinConfig{VolumeGroup: "vg", ThinPool: "tp"}, fx)
+	rec := satellite.NewReconciler(satellite.ReconcilerConfig{
+		Providers: map[string]storage.Provider{"thin1": thin},
+		Adm:       drbd.NewAdm(fx),
+		StateDir:  dir,
+		NodeName:  "n1",
+	})
+
+	seedConvergedTwoVolumeRD(t, dir, rd)
+
+	dr := []*intent.DesiredResource{convergedTwoVolumeDR(rd)}
+
+	results, err := rec.Apply(t.Context(), dr)
+	if err != nil {
+		t.Fatalf("Apply: %v", err)
+	}
+	if len(results) != 1 || !results[0].GetOk() {
+		t.Fatalf("Apply: expected Ok=true; got results=%+v", results)
+	}
+
+	calls := fx.CommandLines()
+
+	wantCreateMD := fmt.Sprintf("drbdadm create-md --force --max-peers=%d %s/1", drbd.MaxPeers-1, rd)
+	if !slices.Contains(calls, wantCreateMD) {
+		t.Errorf("BUG-048 REGRESSED: late-added unattached vol-1 got NO metadata pass — it would come up Diskless / Inconsistent with no winner seed and wedge; want %q in %v", wantCreateMD, calls)
+	}
+
+	// The dump-md probe MUST have run for the unattached volume.
+	if !slices.Contains(calls, "drbdadm dump-md "+rd+"/1") {
+		t.Errorf("expected the per-volume dump-md probe to run for the unattached vol-1; got %v", calls)
+	}
+
+	// vol-0 (attached) must NOT be re-created (would wipe its GI + bitmap).
+	forbidden := fmt.Sprintf("drbdadm create-md --force --max-peers=%d %s/0", drbd.MaxPeers-1, rd)
+	if slices.Contains(calls, forbidden) {
+		t.Errorf("re-ran create-md on the attached vol-0 (would wipe operator-stamped metadata): %v", calls)
+	}
+}
+
+// --- shared fixtures -------------------------------------------------------
+
+// seedConvergedTwoVolumeRD writes the on-disk state a 2-volume RD settles
+// into after first activation: a .res carrying both volume blocks and the
+// .md-created marker (so dr.GetMetadataCreated()==true drives firstActivation
+// false — the post-activation reconcile shape the deadlock lived in).
+func seedConvergedTwoVolumeRD(t *testing.T, dir, rd string) {
+	t.Helper()
+
+	resBody := "resource " + rd + " {\n" +
+		"  on n1 {\n" +
+		"    volume 0 {\n    }\n" +
+		"    volume 1 {\n    }\n" +
+		"  }\n}\n"
+	if err := os.WriteFile(filepath.Join(dir, rd+".res"), []byte(resBody), 0o600); err != nil {
+		t.Fatalf("seed .res: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(dir, rd+".md-created"), nil, 0o600); err != nil {
+		t.Fatalf("seed md-created: %v", err)
+	}
+}
+
+// convergedTwoVolumeDR is the DesiredResource for a past-first-activation
+// 2-diskful RD with one peer: MetadataCreated=true, both volumes 1 GiB.
+func convergedTwoVolumeDR(rd string) *intent.DesiredResource {
+	peerID := int32(1)
+
+	return &intent.DesiredResource{
+		Name:            rd,
+		NodeName:        "n1",
+		MetadataCreated: true,
+		SkipInitialSync: skipInitTrue(),
+		Volumes: []*intent.DesiredVolume{
+			{VolumeNumber: 0, SizeKib: 1024 * 1024, StoragePool: "thin1"},
+			{VolumeNumber: 1, SizeKib: 1024 * 1024, StoragePool: "thin1"},
+		},
+		Peers: []intent.DesiredPeer{{Name: "n2", NodeID: &peerID}},
+		DrbdOptions: map[string]string{
+			"port": "7000", "node-id": "0", "address": "10.0.0.1", "minor": "1000",
+			"peer.n2.address": "10.0.0.2", "peer.n2.node-id": "1", "peer.n2.port": "7000",
+		},
+	}
+}
+
+// assertNoMetadataProbe fails the test if the reconcile issued ANY per-volume
+// `drbdadm dump-md <rd>/<n>` — the md_buffer consumer that deadlocks a resize.
+func assertNoMetadataProbe(t *testing.T, calls []string, rd string) {
+	t.Helper()
+
+	for _, line := range calls {
+		if strings.HasPrefix(line, "drbdadm dump-md "+rd+"/") {
+			t.Errorf("BUG-048 resize deadlock: per-volume metadata probe %q fired on a converged/resizing diskful reconcile — "+
+				"dump-md is an md_buffer consumer that contends with an in-flight `vd s` resize and deadlocks the cluster reboot-proof; all calls: %v", line, calls)
+		}
+	}
+}
+
+// assertNoLateAddSelfHeal fails the test if the reconcile issued any cluster-
+// wide late-add self-heal action: MintLateAddSource (disconnect +
+// new-current-uuid --clear-bitmap + adjust) or InvalidateVolume. These are
+// the cluster-wide state-change operations that contend with an in-flight
+// resize's md_buffer hold.
+func assertNoLateAddSelfHeal(t *testing.T, calls []string, rd string) {
+	t.Helper()
+
+	for _, line := range calls {
+		switch {
+		case line == "drbdadm disconnect "+rd:
+			t.Errorf("BUG-048: late-add self-heal `disconnect %s` fired on a converged/resizing reconcile (MintLateAddSource) — would contend with an in-flight resize; calls: %v", rd, calls)
+		case strings.HasPrefix(line, "drbdsetup new-current-uuid --clear-bitmap "):
+			t.Errorf("BUG-048: late-add self-heal `%s` fired on a converged/resizing reconcile (MintLateAddSource) — would contend with an in-flight resize; calls: %v", line, calls)
+		case strings.HasPrefix(line, "drbdadm invalidate "+rd):
+			t.Errorf("BUG-048: late-add resync-kick `%s` fired on a converged/resizing reconcile (InvalidateVolume) — would contend with an in-flight resize; calls: %v", line, calls)
+		}
+	}
+}

From 77f374d433e13a78a77ad82cf4fd4dca87ee0e88 Mon Sep 17 00:00:00 2001
From: Andrei Kvapil <kvapss@gmail.com>
Date: Tue, 16 Jun 2026 10:02:49 +0300
Subject: [PATCH 3/3] test(harness): BUG-048 resize-deadlock L6 cli-matrix cell
 + L7 replay (zfs-thin 2-diskful+1-diskless)

CLI-bug-fix protocol cells for the BUG-048 resize deadlock (the fix
touches DRBD resize / satellite paths).

L6 cli-matrix bug-048-resize-deadlock-zfs-thin.sh: build the forensic
shape (zfs-thin, 2 diskful + 1 diskless-client), drive 3 sequential
grows under reconcile pressure, and assert each grow COMPLETES (size
lands + diskful replicas re-reach UpToDate within a bounded window) with
NO deadlock surface (StandAlone / suspended / a diskful volume stuck
below UpToDate). With the bug the first grow stalls in a size-change
retry storm and the assert times out.

L7 replay bug048-resize.yaml: codify the operator sequence
(rd c -> vd c 1G -> 2 diskful + 1 diskless-client -> grow 1G->2G->3G)
with vd_size_kib + all_uptodate + replica_diskless convergence asserts;
the grow timeouts are deadlock discriminators, not convergence targets.

Honest scope: the full kernel md_buffer deadlock needs scale/contention
to escalate, so the DETERMINISTIC CI guard for this bug is the L1 unit
regression; these cells are best-effort stand reproduction the
orchestrator runs on a stand under load.

Co-Authored-By: Claude <noreply@anthropic.com>
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
---
 .../bug-048-resize-deadlock-zfs-thin.sh       | 180 ++++++++++++++++++
 .../replay/bug048-resize.yaml                 | 145 ++++++++++++++
 2 files changed, 325 insertions(+)
 create mode 100755 tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh
 create mode 100644 tests/operator-harness/replay/bug048-resize.yaml

diff --git a/tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh b/tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh
new file mode 100755
index 00000000..fdb0cbe7
--- /dev/null
+++ b/tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh
@@ -0,0 +1,180 @@
+#!/usr/bin/env bash
+#
+# usage: bug-048-resize-deadlock-zfs-thin.sh WORK_DIR
+#
+# L6 cli-matrix cell — BUG-048 resize deadlock (P0, availability; a
+# reboot-proof cluster-wide DRBD deadlock that PR #164 shipped to main with
+# a fully-green CI, including the existing vd-resize cell).
+#
+# DEADLOCK MECHANISM: PR #164 widened the ensurePerVolumeMetadata trigger gate
+# from the .res-count hasLateAddedVolume() to the unconditional
+# dr.GetMetadataCreated(), so EVERY post-activation diskful reconcile ran a
+# per-volume `drbdadm dump-md` (an md_buffer consumer) plus the cluster-wide
+# late-add self-heals (MintLateAddSource: disconnect + new-current-uuid
+# --clear-bitmap + adjust; InvalidateVolume). During a `vd s` resize DRBD
+# holds md_buffer for the whole cluster-wide size change
+# (change_cluster_wide_device_size / drbd_determine_dev_size); the
+# per-reconcile md probe + the cluster-wide self-heals then perpetually lose
+# the cluster-wide state-change arbitration, the resize never completes,
+# md_buffer is never released, and the resource deadlocks cluster-wide,
+# reboot-proof (stand forensics: size-change retry count 10,790 vs the normal
+# 2-20, on a zfs-thin 2-diskful + 1-diskless-client RD; clean at f3515045a).
+#
+# THE FIX (pkg/satellite/reconciler.go + pkg/drbd/drbdsetup_show.go): the
+# per-volume metadata pass fires ONLY when some desired volume is NOT
+# present-and-attached in the kernel (an attached volume already has metadata
+# — the kernel cannot attach a lower disk without it — so dump-md on it is
+# pointless and is the md_buffer consumer that contends with the resize), and
+# the late-add self-heals are gated out of a resize pass.
+#
+# THE SHAPE THIS CELL REPRODUCES: zfs-thin, 2 diskful + 1 diskless-client,
+# repeated grows under reconcile pressure. Each grow MUST complete (size
+# lands + diskful replicas re-reach UpToDate) within a bounded window, and the
+# resource must NEVER enter the deadlock surface (StandAlone / suspended /
+# Failed / a diskful volume stuck below UpToDate forever).
+#
+# HONEST SCOPE: the full kernel md_buffer deadlock needs scale/contention to
+# escalate (the forensic repro took ~10k size-change retries), so the
+# DETERMINISTIC CI guard for this bug is the L1 unit regression
+# (pkg/satellite/reconciler_bug048_resize_deadlock_test.go — proven to FAIL on
+# the pre-fix gate). This L6 cell is the BEST-EFFORT stand reproduction: the
+# orchestrator runs it on a live stand under load; a clean pass here is strong
+# corroboration, a hang/timeout here is the deadlock.
+#
+# Unit pin: pkg/satellite/reconciler_bug048_resize_deadlock_test.go
+#   (no dump-md / no self-heal on a converged or resizing diskful reconcile;
+#    metadata pass still fires for a genuine unattached late-add).
+
+set -euo pipefail
+
+WORK_DIR=${1:?work_dir required}
+export KUBECONFIG="$WORK_DIR/kubeconfig"
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=lib.sh
+source "$SCRIPT_DIR/lib.sh"
+
+require_workers 3
+
+linstor_cli_setup
+
+# zfs-thin is the forensic repro substrate (the deadlock was observed on a
+# zfs-thin pool). Allow override but default to zfs-thin.
+POOL="${POOL:-zfs-thin}"
+RD="cli-matrix-bug048-resize-deadlock"
+SIZE_2G_KIB=2097152
+SIZE_3G_KIB=3145728
+SIZE_4G_KIB=4194304
+
+cleanup() {
+    [[ -n "${RD:-}" ]] && delete_rd "$RD"
+    [[ -n "${RD:-}" ]] && assert_no_orphans "$RD"
+}
+trap 'cleanup; linstor_cli_teardown' EXIT
+
+echo "============================================================"
+echo ">> bug-048-resize-deadlock-zfs-thin :: POOL=$POOL RD=$RD"
+echo "============================================================"
+
+echo ">> pre-flight: $POOL on >=2 nodes + a 3rd worker for the diskless client"
+sp_json=$("${LCTL[@]}" --machine-readable storage-pool list --storage-pools "$POOL" 2>/dev/null || echo "[]")
+ok_nodes=$(jq -r '[.[]? | .[]? | select(.provider_kind != null) | .node_name] | unique | length' <<<"$sp_json" 2>/dev/null || echo 0)
+if (( ok_nodes < 2 )); then
+    echo "SKIP ($POOL): pool not on >=2 nodes (got $ok_nodes) — BUG-048 deadlock fixture unavailable"
+    exit 0
+fi
+
+echo ">> rd c + vd c 1G + r c --auto-place=2 -s $POOL (2 diskful replicas)"
+"${LCTL[@]}" resource-definition create "$RD" >/dev/null
+"${LCTL[@]}" volume-definition create "$RD" 1G >/dev/null
+"${LCTL[@]}" resource create --auto-place=2 --storage-pool="$POOL" "$RD" >/dev/null
+
+# Resolve the two diskful replicas.
+deadline=$(( $(date +%s) + 90 ))
+diskful=()
+while (( $(date +%s) < deadline )); do
+    mapfile -t diskful < <(linstor_diskful_nodes "$RD")
+    if (( ${#diskful[@]} >= 2 )); then break; fi
+    sleep 2
+done
+if (( ${#diskful[@]} < 2 )); then
+    echo "FAIL: autoplace did not stage 2 diskful replicas (got ${#diskful[@]})" >&2
+    exit 1
+fi
+N1="${diskful[0]}"
+N2="${diskful[1]}"
+echo ">> diskful replicas: N1=$N1 N2=$N2"
+wait_uptodate "$RD" "$N1" "$N2"
+
+# Add the diskless CLIENT on a 3rd node — the exact forensic shape
+# (2-diskful + 1-diskless-client). Reuse an auto-placed TieBreaker if present.
+DLNODE="$(linstor_tiebreaker_node "$RD" || true)"
+if [[ -z "$DLNODE" ]]; then
+    DLNODE="$(linstor_pick_free_node "$RD" "$N1" "$N2" || true)"
+    if [[ -z "$DLNODE" ]]; then
+        echo "SKIP: no free 3rd worker for the diskless client" >&2
+        exit 0
+    fi
+    echo ">> adding diskless client on $DLNODE"
+    "${LCTL[@]}" resource create "$DLNODE" "$RD" --drbd-diskless >/dev/null \
+        || { echo "FAIL: r c $DLNODE $RD --drbd-diskless" >&2; exit 1; }
+else
+    echo ">> reusing auto-placed TieBreaker on $DLNODE as the diskless client"
+fi
+
+# Deadlock surface probe: a deadlocked RD wedges the kernel connection
+# StandAlone or suspends I/O (suspended:*), and a diskful volume sticks below
+# UpToDate forever. This helper returns non-zero (and prints) if ANY diskful
+# replica is in the deadlock surface — used as a fast-fail after each grow.
+assert_no_deadlock_surface() {
+    local node st
+    for node in "$N1" "$N2"; do
+        st=$(on_node "$node" drbdadm status "$RD" 2>&1 || true)
+        if grep -Eiq 'StandAlone|suspended:[^n]|connection:Timeout|connection:BrokenPipe' <<<"$st"; then
+            echo "FAIL (BUG-048 deadlock surface): $node shows a wedged/suspended connection for $RD" >&2
+            echo "$st" >&2
+            return 1
+        fi
+    done
+    return 0
+}
+
+# Drive a sequence of grows. Each grow exercises the cluster-wide size change
+# (md_buffer hold) under the per-reconcile pressure that PR #164 introduced.
+# With the bug, the FIRST grow against this 2-diskful+1-diskless shape stalls
+# (size-change retry storm) and the assert below times out.
+grow_to() {
+    local size_arg=$1 expect_kib=$2
+    echo ">> linstor vd s $RD 0 $size_arg (grow under reconcile pressure)"
+    "${LCTL[@]}" volume-definition set-size "$RD" 0 "$size_arg" >/dev/null
+
+    echo "   assert vd size == $expect_kib KiB (resize COMPLETED, md_buffer released)"
+    # Bug => this never lands (resize wedged mid change_cluster_wide_device_size).
+    wait_vd_size "$RD" 0 "$expect_kib" 180
+
+    echo "   assert both diskful replicas re-reached UpToDate after grow"
+    wait_uptodate "$RD" "$N1" "$N2"
+
+    echo "   assert no deadlock surface (StandAlone / suspended) on either diskful replica"
+    assert_no_deadlock_surface
+
+    # The diskless client must not have grown a backing dataset (U48 guard,
+    # asserted here as a free side-check) and must not have wedged.
+    local dl_state
+    dl_state="$(status_disk_state "$RD" "$DLNODE" 0)"
+    case "$dl_state" in
+        Failed|Inconsistent|Attaching|Detaching|Negotiating|DUnknown)
+            echo "FAIL: diskless client $DLNODE wedged after grow (state=$dl_state)" >&2
+            return 1 ;;
+    esac
+    echo "   grow to $expect_kib KiB COMPLETE (no deadlock)"
+}
+
+grow_to 2G "$SIZE_2G_KIB"
+grow_to 3G "$SIZE_3G_KIB"
+grow_to 4G "$SIZE_4G_KIB"
+
+echo ">> bug-048-resize-deadlock-zfs-thin OK (3 sequential grows on a 2-diskful+1-diskless-client zfs-thin RD all completed; no md_buffer deadlock, no StandAlone/suspended surface)"
+cleanup
+trap 'linstor_cli_teardown' EXIT
+echo ">> bug-048-resize-deadlock-zfs-thin COMPLETE"
diff --git a/tests/operator-harness/replay/bug048-resize.yaml b/tests/operator-harness/replay/bug048-resize.yaml
new file mode 100644
index 00000000..178f10ff
--- /dev/null
+++ b/tests/operator-harness/replay/bug048-resize.yaml
@@ -0,0 +1,145 @@
+name: bug048-resize
+description: |
+  BUG-048 resize deadlock (P0, availability) — a `vd s` resize on a
+  2-diskful + 1-diskless-client RD must COMPLETE; PR #164's widened
+  ensurePerVolumeMetadata gate (dr.GetMetadataCreated()) ran a per-volume
+  `drbdadm dump-md` (md_buffer consumer) plus the cluster-wide late-add
+  self-heals on EVERY post-activation reconcile. During the resize DRBD
+  holds md_buffer for the cluster-wide size change
+  (change_cluster_wide_device_size); the per-reconcile md probe + self-heals
+  perpetually lose the state-change arbitration, the resize never completes,
+  md_buffer is never released, and the resource deadlocks cluster-wide,
+  reboot-proof (forensic repro: size-change retry count 10,790 vs 2-20
+  normal, zfs-thin 2-diskful + 1-diskless-client).
+
+  Operator sequence:
+    rd c → vd c 1G → r c node1 + r c node2 (diskful) →
+    r c --drbd-diskless node3 → grow 1G→2G→3G
+
+  CLI-side contract (each grow):
+    - the grow returns 0 AND vd size reflects the new size within a bounded
+      window (with the bug it never lands — the resize wedges)
+    - every replica reconverges (all_uptodate tolerates the Diskless peer)
+    - the diskless client STAYS Diskless across the resize, never wedges
+
+  HONEST SCOPE: the full kernel md_buffer deadlock needs scale/contention to
+  escalate, so the DETERMINISTIC CI guard for this bug is the L1 unit
+  regression (pkg/satellite/reconciler_bug048_resize_deadlock_test.go, proven
+  to FAIL on the pre-fix gate). This replay is a BEST-EFFORT stand
+  reproduction — the orchestrator runs it under load; a hang/timeout here is
+  the deadlock, a clean pass is strong corroboration. The kernel-truth
+  no-StandAlone/suspended surface check lives in the L6 cli-matrix cell
+  tests/e2e/cli-matrix/bug-048-resize-deadlock-zfs-thin.sh.
+
+  If this YAML goes red, no fix in:
+    - pkg/satellite/reconciler.go (ensurePerVolumeMetadata gate /
+      maybePromoteSelfHeals resize guard)
+    - pkg/drbd/drbdsetup_show.go (AttachedVolumes)
+  may be claimed closed.
+
+prerequisites:
+  min_nodes: 3
+  storage_pool: stand
+
+vars:
+  sp: stand
+
+steps:
+  - name: create-rd
+    cmd: ["resource-definition", "create", "{{rd}}"]
+    expect_exit: 0
+  - name: create-vd-1g
+    cmd: ["volume-definition", "create", "{{rd}}", "1G"]
+    expect_exit: 0
+    await:
+      kind: vd_size_kib
+      rd: "{{rd}}"
+      vol: 0
+      expected_kib: 1048576
+      timeout_s: 30
+  - name: r-c-node1-diskful
+    cmd: ["resource", "create", "{{node1}}", "{{rd}}", "--storage-pool", "{{sp}}"]
+    expect_exit: 0
+  - name: r-c-node2-diskful
+    cmd: ["resource", "create", "{{node2}}", "{{rd}}", "--storage-pool", "{{sp}}"]
+    expect_exit: 0
+  - name: wait-2-diskful-uptodate
+    cmd: ["resource", "list", "--resources", "{{rd}}"]
+    expect_exit: 0
+    await:
+      kind: all_uptodate
+      rd: "{{rd}}"
+      # 1G full sync to a fresh peer at ~4 MB/s on this stand => minutes.
+      timeout_s: 600
+  - name: r-c-node3-diskless
+    # node + rd are positional FIRST, then the flag — the linstor 1.27.1
+    # client parses `r c --drbd-diskless <rd> <node>` as <rd>=<node>.
+    cmd: ["resource", "create", "{{node3}}", "{{rd}}", "--drbd-diskless"]
+    expect_exit: 0
+    await:
+      kind: replica_diskless
+      rd: "{{rd}}"
+      node: "{{node3}}"
+      timeout_s: 120
+
+  # ---- Grow 1G → 2G (the deadlock window: cluster-wide size change under
+  #      the per-reconcile pressure PR #164 introduced) ----
+  - name: grow-2g
+    cmd: ["volume-definition", "set-size", "{{rd}}", "0", "2G"]
+    expect_exit: 0
+    await:
+      # Bug => this never lands (resize wedged mid-change_cluster_wide_device_size,
+      # md_buffer never released). 180s is the deadlock discriminator, NOT a
+      # convergence target — a healthy grow lands in seconds.
+      kind: vd_size_kib
+      rd: "{{rd}}"
+      vol: 0
+      expected_kib: 2097152
+      timeout_s: 180
+  - name: wait-uptodate-after-2g
+    cmd: ["resource", "list", "--resources", "{{rd}}"]
+    expect_exit: 0
+    await:
+      kind: all_uptodate
+      rd: "{{rd}}"
+      timeout_s: 600
+
+  # ---- Grow 2G → 3G (second cluster-wide size change; the bug's retry
+  #      storm compounds across successive resizes) ----
+  - name: grow-3g
+    cmd: ["volume-definition", "set-size", "{{rd}}", "0", "3G"]
+    expect_exit: 0
+    await:
+      kind: vd_size_kib
+      rd: "{{rd}}"
+      vol: 0
+      expected_kib: 3145728
+      timeout_s: 180
+  - name: wait-uptodate-after-3g
+    cmd: ["resource", "list", "--resources", "{{rd}}"]
+    expect_exit: 0
+    await:
+      kind: all_uptodate
+      rd: "{{rd}}"
+      timeout_s: 600
+  - name: diskless-client-still-diskless-not-wedged
+    cmd: ["resource", "list", "--resources", "{{rd}}"]
+    expect_exit: 0
+    await:
+      # After both grows the diskless client is STILL Diskless — a deadlock
+      # / resize fan-out would have wedged it to Failed/Inconsistent.
+      kind: replica_diskless
+      rd: "{{rd}}"
+      node: "{{node3}}"
+      timeout_s: 60
+
+teardown:
+  - cmd: ["resource-definition", "delete", "{{rd}}"]
+    expect_exit: 0
+    await:
+      kind: rd_absent
+      rd: "{{rd}}"
+      timeout_s: 60
+
+invariants:
+  - no_orphans