Skip to content

Commit c50ede1

Browse files
kvapsclaude
andauthored
fix(e2e): resolve DRBD devices via drbdadm sh-dev in remaining cli-matrix cells (#141)
Migrate the 9 remaining cli-matrix cells that still resolved DRBD devices through the /dev/drbd/by-res symlink (readlink or direct dd) to the shared resolve_drbd_device helper from lib.sh, the same way the snap-restore cell was fixed. The by-res symlink is not reliably present in the satellite mount namespace, so those resolution sites fail or silently no-op on the stand. Assertions are unchanged; only the resolver mechanics moved. The last-resort /dev/drbd* minor-enumeration fallbacks are kept where they already existed. Signed-off-by: Andrei Kvapil <kvapss@gmail.com> Co-authored-by: Claude <noreply@anthropic.com>
1 parent 04aacdd commit c50ede1

9 files changed

Lines changed: 73 additions & 20 deletions

tests/e2e/cli-matrix/r-activate-deactivate-lifecycle.sh

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
#
5151
# 2. IO during INACTIVE:
5252
# - Promote N1 to Primary if not already
53-
# - Write ~64 MiB random pattern to /dev/drbd/by-res/<RD>/0
53+
# - Write ~64 MiB random pattern to the RD's /dev/drbdN device
5454
# - Must complete without quorum block (single-replica quorum
5555
# override). Capture md5 of pattern.
5656
#
@@ -153,9 +153,14 @@ wait_role "$RD" "$N1" "Primary" 30 \
153153
# written BEFORE the first deactivate so both replicas hold the
154154
# same bytes and the GI baseline is established.
155155
echo ">> seed initial ${IO_MIB} MiB pattern on $N1 (will be the md5 anchor)"
156+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
157+
# /dev/drbd/by-res symlink is not reliably present in the satellite
158+
# mount namespace, so readlink-based resolution aborts on the stand.
159+
# Last-resort minor enumeration kept for stands where sh-dev fails.
160+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
156161
on_node "$N1" bash -c "
157162
set -e
158-
dev=\$(readlink -f /dev/drbd/by-res/${RD}/0 2>/dev/null || true)
163+
dev='$dev'
159164
if [ -z \"\$dev\" ]; then
160165
dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
161166
fi
@@ -289,9 +294,11 @@ for cycle in 1 2 3; do
289294
# the sole voter and DRBD's single-replica quorum override must
290295
# let writes through. A failure here = quorum frame is wrong on
291296
# INACTIVE peers.
297+
# Same portable resolver as the seeding step (sh-dev, not by-res).
298+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
292299
io_out=$(on_node "$N1" bash -c "
293300
set -e
294-
dev=\$(readlink -f /dev/drbd/by-res/${RD}/0 2>/dev/null || true)
301+
dev='$dev'
295302
if [ -z \"\$dev\" ]; then
296303
dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
297304
fi
@@ -317,8 +324,9 @@ for cycle in 1 2 3; do
317324
# — the deactivate path must not have corrupted it. (md5 of the
318325
# FIRST ${IO_MIB} MiB of the device, since the seed lives at
319326
# offset 0 and the new write was at offset ${IO_MIB} MiB.)
327+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
320328
seed_md5_now=$(on_node "$N1" bash -c "
321-
dev=\$(readlink -f /dev/drbd/by-res/${RD}/0 2>/dev/null || true)
329+
dev='$dev'
322330
if [ -z \"\$dev\" ]; then
323331
dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
324332
fi
@@ -424,8 +432,9 @@ for cycle in 1 2 3; do
424432
# the pre-deact pattern. The partial-sync handshake must not
425433
# have written stale bytes onto the surviving replica's data.
426434
echo ">> md5 of seed on $N1 still matches pre-deact pattern"
435+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
427436
seed_md5_after=$(on_node "$N1" bash -c "
428-
dev=\$(readlink -f /dev/drbd/by-res/${RD}/0 2>/dev/null || true)
437+
dev='$dev'
429438
if [ -z \"\$dev\" ]; then
430439
dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
431440
fi

tests/e2e/cli-matrix/r-c-over-tiebreaker-skip-sync.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,11 @@ fi
224224
# Write 32 MiB; secondary will need to catch up. With skip-sync
225225
# the catch-up is fast but still passes through SyncSource on
226226
# the source side per upstream events2 semantics.
227-
on_node "$prim" bash -c "dd if=/dev/urandom of=/dev/drbd/by-res/$RD/0 bs=1M count=32 status=none oflag=direct 2>/dev/null" || true
227+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
228+
# /dev/drbd/by-res symlink is not reliably present in the satellite
229+
# mount namespace, so the by-res dd silently no-ops on the stand.
230+
dev=$(resolve_drbd_device "$prim" "$RD" 0 2>/dev/null) || dev=""
231+
[ -n "$dev" ] && on_node "$prim" bash -c "dd if=/dev/urandom of=$dev bs=1M count=32 status=none oflag=direct 2>/dev/null" || true
228232

229233
# Capture wire-shape for ~10s post-mutation.
230234
shape_ok=false

tests/e2e/cli-matrix/r-d-last-uptodate-midsync-rejected.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,11 @@ echo ">> Phase 2: write data on $N1 so the second replica must really resync"
8282
# Primary + dd a chunk to bump the GI; a fresh empty volume could
8383
# skip-sync and erase the SyncTarget window.
8484
on_node "$N1" bash -c "drbdadm primary --force $RD 2>/dev/null" || true
85-
on_node "$N1" bash -c "dd if=/dev/urandom of=/dev/drbd/by-res/$RD/0 bs=1M count=256 status=none oflag=direct 2>/dev/null" || true
85+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
86+
# /dev/drbd/by-res symlink is not reliably present in the satellite
87+
# mount namespace, so the by-res dd silently no-ops on the stand.
88+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
89+
[ -n "$dev" ] && on_node "$N1" bash -c "dd if=/dev/urandom of=$dev bs=1M count=256 status=none oflag=direct 2>/dev/null" || true
8690
on_node "$N1" bash -c "drbdadm secondary $RD 2>/dev/null" || true
8791

8892
echo ">> Phase 3: throttle resync (c-max-rate 1024 KiB/s) so the add stays SyncTarget"

tests/e2e/cli-matrix/snap-create-multiple-group-consistency.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,14 @@ on_node "$N1" drbdadm primary --force "$RD_B" 2>/dev/null || true
114114
# different counter values whenever the writer made progress
115115
# between the two per-RD snap calls.
116116
echo ">> start cross-RD correlated writer on $N1 (counter into rd-a + rd-b)"
117+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
118+
# /dev/drbd/by-res symlink is not reliably present in the satellite
119+
# mount namespace, so readlink-based resolution aborts on the stand.
120+
dev_a=$(resolve_drbd_device "$N1" "$RD_A" 0 2>/dev/null) || dev_a=""
121+
dev_b=$(resolve_drbd_device "$N1" "$RD_B" 0 2>/dev/null) || dev_b=""
117122
on_node "$N1" bash -c "
118-
dev_a=\$(readlink -f /dev/drbd/by-res/$RD_A/0 2>/dev/null || true)
119-
dev_b=\$(readlink -f /dev/drbd/by-res/$RD_B/0 2>/dev/null || true)
123+
dev_a='$dev_a'
124+
dev_b='$dev_b'
120125
if [ -z \"\$dev_a\" ] || [ -z \"\$dev_b\" ]; then
121126
echo 'note: could not resolve drbd device paths'
122127
exit 0

tests/e2e/cli-matrix/snap-create-multiple-lifecycle.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,16 @@ done
119119
# Phase 2: cross-RD correlated writer (counter into bytes 0-7 of all 3)
120120
# =====================================================================
121121
echo ">> Phase 2: start cross-RD correlated writer on $N1"
122+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
123+
# /dev/drbd/by-res symlink is not reliably present in the satellite
124+
# mount namespace, so readlink-based resolution aborts on the stand.
125+
dev_a=$(resolve_drbd_device "$N1" "$RD_A" 0 2>/dev/null) || dev_a=""
126+
dev_b=$(resolve_drbd_device "$N1" "$RD_B" 0 2>/dev/null) || dev_b=""
127+
dev_c=$(resolve_drbd_device "$N1" "$RD_C" 0 2>/dev/null) || dev_c=""
122128
on_node "$N1" bash -c "
123-
dev_a=\$(readlink -f /dev/drbd/by-res/$RD_A/0 2>/dev/null || true)
124-
dev_b=\$(readlink -f /dev/drbd/by-res/$RD_B/0 2>/dev/null || true)
125-
dev_c=\$(readlink -f /dev/drbd/by-res/$RD_C/0 2>/dev/null || true)
129+
dev_a='$dev_a'
130+
dev_b='$dev_b'
131+
dev_c='$dev_c'
126132
if [ -z \"\$dev_a\" ] || [ -z \"\$dev_b\" ] || [ -z \"\$dev_c\" ]; then
127133
echo 'note: could not resolve all 3 drbd device paths'
128134
exit 0

tests/e2e/cli-matrix/snap-cross-node-consistency.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,14 @@ on_node "$N1" drbdadm primary --force "$RD" 2>/dev/null || true
9191
# capture (one snap took data at byte N, the other at byte N+delta)
9292
# yields visibly different md5.
9393
echo ">> seed deterministic 256 MiB pattern on $N1's DRBD device"
94+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
95+
# /dev/drbd/by-res symlink is not reliably present in the satellite
96+
# mount namespace, so readlink-based resolution aborts on the stand.
97+
# Last-resort minor enumeration kept for stands where sh-dev fails.
98+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
9499
on_node "$N1" bash -c "
95100
set -e
96-
dev=\$(readlink -f /dev/drbd/by-res/$RD/0 2>/dev/null || true)
101+
dev='$dev'
97102
if [ -z \"\$dev\" ]; then
98103
dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
99104
fi
@@ -111,8 +116,10 @@ wait_uptodate "$RD" "$N1" "$N2"
111116
# AFTER replica $N1 already finished its snapshot — and the two
112117
# resulting snapshots reflect that delta.
113118
echo ">> start continuous writer on $N1 (urandom → DRBD device)"
119+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
114120
on_node "$N1" bash -c "
115-
dev=\$(readlink -f /dev/drbd/by-res/$RD/0 2>/dev/null || ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
121+
dev='$dev'
122+
[ -n \"\$dev\" ] || dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
116123
while true; do
117124
dd if=/dev/urandom of=\$dev bs=4K count=128 oflag=direct status=none 2>/dev/null || break
118125
done >/tmp/cli-matrix-snap-writer.log 2>&1 &

tests/e2e/cli-matrix/snap-full-lifecycle.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,14 @@ on_node "$N1" drbdadm primary --force "$RD" 2>/dev/null || true
115115
# We therefore read the device's actual byte size and feed dd via
116116
# count_bytes so the write stops exactly at the DRBD boundary.
117117
echo ">> Phase 2: seed deterministic random pattern on $N1 (DRBD-fit bytes), then start writer"
118+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
119+
# /dev/drbd/by-res symlink is not reliably present in the satellite
120+
# mount namespace, so readlink-based resolution aborts on the stand.
121+
# Last-resort minor enumeration kept for stands where sh-dev fails.
122+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
118123
seed_out=$(on_node "$N1" bash -c "
119124
set -e
120-
dev=\$(readlink -f /dev/drbd/by-res/$RD/0 2>/dev/null || true)
125+
dev='$dev'
121126
if [ -z \"\$dev\" ]; then
122127
dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
123128
fi
@@ -136,8 +141,10 @@ echo "$seed_out"
136141
wait_uptodate "$RD" "$N1" "$N2"
137142

138143
echo ">> Phase 2: start continuous writer on $N1"
144+
dev=$(resolve_drbd_device "$N1" "$RD" 0 2>/dev/null) || dev=""
139145
on_node "$N1" bash -c "
140-
dev=\$(readlink -f /dev/drbd/by-res/$RD/0 2>/dev/null || ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
146+
dev='$dev'
147+
[ -n \"\$dev\" ] || dev=\$(ls -1 /dev/drbd* 2>/dev/null | grep -vE 'by-(res|disk)' | head -1)
141148
while true; do
142149
dd if=/dev/urandom of=\$dev bs=4K count=128 oflag=direct status=none 2>/dev/null || break
143150
done >/tmp/cli-matrix-snap-lifecycle-writer.log 2>&1 &

tests/e2e/cli-matrix/snap-r-rst-stamps-resources.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,12 @@ wait_uptodate "$SRC" "$N1" "$N2"
101101
# call and asserts Resource CRDs are stamped).
102102
echo ">> seed deterministic pattern on $N1 $SRC"
103103
on_node "$N1" drbdadm primary --force "$SRC" 2>/dev/null || true
104+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
105+
# /dev/drbd/by-res symlink is not reliably present in the satellite
106+
# mount namespace, so readlink-based resolution aborts on the stand.
107+
dev=$(resolve_drbd_device "$N1" "$SRC" 0 2>/dev/null) || dev=""
104108
on_node "$N1" bash -c "
105-
dev=\$(readlink -f /dev/drbd/by-res/$SRC/0 2>/dev/null || true)
109+
dev='$dev'
106110
if [ -n \"\$dev\" ]; then
107111
printf 'BLOCKSTOR-BUG354-MARKER' | dd of=\"\$dev\" bs=1 count=24 conv=fsync status=none
108112
fi
@@ -212,10 +216,13 @@ fi
212216

213217
# ---- Bonus: read the marker on the restored replica ----------------------
214218
echo ">> bonus assert: marker bytes restored from snapshot on $N1 $TGT"
219+
# Same portable resolver as the seeding step: by-res symlinks are
220+
# not reliably present in the satellite mount namespace.
221+
dev=$(resolve_drbd_device "$N1" "$TGT" 0 2>/dev/null) || dev=""
215222
marker_read=$(on_node "$N1" bash -c "
216223
on_node_drbdadm() { drbdadm primary --force \$1 2>/dev/null; }
217224
on_node_drbdadm $TGT
218-
dev=\$(readlink -f /dev/drbd/by-res/$TGT/0 2>/dev/null || true)
225+
dev='$dev'
219226
if [ -n \"\$dev\" ]; then
220227
head -c 24 \"\$dev\" 2>/dev/null
221228
fi

tests/e2e/cli-matrix/snap-suspend-resume-isolation-u138-u52.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,14 @@ snap_ready() {
112112
# timeout. A still-suspended device makes dd block; `timeout` then kills
113113
# it and we FAIL — that is the U138 outage signal.
114114
write_survives() {
115-
local rd=$1
115+
local rd=$1 dev
116116
on_node "$N1" drbdadm primary --force "$rd" 2>/dev/null || true
117+
# Resolve via `drbdadm sh-dev` (lib.sh resolve_drbd_device): the
118+
# /dev/drbd/by-res symlink is not reliably present in the satellite
119+
# mount namespace, so readlink-based resolution aborts on the stand.
120+
dev=$(resolve_drbd_device "$N1" "$rd" 0 2>/dev/null) || dev=""
117121
if ! on_node "$N1" bash -c "
118-
dev=\$(readlink -f /dev/drbd/by-res/$rd/0 2>/dev/null || true)
122+
dev='$dev'
119123
[ -z \"\$dev\" ] && { echo 'no drbd device node for $rd' >&2; exit 2; }
120124
timeout 20 dd if=/dev/zero of=\"\$dev\" bs=4096 count=16 oflag=direct conv=fsync status=none
121125
"; then

0 commit comments

Comments
 (0)