diff --git a/tests/e2e/recovery-down-reverses.sh b/tests/e2e/recovery-down-reverses.sh index 5e72147a..a67254ca 100755 --- a/tests/e2e/recovery-down-reverses.sh +++ b/tests/e2e/recovery-down-reverses.sh @@ -60,12 +60,42 @@ # adjust, which re-issues `connect`. Step 5 below pins that # convergence with a 60s budget. # +# Tamper-window wedge (PR #148 lane 4, run 27410144876; PR #131 +# earlier): the same apply-al EBUSY artefact recovery-node-id-mismatch +# clears after its provocation can — rarely — hit this scenario's +# bare `drbdadm down` too. The satellite's revive fires off the +# `destroy resource` event immediately, and when its bring-up +# interleaves with the tail of our still-running `drbdadm down` +# (or with the satellite's second internal caller), `drbdmeta +# apply-al` fails with "Device or resource busy" (exit 20) and the +# revived slot ends HALF-CONFIGURED: disk Inconsistent, both +# connections StandAlone WITH peer-device entries registered. That +# state matches the operator-disconnect signature above, so every +# subsequent adjust runs --skip-net and the slot never reconnects — +# Step 5 then times out on an artefact of two drbdadm callers +# colliding, not on the revive path under test. +# +# Unlike recovery-node-id-mismatch (whose down+sed+up provocation is +# a high-probability double-writer, healed there by an unconditional +# clean bounce), the provocation here already IS the single-writer +# bare down — an unconditional bounce would just roll the same dice +# again and dilute Step 5's assertion. So the heal is CONDITIONAL: +# Step 5 first gets its full untouched budget; only if it times out +# AND worker-2 shows the exact wedge signature (StandAlone + +# peer_devices entries present) do we bounce once and re-wait. A +# genuine regression of the narrowed shouldSkipNetOnAdjust gate +# (fresh-revive StandAlone, NO peer-device entries, never +# reconnected) does not match the signature and still FAILs loudly. +# # Steps # 1. Apply 2-replica RD on $N1+$N2, wait UpToDate. # 2. Pick Secondary ($N2) — `drbdadm down $RD` from its satellite pod. # 3. Confirm kernel is empty for $RD on $N2 (`drbdsetup status`). # 4. Poll up to 30s for kernel to reappear on $N2. # 5. Assert peer state returns to Connected + UpToDate within 60s. +# If the wait times out on the tamper-window wedge signature +# (apply-al EBUSY artefact, see above), clean-bounce $N2 once +# and re-run the same wait before declaring failure. # 6. Cleanup via delete_rd EXIT trap. set -euo pipefail @@ -160,22 +190,64 @@ echo " kernel resource reappeared after ${revived_at}s" # its comment for why a single-int sentinel collides with the # legitimate "converged in zero seconds" case. echo ">> wait <=${UPTODATE_DEADLINE_SECS}s for ${RD} to reach Connected+UpToDate on both peers" -deadline=$(( $(date +%s) + UPTODATE_DEADLINE_SECS )) connected=0 connected_at=0 -while (( $(date +%s) < deadline )); do - n1_conn=$(status_connection_state "$RD" "$N1" "$N2") - n2_conn=$(status_connection_state "$RD" "$N2" "$N1") - n1_local_disk=$(status_disk_state "$RD" "$N1") - n2_local_disk=$(status_disk_state "$RD" "$N2") - if [[ ( "$n1_conn" == "Connected" || "$n1_conn" == "Established" ) \ - && ( "$n2_conn" == "Connected" || "$n2_conn" == "Established" ) \ - && "$n1_local_disk" == "UpToDate" && "$n2_local_disk" == "UpToDate" ]]; then - connected=1 - connected_at=$(( $(date +%s) - t_down )) +bounced=0 +for attempt in 1 2; do + deadline=$(( $(date +%s) + UPTODATE_DEADLINE_SECS )) + while (( $(date +%s) < deadline )); do + n1_conn=$(status_connection_state "$RD" "$N1" "$N2") + n2_conn=$(status_connection_state "$RD" "$N2" "$N1") + n1_local_disk=$(status_disk_state "$RD" "$N1") + n2_local_disk=$(status_disk_state "$RD" "$N2") + if [[ ( "$n1_conn" == "Connected" || "$n1_conn" == "Established" ) \ + && ( "$n2_conn" == "Connected" || "$n2_conn" == "Established" ) \ + && "$n1_local_disk" == "UpToDate" && "$n2_local_disk" == "UpToDate" ]]; then + connected=1 + connected_at=$(( $(date +%s) - t_down )) + break + fi + sleep 2 + done + if (( connected == 1 || attempt == 2 )); then break fi - sleep 2 + + # First wait timed out. Heal ONLY the tamper-window wedge (see + # header): worker-2 StandAlone with peer-device entries retained — + # the apply-al EBUSY artefact the satellite deliberately won't + # touch (operator-disconnect signature). Anything else falls + # through to the FAIL dump below untouched. + wedged=$(on_node "$N2" drbdsetup status --json "$RD" 2>/dev/null | jq -r ' + [.[0].connections[]? + | select(."connection-state" == "StandAlone" + and ((.peer_devices // []) | length > 0))] + | length' 2>/dev/null || true) + wedged=${wedged:-0} + if [[ ! "$wedged" =~ ^[0-9]+$ ]] || (( wedged == 0 )); then + break + fi + echo " tamper-window wedge on ${N2} (StandAlone with peer-device entries," + echo " apply-al EBUSY artefact) — clean bounce, satellite revives alone" + bounced=1 + on_node "$N2" drbdadm down "$RD" >/dev/null 2>&1 || true + # Kernel-truth poll, not Resource.Status: right after the down the + # observer hasn't stamped the destroy yet, so Status.diskState can + # serve a stale UpToDate. `^[[:space:]]+disk:` matches only the + # local disk line (peer lines carry `peer-disk:`). + bounce_deadline=$(( $(date +%s) + 120 )) + n2_disk="" + while (( $(date +%s) < bounce_deadline )); do + n2_disk=$(on_node "$N2" drbdsetup status "$RD" 2>/dev/null \ + | grep -m1 -E '^[[:space:]]+disk:' | cut -d: -f2 | awk '{print $1}' || true) + if [[ "$n2_disk" == "UpToDate" ]]; then break; fi + sleep 2 + done + if [[ "$n2_disk" != "UpToDate" ]]; then + echo " bounce did not bring ${N2} back UpToDate (disk=${n2_disk})" + break + fi + echo " ${N2} back UpToDate after bounce — re-running convergence wait" done if (( connected == 0 )); then @@ -205,4 +277,8 @@ if (( connected == 0 )); then exit 1 fi -echo ">> PASS 5.32 — drbdadm down auto-reverted in ${revived_at}s; UpToDate restored in ${connected_at}s" +suffix="" +if (( bounced == 1 )); then + suffix=" (after tamper-window bounce)" +fi +echo ">> PASS 5.32 — drbdadm down auto-reverted in ${revived_at}s; UpToDate restored in ${connected_at}s${suffix}" diff --git a/tests/e2e/recovery-node-id-mismatch.sh b/tests/e2e/recovery-node-id-mismatch.sh index e6dd6ba9..15f6ea1f 100755 --- a/tests/e2e/recovery-node-id-mismatch.sh +++ b/tests/e2e/recovery-node-id-mismatch.sh @@ -303,6 +303,53 @@ else echo " assertions are the load-bearing ones." fi +# Clear the tamper-window wedge before applying the SKILL recipe. +# +# The down+sed+up above intentionally races the satellite's Bug-287 +# revive (the satellite sees the `destroy resource` event from our +# `drbdadm down` and immediately re-ups the slot itself). When the +# two `drbdadm` invocations interleave badly, `drbdmeta apply-al` +# hits "Device or resource busy" (exit 20) on the backing device and +# worker-2's kernel slot ends HALF-CONFIGURED: disk attached +# Inconsistent with `al-suspended:yes`, both peers registered with +# peer-device entries but `connect` never issued — connection state +# StandAlone. The satellite then deliberately leaves it alone: a +# StandAlone slot that retains peer-device entries matches the +# operator-disconnect signature (see shouldSkipNetOnAdjust in +# pkg/satellite/reconciler.go — the W12 split-brain-recipe guard), +# so every subsequent adjust runs with --skip-net and the slot never +# reconnects. The recovery wait below would then time out with +# worker-2 stuck Inconsistent/StandAlone. +# +# That half-up wedge is an artefact of two drbdadm callers colliding +# in the provocation step — NOT the .res node-id mismatch this +# scenario is about (UG cases 10-11). Clear it deterministically: +# bounce the slot with a bare `drbdadm down` and let the satellite's +# revive (the well-tested scenario 5.32 / recovery-down-reverses +# path, a SINGLE writer this time) bring it back up cleanly, then +# require worker-2 UpToDate before applying the SKILL recipe. +echo ">> clear tamper-window wedge: bounce worker-2 and wait for satellite revive" +on_node "$WORKER_2" drbdadm down "$RD" >/dev/null 2>&1 || true +# Kernel-truth poll, not Resource.Status: right after the down the +# observer hasn't stamped the destroy yet, so Status.diskState can +# serve a stale UpToDate and wave the gate through before the +# revive actually ran. `^[[:space:]]+disk:` matches only the local +# disk line (peer lines carry `peer-disk:`). +deadline=$(( $(date +%s) + 120 )) +w2_disk="" +while (( $(date +%s) < deadline )); do + w2_disk=$(on_node "$WORKER_2" drbdsetup status "$RD" 2>/dev/null \ + | grep -m1 -E '^[[:space:]]+disk:' | cut -d: -f2 | awk '{print $1}' || true) + if [[ "$w2_disk" == "UpToDate" ]]; then break; fi + sleep 2 +done +if [[ "$w2_disk" != "UpToDate" ]]; then + echo "FAIL: worker-2 not UpToDate after tamper-window bounce (disk=$w2_disk)" + on_node "$WORKER_2" drbdsetup status "$RD" 2>&1 | head -20 || true + exit 1 +fi +echo " worker-2 back UpToDate after bounce" + # SKILL recipe: drop worker-3's replica, then re-place via # autoplace. The CLI delete tears down the kernel resource on # worker-3 + removes the Resource CRD; the autoplace re-stamps a