fix(e2e): poll kernel ground truth in state-auto-resync 3-peer wait

kvaps · claude · kvaps · commit e9e2ba9c0ca2 · 2026-05-27T14:28:06.000+02:00
state-auto-resync's local wait_uptodate_3 polled only status_disk_state, which reads the CRD .status projection. On a busy CI stand that projection lags tens of seconds behind the kernel, so the 240s wait timed out while the post-fail drbdsetup dump showed all three peers already UpToDate — the resource had converged, only the projection hadn't surfaced. This is the recurring projection-lag flake class, not the SkipInitialSync gate: the controller stamps SkipInitialSync in the same allocation pass as the node-id/port (when the RD is observable, as it is for a normal multi-replica deploy), and the satellite gate uses the existing bounded 5s requeue, so the gate adds no minute-scale latency. Add kernel_all_uptodate (an N-peer generalisation of the existing kernel_pair_uptodate) and accept it as an additional pass in wait_uptodate_3, mirroring the kernel-fallback lib.sh's wait_uptodate gained in c635627. It only ADDS an accept path: a genuinely non-converged RD still reads non-UpToDate, and a lone node with no peers can't falsely pass, so real failures are not masked. Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
diff --git a/tests/e2e/lib.sh b/tests/e2e/lib.sh
@@ -190,6 +190,31 @@ kernel_pair_uptodate() {
         2>/dev/null || true
 }
 
+# kernel_all_uptodate <rd> <node> [vol] — kernel ground truth for an
+# N-peer RD: prints "ok" iff `node`'s local disk-state AND the
+# peer-disk-state of EVERY connection are UpToDate, read straight from
+# `drbdsetup status <rd> --json` on `node`. A single node's status frame
+# reports its own disk-state plus the peer-disk-state of all peers, so
+# for a 3-replica RD one query on any peer covers all three replicas.
+# The connection set must be non-empty (a lone node with no peers can't
+# prove the others are UpToDate). Empty/parse failure (node unreachable,
+# slot mid-negotiation) prints nothing → caller keeps waiting.
+# Independent of the controller's CRD .status projection — same purpose
+# as kernel_pair_uptodate, generalised past two peers.
+kernel_all_uptodate() {
+    local rd=$1 node=$2 vol=${3:-0}
+    on_node "$node" drbdsetup status "$rd" --json 2>/dev/null | jq -r \
+        --argjson v "$vol" '
+        ([.[0].devices[]? | select(.volume==$v) | ."disk-state"] | first) as $loc
+        | [.[0].connections[]? | .peer_devices[]?
+            | select(.volume==$v) | ."peer-disk-state"] as $peers
+        | if ($loc=="UpToDate"
+              and ($peers | length) > 0
+              and ($peers | all(. == "UpToDate")))
+          then "ok" else "no" end' \
+        2>/dev/null || true
+}
+
 # status_connection_state <rd> <node> <peer> — full kernel connection
 # state string as observed FROM `node` TOWARD `peer`: Connected /
 # Connecting / StandAlone / BrokenPipe / NetworkFailure / Timeout /
diff --git a/tests/e2e/state-auto-resync.sh b/tests/e2e/state-auto-resync.sh
@@ -117,6 +117,19 @@ trap 'cleanup_partition; delete_rd "$RD"' EXIT
 # 5.15 scenario needs all three diskful rows UpToDate before
 # we start poking the kernel; otherwise the disconnect would
 # race the initial-sync the autoplace just kicked off.
+#
+# Accepts kernel ground truth as well as the CRD projection. The
+# CRD .status projection (status_disk_state) can lag tens of seconds
+# behind the kernel on a busy CI stand — observed here as a 240s
+# timeout whose post-fail drbdsetup dump showed all three peers
+# already UpToDate (the row had converged; only the projection hadn't
+# surfaced). kernel_all_uptodate reads `drbdsetup status --json` on
+# $N1 directly: one frame reports $N1's local disk-state plus the
+# peer-disk-state of $N2 and $N3, so it proves all three replicas
+# from kernel truth. This mirrors the kernel-fallback lib.sh's
+# wait_uptodate gained in c63562707 and only ADDS an accept path —
+# a genuinely non-converged RD still shows non-UpToDate, so real
+# failures are not masked.
 wait_uptodate_3() {
     local rd=$1 deadline=$(( $(date +%s) + 240 ))
     while (( $(date +%s) < deadline )); do
@@ -130,9 +143,16 @@ wait_uptodate_3() {
             fi
         done
         if (( ok == 1 )); then return 0; fi
+
+        # Kernel ground truth: independent of the CRD projection lag.
+        if [[ "$(kernel_all_uptodate "$rd" "$N1")" == "ok" ]]; then
+            return 0
+        fi
+
         sleep 2
     done
     echo "FAIL: $rd never reached UpToDate on all 3 peers" >&2
+    echo "   last CRD diskState: $N1=$(status_disk_state "$rd" "$N1") $N2=$(status_disk_state "$rd" "$N2") $N3=$(status_disk_state "$rd" "$N3")" >&2
     on_node "$N1" drbdsetup status "$rd" 2>/dev/null || true
     return 1
 }