roachtest/perturbation: reduce IO pressure and pre-balance for restart (#171103)

trunk-io[bot] · web-flow · commit bb2461a7edec · 2026-06-02T10:33:05.000Z
roachtest/perturbation: reduce IO pressure and pre-balance for restart
diff --git a/pkg/cmd/roachtest/tests/perturbation/framework.go b/pkg/cmd/roachtest/tests/perturbation/framework.go
@@ -987,6 +987,14 @@ func (v variations) runTest(ctx context.Context, t test.Test, c cluster.Cluster)
 
 	// Collect the baseline after the workload has stabilized.
 	baselineInterval := intervalSince(v.validationDuration / 2)
+
+	// Let any in-flight rebalancing settle before perturbing. The fill phase
+	// can leave per-store range counts noticeably skewed: when the perturbed
+	// node returns and the cluster tries to re-balance to its underfull
+	// stores, the snapshot storm concentrates on one store and can push it
+	// into a Pebble write stall.
+	v.waitForRebalanceToStop(ctx, t)
+
 	// Now start the perturbation.
 	t.Status("T3: inducing perturbation")
 	perturbationDuration := v.perturbation.startPerturbation(ctx, t, v)
diff --git a/pkg/cmd/roachtest/tests/perturbation/restart_node.go b/pkg/cmd/roachtest/tests/perturbation/restart_node.go
@@ -27,6 +27,19 @@ func (r restart) setup() variations {
 	r.cleanRestart = true
 	v := setup(r, defaultThresholds())
 
+	// Run with extra IO headroom relative to the cluster-wide default. When the
+	// target node returns it must absorb raft catch-up, rebalancing snapshots,
+	// and re-acquired lease traffic concurrently; at the default 0.5 the
+	// recovered store sits at the edge of its sustainable write rate and the
+	// flush/compaction pipeline can fall behind into a Pebble write stall.
+	v.ratioOfMax = 0.3
+
+	// Scatter the workload table at init so initial replica placement isn't
+	// skewed across the per-node stores. A pre-existing per-store imbalance
+	// concentrates the recovery snapshot storm on whichever store appears
+	// underfull, which is what causes the stall described above.
+	v.scatter = true
+
 	// TODO(baptist): Remove this setting once #120073 is fixed.
 	v.clusterSettings["kv.lease.reject_on_leader_unknown.enabled"] = "true"