Skip to content

Commit bb2461a

Browse files
roachtest/perturbation: reduce IO pressure and pre-balance for restart (#171103)
roachtest/perturbation: reduce IO pressure and pre-balance for restart
2 parents 78450ef + 7e08f8c commit bb2461a

2 files changed

Lines changed: 21 additions & 0 deletions

File tree

pkg/cmd/roachtest/tests/perturbation/framework.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -987,6 +987,14 @@ func (v variations) runTest(ctx context.Context, t test.Test, c cluster.Cluster)
987987

988988
// Collect the baseline after the workload has stabilized.
989989
baselineInterval := intervalSince(v.validationDuration / 2)
990+
991+
// Let any in-flight rebalancing settle before perturbing. The fill phase
992+
// can leave per-store range counts noticeably skewed: when the perturbed
993+
// node returns and the cluster tries to re-balance to its underfull
994+
// stores, the snapshot storm concentrates on one store and can push it
995+
// into a Pebble write stall.
996+
v.waitForRebalanceToStop(ctx, t)
997+
990998
// Now start the perturbation.
991999
t.Status("T3: inducing perturbation")
9921000
perturbationDuration := v.perturbation.startPerturbation(ctx, t, v)

pkg/cmd/roachtest/tests/perturbation/restart_node.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,19 @@ func (r restart) setup() variations {
2727
r.cleanRestart = true
2828
v := setup(r, defaultThresholds())
2929

30+
// Run with extra IO headroom relative to the cluster-wide default. When the
31+
// target node returns it must absorb raft catch-up, rebalancing snapshots,
32+
// and re-acquired lease traffic concurrently; at the default 0.5 the
33+
// recovered store sits at the edge of its sustainable write rate and the
34+
// flush/compaction pipeline can fall behind into a Pebble write stall.
35+
v.ratioOfMax = 0.3
36+
37+
// Scatter the workload table at init so initial replica placement isn't
38+
// skewed across the per-node stores. A pre-existing per-store imbalance
39+
// concentrates the recovery snapshot storm on whichever store appears
40+
// underfull, which is what causes the stall described above.
41+
v.scatter = true
42+
3043
// TODO(baptist): Remove this setting once #120073 is fixed.
3144
v.clusterSettings["kv.lease.reject_on_leader_unknown.enabled"] = "true"
3245

0 commit comments

Comments
 (0)