Skip to content

Commit d43059f

Browse files
committed
roachtest: re-enable perturbation/full tests
The following tests were previously skipped via #149662 to focus on stabilizing one test at a time: - perturbation/full/intents - perturbation/full/decommission - perturbation/full/elasticWorkload - perturbation/full/partition - perturbation/full/slowDisk - perturbation/full/addNode Re-enable all of them. The metamorphic and dev variants remain unchanged. All re-enabled tests use the lenient defaultThresholds() (1.25x throughput floor, p99/p50 disabled) for both the perturbation and recovery intervals, with one exception: the partition test isolates an entire region (4 of 12 nodes) and removes 1/3 of leaseholders, which causes foreground throughput to drop sharply (~2x) while the partition is in effect. The meaningful pass/fail signal for partition is whether the cluster returns to baseline once the partition heals, so the perturbation interval is left ungated (noImpactThresholds()) and the 1.25x floor is enforced only on the recovery interval, via the recoveryImpact field. While here, add a comment to slowDisk explaining why the default threshold is appropriate for the full variant: with walFailover=true and 2 disks per node, raft log writes fail over to the non-throttled store and foreground throughput stays close to baseline. The lenient 1.25x floor is mainly to absorb noise from the slowLiveness leg. Resolves: #149662 Epic: none Release note: None
1 parent 5be101c commit d43059f

3 files changed

Lines changed: 22 additions & 12 deletions

File tree

pkg/cmd/roachtest/tests/perturbation/framework.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,6 @@ func register(r registry.Registry, p perturbation, skipReason string) {
419419

420420
func RegisterTests(r registry.Registry) {
421421
const notSkipped = ""
422-
const skippedByBankruptcy = "#149662"
423422

424423
register(r, restart{}, notSkipped)
425424
addLong(r, restart{})
@@ -431,17 +430,13 @@ func RegisterTests(r registry.Registry) {
431430
for _, asleep := range []bool{true, false} {
432431
register(r, splits{asleep: asleep}, notSkipped)
433432
}
434-
435-
// TODO(ssd): We skipped the majority of these tests so that we can focus on
436-
// one at a time. These are vaguely ordered by their previous pass rate
437-
// (highest first).
438-
register(r, intents{}, skippedByBankruptcy)
439-
register(r, decommission{}, skippedByBankruptcy)
440-
register(r, elasticWorkload{}, skippedByBankruptcy)
441-
register(r, partition{}, skippedByBankruptcy)
433+
register(r, intents{}, notSkipped)
434+
register(r, decommission{}, notSkipped)
435+
register(r, elasticWorkload{}, notSkipped)
436+
register(r, partition{}, notSkipped)
442437
register(r, backfill{}, notSkipped)
443-
register(r, &slowDisk{}, skippedByBankruptcy)
444-
register(r, addNode{}, skippedByBankruptcy)
438+
register(r, &slowDisk{}, notSkipped)
439+
register(r, addNode{}, notSkipped)
445440
}
446441

447442
func (v variations) makeClusterSpec() spec.ClusterSpec {

pkg/cmd/roachtest/tests/perturbation/network_partition.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,13 @@ var _ perturbation = partition{}
3131

3232
func (p partition) setup() variations {
3333
p.partitionSite = true
34-
v := setup(p, defaultThresholds())
34+
// The partition test isolates an entire region (4 of 12 nodes), removing
35+
// 1/3 of leaseholders. Foreground throughput naturally drops sharply
36+
// while the partition is in effect, and the meaningful pass/fail signal
37+
// is whether the cluster returns to baseline once the partition heals.
38+
// Skip the perturbation-interval gate; keep the default recovery gate.
39+
v := setup(p, noImpactThresholds())
40+
v.recoveryImpact = defaultThresholds()
3541
v.leaseType = registry.ExpirationLeases
3642
// TODO(baptist): Remove this setting once #120073 is fixed.
3743
v.clusterSettings["kv.lease.reject_on_leader_unknown.enabled"] = "true"

pkg/cmd/roachtest/tests/perturbation/slow_disk.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@ var _ perturbation = &slowDisk{}
3232
func (s *slowDisk) setup() variations {
3333
s.slowLiveness = true
3434
s.walFailover = true
35+
// With walFailover=true and 2 disks per node (the default for the full
36+
// variant), raft log writes fail over to the non-throttled store, so
37+
// foreground throughput is expected to stay close to baseline even
38+
// while the staller is active. Default thresholds apply to both
39+
// intervals; we keep the 1.25x floor (rather than tightening) only to
40+
// avoid flakes from the slowLiveness leg, which routes liveness
41+
// heartbeats through the slow disk. The metamorphic variant exercises
42+
// configurations where walFailover is off and throughput can drop
43+
// substantially -- those should override impact independently.
3544
return setup(s, defaultThresholds())
3645
}
3746

0 commit comments

Comments
 (0)