From d43059fe44c03e11659e179fa34eff54be3ddef3 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Wed, 27 May 2026 18:25:35 +0200 Subject: [PATCH] roachtest: re-enable perturbation/full tests The following tests were previously skipped via #149662 to focus on stabilizing one test at a time: - perturbation/full/intents - perturbation/full/decommission - perturbation/full/elasticWorkload - perturbation/full/partition - perturbation/full/slowDisk - perturbation/full/addNode Re-enable all of them. The metamorphic and dev variants remain unchanged. All re-enabled tests use the lenient defaultThresholds() (1.25x throughput floor, p99/p50 disabled) for both the perturbation and recovery intervals, with one exception: the partition test isolates an entire region (4 of 12 nodes) and removes 1/3 of leaseholders, which causes foreground throughput to drop sharply (~2x) while the partition is in effect. The meaningful pass/fail signal for partition is whether the cluster returns to baseline once the partition heals, so the perturbation interval is left ungated (noImpactThresholds()) and the 1.25x floor is enforced only on the recovery interval, via the recoveryImpact field. While here, add a comment to slowDisk explaining why the default threshold is appropriate for the full variant: with walFailover=true and 2 disks per node, raft log writes fail over to the non-throttled store and foreground throughput stays close to baseline. The lenient 1.25x floor is mainly to absorb noise from the slowLiveness leg. Resolves: #149662 Epic: none Release note: None --- .../roachtest/tests/perturbation/framework.go | 17 ++++++----------- .../tests/perturbation/network_partition.go | 8 +++++++- .../roachtest/tests/perturbation/slow_disk.go | 9 +++++++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pkg/cmd/roachtest/tests/perturbation/framework.go b/pkg/cmd/roachtest/tests/perturbation/framework.go index e6e7e0087583..845ffce63d3e 100644 --- a/pkg/cmd/roachtest/tests/perturbation/framework.go +++ b/pkg/cmd/roachtest/tests/perturbation/framework.go @@ -419,7 +419,6 @@ func register(r registry.Registry, p perturbation, skipReason string) { func RegisterTests(r registry.Registry) { const notSkipped = "" - const skippedByBankruptcy = "#149662" register(r, restart{}, notSkipped) addLong(r, restart{}) @@ -431,17 +430,13 @@ func RegisterTests(r registry.Registry) { for _, asleep := range []bool{true, false} { register(r, splits{asleep: asleep}, notSkipped) } - - // TODO(ssd): We skipped the majority of these tests so that we can focus on - // one at a time. These are vaguely ordered by their previous pass rate - // (highest first). - register(r, intents{}, skippedByBankruptcy) - register(r, decommission{}, skippedByBankruptcy) - register(r, elasticWorkload{}, skippedByBankruptcy) - register(r, partition{}, skippedByBankruptcy) + register(r, intents{}, notSkipped) + register(r, decommission{}, notSkipped) + register(r, elasticWorkload{}, notSkipped) + register(r, partition{}, notSkipped) register(r, backfill{}, notSkipped) - register(r, &slowDisk{}, skippedByBankruptcy) - register(r, addNode{}, skippedByBankruptcy) + register(r, &slowDisk{}, notSkipped) + register(r, addNode{}, notSkipped) } func (v variations) makeClusterSpec() spec.ClusterSpec { diff --git a/pkg/cmd/roachtest/tests/perturbation/network_partition.go b/pkg/cmd/roachtest/tests/perturbation/network_partition.go index 6d50abf8669d..1f7122311b8b 100644 --- a/pkg/cmd/roachtest/tests/perturbation/network_partition.go +++ b/pkg/cmd/roachtest/tests/perturbation/network_partition.go @@ -31,7 +31,13 @@ var _ perturbation = partition{} func (p partition) setup() variations { p.partitionSite = true - v := setup(p, defaultThresholds()) + // The partition test isolates an entire region (4 of 12 nodes), removing + // 1/3 of leaseholders. Foreground throughput naturally drops sharply + // while the partition is in effect, and the meaningful pass/fail signal + // is whether the cluster returns to baseline once the partition heals. + // Skip the perturbation-interval gate; keep the default recovery gate. + v := setup(p, noImpactThresholds()) + v.recoveryImpact = defaultThresholds() v.leaseType = registry.ExpirationLeases // TODO(baptist): Remove this setting once #120073 is fixed. v.clusterSettings["kv.lease.reject_on_leader_unknown.enabled"] = "true" diff --git a/pkg/cmd/roachtest/tests/perturbation/slow_disk.go b/pkg/cmd/roachtest/tests/perturbation/slow_disk.go index ac0cae8e0a52..2231959bbd6a 100644 --- a/pkg/cmd/roachtest/tests/perturbation/slow_disk.go +++ b/pkg/cmd/roachtest/tests/perturbation/slow_disk.go @@ -32,6 +32,15 @@ var _ perturbation = &slowDisk{} func (s *slowDisk) setup() variations { s.slowLiveness = true s.walFailover = true + // With walFailover=true and 2 disks per node (the default for the full + // variant), raft log writes fail over to the non-throttled store, so + // foreground throughput is expected to stay close to baseline even + // while the staller is active. Default thresholds apply to both + // intervals; we keep the 1.25x floor (rather than tightening) only to + // avoid flakes from the slowLiveness leg, which routes liveness + // heartbeats through the slow disk. The metamorphic variant exercises + // configurations where walFailover is off and throughput can drop + // substantially -- those should override impact independently. return setup(s, defaultThresholds()) }