roachtest: replace kv/restart/nodes=12 with perturbation/long/restart (#170089)

trunk-io[bot] · web-flow · commit 18eb14ffe6fd · 2026-05-12T05:26:48.000Z
roachtest: replace kv/restart/nodes=12 with perturbation/long/restart
diff --git a/pkg/cmd/roachtest/tests/kv.go b/pkg/cmd/roachtest/tests/kv.go
@@ -1055,153 +1055,3 @@ func registerKVRangeLookups(r registry.Registry) {
 		})
 	}
 }
-
-// registerKVRestartImpact measures the impact of stopping and then restarting
-// a node during a write-heavy workload. Specifically the Raft log on the node
-// falls behind when the node is down and when it comes back up it goes into IO
-// Overload as it attempts to recover.
-func registerKVRestartImpact(r registry.Registry) {
-	r.Add(registry.TestSpec{
-		Name: "kv/restart/nodes=12",
-		// This test is expensive (104vcpu), we run it weekly. Don't use local SSD
-		// they are faster and less likely to hit a hard bandwidth limit causing
-		// LSM inversion (IO overload).
-		CompatibleClouds: registry.AllExceptAWS,
-		Suites:           registry.Suites(registry.Weekly),
-		Owner:            registry.OwnerAdmissionControl,
-		Timeout:          4 * time.Hour,
-		Cluster: r.MakeClusterSpec(
-			13,
-			spec.CPU(8),
-			spec.WorkloadNode(),
-			spec.WorkloadNodeCPU(8),
-			spec.DisableLocalSSD(),
-			spec.RandomizeVolumeType(),
-			spec.RandomlyUseXfs(),
-		),
-		Leases: registry.MetamorphicLeases,
-		Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
-			nodes := len(c.CRDBNodes())
-			startOpts := option.NewStartOpts(option.NoBackupSchedule)
-			startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs,
-				"--vmodule=store_rebalancer=2,allocator=2,allocator_scorer=1,replicate_queue=2,lease=2")
-			settings := install.MakeClusterSettings()
-			// Enable continuous Go execution tracing to aid diagnosis of transient
-			// resource spikes (#166364).
-			settings.ClusterSettings["obs.execution_tracer.interval"] = "10s"
-			settings.ClusterSettings["obs.execution_tracer.duration"] = "10s"
-			settings.ClusterSettings["obs.execution_tracer.total_dump_size_limit"] = "10 GiB"
-
-			c.Start(ctx, t.L(), startOpts, settings, c.CRDBNodes())
-
-			// Run long enough to create a large amount of pebble data.
-			testDuration := 3 * time.Hour
-			targetQPS := 5000
-			// Having higher concurrency allows a more consistent QPS.
-			concurrency := 256
-			// We need a lot of ranges so that the individual ranges don't get truncated by Raft.
-			splits := 20000
-
-			if c.IsLocal() {
-				testDuration = 3 * time.Minute
-				targetQPS = 100
-				concurrency = 24
-				splits = 10
-			}
-
-			// We do 90% write and 10% read - this only counts the writes
-			expectedQPS := float64(targetQPS) * 0.9
-			// Ideally this should be closer to 0.9, but until more issues are fixed
-			// we are starting lower. The first 0.9 is for the 10% reads we do.
-			passingQPS := expectedQPS * 0.5
-			fillDuration := testDuration * 2 / 3  // 2/3 of test time. 2 hours for non-local, 4 minutes for local.
-			downtimeDuration := testDuration / 18 // 10 minutes for non-local, 20 sec for local.
-			printInterval := testDuration / 72    // Show 72 point results during the run.
-
-			c.Run(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf("./cockroach workload init kv --splits=%d {pgurl:1}", splits))
-
-			workloadStartTime := timeutil.Now()
-			t.Status(fmt.Sprintf("starting kv workload thread to run for %s", testDuration))
-
-			// Three goroutines run and we wait for all to complete.
-			m := c.NewDeprecatedMonitor(ctx, c.CRDBNodes())
-			m.ExpectDeath()
-			m.Go(func(ctx context.Context) error {
-				// Don't include the last node when starting the workload since
-				// it will stop in the middle. Write enough data per value to
-				// make sure we create a large raft backlog.
-				cmd := fmt.Sprintf("./cockroach workload run kv --min-block-bytes=8192 --max-block-bytes=8192 "+
-					"--duration=%s --concurrency=%d --max-rate=%d --read-percent=10 {pgurl:1-%d}",
-					testDuration.String(), concurrency, targetQPS, nodes-1,
-				)
-
-				return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd)
-			})
-
-			// Begin the monitoring goroutine to track QPS every 5 seconds.
-			m.Go(func(ctx context.Context) error {
-				// Wait until 5 minutes after the workload began to begin asserting on
-				// QPS.
-				select {
-				case <-ctx.Done():
-					return nil
-				case <-time.After(5 * time.Minute):
-				}
-
-				t.Status(fmt.Sprintf("verify QPS is at least %d during the test, expecting %d", int(passingQPS), int(expectedQPS)))
-				lastPrint := timeutil.Now()
-				defer t.WorkerStatus()
-				for {
-					// Measure QPS every few seconds throughout the test. measureQPS takes time
-					// to run, so we don't sleep between invocations.
-					qps := roachtestutil.MeasureQPS(ctx, t, c, 5*time.Second, c.Range(1, nodes-1))
-					if qps < passingQPS {
-						return errors.Newf(
-							"QPS of %.2f at time %v is below minimum allowable QPS of %.2f",
-							qps, timeutil.Now(), passingQPS)
-					}
-					// Periodically print the current value.
-					if timeutil.Since(lastPrint) > printInterval {
-						lastPrint = timeutil.Now()
-						t.Status(fmt.Sprintf("current QPS %.2f", qps))
-					}
-					// Stop measuring 10 seconds before the workload ends.
-					if timeutil.Since(workloadStartTime) > testDuration-10*time.Second {
-						return nil
-					}
-				}
-			})
-
-			// Begin the goroutine which will start and stop the node.
-			m.Go(func(ctx context.Context) error {
-				// Let some data be written to all nodes in the cluster.
-				t.Status(fmt.Sprintf("waiting %s to get sufficient fill", fillDuration))
-				select {
-				case <-ctx.Done():
-					return nil
-				case <-time.After(fillDuration):
-				}
-
-				// Gracefully shut down the last node to let it transfer leases cleanly.
-				// Wait enough time to let it fall behind on Raft. Since there are a lot
-				// of ranges, about half will be upreplicated during this time.
-				gracefulOpts := option.DefaultStopOpts()
-				gracefulOpts.RoachprodOpts.Sig = 15 // SIGTERM for clean shutdown
-				gracefulOpts.RoachprodOpts.Wait = true
-				c.Stop(ctx, t.L(), gracefulOpts, c.Node(nodes))
-				t.Status(fmt.Sprintf("waiting %s after stopping node to allow the node to fall behind", downtimeDuration))
-				select {
-				case <-ctx.Done():
-					return nil
-				case <-time.After(downtimeDuration):
-				}
-
-				// Start the node again. It will go into an IO Overload scenario.
-				return c.StartE(ctx, t.L(), startOpts, settings, c.Node(nodes))
-			})
-
-			// Wait for the workload to finish.
-			m.Wait()
-		},
-	})
-}
diff --git a/pkg/cmd/roachtest/tests/perturbation/framework.go b/pkg/cmd/roachtest/tests/perturbation/framework.go
@@ -393,6 +393,7 @@ func RegisterTests(r registry.Registry) {
 	const skippedByBankruptcy = "#149662"
 
 	register(r, restart{}, notSkipped)
+	addLong(r, restart{})
 	register(r, backup{}, notSkipped)
 
 	// TODO(ssd): We skipped the majority of these tests so that we can focus on
@@ -565,6 +566,67 @@ func addFull(r registry.Registry, p perturbation, skipReason string) {
 	})
 }
 
+// addLong registers a heavyweight variant of a perturbation test that runs
+// significantly longer than addFull. It is intended for cases where the
+// perturbation only becomes interesting at scale (e.g. a restart that needs a
+// non-trivial backlog to surface IO-overload behavior on recovery), and
+// belongs to the Weekly suite to keep the nightly footprint small. Individual
+// callers can override variations fields after p.setup() to tune what "long"
+// means for their perturbation; the timeout below is sized for the longest
+// expected fill-plus-perturbation we currently configure.
+//
+// What the long fill buys, and what it does not:
+//
+// The 2h fill duration grows the LSM and range count on the surviving nodes
+// so that ongoing compactions, snapshot generation, and lease bookkeeping at
+// the time of the perturbation reflect a non-trivial steady-state cluster,
+// rather than a freshly-initialized one. It does NOT enlarge the raft gap
+// the perturbed node has to close on recovery -- that gap is set by
+// perturbationDuration (default 10 minutes) and the cluster's write rate
+// during the measurement phase.
+//
+// Napkin math for the restart variant on the default 12-node, 16 vcpu,
+// localSSD spec with 50/50 r/w (50% follower-reads), 4 KiB blocks, and
+// splits=10000: cluster max throughput measures around 80k ops/s, so at
+// ratioOfMax=0.5 the measurement workload sustains ~20k writes/s. A 10
+// minute downtime therefore produces ~12 GiB of cluster-wide raft data
+// destined for the down node's ranges. With ~25% of replicas on a
+// 12-node RF=3 cluster across 10000 splits, the down node owns ~2500
+// ranges, so the average per-range raft log accumulated against it is
+// ~5 MiB. That sits below RaftLogTruncationThreshold (16 MiB; see
+// pkg/base/config.go), so most ranges should recover via log replay
+// rather than raft snapshots; if the per-range write skew, the cluster
+// throughput, or perturbationDuration grow, the average will cross that
+// threshold and the cost mix shifts toward snapshot ingest.
+func addLong(r registry.Registry, p perturbation) {
+	v := p.setup()
+	v.fillDuration = 2 * time.Hour
+	v = v.finishSetup()
+	r.Add(registry.TestSpec{
+		Name:             fmt.Sprintf("perturbation/long/%s", v.perturbationName()),
+		CompatibleClouds: v.cloud,
+		Suites:           registry.Suites(registry.Weekly),
+		Owner:            registry.OwnerKV,
+		Cluster:          v.makeClusterSpec(),
+		Leases:           v.leaseType,
+		Benchmark:        true,
+		// The expected runtime is around 2h45m (2h fill + 10m perturbation +
+		// validation/recovery windows + teardown). The timeout is set well
+		// above that — it is a backstop for pathological cases (everything
+		// seizing up in a way that does not surface as a test failure) and
+		// not a target.
+		Timeout: 6 * time.Hour,
+		// The 2h fill produces enough data that the post-test replica
+		// divergence check exceeds its 20m budget, mirroring the carve-out
+		// for large block sizes in pkg/cmd/roachtest/tests/kv.go (see
+		// #141007). The divergence-check timeout is logged but does not
+		// fail the test; we skip it explicitly to keep the artifact clean.
+		SkipPostValidations:    registry.PostValidationReplicaDivergence,
+		PostProcessPerfMetrics: perturbationDefaultProcessFunction,
+		Run:                    v.runTest,
+	})
+}
+
 func addDev(r registry.Registry, p perturbation) {
 	v := p.setup()
 	// Dev tests never fail on latency increases.
diff --git a/pkg/cmd/roachtest/tests/registry.go b/pkg/cmd/roachtest/tests/registry.go
@@ -94,7 +94,6 @@ func RegisterTests(r registry.Registry) {
 	registerKVRangeLookups(r)
 	registerKVScalability(r)
 	registerKVSplits(r)
-	registerKVRestartImpact(r)
 	registerKVStopAndCopy(r)
 	registerKnex(r)
 	registerLOQRecovery(r)