Skip to content

Commit 18eb14f

Browse files
roachtest: replace kv/restart/nodes=12 with perturbation/long/restart (#170089)
roachtest: replace kv/restart/nodes=12 with perturbation/long/restart
2 parents 6cf0a87 + 9bdf947 commit 18eb14f

3 files changed

Lines changed: 62 additions & 151 deletions

File tree

pkg/cmd/roachtest/tests/kv.go

Lines changed: 0 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,153 +1055,3 @@ func registerKVRangeLookups(r registry.Registry) {
10551055
})
10561056
}
10571057
}
1058-
1059-
// registerKVRestartImpact measures the impact of stopping and then restarting
1060-
// a node during a write-heavy workload. Specifically the Raft log on the node
1061-
// falls behind when the node is down and when it comes back up it goes into IO
1062-
// Overload as it attempts to recover.
1063-
func registerKVRestartImpact(r registry.Registry) {
1064-
r.Add(registry.TestSpec{
1065-
Name: "kv/restart/nodes=12",
1066-
// This test is expensive (104vcpu), we run it weekly. Don't use local SSD
1067-
// they are faster and less likely to hit a hard bandwidth limit causing
1068-
// LSM inversion (IO overload).
1069-
CompatibleClouds: registry.AllExceptAWS,
1070-
Suites: registry.Suites(registry.Weekly),
1071-
Owner: registry.OwnerAdmissionControl,
1072-
Timeout: 4 * time.Hour,
1073-
Cluster: r.MakeClusterSpec(
1074-
13,
1075-
spec.CPU(8),
1076-
spec.WorkloadNode(),
1077-
spec.WorkloadNodeCPU(8),
1078-
spec.DisableLocalSSD(),
1079-
spec.RandomizeVolumeType(),
1080-
spec.RandomlyUseXfs(),
1081-
),
1082-
Leases: registry.MetamorphicLeases,
1083-
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
1084-
nodes := len(c.CRDBNodes())
1085-
startOpts := option.NewStartOpts(option.NoBackupSchedule)
1086-
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs,
1087-
"--vmodule=store_rebalancer=2,allocator=2,allocator_scorer=1,replicate_queue=2,lease=2")
1088-
settings := install.MakeClusterSettings()
1089-
// Enable continuous Go execution tracing to aid diagnosis of transient
1090-
// resource spikes (#166364).
1091-
settings.ClusterSettings["obs.execution_tracer.interval"] = "10s"
1092-
settings.ClusterSettings["obs.execution_tracer.duration"] = "10s"
1093-
settings.ClusterSettings["obs.execution_tracer.total_dump_size_limit"] = "10 GiB"
1094-
1095-
c.Start(ctx, t.L(), startOpts, settings, c.CRDBNodes())
1096-
1097-
// Run long enough to create a large amount of pebble data.
1098-
testDuration := 3 * time.Hour
1099-
targetQPS := 5000
1100-
// Having higher concurrency allows a more consistent QPS.
1101-
concurrency := 256
1102-
// We need a lot of ranges so that the individual ranges don't get truncated by Raft.
1103-
splits := 20000
1104-
1105-
if c.IsLocal() {
1106-
testDuration = 3 * time.Minute
1107-
targetQPS = 100
1108-
concurrency = 24
1109-
splits = 10
1110-
}
1111-
1112-
// We do 90% write and 10% read - this only counts the writes
1113-
expectedQPS := float64(targetQPS) * 0.9
1114-
// Ideally this should be closer to 0.9, but until more issues are fixed
1115-
// we are starting lower. The first 0.9 is for the 10% reads we do.
1116-
passingQPS := expectedQPS * 0.5
1117-
fillDuration := testDuration * 2 / 3 // 2/3 of test time. 2 hours for non-local, 4 minutes for local.
1118-
downtimeDuration := testDuration / 18 // 10 minutes for non-local, 20 sec for local.
1119-
printInterval := testDuration / 72 // Show 72 point results during the run.
1120-
1121-
c.Run(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf("./cockroach workload init kv --splits=%d {pgurl:1}", splits))
1122-
1123-
workloadStartTime := timeutil.Now()
1124-
t.Status(fmt.Sprintf("starting kv workload thread to run for %s", testDuration))
1125-
1126-
// Three goroutines run and we wait for all to complete.
1127-
m := c.NewDeprecatedMonitor(ctx, c.CRDBNodes())
1128-
m.ExpectDeath()
1129-
m.Go(func(ctx context.Context) error {
1130-
// Don't include the last node when starting the workload since
1131-
// it will stop in the middle. Write enough data per value to
1132-
// make sure we create a large raft backlog.
1133-
cmd := fmt.Sprintf("./cockroach workload run kv --min-block-bytes=8192 --max-block-bytes=8192 "+
1134-
"--duration=%s --concurrency=%d --max-rate=%d --read-percent=10 {pgurl:1-%d}",
1135-
testDuration.String(), concurrency, targetQPS, nodes-1,
1136-
)
1137-
1138-
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd)
1139-
})
1140-
1141-
// Begin the monitoring goroutine to track QPS every 5 seconds.
1142-
m.Go(func(ctx context.Context) error {
1143-
// Wait until 5 minutes after the workload began to begin asserting on
1144-
// QPS.
1145-
select {
1146-
case <-ctx.Done():
1147-
return nil
1148-
case <-time.After(5 * time.Minute):
1149-
}
1150-
1151-
t.Status(fmt.Sprintf("verify QPS is at least %d during the test, expecting %d", int(passingQPS), int(expectedQPS)))
1152-
lastPrint := timeutil.Now()
1153-
defer t.WorkerStatus()
1154-
for {
1155-
// Measure QPS every few seconds throughout the test. measureQPS takes time
1156-
// to run, so we don't sleep between invocations.
1157-
qps := roachtestutil.MeasureQPS(ctx, t, c, 5*time.Second, c.Range(1, nodes-1))
1158-
if qps < passingQPS {
1159-
return errors.Newf(
1160-
"QPS of %.2f at time %v is below minimum allowable QPS of %.2f",
1161-
qps, timeutil.Now(), passingQPS)
1162-
}
1163-
// Periodically print the current value.
1164-
if timeutil.Since(lastPrint) > printInterval {
1165-
lastPrint = timeutil.Now()
1166-
t.Status(fmt.Sprintf("current QPS %.2f", qps))
1167-
}
1168-
// Stop measuring 10 seconds before the workload ends.
1169-
if timeutil.Since(workloadStartTime) > testDuration-10*time.Second {
1170-
return nil
1171-
}
1172-
}
1173-
})
1174-
1175-
// Begin the goroutine which will start and stop the node.
1176-
m.Go(func(ctx context.Context) error {
1177-
// Let some data be written to all nodes in the cluster.
1178-
t.Status(fmt.Sprintf("waiting %s to get sufficient fill", fillDuration))
1179-
select {
1180-
case <-ctx.Done():
1181-
return nil
1182-
case <-time.After(fillDuration):
1183-
}
1184-
1185-
// Gracefully shut down the last node to let it transfer leases cleanly.
1186-
// Wait enough time to let it fall behind on Raft. Since there are a lot
1187-
// of ranges, about half will be upreplicated during this time.
1188-
gracefulOpts := option.DefaultStopOpts()
1189-
gracefulOpts.RoachprodOpts.Sig = 15 // SIGTERM for clean shutdown
1190-
gracefulOpts.RoachprodOpts.Wait = true
1191-
c.Stop(ctx, t.L(), gracefulOpts, c.Node(nodes))
1192-
t.Status(fmt.Sprintf("waiting %s after stopping node to allow the node to fall behind", downtimeDuration))
1193-
select {
1194-
case <-ctx.Done():
1195-
return nil
1196-
case <-time.After(downtimeDuration):
1197-
}
1198-
1199-
// Start the node again. It will go into an IO Overload scenario.
1200-
return c.StartE(ctx, t.L(), startOpts, settings, c.Node(nodes))
1201-
})
1202-
1203-
// Wait for the workload to finish.
1204-
m.Wait()
1205-
},
1206-
})
1207-
}

pkg/cmd/roachtest/tests/perturbation/framework.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ func RegisterTests(r registry.Registry) {
393393
const skippedByBankruptcy = "#149662"
394394

395395
register(r, restart{}, notSkipped)
396+
addLong(r, restart{})
396397
register(r, backup{}, notSkipped)
397398

398399
// TODO(ssd): We skipped the majority of these tests so that we can focus on
@@ -565,6 +566,67 @@ func addFull(r registry.Registry, p perturbation, skipReason string) {
565566
})
566567
}
567568

569+
// addLong registers a heavyweight variant of a perturbation test that runs
570+
// significantly longer than addFull. It is intended for cases where the
571+
// perturbation only becomes interesting at scale (e.g. a restart that needs a
572+
// non-trivial backlog to surface IO-overload behavior on recovery), and
573+
// belongs to the Weekly suite to keep the nightly footprint small. Individual
574+
// callers can override variations fields after p.setup() to tune what "long"
575+
// means for their perturbation; the timeout below is sized for the longest
576+
// expected fill-plus-perturbation we currently configure.
577+
//
578+
// What the long fill buys, and what it does not:
579+
//
580+
// The 2h fill duration grows the LSM and range count on the surviving nodes
581+
// so that ongoing compactions, snapshot generation, and lease bookkeeping at
582+
// the time of the perturbation reflect a non-trivial steady-state cluster,
583+
// rather than a freshly-initialized one. It does NOT enlarge the raft gap
584+
// the perturbed node has to close on recovery -- that gap is set by
585+
// perturbationDuration (default 10 minutes) and the cluster's write rate
586+
// during the measurement phase.
587+
//
588+
// Napkin math for the restart variant on the default 12-node, 16 vcpu,
589+
// localSSD spec with 50/50 r/w (50% follower-reads), 4 KiB blocks, and
590+
// splits=10000: cluster max throughput measures around 80k ops/s, so at
591+
// ratioOfMax=0.5 the measurement workload sustains ~20k writes/s. A 10
592+
// minute downtime therefore produces ~12 GiB of cluster-wide raft data
593+
// destined for the down node's ranges. With ~25% of replicas on a
594+
// 12-node RF=3 cluster across 10000 splits, the down node owns ~2500
595+
// ranges, so the average per-range raft log accumulated against it is
596+
// ~5 MiB. That sits below RaftLogTruncationThreshold (16 MiB; see
597+
// pkg/base/config.go), so most ranges should recover via log replay
598+
// rather than raft snapshots; if the per-range write skew, the cluster
599+
// throughput, or perturbationDuration grow, the average will cross that
600+
// threshold and the cost mix shifts toward snapshot ingest.
601+
func addLong(r registry.Registry, p perturbation) {
602+
v := p.setup()
603+
v.fillDuration = 2 * time.Hour
604+
v = v.finishSetup()
605+
r.Add(registry.TestSpec{
606+
Name: fmt.Sprintf("perturbation/long/%s", v.perturbationName()),
607+
CompatibleClouds: v.cloud,
608+
Suites: registry.Suites(registry.Weekly),
609+
Owner: registry.OwnerKV,
610+
Cluster: v.makeClusterSpec(),
611+
Leases: v.leaseType,
612+
Benchmark: true,
613+
// The expected runtime is around 2h45m (2h fill + 10m perturbation +
614+
// validation/recovery windows + teardown). The timeout is set well
615+
// above that — it is a backstop for pathological cases (everything
616+
// seizing up in a way that does not surface as a test failure) and
617+
// not a target.
618+
Timeout: 6 * time.Hour,
619+
// The 2h fill produces enough data that the post-test replica
620+
// divergence check exceeds its 20m budget, mirroring the carve-out
621+
// for large block sizes in pkg/cmd/roachtest/tests/kv.go (see
622+
// #141007). The divergence-check timeout is logged but does not
623+
// fail the test; we skip it explicitly to keep the artifact clean.
624+
SkipPostValidations: registry.PostValidationReplicaDivergence,
625+
PostProcessPerfMetrics: perturbationDefaultProcessFunction,
626+
Run: v.runTest,
627+
})
628+
}
629+
568630
func addDev(r registry.Registry, p perturbation) {
569631
v := p.setup()
570632
// Dev tests never fail on latency increases.

pkg/cmd/roachtest/tests/registry.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ func RegisterTests(r registry.Registry) {
9494
registerKVRangeLookups(r)
9595
registerKVScalability(r)
9696
registerKVSplits(r)
97-
registerKVRestartImpact(r)
9897
registerKVStopAndCopy(r)
9998
registerKnex(r)
10099
registerLOQRecovery(r)

0 commit comments

Comments
 (0)