Skip to content

Commit 0959def

Browse files
tbgroachdev-claude
andcommitted
roachtest: remove kv/restart/nodes=12
Drop kv/restart/nodes=12. The next commit registers perturbation/long/restart as a replacement using the perturbation framework's measurement infrastructure (baseline/perturbation/recovery with roachperf integration), which is more useful for tracking restart-related regressions than the QPS-floor assertion in this test. Touches #170047. Epic: none Release note: None Co-Authored-By: roachdev-claude <roachdev-claude-bot@cockroachlabs.com>
1 parent 725a600 commit 0959def

2 files changed

Lines changed: 0 additions & 151 deletions

File tree

pkg/cmd/roachtest/tests/kv.go

Lines changed: 0 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,153 +1055,3 @@ func registerKVRangeLookups(r registry.Registry) {
10551055
})
10561056
}
10571057
}
1058-
1059-
// registerKVRestartImpact measures the impact of stopping and then restarting
1060-
// a node during a write-heavy workload. Specifically the Raft log on the node
1061-
// falls behind when the node is down and when it comes back up it goes into IO
1062-
// Overload as it attempts to recover.
1063-
func registerKVRestartImpact(r registry.Registry) {
1064-
r.Add(registry.TestSpec{
1065-
Name: "kv/restart/nodes=12",
1066-
// This test is expensive (104vcpu), we run it weekly. Don't use local SSD
1067-
// they are faster and less likely to hit a hard bandwidth limit causing
1068-
// LSM inversion (IO overload).
1069-
CompatibleClouds: registry.AllExceptAWS,
1070-
Suites: registry.Suites(registry.Weekly),
1071-
Owner: registry.OwnerAdmissionControl,
1072-
Timeout: 4 * time.Hour,
1073-
Cluster: r.MakeClusterSpec(
1074-
13,
1075-
spec.CPU(8),
1076-
spec.WorkloadNode(),
1077-
spec.WorkloadNodeCPU(8),
1078-
spec.DisableLocalSSD(),
1079-
spec.RandomizeVolumeType(),
1080-
spec.RandomlyUseXfs(),
1081-
),
1082-
Leases: registry.MetamorphicLeases,
1083-
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
1084-
nodes := len(c.CRDBNodes())
1085-
startOpts := option.NewStartOpts(option.NoBackupSchedule)
1086-
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs,
1087-
"--vmodule=store_rebalancer=2,allocator=2,allocator_scorer=1,replicate_queue=2,lease=2")
1088-
settings := install.MakeClusterSettings()
1089-
// Enable continuous Go execution tracing to aid diagnosis of transient
1090-
// resource spikes (#166364).
1091-
settings.ClusterSettings["obs.execution_tracer.interval"] = "10s"
1092-
settings.ClusterSettings["obs.execution_tracer.duration"] = "10s"
1093-
settings.ClusterSettings["obs.execution_tracer.total_dump_size_limit"] = "10 GiB"
1094-
1095-
c.Start(ctx, t.L(), startOpts, settings, c.CRDBNodes())
1096-
1097-
// Run long enough to create a large amount of pebble data.
1098-
testDuration := 3 * time.Hour
1099-
targetQPS := 5000
1100-
// Having higher concurrency allows a more consistent QPS.
1101-
concurrency := 256
1102-
// We need a lot of ranges so that the individual ranges don't get truncated by Raft.
1103-
splits := 20000
1104-
1105-
if c.IsLocal() {
1106-
testDuration = 3 * time.Minute
1107-
targetQPS = 100
1108-
concurrency = 24
1109-
splits = 10
1110-
}
1111-
1112-
// We do 90% write and 10% read - this only counts the writes
1113-
expectedQPS := float64(targetQPS) * 0.9
1114-
// Ideally this should be closer to 0.9, but until more issues are fixed
1115-
// we are starting lower. The first 0.9 is for the 10% reads we do.
1116-
passingQPS := expectedQPS * 0.5
1117-
fillDuration := testDuration * 2 / 3 // 2/3 of test time. 2 hours for non-local, 4 minutes for local.
1118-
downtimeDuration := testDuration / 18 // 10 minutes for non-local, 20 sec for local.
1119-
printInterval := testDuration / 72 // Show 72 point results during the run.
1120-
1121-
c.Run(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf("./cockroach workload init kv --splits=%d {pgurl:1}", splits))
1122-
1123-
workloadStartTime := timeutil.Now()
1124-
t.Status(fmt.Sprintf("starting kv workload thread to run for %s", testDuration))
1125-
1126-
// Three goroutines run and we wait for all to complete.
1127-
m := c.NewDeprecatedMonitor(ctx, c.CRDBNodes())
1128-
m.ExpectDeath()
1129-
m.Go(func(ctx context.Context) error {
1130-
// Don't include the last node when starting the workload since
1131-
// it will stop in the middle. Write enough data per value to
1132-
// make sure we create a large raft backlog.
1133-
cmd := fmt.Sprintf("./cockroach workload run kv --min-block-bytes=8192 --max-block-bytes=8192 "+
1134-
"--duration=%s --concurrency=%d --max-rate=%d --read-percent=10 {pgurl:1-%d}",
1135-
testDuration.String(), concurrency, targetQPS, nodes-1,
1136-
)
1137-
1138-
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd)
1139-
})
1140-
1141-
// Begin the monitoring goroutine to track QPS every 5 seconds.
1142-
m.Go(func(ctx context.Context) error {
1143-
// Wait until 5 minutes after the workload began to begin asserting on
1144-
// QPS.
1145-
select {
1146-
case <-ctx.Done():
1147-
return nil
1148-
case <-time.After(5 * time.Minute):
1149-
}
1150-
1151-
t.Status(fmt.Sprintf("verify QPS is at least %d during the test, expecting %d", int(passingQPS), int(expectedQPS)))
1152-
lastPrint := timeutil.Now()
1153-
defer t.WorkerStatus()
1154-
for {
1155-
// Measure QPS every few seconds throughout the test. measureQPS takes time
1156-
// to run, so we don't sleep between invocations.
1157-
qps := roachtestutil.MeasureQPS(ctx, t, c, 5*time.Second, c.Range(1, nodes-1))
1158-
if qps < passingQPS {
1159-
return errors.Newf(
1160-
"QPS of %.2f at time %v is below minimum allowable QPS of %.2f",
1161-
qps, timeutil.Now(), passingQPS)
1162-
}
1163-
// Periodically print the current value.
1164-
if timeutil.Since(lastPrint) > printInterval {
1165-
lastPrint = timeutil.Now()
1166-
t.Status(fmt.Sprintf("current QPS %.2f", qps))
1167-
}
1168-
// Stop measuring 10 seconds before the workload ends.
1169-
if timeutil.Since(workloadStartTime) > testDuration-10*time.Second {
1170-
return nil
1171-
}
1172-
}
1173-
})
1174-
1175-
// Begin the goroutine which will start and stop the node.
1176-
m.Go(func(ctx context.Context) error {
1177-
// Let some data be written to all nodes in the cluster.
1178-
t.Status(fmt.Sprintf("waiting %s to get sufficient fill", fillDuration))
1179-
select {
1180-
case <-ctx.Done():
1181-
return nil
1182-
case <-time.After(fillDuration):
1183-
}
1184-
1185-
// Gracefully shut down the last node to let it transfer leases cleanly.
1186-
// Wait enough time to let it fall behind on Raft. Since there are a lot
1187-
// of ranges, about half will be upreplicated during this time.
1188-
gracefulOpts := option.DefaultStopOpts()
1189-
gracefulOpts.RoachprodOpts.Sig = 15 // SIGTERM for clean shutdown
1190-
gracefulOpts.RoachprodOpts.Wait = true
1191-
c.Stop(ctx, t.L(), gracefulOpts, c.Node(nodes))
1192-
t.Status(fmt.Sprintf("waiting %s after stopping node to allow the node to fall behind", downtimeDuration))
1193-
select {
1194-
case <-ctx.Done():
1195-
return nil
1196-
case <-time.After(downtimeDuration):
1197-
}
1198-
1199-
// Start the node again. It will go into an IO Overload scenario.
1200-
return c.StartE(ctx, t.L(), startOpts, settings, c.Node(nodes))
1201-
})
1202-
1203-
// Wait for the workload to finish.
1204-
m.Wait()
1205-
},
1206-
})
1207-
}

pkg/cmd/roachtest/tests/registry.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ func RegisterTests(r registry.Registry) {
9494
registerKVRangeLookups(r)
9595
registerKVScalability(r)
9696
registerKVSplits(r)
97-
registerKVRestartImpact(r)
9897
registerKVStopAndCopy(r)
9998
registerKnex(r)
10099
registerLOQRecovery(r)

0 commit comments

Comments
 (0)