Skip to content

Commit eb8ae2f

Browse files
authored
Make sure we still do writeback on test failure (#10097)
Uses SIGTERM for `CockroachInstances` that are dropped without being cleaned, to propagate ongoing writes back to durable storage for inspection. Additionally, adds a test which validates this behavior (torn writes should no longer appear with the new SIGTERM-on-drop implementation). Fixes #10085
1 parent 5414efb commit eb8ae2f

2 files changed

Lines changed: 132 additions & 11 deletions

File tree

.config/nextest.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,11 @@ filter = 'binary_id(omicron-nexus::test_all) and test(::schema::)'
7272
threads-required = 4
7373

7474
[[profile.ci.overrides]]
75-
filter = 'binary_id(omicron-nexus::test_all)'
75+
# `omicron-test-utils` is included in this block that uses a longer-than-normal
76+
# timeout because under some conditions it waits for CockroachDB to shut down
77+
# gracefully. Note that `omicron-test-utils` depends on `omicron-nexus`, so this
78+
# is redundant, but we do this for future-proofing.
79+
filter = 'binary_id(omicron-nexus::test_all) + rdeps(omicron-test-utils)'
7680
# As of 2023-01-08, the slowest test in test_all takes 196s on a Ryzen 7950X.
7781
# 900s is a good upper limit that adds a comfortable buffer.
7882
slow-timeout = { period = '60s', terminate-after = 15 }

test-utils/src/dev/db.rs

Lines changed: 127 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -733,24 +733,55 @@ impl CockroachInstance {
733733

734734
impl Drop for CockroachInstance {
735735
fn drop(&mut self) {
736-
// TODO-cleanup Ideally at this point we would run self.cleanup() to
737-
// kill the child process, wait for it to exit, and then clean up the
738-
// temporary directory. However, we don't have an executor here with
739-
// which to run async/await code. We could create one here, but it's
740-
// not clear how safe or sketchy that would be. Instead, we expect that
741-
// the caller has done the cleanup already. This won't always happen,
742-
// particularly for ungraceful failures.
736+
// NOTE: Ideally, we'd share this logic with self.cleanup() to tear down
737+
// the child process. However, there are a couple distinctions from that
738+
// pathway we must consider here:
739+
//
740+
// 1. drop is not asynchronous, so we don't have the benefits of using
741+
// our tokio executor to run async/await code.
742+
// 2. If self.cleanup() has not been invoked, we want to gracefully
743+
// preserve the database for post-mortem debugging. This is often
744+
// the case for e.g. a test which has failed before invoking cleanup,
745+
// and which is bailing out. We can also take this pathway when the
746+
// calling code has forgotten to call "cleanup", and is leaking the
747+
// database, but that's a bug - and choosing to flush and terminate
748+
// via SIGTERM wouldn't be wrong in that case either.
743749
if self.child_process.is_some() || self.temp_dir.is_some() {
744750
eprintln!(
745751
"WARN: dropped CockroachInstance without cleaning it up first \
746752
(there may still be a child process running and a \
747753
temporary directory leaked)"
748754
);
749755

750-
// Still, make a best effort.
751-
#[allow(unused_must_use)]
756+
// Send SIGTERM so CockroachDB can flush its WAL to disk, making the
757+
// leaked temp directory useful for post-mortem debugging.
758+
//
759+
// Background: test CRDB instances might set
760+
// `kv.raft_log.disable_synchronization_unsafe = true`
761+
// which skips per-commit fdatasync for performance. Data is still
762+
// written to the WAL but may only exist in Pebble's user-space
763+
// write buffer. SIGTERM triggers Pebble's DB.Close() which flushes
764+
// and fsyncs the WAL before exiting.
765+
//
766+
// See also:
767+
// - https://github.com/oxidecomputer/cockroach/blob/release-22.1-oxide/pkg/cli/start_unix.go#L37
768+
// - https://github.com/oxidecomputer/omicron/issues/10085
752769
if let Some(child_process) = self.child_process.as_mut() {
753-
child_process.start_kill();
770+
let pid = self.pid as libc::pid_t;
771+
unsafe { libc::kill(pid, libc::SIGTERM) };
772+
773+
// Poll the child_process until it exits.
774+
//
775+
// Note that nextest has a timeout for tests
776+
// with the filter 'rdeps(omicron-test-utils)'.
777+
//
778+
// As a consequence, we can loop "as long as the test runner
779+
// lets us" waiting for CockroachDB to gracefully terminate. In
780+
// reality, this is typically on the order of ~one second after
781+
// a test failure.
782+
while let Ok(None) = child_process.try_wait() {
783+
std::thread::sleep(std::time::Duration::from_millis(50));
784+
}
754785
}
755786
#[allow(unused_must_use)]
756787
if let Some(temp_dir) = self.temp_dir.take() {
@@ -1903,6 +1934,92 @@ mod test {
19031934
assert_eq!(addr.port(), 12345);
19041935
}
19051936

1937+
// Test that data written to CockroachDB survives when the instance is
1938+
// dropped without calling cleanup(). This exercises the Drop impl path
1939+
// that fires during test failures (when cleanup is unreachable due to
1940+
// a panic) or when cleanup is accidentally omitted.
1941+
//
1942+
// With `disable_synchronization_unsafe = true`, committed data may
1943+
// only exist in Pebble's user-space write buffer (not yet write()-ed
1944+
// to the WAL file on disk). A graceful shutdown (SIGTERM) triggers
1945+
// Pebble's DB.Close() which flushes and fsyncs the WAL, making data
1946+
// durable. SIGKILL destroys the process immediately, losing any
1947+
// buffered WAL data.
1948+
//
1949+
// See: https://github.com/oxidecomputer/omicron/issues/10085
1950+
#[tokio::test]
1951+
async fn test_data_survives_drop_without_cleanup() {
1952+
let store_dir = tempdir().expect("failed to create temp dir");
1953+
let data_dir = store_dir.path().join("data");
1954+
1955+
// Start CRDB with a persistent store directory
1956+
let starter = new_builder()
1957+
.store_dir(&data_dir)
1958+
.build()
1959+
.expect("failed to build starter");
1960+
let database = starter.start().await.expect("failed to start CRDB");
1961+
1962+
// Enable the unsafe sync setting (as setup_database does)
1963+
database
1964+
.disable_synchronization()
1965+
.await
1966+
.expect("failed to disable sync");
1967+
1968+
// Write a single canary row — this is the value we check after restart.
1969+
let client = database.connect().await.expect("failed to connect");
1970+
client
1971+
.batch_execute(
1972+
"CREATE DATABASE test_db; \
1973+
USE test_db; \
1974+
CREATE TABLE test_data (id INT PRIMARY KEY, payload STRING); \
1975+
INSERT INTO test_data (id, payload) \
1976+
VALUES (99999, 'canary');",
1977+
)
1978+
.await
1979+
.expect("canary insert failed");
1980+
1981+
// Drop without cleanup — this triggers the Drop impl.
1982+
// Save the temp dir path so we can clean it up after
1983+
// verification (the Drop impl intentionally leaks it).
1984+
let leaked_temp_dir = database.temp_dir().to_owned();
1985+
drop(client);
1986+
drop(database);
1987+
1988+
// Restart CRDB on the same store directory.
1989+
let starter2 = new_builder()
1990+
.store_dir(&data_dir)
1991+
.build()
1992+
.expect("failed to build second starter");
1993+
let mut database2 =
1994+
starter2.start().await.expect("failed to restart CRDB");
1995+
1996+
let client2 = database2.connect().await.expect("failed to reconnect");
1997+
1998+
// Check that the canary row survived.
1999+
let rows = client2
2000+
.query(
2001+
"SELECT payload FROM test_db.test_data WHERE id = 99999",
2002+
&[],
2003+
)
2004+
.await
2005+
.expect("canary query failed");
2006+
assert_eq!(
2007+
rows.len(),
2008+
1,
2009+
"canary row (id=99999) did not survive shutdown"
2010+
);
2011+
let payload: String = rows[0].get(0);
2012+
assert_eq!(payload, "canary", "canary row has wrong payload");
2013+
2014+
client2.cleanup().await.expect("client2 cleanup failed");
2015+
database2.cleanup().await.expect("database2 cleanup failed");
2016+
2017+
// Clean up the temp dir leaked by the first instance's Drop.
2018+
fs::remove_dir_all(&leaked_temp_dir)
2019+
.await
2020+
.expect("failed to clean up leaked temp dir");
2021+
}
2022+
19062023
// Tests the way `process_exited()` checks and interprets the exit status
19072024
// for abnormal process termination (by a signal).
19082025
#[tokio::test]

0 commit comments

Comments
 (0)