@@ -733,24 +733,55 @@ impl CockroachInstance {
733733
734734impl Drop for CockroachInstance {
735735 fn drop ( & mut self ) {
736- // TODO-cleanup Ideally at this point we would run self.cleanup() to
737- // kill the child process, wait for it to exit, and then clean up the
738- // temporary directory. However, we don't have an executor here with
739- // which to run async/await code. We could create one here, but it's
740- // not clear how safe or sketchy that would be. Instead, we expect that
741- // the caller has done the cleanup already. This won't always happen,
742- // particularly for ungraceful failures.
736+ // NOTE: Ideally, we'd share this logic with self.cleanup() to tear down
737+ // the child process. However, there are a couple distinctions from that
738+ // pathway we must consider here:
739+ //
740+ // 1. drop is not asynchronous, so we don't have the benefits of using
741+ // our tokio executor to run async/await code.
742+ // 2. If self.cleanup() has not been invoked, we want to gracefully
743+ // preserve the database for post-mortem debugging. This is often
744+ // the case for e.g. a test which has failed before invoking cleanup,
745+ // and which is bailing out. We can also take this pathway when the
746+ // calling code has forgotten to call "cleanup", and is leaking the
747+ // database, but that's a bug - and choosing to flush and terminate
748+ // via SIGTERM wouldn't be wrong in that case either.
743749 if self . child_process . is_some ( ) || self . temp_dir . is_some ( ) {
744750 eprintln ! (
745751 "WARN: dropped CockroachInstance without cleaning it up first \
746752 (there may still be a child process running and a \
747753 temporary directory leaked)"
748754 ) ;
749755
750- // Still, make a best effort.
751- #[ allow( unused_must_use) ]
756+ // Send SIGTERM so CockroachDB can flush its WAL to disk, making the
757+ // leaked temp directory useful for post-mortem debugging.
758+ //
759+ // Background: test CRDB instances might set
760+ // `kv.raft_log.disable_synchronization_unsafe = true`
761+ // which skips per-commit fdatasync for performance. Data is still
762+ // written to the WAL but may only exist in Pebble's user-space
763+ // write buffer. SIGTERM triggers Pebble's DB.Close() which flushes
764+ // and fsyncs the WAL before exiting.
765+ //
766+ // See also:
767+ // - https://github.com/oxidecomputer/cockroach/blob/release-22.1-oxide/pkg/cli/start_unix.go#L37
768+ // - https://github.com/oxidecomputer/omicron/issues/10085
752769 if let Some ( child_process) = self . child_process . as_mut ( ) {
753- child_process. start_kill ( ) ;
770+ let pid = self . pid as libc:: pid_t ;
771+ unsafe { libc:: kill ( pid, libc:: SIGTERM ) } ;
772+
773+ // Poll the child_process until it exits.
774+ //
775+ // Note that nextest has a timeout for tests
776+ // with the filter 'rdeps(omicron-test-utils)'.
777+ //
778+ // As a consequence, we can loop "as long as the test runner
779+ // lets us" waiting for CockroachDB to gracefully terminate. In
780+ // reality, this is typically on the order of ~one second after
781+ // a test failure.
782+ while let Ok ( None ) = child_process. try_wait ( ) {
783+ std:: thread:: sleep ( std:: time:: Duration :: from_millis ( 50 ) ) ;
784+ }
754785 }
755786 #[ allow( unused_must_use) ]
756787 if let Some ( temp_dir) = self . temp_dir . take ( ) {
@@ -1903,6 +1934,92 @@ mod test {
19031934 assert_eq ! ( addr. port( ) , 12345 ) ;
19041935 }
19051936
1937+ // Test that data written to CockroachDB survives when the instance is
1938+ // dropped without calling cleanup(). This exercises the Drop impl path
1939+ // that fires during test failures (when cleanup is unreachable due to
1940+ // a panic) or when cleanup is accidentally omitted.
1941+ //
1942+ // With `disable_synchronization_unsafe = true`, committed data may
1943+ // only exist in Pebble's user-space write buffer (not yet write()-ed
1944+ // to the WAL file on disk). A graceful shutdown (SIGTERM) triggers
1945+ // Pebble's DB.Close() which flushes and fsyncs the WAL, making data
1946+ // durable. SIGKILL destroys the process immediately, losing any
1947+ // buffered WAL data.
1948+ //
1949+ // See: https://github.com/oxidecomputer/omicron/issues/10085
1950+ #[ tokio:: test]
1951+ async fn test_data_survives_drop_without_cleanup ( ) {
1952+ let store_dir = tempdir ( ) . expect ( "failed to create temp dir" ) ;
1953+ let data_dir = store_dir. path ( ) . join ( "data" ) ;
1954+
1955+ // Start CRDB with a persistent store directory
1956+ let starter = new_builder ( )
1957+ . store_dir ( & data_dir)
1958+ . build ( )
1959+ . expect ( "failed to build starter" ) ;
1960+ let database = starter. start ( ) . await . expect ( "failed to start CRDB" ) ;
1961+
1962+ // Enable the unsafe sync setting (as setup_database does)
1963+ database
1964+ . disable_synchronization ( )
1965+ . await
1966+ . expect ( "failed to disable sync" ) ;
1967+
1968+ // Write a single canary row — this is the value we check after restart.
1969+ let client = database. connect ( ) . await . expect ( "failed to connect" ) ;
1970+ client
1971+ . batch_execute (
1972+ "CREATE DATABASE test_db; \
1973+ USE test_db; \
1974+ CREATE TABLE test_data (id INT PRIMARY KEY, payload STRING); \
1975+ INSERT INTO test_data (id, payload) \
1976+ VALUES (99999, 'canary');",
1977+ )
1978+ . await
1979+ . expect ( "canary insert failed" ) ;
1980+
1981+ // Drop without cleanup — this triggers the Drop impl.
1982+ // Save the temp dir path so we can clean it up after
1983+ // verification (the Drop impl intentionally leaks it).
1984+ let leaked_temp_dir = database. temp_dir ( ) . to_owned ( ) ;
1985+ drop ( client) ;
1986+ drop ( database) ;
1987+
1988+ // Restart CRDB on the same store directory.
1989+ let starter2 = new_builder ( )
1990+ . store_dir ( & data_dir)
1991+ . build ( )
1992+ . expect ( "failed to build second starter" ) ;
1993+ let mut database2 =
1994+ starter2. start ( ) . await . expect ( "failed to restart CRDB" ) ;
1995+
1996+ let client2 = database2. connect ( ) . await . expect ( "failed to reconnect" ) ;
1997+
1998+ // Check that the canary row survived.
1999+ let rows = client2
2000+ . query (
2001+ "SELECT payload FROM test_db.test_data WHERE id = 99999" ,
2002+ & [ ] ,
2003+ )
2004+ . await
2005+ . expect ( "canary query failed" ) ;
2006+ assert_eq ! (
2007+ rows. len( ) ,
2008+ 1 ,
2009+ "canary row (id=99999) did not survive shutdown"
2010+ ) ;
2011+ let payload: String = rows[ 0 ] . get ( 0 ) ;
2012+ assert_eq ! ( payload, "canary" , "canary row has wrong payload" ) ;
2013+
2014+ client2. cleanup ( ) . await . expect ( "client2 cleanup failed" ) ;
2015+ database2. cleanup ( ) . await . expect ( "database2 cleanup failed" ) ;
2016+
2017+ // Clean up the temp dir leaked by the first instance's Drop.
2018+ fs:: remove_dir_all ( & leaked_temp_dir)
2019+ . await
2020+ . expect ( "failed to clean up leaked temp dir" ) ;
2021+ }
2022+
19062023 // Tests the way `process_exited()` checks and interprets the exit status
19072024 // for abnormal process termination (by a signal).
19082025 #[ tokio:: test]
0 commit comments