Skip to content

Commit a9906f0

Browse files
pdillingermeta-codesync[bot]
authored andcommitted
A better approach to clearing DBs for crash test (facebook#14254)
Summary: Clearing DB dir for crash test is currently a hodgepodge of 1. Caller of db_crashtest.py maybe tries to clear the dir 2. db_crashtest.py tries to clear the dir in get_dbname() (but ignoring failure) 3. db_crashtest.py passes --destroy_db_initially to some db_stress calls as needed 4. db_crashtest.py tries to clear the dir between some db_stress calls 5. db_crashtest.py tries to clear the dir after everything is done and successful (no artifacts to investigate or save) (but ignoring failure) 6. Try to add more uniqueness to the directory from facebook#14249 This change reverts or replaces 2, 4, 5, and 6 by doubling-down on (expanding) 3 and a small variant of it: * crash_test.mk passes --destroy_db_initially=1 so that the first run of db_stress clears the db dir. * After each db_stress invocation, db_crashtest.py resets destroy_db_initially=0 so that the next invocation reuses the same DB, except in cases where there is an incompatibility that requires a fresh DB (from cases 3 and 4 above). * On success, uses new `db_stress --destroy_db_and_exit` option to clean up the DB dir without needing a custom cleanup_cmd (now ignored) Note that although case 1 is likely obsolete, it is out of control of an open source PR. Pull Request resolved: facebook#14254 Test Plan: some manual runs Reviewed By: xingbowang Differential Revision: D91164731 Pulled By: pdillinger fbshipit-source-id: 0a66c8c0e130c9eeacc55af411a18a09bc9debdf
1 parent b89d290 commit a9906f0

7 files changed

Lines changed: 72 additions & 45 deletions

File tree

crash_test.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ DB_STRESS_CMD?=./db_stress
88
include common.mk
99

1010
CRASHTEST_MAKE=$(MAKE) -f crash_test.mk
11-
CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)'
11+
CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)' --destroy_db_initially=1
1212

1313
.PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \
1414
crash_test_with_wc_txn crash_test_with_wp_txn crash_test_with_wup_txn \

db_stress_tool/db_stress_common.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,5 +877,24 @@ Status DestroyUnverifiedSubdir(const std::string& dirname) {
877877
return s;
878878
}
879879

880+
Status DbStressDestroyDb(const std::string& db_path) {
881+
Status s;
882+
Options options;
883+
// NOTE: using db_stress_listener_env in order to see obsolete MANIFEST files
884+
options.env = db_stress_listener_env;
885+
// Remove DB files in a principled way to avoid issues
886+
if (FLAGS_use_blob_db) {
887+
s = blob_db::DestroyBlobDB(db_path, options, blob_db::BlobDBOptions());
888+
} else {
889+
s = DestroyDB(db_path, options);
890+
}
891+
if (!s.ok()) {
892+
return s;
893+
}
894+
// Remove everything else recursively, only reporting success if able to
895+
// delete everything
896+
return DestroyDir(db_stress_listener_env, db_path);
897+
}
898+
880899
} // namespace ROCKSDB_NAMESPACE
881900
#endif // GFLAGS

db_stress_tool/db_stress_common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ DECLARE_bool(enable_pipelined_write);
100100
DECLARE_bool(verify_before_write);
101101
DECLARE_bool(histogram);
102102
DECLARE_bool(destroy_db_initially);
103+
DECLARE_bool(destroy_db_and_exit);
103104
DECLARE_bool(verbose);
104105
DECLARE_bool(progress_reports);
105106
DECLARE_uint64(db_write_buffer_size);
@@ -820,5 +821,10 @@ Status SaveFilesInDirectory(const std::string& src_dirname,
820821
const std::string& dst_dirname);
821822
Status DestroyUnverifiedSubdir(const std::string& dirname);
822823
Status InitUnverifiedSubdir(const std::string& dirname);
824+
825+
// Destroy the DB at the given path under the env configured for db_stress.
826+
// Handles both regular DB and BlobDB, and cleans and removes the entire dir.
827+
Status DbStressDestroyDb(const std::string& db_path);
828+
823829
} // namespace ROCKSDB_NAMESPACE
824830
#endif // GFLAGS

db_stress_tool/db_stress_gflags.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings");
135135
DEFINE_bool(destroy_db_initially, true,
136136
"Destroys the database dir before start if this is true");
137137

138+
DEFINE_bool(destroy_db_and_exit, false,
139+
"Destroys the database dir and exits. Useful for cleanup without "
140+
"running stress test. Other options are mostly ignored.");
141+
138142
DEFINE_bool(verbose, false, "Verbose");
139143

140144
DEFINE_bool(progress_reports, true,

db_stress_tool/db_stress_test_base.cc

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -77,22 +77,7 @@ StressTest::StressTest()
7777
secondary_db_(nullptr),
7878
is_db_stopped_(false) {
7979
if (FLAGS_destroy_db_initially) {
80-
std::vector<std::string> files;
81-
db_stress_env->GetChildren(FLAGS_db, &files);
82-
for (unsigned int i = 0; i < files.size(); i++) {
83-
if (Slice(files[i]).starts_with("heap-")) {
84-
db_stress_env->DeleteFile(FLAGS_db + "/" + files[i]);
85-
}
86-
}
87-
88-
Options options;
89-
options.env = db_stress_env;
90-
// Remove files without preserving manfiest files
91-
const Status s = !FLAGS_use_blob_db
92-
? DestroyDB(FLAGS_db, options)
93-
: blob_db::DestroyBlobDB(FLAGS_db, options,
94-
blob_db::BlobDBOptions());
95-
80+
const Status s = DbStressDestroyDb(FLAGS_db);
9681
if (!s.ok()) {
9782
fprintf(stderr, "Cannot destroy original db: %s\n", s.ToString().c_str());
9883
exit(1);

db_stress_tool/db_stress_tool.cc

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,19 @@ int db_stress_tool(int argc, char** argv) {
9898
raw_env, std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem()));
9999
db_stress_env = env_wrapper_guard.get();
100100

101+
// Handle --destroy_db_and_exit early, before other option validation
102+
if (FLAGS_destroy_db_and_exit) {
103+
s = DbStressDestroyDb(FLAGS_db);
104+
if (s.ok()) {
105+
fprintf(stdout, "Successfully destroyed db at %s\n", FLAGS_db.c_str());
106+
return 0;
107+
} else {
108+
fprintf(stderr, "Failed to destroy db at %s: %s\n", FLAGS_db.c_str(),
109+
s.ToString().c_str());
110+
return 1;
111+
}
112+
}
113+
101114
FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
102115

103116
// The number of background threads should be at least as much the

tools/db_crashtest.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ def early_argument_parsing_before_main():
6060
global per_iteration_random_seed_override
6161
per_iteration_random_seed_override = args.per_iteration_random_seed_override
6262
global is_remote_db
63-
# Set is_remote_db if remain_args has a non-empty --env_uri= argument
63+
# Set is_remote_db if remain_args has a non-empty --env_uri= or --fs_uri= argument
6464
for arg in remain_args:
6565
parts = arg.split("=", 1)
66-
if parts[0] == "--env_uri" and len(parts) > 1 and parts[1]:
66+
if parts[0] in ["--env_uri", "--fs_uri"] and len(parts) > 1 and parts[1]:
6767
is_remote_db = True
6868
break
6969

@@ -454,32 +454,20 @@ def apply_random_seed_per_iteration():
454454
_DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL"
455455

456456
stress_cmd = "./db_stress"
457-
cleanup_cmd = None
458457

459458

460459
def is_release_mode():
461460
return os.environ.get(_DEBUG_LEVEL_ENV_VAR) == "0"
462461

463462

464-
# Generate a unique run ID for this script execution. This ensures each run
465-
# gets a unique database directory when TEST_TMPDIR is set, avoiding issues
466-
# with parameter changes (like use_put_entity_one_in) between runs.
467-
run_id = str(random.randint(0, 2**63))
468-
469-
470463
def get_dbname(test_name):
471464
test_dir_name = "rocksdb_crashtest_" + test_name
472465
test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
473466
if test_tmpdir is None or test_tmpdir == "":
474467
dbname = tempfile.mkdtemp(prefix=test_dir_name)
475468
else:
476-
dbname = test_tmpdir + "/" + test_dir_name + "_" + run_id
469+
dbname = test_tmpdir + "/" + test_dir_name
477470
if not is_remote_db:
478-
shutil.rmtree(dbname, True)
479-
if cleanup_cmd is not None:
480-
print("Running DB cleanup command - %s\n" % cleanup_cmd)
481-
# Ignore failure
482-
os.system(cleanup_cmd)
483471
os.makedirs(dbname, exist_ok=True)
484472
return dbname
485473

@@ -1387,13 +1375,18 @@ def print_output_and_exit_on_error(stdout, stderr, print_stderr_separately=False
13871375

13881376

13891377
def cleanup_after_success(dbname):
1390-
if not is_remote_db:
1391-
shutil.rmtree(dbname, True)
1392-
if cleanup_cmd is not None:
1393-
print("Running DB cleanup command - %s\n" % cleanup_cmd)
1394-
ret = os.system(cleanup_cmd)
1395-
if ret != 0:
1396-
print("WARNING: DB cleanup returned error %d\n" % ret)
1378+
# Use db_stress --destroy_db_and_exit, which simplifies remote DB cleanup
1379+
cleanup_cmd_parts = [stress_cmd, "--destroy_db_and_exit=1", "--db=" + dbname]
1380+
# Pass through relevant arguments for remote DB access
1381+
for arg in remain_args:
1382+
parts = arg.split("=", 1)
1383+
if parts[0] in ["--env_uri", "--fs_uri"]:
1384+
cleanup_cmd_parts.append(arg)
1385+
print("Running DB cleanup command - %s\n" % " ".join(cleanup_cmd_parts))
1386+
ret = subprocess.call(cleanup_cmd_parts)
1387+
if ret != 0:
1388+
print("ERROR: DB cleanup returned error %d\n" % ret)
1389+
sys.exit(2)
13971390

13981391

13991392
# This script runs and kills db_stress multiple times. It checks consistency
@@ -1421,6 +1414,10 @@ def blackbox_crash_main(args, unknown_args):
14211414

14221415
hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"])
14231416

1417+
# Reset destroy_db_initially after each run (it may have been set by
1418+
# command line for first run only)
1419+
cmd_params["destroy_db_initially"] = 0
1420+
14241421
if not hit_timeout:
14251422
print("Exit Before Killing")
14261423
print_output_and_exit_on_error(outs, errs, args.print_stderr_separately)
@@ -1563,7 +1560,7 @@ def whitebox_crash_main(args, unknown_args):
15631560
"`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style."
15641561
+ "\n"
15651562
)
1566-
additional_opts["destroy_db_initially"] = 1
1563+
cmd_params["destroy_db_initially"] = 1
15671564
prev_compaction_style = cur_compaction_style
15681565

15691566
cmd = gen_cmd(
@@ -1588,6 +1585,11 @@ def whitebox_crash_main(args, unknown_args):
15881585
hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
15891586
cmd, exit_time - time.time() + 900
15901587
)
1588+
1589+
# Reset destroy_db_initially after each run (it may have been set by
1590+
# command line for first run, or set for various reasons for a step)
1591+
cmd_params["destroy_db_initially"] = 0
1592+
15911593
msg = "check_mode={}, kill option={}, exitcode={}\n".format(
15921594
check_mode, additional_opts["kill_random_test"], retncode
15931595
)
@@ -1617,7 +1619,8 @@ def whitebox_crash_main(args, unknown_args):
16171619
# First half of the duration, keep doing kill test. For the next half,
16181620
# try different modes.
16191621
if time.time() > half_time:
1620-
cleanup_after_success(dbname)
1622+
# Set next iteration to destroy DB (works for remote DB)
1623+
cmd_params["destroy_db_initially"] = 1
16211624
if expected_values_dir is not None:
16221625
shutil.rmtree(expected_values_dir, True)
16231626
os.mkdir(expected_values_dir)
@@ -1633,7 +1636,6 @@ def whitebox_crash_main(args, unknown_args):
16331636

16341637
def main():
16351638
global stress_cmd
1636-
global cleanup_cmd
16371639

16381640
parser = argparse.ArgumentParser(
16391641
description="This script runs and kills \
@@ -1649,7 +1651,7 @@ def main():
16491651
parser.add_argument("--test_multiops_txn", action="store_true")
16501652
parser.add_argument("--stress_cmd")
16511653
parser.add_argument("--test_tiered_storage", action="store_true")
1652-
parser.add_argument("--cleanup_cmd")
1654+
parser.add_argument("--cleanup_cmd") # ignore old option for now
16531655
parser.add_argument("--print_stderr_separately", action="store_true", default=False)
16541656

16551657
all_params = dict(
@@ -1690,8 +1692,6 @@ def main():
16901692

16911693
if args.stress_cmd:
16921694
stress_cmd = args.stress_cmd
1693-
if args.cleanup_cmd:
1694-
cleanup_cmd = args.cleanup_cmd
16951695
if args.test_type == "blackbox":
16961696
blackbox_crash_main(args, unknown_args)
16971697
if args.test_type == "whitebox":

0 commit comments

Comments
 (0)