Skip to content

Commit 21723bb

Browse files
xingbowangmeta-codesync[bot]
authored andcommitted
Add async WAL precreation
Summary: - Add experimental immutable `DBOptions::async_wal_precreate` to reserve and open one future WAL on a background HIGH-priority task, with sanitization that disables the optimization when WAL recycling is configured. - Split WAL creation into open/preallocate and start phases so `SwitchMemtable()` can consume a prepared WAL after writing normal WAL metadata, wait for in-flight precreation, fall back to synchronous creation, and delete an unstarted prepared WAL on start failure. - Keep WAL numbering, close, recovery, and read-only open safe for empty future WAL files left by async precreation; `error_if_wal_file_exists=true` now rejects non-empty WALs while tolerating empty WALs. - Add public option plumbing for the C API, options parsing/stringification, random option testing, `db_bench`, `db_stress`, and crash-test configuration. - Add WAL precreate statistics counters plus Java `TickerType`/JNI mappings, and update C++, C, and Java read-only-open documentation for the empty-WAL behavior. - Add focused WAL/option/C/Java tests for async precreate ready/wait/failure/recovery paths, read-only WAL detection, option sanitization, and API plumbing, plus write-flow docs and unreleased history entries for the new feature and behavior change. PR #14738 Reviewed By: pdillinger Differential Revision: D105020559 fbshipit-source-id: 5059b424702e021abb8de65ceeb6d3b975280ffc
1 parent a2c96df commit 21723bb

32 files changed

Lines changed: 823 additions & 32 deletions

db/c.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5286,6 +5286,15 @@ size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) {
52865286
return opt->rep.recycle_log_file_num;
52875287
}
52885288

5289+
void rocksdb_options_set_async_wal_precreate(rocksdb_options_t* opt,
5290+
unsigned char v) {
5291+
opt->rep.async_wal_precreate = v;
5292+
}
5293+
5294+
unsigned char rocksdb_options_get_async_wal_precreate(rocksdb_options_t* opt) {
5295+
return opt->rep.async_wal_precreate;
5296+
}
5297+
52895298
void rocksdb_options_set_soft_pending_compaction_bytes_limit(
52905299
rocksdb_options_t* opt, size_t v) {
52915300
opt->rep.soft_pending_compaction_bytes_limit = v;

db/c_test.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2869,6 +2869,9 @@ int main(int argc, char** argv) {
28692869
rocksdb_options_set_track_and_verify_wals_in_manifest(o, 42);
28702870
CheckCondition(1 ==
28712871
rocksdb_options_get_track_and_verify_wals_in_manifest(o));
2872+
CheckCondition(0 == rocksdb_options_get_async_wal_precreate(o));
2873+
rocksdb_options_set_async_wal_precreate(o, 1);
2874+
CheckCondition(1 == rocksdb_options_get_async_wal_precreate(o));
28722875

28732876
/* Blob Options */
28742877
rocksdb_options_set_enable_blob_files(o, 1);

db/db_impl/db_impl.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,12 +789,27 @@ Status DBImpl::CloseHelper() {
789789
bg_flush_scheduled_ || bg_purge_scheduled_ ||
790790
bg_pressure_callback_in_progress_ ||
791791
bg_async_file_open_state_ == AsyncFileOpenState::kScheduled ||
792+
async_wal_precreate_state_ == AsyncWALPrecreateState::kScheduled ||
792793
pending_purge_obsolete_files_ ||
793794
error_handler_.IsRecoveryInProgress()) {
794795
TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
795796
bg_cv_.Wait();
796797
}
797798

799+
// Release any opened-but-unpublished WAL writer after the in-flight worker
800+
// has published its result. Clear the DB-owned async slot while holding
801+
// mutex_, but destroy the detached writer after dropping mutex_ because
802+
// log::Writer / WritableFileWriter destruction can flush and close the file.
803+
// The file itself can be left behind as an empty future WAL; recovery already
804+
// tolerates it and marks its file number used if observed.
805+
UnpublishedWAL unused_async_wal = std::move(async_wal_precreate_wal_);
806+
async_wal_precreate_state_ = AsyncWALPrecreateState::kNotScheduled;
807+
if (unused_async_wal.writer) {
808+
mutex_.Unlock();
809+
unused_async_wal.Reset();
810+
mutex_.Lock();
811+
}
812+
798813
// Ensure subclasses don't forget to schedule async file opening
799814
assert(!immutable_db_options_.open_files_async || !opened_successfully_ ||
800815
bg_async_file_open_state_ != AsyncFileOpenState::kNotScheduled);

db/db_impl/db_impl.h

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include <limits>
1616
#include <list>
1717
#include <map>
18+
#include <memory>
19+
#include <optional>
1820
#include <set>
1921
#include <string>
2022
#include <unordered_map>
@@ -2810,7 +2812,68 @@ class DBImpl : public DB {
28102812
size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
28112813
Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
28122814

2813-
IOStatus CreateWAL(const WriteOptions& write_options, uint64_t log_file_num,
2815+
// Returns true when async WAL precreation is enabled and compatible with the
2816+
// active WAL strategy. WAL recycling already avoids file creation latency, so
2817+
// precreation is disabled when recycle_log_file_num is non-zero.
2818+
bool AsyncWALPrecreateEnabled() const;
2819+
2820+
// A WAL file that has a reserved file number and may have an opened writer,
2821+
// but has not been added to DBImpl's in-memory logical WAL tracking lists
2822+
// (logs_ and alive_wal_files_).
2823+
struct UnpublishedWAL {
2824+
uint64_t log_number = 0;
2825+
std::unique_ptr<log::Writer> writer;
2826+
2827+
UnpublishedWAL() = default;
2828+
UnpublishedWAL(const UnpublishedWAL&) = delete;
2829+
UnpublishedWAL& operator=(const UnpublishedWAL&) = delete;
2830+
2831+
UnpublishedWAL(UnpublishedWAL&& other) noexcept {
2832+
*this = std::move(other);
2833+
}
2834+
UnpublishedWAL& operator=(UnpublishedWAL&& other) noexcept {
2835+
if (this != &other) {
2836+
log_number = other.log_number;
2837+
writer = std::move(other.writer);
2838+
other.Reset();
2839+
}
2840+
return *this;
2841+
}
2842+
2843+
void Reset() {
2844+
log_number = 0;
2845+
writer.reset();
2846+
}
2847+
};
2848+
2849+
// Reserves the next WAL file number and schedules a HIGH-priority background
2850+
// task to precreate that WAL file. A precreated WAL is not a logical WAL
2851+
// until a foreground WAL rotation consumes it.
2852+
void MaybeScheduleAsyncWALPrecreate(size_t preallocate_block_size);
2853+
2854+
// Background task for opening the reserved future WAL and publishing the
2855+
// result under mutex_.
2856+
static void BGWorkAsyncWALPrecreate(void* arg);
2857+
2858+
// Waits for an in-flight async WAL precreation and returns a prepared WAL if
2859+
// one is available. If precreation failed, returns an empty WAL and lets the
2860+
// foreground rotation create the WAL synchronously. Caller must hold mutex_.
2861+
UnpublishedWAL WaitForAsyncWALPrecreate();
2862+
2863+
// Opens and preallocates a WAL writer without writing logical WAL records.
2864+
// Used by async WAL precreation and by synchronous WAL creation.
2865+
IOStatus CreateWALWriter(const DBOptions& db_options, uint64_t log_file_num,
2866+
uint64_t recycle_log_number,
2867+
size_t preallocate_block_size,
2868+
UnpublishedWAL* new_wal);
2869+
2870+
// Starts an opened WAL file by writing the initial records required before it
2871+
// can be installed as the current WAL for foreground writes.
2872+
IOStatus StartWALFile(const WriteOptions& write_options,
2873+
const PredecessorWALInfo& predecessor_wal_info,
2874+
log::Writer* new_log);
2875+
IOStatus CreateWAL(const DBOptions& db_options,
2876+
const WriteOptions& write_options, uint64_t log_file_num,
28142877
uint64_t recycle_log_number, size_t preallocate_block_size,
28152878
const PredecessorWALInfo& predecessor_wal_info,
28162879
log::Writer** new_log);
@@ -3306,6 +3369,28 @@ class DBImpl : public DB {
33063369
AsyncFileOpenState bg_async_file_open_state_ =
33073370
AsyncFileOpenState::kNotScheduled;
33083371

3372+
// State machine for the single async WAL precreation slot protected by
3373+
// mutex_. Background precreation failure returns to kNotScheduled; foreground
3374+
// rotation handles it the same as no prepared WAL and creates one
3375+
// synchronously. kScheduled owns a reserved file number; kReady owns an
3376+
// opened writer that has not been started or added to logical WAL tracking.
3377+
enum class AsyncWALPrecreateState : uint8_t {
3378+
kNotScheduled = 0, // No WAL precreate work is in-flight or ready.
3379+
kScheduled, // Background task owns creation of the reserved WAL.
3380+
kReady, // Reserved WAL writer is open but not logically live.
3381+
};
3382+
3383+
// Protected by mutex_. Tracks at most one background precreated WAL. A
3384+
// precreated WAL is only reserved empty storage until SwitchMemtable()
3385+
// consumes it and installs it in DBImpl's in-memory logical WAL tracking
3386+
// lists (logs_ and alive_wal_files_).
3387+
AsyncWALPrecreateState async_wal_precreate_state_ =
3388+
AsyncWALPrecreateState::kNotScheduled;
3389+
3390+
// Reserved in-flight/ready precreated WAL. The writer is populated only while
3391+
// state is kReady.
3392+
UnpublishedWAL async_wal_precreate_wal_;
3393+
33093394
std::deque<ManualCompactionState*> manual_compaction_dequeue_;
33103395

33113396
// shall we disable deletion of obsolete files

0 commit comments

Comments
 (0)