facebook
diff --git a/‎db/c.cc‎
Lines changed: 10 additions & 0 deletions b/‎db/c.cc‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎db/compaction/compaction_job.cc‎
Lines changed: 19 additions & 1 deletion b/‎db/compaction/compaction_job.cc‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎db/db_compaction_test.cc‎
Lines changed: 175 additions & 0 deletions b/‎db/db_compaction_test.cc‎
Lines changed: 175 additions & 0 deletions
diff --git a/‎db/db_impl/db_impl_open.cc‎
Lines changed: 12 additions & 0 deletions b/‎db/db_impl/db_impl_open.cc‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎db/db_options_test.cc‎
Lines changed: 100 additions & 0 deletions b/‎db/db_options_test.cc‎
Lines changed: 100 additions & 0 deletions
@@ -5038,6 +5038,16 @@ unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction(
   return opt->rep.use_direct_io_for_flush_and_compaction;
 }
 
+void rocksdb_options_set_use_direct_reads_for_compaction(rocksdb_options_t* opt,
+                                                         unsigned char v) {
+  opt->rep.use_direct_reads_for_compaction = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_reads_for_compaction(
+    rocksdb_options_t* opt) {
+  return opt->rep.use_direct_reads_for_compaction;
+}
+
 void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt,
                                           unsigned char v) {
   opt->rep.allow_mmap_reads = v;
 
@@ -203,6 +203,12 @@ CompactionJob::CompactionJob(
   assert(job_context);
   assert(job_context->snapshot_context_initialized);
 
+  // Expose the file options used for compaction reads so tests can confirm
+  // that `use_direct_reads_for_compaction` (and related flags) plumb all the
+  // way through to the read path.
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::CompactionJob:FileOptionsForRead",
+                           &file_options_for_read_);
+
   const auto* cfd = compact_->compaction->column_family_data();
   ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
   ThreadStatusUtil::SetColumnFamily(cfd);
@@ -1536,10 +1542,22 @@ InternalIterator* CompactionJob::CreateInputIterator(
 
   // Although the v2 aggregator is what the level iterator(s) know about,
   // the AddTombstones calls will be propagated down to the v1 aggregator.
+  //
+  // When `use_direct_reads_for_compaction` is set while the global
+  // `use_direct_reads` stays off, the shared TableCache is already holding
+  // buffered file handles for these SST files (opened that way for user
+  // reads). Reusing those handles would silently downgrade the compaction
+  // scan back to buffered I/O. Ask the iterator to open ephemeral
+  // O_DIRECT handles instead so the kernel actually bypasses the page
+  // cache for the compaction reads.
+  const bool bypass_cache_for_scan =
+      db_options_.use_direct_reads_for_compaction &&
+      !db_options_.use_direct_reads && file_options_for_read_.use_direct_reads;
   iterators.raw_input =
       std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
           read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
-          file_options_for_read_, boundaries.start, boundaries.end));
+          file_options_for_read_, boundaries.start, boundaries.end,
+          bypass_cache_for_scan));
   InternalIterator* input = iterators.raw_input.get();
 
   if (boundaries.start.has_value() || boundaries.end.has_value()) {
 
@@ -6651,6 +6651,181 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) {
 INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
                         testing::Bool());
 
+// End-to-end check that `use_direct_reads_for_compaction` actually causes
+// compaction-input SST files to be opened with O_DIRECT, even though
+// `use_direct_reads` (the global flag) is left off so user reads stay
+// buffered. The assertion exercises the kernel-level path, not just the
+// FileOptions plumbing: the existing `NewRandomAccessFile:O_DIRECT` sync
+// point in env/fs_posix.cc fires once per fresh open that includes the
+// O_DIRECT flag.
+//
+// This test only runs on platforms that go through the O_DIRECT path
+// (Linux / non-BSD POSIX), since that is the configuration RocksDB users
+// actually deploy with the direct-I/O knobs. On other platforms it is
+// silently bypassed.
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) && \
+    !defined(OS_WIN)
+TEST_F(DBCompactionTest, UseDirectReadsForCompactionEndToEnd) {
+  if (!IsDirectIOSupported()) {
+    ROCKSDB_GTEST_BYPASS("Direct IO not supported");
+    return;
+  }
+
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  // User reads stay buffered, compaction reads should switch to O_DIRECT.
+  options.use_direct_reads = false;
+  options.use_direct_reads_for_compaction = true;
+  // Isolate the read-side change; leave the compaction write path buffered.
+  options.use_direct_io_for_flush_and_compaction = false;
+
+  int observed_run_starts = 0;
+  int observed_odirect_opens = 0;
+  bool observed_direct_compaction_read = false;
+  int observed_callbacks = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+  // Plumbing-level probe: the compaction-read FileOptions should carry
+  // use_direct_reads = true when the new flag is enabled.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::CompactionJob:FileOptionsForRead", [&](void* arg) {
+        const auto* fo = static_cast<const FileOptions*>(arg);
+        ++observed_callbacks;
+        if (fo != nullptr && fo->use_direct_reads) {
+          observed_direct_compaction_read = true;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start",
+      [&](void* /*arg*/) { ++observed_run_starts; });
+  // Kernel-level probe: this sync point fires only when the OS open() call
+  // is being issued with O_DIRECT in its flags. Hitting it proves we are
+  // actually changing the cache-mode for compaction reads, not just the
+  // in-memory FileOptions struct.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT",
+      [&](void* /*arg*/) { ++observed_odirect_opens; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (s.IsNotSupported() || s.IsInvalidArgument()) {
+    ROCKSDB_GTEST_BYPASS(
+        "Direct IO reads not supported in this test environment");
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    return;
+  }
+  ASSERT_OK(s);
+
+  // Produce two L0 files with OVERLAPPING key ranges so that CompactRange has
+  // actual merge work to do (otherwise RocksDB performs a trivial file move
+  // and never constructs a CompactionJob).
+  const std::string value(4096, 'v');
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+
+  // User reads should still go through the buffered path. Confirm that the
+  // option does not silently flip use_direct_reads for user reads.
+  for (int i = 0; i < 8; ++i) {
+    std::string actual;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &actual));
+    ASSERT_EQ(value, actual);
+  }
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Wait for compaction to complete and CompactionJob to be constructed.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Diagnostic: confirm that the compaction actually ran. If it didn't, the
+  // missing FileOptions sync-point hits would be a test-infrastructure issue,
+  // not a regression in the new option.
+  ASSERT_GT(observed_run_starts, 0)
+      << "CompactionJob::Run():Start never fired; CompactRange did not "
+         "schedule a compaction.";
+  ASSERT_GT(observed_callbacks, 0);
+  ASSERT_TRUE(observed_direct_compaction_read);
+  // The headline assertion: at least one compaction-input file open went
+  // through the O_DIRECT path. Without the TableCache bypass plumbing this
+  // would be zero because compaction would silently reuse the buffered
+  // handles already cached for user reads.
+  EXPECT_GT(observed_odirect_opens, 0)
+      << "no compaction-input opens went through O_DIRECT; "
+         "observed_odirect_opens="
+      << observed_odirect_opens;
+
+  // Quick sanity sweep after compaction to confirm data is intact.
+  for (int i = 0; i < 64; ++i) {
+    std::string actual;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &actual));
+    ASSERT_EQ(value, actual);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+
+// Confirms that when use_direct_reads_for_compaction is OFF, compaction reads
+// stay on the buffered path: neither the compaction-read FileOptions nor the
+// kernel-level O_DIRECT open should ever be triggered. Pairs with the test
+// above to cover both halves of the on/off switch.
+TEST_F(DBCompactionTest, UseDirectReadsForCompactionOffStaysBuffered) {
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.use_direct_reads = false;
+  options.use_direct_reads_for_compaction = false;
+  options.use_direct_io_for_flush_and_compaction = false;
+
+  bool observed_direct_compaction_read = false;
+  int observed_callbacks = 0;
+  int observed_odirect_opens = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::CompactionJob:FileOptionsForRead", [&](void* arg) {
+        const auto* fo = static_cast<const FileOptions*>(arg);
+        ++observed_callbacks;
+        if (fo->use_direct_reads) {
+          observed_direct_compaction_read = true;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT",
+      [&](void* /*arg*/) { ++observed_odirect_opens; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(TryReopen(options));
+
+  const std::string value(4096, 'v');
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(observed_callbacks, 0);
+  ASSERT_FALSE(observed_direct_compaction_read);
+  ASSERT_EQ(0, observed_odirect_opens);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+#endif  // !defined(OS_MACOSX) && !defined(OS_OPENBSD) && ...
+
 class CompactionPriTest : public DBTestBase,
                           public testing::WithParamInterface<uint32_t> {
  public:
 
@@ -244,6 +244,18 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
         "then direct I/O reads (use_direct_reads) must be disabled. ");
   }
 
+  if (db_options.allow_mmap_reads &&
+      db_options.use_direct_reads_for_compaction) {
+    // Memory-mapped reads and direct I/O share the same EnvOptions field, so
+    // enabling both would route compaction reads through a code path that
+    // tries to do mmap and O_DIRECT at the same time. Reject this combination
+    // explicitly rather than relying on lower-level asserts.
+    return Status::NotSupported(
+        "If memory mapped reads (allow_mmap_reads) are enabled "
+        "then compaction-only direct I/O reads "
+        "(use_direct_reads_for_compaction) must be disabled. ");
+  }
+
   if (db_options.allow_mmap_writes &&
       db_options.use_direct_io_for_flush_and_compaction) {
     return Status::NotSupported(
 
@@ -1729,6 +1729,106 @@ TEST_F(DBOptionsTest, SetOptionsMultipleColumnFamilies) {
   ASSERT_TRUE(dbfull()->GetOptions(handles_[2]).disable_auto_compactions);
 }
 
+// Validates the new option's serialization/parse round trip, default value,
+// and validation against incompatible options. Also exercises the
+// FileSystem::OptimizeForCompactionTableRead / OptimizeForBlobFileRead helpers
+// directly to confirm the new flag truly switches use_direct_reads on for
+// compaction reads.
+TEST_F(DBOptionsTest, UseDirectReadsForCompactionOptionMechanics) {
+  // Default value must remain false to preserve existing semantics.
+  ASSERT_FALSE(DBOptions().use_direct_reads_for_compaction);
+
+  // Round-trip through GetDBOptionsFromString.
+  DBOptions parsed;
+  ConfigOptions config_options;
+  ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(),
+                                   "use_direct_reads_for_compaction=true",
+                                   &parsed));
+  ASSERT_TRUE(parsed.use_direct_reads_for_compaction);
+  ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(),
+                                   "use_direct_reads_for_compaction=false",
+                                   &parsed));
+  ASSERT_FALSE(parsed.use_direct_reads_for_compaction);
+
+  // Confirm the option is reachable through the live DB's options round trip.
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.use_direct_reads_for_compaction = true;
+  // Use a buffered user-read setup so the new flag is the one doing the work.
+  options.use_direct_reads = false;
+  options.use_direct_io_for_flush_and_compaction = false;
+  Status s = TryReopen(options);
+  // Direct I/O may not be supported on every test environment; skip silently
+  // in that case since the option metadata path is what this test cares about.
+  if (s.IsNotSupported() || s.IsInvalidArgument()) {
+    Options buffered = CurrentOptions();
+    buffered.create_if_missing = true;
+    buffered.use_direct_reads_for_compaction = true;
+    // Drop the flag if direct I/O is not supported so we can still verify the
+    // option round-trips through SetDBOptions / GetDBOptions.
+    buffered.use_direct_reads_for_compaction = false;
+    Reopen(buffered);
+  } else {
+    ASSERT_OK(s);
+    ASSERT_TRUE(dbfull()->GetDBOptions().use_direct_reads_for_compaction);
+  }
+  Close();
+
+  // mmap_reads + use_direct_reads_for_compaction is rejected at Open time, the
+  // same way mmap_reads + use_direct_reads has always been rejected.
+  Options bad_options = CurrentOptions();
+  bad_options.create_if_missing = true;
+  bad_options.allow_mmap_reads = true;
+  bad_options.use_direct_reads_for_compaction = true;
+  Status bad_status = TryReopen(bad_options);
+  ASSERT_TRUE(bad_status.IsNotSupported()) << bad_status.ToString();
+
+  // Direct test of OptimizeForCompactionTableRead: feeding only the new flag
+  // through ImmutableDBOptions should turn on use_direct_reads in the returned
+  // FileOptions while not touching use_direct_writes. OptimizeForBlobFileRead
+  // intentionally still tracks `use_direct_reads` only -- blob file reads in
+  // production go through BlobFileCache (not OptimizeForBlobFileRead), and
+  // BackupEngine's blob copy path should not be affected by a flag named "for
+  // compaction".
+  Options check_options;
+  check_options.use_direct_reads = false;
+  check_options.use_direct_reads_for_compaction = true;
+  check_options.use_direct_io_for_flush_and_compaction = false;
+  ImmutableDBOptions immutable(check_options);
+  FileOptions in_opts;
+  in_opts.use_direct_reads = false;
+  FileOptions sst_read =
+      env_->GetFileSystem()->OptimizeForCompactionTableRead(in_opts, immutable);
+  FileOptions blob_read =
+      env_->GetFileSystem()->OptimizeForBlobFileRead(in_opts, immutable);
+  ASSERT_TRUE(sst_read.use_direct_reads);
+  ASSERT_FALSE(blob_read.use_direct_reads);
+  ASSERT_FALSE(sst_read.use_direct_writes);
+
+  // When both flags are off, behavior stays exactly as before.
+  Options off_options;
+  off_options.use_direct_reads = false;
+  off_options.use_direct_reads_for_compaction = false;
+  off_options.use_direct_io_for_flush_and_compaction = false;
+  ImmutableDBOptions immutable_off(off_options);
+  FileOptions sst_read_off =
+      env_->GetFileSystem()->OptimizeForCompactionTableRead(in_opts,
+                                                            immutable_off);
+  ASSERT_FALSE(sst_read_off.use_direct_reads);
+
+  // When use_direct_reads is on, the new flag is irrelevant for the returned
+  // FileOptions but must not regress the existing behavior.
+  Options global_on_options;
+  global_on_options.use_direct_reads = true;
+  global_on_options.use_direct_reads_for_compaction = false;
+  global_on_options.use_direct_io_for_flush_and_compaction = false;
+  ImmutableDBOptions immutable_global(global_on_options);
+  FileOptions sst_read_global =
+      env_->GetFileSystem()->OptimizeForCompactionTableRead(in_opts,
+                                                            immutable_global);
+  ASSERT_TRUE(sst_read_global.use_direct_reads);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {