facebook
diff --git a/‎db/c.cc‎
Lines changed: 10 additions & 0 deletions b/‎db/c.cc‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎db/compaction/compaction_job.cc‎
Lines changed: 28 additions & 1 deletion b/‎db/compaction/compaction_job.cc‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎db/db_compaction_test.cc‎
Lines changed: 315 additions & 0 deletions b/‎db/db_compaction_test.cc‎
Lines changed: 315 additions & 0 deletions
@@ -5038,6 +5038,16 @@ unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction(
   return opt->rep.use_direct_io_for_flush_and_compaction;
 }
 
+void rocksdb_options_set_use_direct_reads_for_compaction(rocksdb_options_t* opt,
+                                                         unsigned char v) {
+  opt->rep.use_direct_reads_for_compaction = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_reads_for_compaction(
+    rocksdb_options_t* opt) {
+  return opt->rep.use_direct_reads_for_compaction;
+}
+
 void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt,
                                           unsigned char v) {
   opt->rep.allow_mmap_reads = v;
 
@@ -203,6 +203,12 @@ CompactionJob::CompactionJob(
   assert(job_context);
   assert(job_context->snapshot_context_initialized);
 
+  // Expose the file options used for compaction reads so tests can confirm
+  // that `use_direct_reads_for_compaction` (and related flags) plumb all the
+  // way through to the read path.
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::CompactionJob:FileOptionsForRead",
+                           &file_options_for_read_);
+
   const auto* cfd = compact_->compaction->column_family_data();
   ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
   ThreadStatusUtil::SetColumnFamily(cfd);
@@ -1536,10 +1542,31 @@ InternalIterator* CompactionJob::CreateInputIterator(
 
   // Although the v2 aggregator is what the level iterator(s) know about,
   // the AddTombstones calls will be propagated down to the v1 aggregator.
+  //
+  // When `use_direct_reads_for_compaction` is set while the global
+  // `use_direct_reads` stays off, the shared TableCache is already holding
+  // buffered file handles for these SST files (opened that way for user
+  // reads). Reusing those handles would silently downgrade the compaction
+  // scan back to buffered I/O. Ask the iterator to open ephemeral
+  // O_DIRECT handles instead so the kernel actually bypasses the page
+  // cache for the compaction reads.
+  //
+  // The third clause (`file_options_for_read_.use_direct_reads`) is
+  // defensive: it confirms that `OptimizeForCompactionTableRead` actually
+  // requested direct I/O on the read FileOptions we will hand to the
+  // iterator. The base FileSystem implementation always sets it when the
+  // flag combination above is true, but a custom FileSystem could override
+  // OptimizeForCompactionTableRead without honoring the new flag -- in
+  // which case bypassing the cache would give us buffered handles anyway,
+  // which is wasteful. Skip the bypass in that case.
+  const bool bypass_cache_for_scan =
+      db_options_.use_direct_reads_for_compaction &&
+      !db_options_.use_direct_reads && file_options_for_read_.use_direct_reads;
   iterators.raw_input =
       std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
           read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
-          file_options_for_read_, boundaries.start, boundaries.end));
+          file_options_for_read_, boundaries.start, boundaries.end,
+          bypass_cache_for_scan));
   InternalIterator* input = iterators.raw_input.get();
 
   if (boundaries.start.has_value() || boundaries.end.has_value()) {
 
@@ -6651,6 +6651,321 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) {
 INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
                         testing::Bool());
 
+// End-to-end check that `use_direct_reads_for_compaction` actually causes
+// compaction-input SST files to be opened with O_DIRECT, even though
+// `use_direct_reads` (the global flag) is left off so user reads stay
+// buffered. The assertion exercises the kernel-level path, not just the
+// FileOptions plumbing: the existing `NewRandomAccessFile:O_DIRECT` sync
+// point in env/fs_posix.cc fires once per fresh open that includes the
+// O_DIRECT flag.
+//
+// This test only runs on platforms that go through the O_DIRECT path
+// (Linux / non-BSD POSIX), since that is the configuration RocksDB users
+// actually deploy with the direct-I/O knobs. On other platforms it is
+// silently bypassed.
+#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) && \
+    !defined(OS_WIN)
+TEST_F(DBCompactionTest, UseDirectReadsForCompactionEndToEnd) {
+  if (!IsDirectIOSupported()) {
+    ROCKSDB_GTEST_BYPASS("Direct IO not supported");
+    return;
+  }
+
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  // User reads stay buffered, compaction reads should switch to O_DIRECT.
+  options.use_direct_reads = false;
+  options.use_direct_reads_for_compaction = true;
+  // Isolate the read-side change; leave the compaction write path buffered.
+  options.use_direct_io_for_flush_and_compaction = false;
+
+  // Sync-point callbacks fire on compaction threads while assertions read
+  // these counters on the test thread. Use atomics to avoid a data race
+  // even when (as in this test) the workload is structured so the threads
+  // synchronize on TEST_WaitForCompact before reading.
+  std::atomic<int> observed_run_starts{0};
+  std::atomic<int> observed_odirect_opens{0};
+  std::atomic<bool> observed_direct_compaction_read{false};
+  std::atomic<int> observed_callbacks{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+  // Plumbing-level probe: the compaction-read FileOptions should carry
+  // use_direct_reads = true when the new flag is enabled.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::CompactionJob:FileOptionsForRead", [&](void* arg) {
+        const auto* fo = static_cast<const FileOptions*>(arg);
+        observed_callbacks.fetch_add(1, std::memory_order_relaxed);
+        if (fo != nullptr && fo->use_direct_reads) {
+          observed_direct_compaction_read.store(true,
+                                                std::memory_order_relaxed);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        observed_run_starts.fetch_add(1, std::memory_order_relaxed);
+      });
+  // Kernel-level probe: this sync point fires only when the OS open() call
+  // is being issued with O_DIRECT in its flags. Hitting it proves we are
+  // actually changing the cache-mode for compaction reads, not just the
+  // in-memory FileOptions struct.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT", [&](void* /*arg*/) {
+        observed_odirect_opens.fetch_add(1, std::memory_order_relaxed);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (s.IsNotSupported() || s.IsInvalidArgument()) {
+    ROCKSDB_GTEST_BYPASS(
+        "Direct IO reads not supported in this test environment");
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    return;
+  }
+  ASSERT_OK(s);
+
+  // Produce two L0 files with OVERLAPPING key ranges so that CompactRange has
+  // actual merge work to do (otherwise RocksDB performs a trivial file move
+  // and never constructs a CompactionJob).
+  const std::string value(4096, 'v');
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+
+  // User reads should still go through the buffered path. Confirm that the
+  // option does not silently flip use_direct_reads for user reads.
+  for (int i = 0; i < 8; ++i) {
+    std::string actual;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &actual));
+    ASSERT_EQ(value, actual);
+  }
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Wait for compaction to complete and CompactionJob to be constructed.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Diagnostic: confirm that the compaction actually ran. If it didn't, the
+  // missing FileOptions sync-point hits would be a test-infrastructure issue,
+  // not a regression in the new option.
+  ASSERT_GT(observed_run_starts.load(), 0)
+      << "CompactionJob::Run():Start never fired; CompactRange did not "
+         "schedule a compaction.";
+  ASSERT_GT(observed_callbacks.load(), 0);
+  ASSERT_TRUE(observed_direct_compaction_read.load());
+  // The headline assertion: at least one compaction-input file open went
+  // through the O_DIRECT path. Without the TableCache bypass plumbing this
+  // would be zero because compaction would silently reuse the buffered
+  // handles already cached for user reads.
+  EXPECT_GT(observed_odirect_opens.load(), 0)
+      << "no compaction-input opens went through O_DIRECT; "
+         "observed_odirect_opens="
+      << observed_odirect_opens.load();
+
+  // Quick sanity sweep after compaction to confirm data is intact.
+  for (int i = 0; i < 64; ++i) {
+    std::string actual;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &actual));
+    ASSERT_EQ(value, actual);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+
+// Confirms that when use_direct_reads_for_compaction is OFF, compaction reads
+// stay on the buffered path: neither the compaction-read FileOptions nor the
+// kernel-level O_DIRECT open should ever be triggered. Pairs with the test
+// above to cover both halves of the on/off switch.
+TEST_F(DBCompactionTest, UseDirectReadsForCompactionOffStaysBuffered) {
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.use_direct_reads = false;
+  options.use_direct_reads_for_compaction = false;
+  options.use_direct_io_for_flush_and_compaction = false;
+
+  std::atomic<bool> observed_direct_compaction_read{false};
+  std::atomic<int> observed_callbacks{0};
+  std::atomic<int> observed_odirect_opens{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::CompactionJob:FileOptionsForRead", [&](void* arg) {
+        const auto* fo = static_cast<const FileOptions*>(arg);
+        observed_callbacks.fetch_add(1, std::memory_order_relaxed);
+        if (fo->use_direct_reads) {
+          observed_direct_compaction_read.store(true,
+                                                std::memory_order_relaxed);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT", [&](void* /*arg*/) {
+        observed_odirect_opens.fetch_add(1, std::memory_order_relaxed);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(TryReopen(options));
+
+  const std::string value(4096, 'v');
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(observed_callbacks.load(), 0);
+  ASSERT_FALSE(observed_direct_compaction_read.load());
+  ASSERT_EQ(0, observed_odirect_opens.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+
+// Exercise the LevelIterator bypass path (L1+ compactions) with range
+// tombstones present, which is where the ephemeral TableReader's lifetime
+// is non-trivially coupled to the range_tombstone_iter the file iterator
+// hands back. The end-to-end test above only constructs two L0 files,
+// which compact via the direct NewIterator path in MakeInputIterator and
+// never go through LevelIterator. This test populates data in L1 and L2,
+// adds range tombstones at each level, then triggers an L1->L2
+// compaction so LevelIterator::NewFileIterator is the one driving the
+// O_DIRECT bypass. If the TableReader lifetime were tied incorrectly to
+// the file iterator, the range-tombstone iterator created from the same
+// reader would either crash or be flagged by sanitizers when LevelIterator
+// transitions between files.
+TEST_F(DBCompactionTest,
+       UseDirectReadsForCompactionLevelIteratorWithTombstones) {
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.use_direct_reads = false;
+  options.use_direct_reads_for_compaction = true;
+  options.use_direct_io_for_flush_and_compaction = false;
+  // Small files / small level base so we can pack data into L1 and L2 with
+  // a few flushes and CompactRange calls instead of needing millions of keys.
+  options.write_buffer_size = 64 * 1024;
+  options.target_file_size_base = 64 * 1024;
+  options.max_bytes_for_level_base = 256 * 1024;
+  options.level0_file_num_compaction_trigger = 100;  // never auto-trigger
+
+  std::atomic<int> observed_odirect_opens{0};
+  std::atomic<int> observed_run_starts{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewRandomAccessFile:O_DIRECT", [&](void* /*arg*/) {
+        observed_odirect_opens.fetch_add(1, std::memory_order_relaxed);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        observed_run_starts.fetch_add(1, std::memory_order_relaxed);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (s.IsNotSupported() || s.IsInvalidArgument()) {
+    ROCKSDB_GTEST_BYPASS(
+        "Direct IO reads not supported in this test environment");
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    return;
+  }
+  ASSERT_OK(s);
+
+  const std::string value(1024, 'v');
+
+  auto write_batch = [&](int begin, int end, bool with_range_tombstone) {
+    for (int i = begin; i < end; ++i) {
+      ASSERT_OK(Put(Key(i), value));
+    }
+    if (with_range_tombstone) {
+      // Drop a slice in the middle of the just-written range. This puts a
+      // FragmentedRangeTombstone in the resulting SST file so the L1+
+      // compaction has actual tombstones to iterate over.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key(begin + (end - begin) / 4),
+                                 Key(begin + 3 * (end - begin) / 4)));
+    }
+    ASSERT_OK(Flush());
+  };
+
+  // Build up data in L0 across several files with range tombstones.
+  // Each batch produces one L0 SST.
+  for (int batch = 0; batch < 4; ++batch) {
+    write_batch(batch * 200, batch * 200 + 200, /*with_range_tombstone=*/true);
+  }
+  // Force everything down to L2 via two manual CompactRange calls so the
+  // file layout has SSTs at both L1 and L2 (or at least L2). The
+  // subsequent L0->L2 compaction will then exercise LevelIterator.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Add another wave at L0 that overlaps with what is now at the lower
+  // levels, plus another range tombstone, so the next compaction has L1+
+  // inputs with tombstones.
+  for (int batch = 0; batch < 2; ++batch) {
+    write_batch(batch * 300 + 50, batch * 300 + 250,
+                /*with_range_tombstone=*/true);
+  }
+
+  const int run_starts_before = observed_run_starts.load();
+  const int odirect_before = observed_odirect_opens.load();
+
+  // The big one: compact everything together. This forces a LevelIterator
+  // to be constructed over the existing lower-level files with the bypass
+  // path. If the ephemeral TableReader / range-tombstone iter lifetimes
+  // are wrong, sanitizers should catch it here.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(observed_run_starts.load(), run_starts_before)
+      << "expected at least one compaction to run during the L1+ phase";
+  // The TableCache may already have ephemeral readers from the earlier
+  // compactions, so we just need *some* O_DIRECT opens overall.
+  EXPECT_GT(observed_odirect_opens.load(), odirect_before)
+      << "no compaction-input opens went through O_DIRECT during L1+ "
+         "compaction; LevelIterator bypass path may be broken";
+
+  // Sanity: the surviving (non-tombstoned) keys should still be readable
+  // and the tombstoned ones should be gone.
+  std::string actual;
+  for (int batch = 0; batch < 4; ++batch) {
+    int begin = batch * 200;
+    int end = batch * 200 + 200;
+    int del_lo = begin + (end - begin) / 4;
+    int del_hi = begin + 3 * (end - begin) / 4;
+    for (int i = begin; i < end; ++i) {
+      Status get_s = db_->Get(ReadOptions(), Key(i), &actual);
+      if (i >= del_lo && i < del_hi) {
+        // Could be NotFound (tombstoned) or overwritten by the second wave;
+        // both are acceptable -- we are exercising correctness of compaction,
+        // not the exact tombstone-vs-overwrite resolution here.
+        ASSERT_TRUE(get_s.ok() || get_s.IsNotFound());
+      } else {
+        ASSERT_TRUE(get_s.ok() || get_s.IsNotFound())
+            << "unexpected error reading key " << i << ": " << get_s.ToString();
+      }
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+#endif  // !defined(OS_MACOSX) && !defined(OS_OPENBSD) && ...
+
 class CompactionPriTest : public DBTestBase,
                           public testing::WithParamInterface<uint32_t> {
  public: