Skip to content

Commit 005f4e1

Browse files
CASSANDRA-21134: Direct I/O for background SSTable writes
Adds an opt-in O_DIRECT write path for background SSTable producers, bypassing the OS page cache for data that is unlikely to be re-read soon after being written. Memtable flushes remain buffered. Enabled via two new YAML knobs: - background_write_disk_access_mode: standard (default) | direct - direct_write_buffer_size: 256KiB (default; aligned up to FS block size, auto-grown to chunk_length) The path is gated by config, table compression being enabled, and an OperationType allowlist in DataComponent. The allowlist is exhaustive: any new OperationType with writesData=true that is not classified will fail static initialization. Operations on the DIO path: COMPACTION, MAJOR_COMPACTION, TOMBSTONE_COMPACTION, ANTICOMPACTION, GARBAGE_COLLECT, CLEANUP, UPGRADE_SSTABLES, WRITE, STREAM (chunked receiver only). Operations off the DIO path: - FLUSH (policy: just-flushed data is hot, keep in page cache) - SCRUB (correctness: tryAppend needs mark/resetAndTruncate) - Zero-Copy Streaming (bypasses DataComponent.buildWriter) - Uncompressed writers (only CompressedSequentialWriter has a DIO subclass in this change) StartupChecks fails fast if 'direct' is requested on a platform/FS that does not support O_DIRECT. patch by Sam Lightfoot; reviewed by <reviewers> for CASSANDRA-21134
1 parent a0dc6f8 commit 005f4e1

16 files changed

Lines changed: 2056 additions & 72 deletions

File tree

conf/cassandra.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,21 @@ commitlog_disk_access_mode: legacy
693693
# - direct: use direct I/O for compaction reads, bypassing the OS page cache
694694
# compaction_read_disk_access_mode: auto
695695

696+
# Set the disk access mode for writing compressed SSTables during background operations
697+
# (compaction, streaming, scrub, cleanup, repair, etc.). The allowed values are:
698+
# - standard: use buffered I/O (default)
699+
# - direct: use direct I/O, bypassing the OS page cache
700+
# Note: Only applies to compressed tables. Uncompressed tables always use buffered I/O.
701+
# Note: Memtable flushes always use buffered I/O regardless of this setting, as flushed
702+
# data benefits from page cache for subsequent reads.
703+
# background_write_disk_access_mode: standard
704+
705+
# Preferred buffer size for Direct IO background writes. Will be aligned up to filesystem
706+
# block size. If a table's compression chunk_length exceeds this value, the buffer will
707+
# auto-expand to fit. Larger buffers reduce syscall overhead but increase memory usage
708+
# per compaction thread.
709+
# direct_write_buffer_size: 256KiB
710+
696711
# Compression to apply to SSTables as they flush for compressed tables.
697712
# Note that tables without compression enabled do not respect this flag.
698713
#

src/java/org/apache/cassandra/config/Config.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,17 @@ public MemtableOptions()
362362

363363
public DataStorageSpec.IntKibibytesBound compressed_read_ahead_buffer_size = new DataStorageSpec.IntKibibytesBound("256KiB");
364364

365+
// Direct IO for background SSTable writes (compaction, streaming, scrub, cleanup, etc.)
366+
// When 'direct' is set, background writes bypass the OS page cache using O_DIRECT.
367+
// Memtable flushes always use buffered I/O regardless of this setting.
368+
// Default is 'standard' (buffered I/O) - users must opt-in to Direct IO
369+
public DiskAccessMode background_write_disk_access_mode = DiskAccessMode.standard;
370+
371+
// Preferred buffer size for Direct IO background writes. Will be aligned up to filesystem block size.
372+
// If a table's compression chunk_length exceeds this value, the buffer will auto-expand to fit.
373+
// Larger buffers reduce syscall overhead but increase memory usage per compaction thread.
374+
public DataStorageSpec.IntKibibytesBound direct_write_buffer_size = new DataStorageSpec.IntKibibytesBound("256KiB");
375+
365376
// fraction of free disk space available for compaction after min free space is subtracted
366377
public volatile Double max_space_usable_for_compactions_in_percentage = .95;
367378

src/java/org/apache/cassandra/config/DatabaseDescriptor.java

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ public class DatabaseDescriptor
224224

225225
private static DiskAccessMode compactionReadDiskAccessMode;
226226

227+
private static DiskAccessMode backgroundWriteDiskAccessMode;
228+
227229
private static AbstractCryptoProvider cryptoProvider;
228230
private static IAuthenticator authenticator;
229231
private static IAuthorizer authorizer;
@@ -897,6 +899,10 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
897899
if (conf.hints_directory.equals(conf.saved_caches_directory))
898900
throw new ConfigurationException("saved_caches_directory must not be the same as the hints_directory", false);
899901

902+
initializeBackgroundWriteDiskAccessMode();
903+
if (backgroundWriteDiskAccessMode != conf.background_write_disk_access_mode)
904+
logger.info("background_write_disk_access_mode resolved to: {}", backgroundWriteDiskAccessMode);
905+
900906
if (conf.memtable_flush_writers == 0)
901907
{
902908
conf.memtable_flush_writers = conf.data_file_directories.length == 1 ? 2 : 1;
@@ -3406,6 +3412,85 @@ public static void initializeCommitLogDiskAccessMode()
34063412
commitLogWriteDiskAccessMode = accessModeDirectIoPair.left;
34073413
}
34083414

3415+
/**
3416+
* Return background write disk access mode.
3417+
*/
3418+
public static DiskAccessMode getBackgroundWriteDiskAccessMode()
3419+
{
3420+
return backgroundWriteDiskAccessMode;
3421+
}
3422+
3423+
@VisibleForTesting
3424+
public static void setBackgroundWriteDiskAccessMode(DiskAccessMode diskAccessMode)
3425+
{
3426+
backgroundWriteDiskAccessMode = diskAccessMode;
3427+
conf.background_write_disk_access_mode = diskAccessMode;
3428+
}
3429+
3430+
public static DataStorageSpec.IntKibibytesBound getDirectWriteBufferSize()
3431+
{
3432+
return conf.direct_write_buffer_size;
3433+
}
3434+
3435+
@VisibleForTesting
3436+
public static void initializeBackgroundWriteDiskAccessMode()
3437+
{
3438+
DiskAccessMode providedMode = conf.background_write_disk_access_mode;
3439+
3440+
// For 'auto', default to standard (conservative, safe default)
3441+
if (providedMode == DiskAccessMode.auto)
3442+
{
3443+
providedMode = DiskAccessMode.standard;
3444+
}
3445+
3446+
// Validate Direct IO is supported on ALL data directories if requested
3447+
if (providedMode == DiskAccessMode.direct)
3448+
{
3449+
// DataStorageSpec already rejects negatives at parse time; zero is the remaining
3450+
// nonsense value. The writer's Math.max would silently coerce it to minRequiredSize,
3451+
// which masks a likely operator mistake — fail fast instead.
3452+
if (conf.direct_write_buffer_size.toBytes() <= 0)
3453+
throw new ConfigurationException("direct_write_buffer_size must be > 0 when background_write_disk_access_mode is 'direct'. " +
3454+
"Got: " + conf.direct_write_buffer_size, false);
3455+
3456+
// Only check Direct IO support if not running as a tool
3457+
if (!toolInitialized)
3458+
{
3459+
List<String> unsupportedLocations = new ArrayList<>();
3460+
3461+
for (String dataDir : conf.data_file_directories)
3462+
{
3463+
try
3464+
{
3465+
File dataDirFile = new File(dataDir);
3466+
PathUtils.createDirectoriesIfNotExists(dataDirFile.toPath());
3467+
3468+
if (!FileUtils.isDirectIOSupported(dataDirFile))
3469+
{
3470+
unsupportedLocations.add(dataDir);
3471+
}
3472+
}
3473+
catch (RuntimeException e)
3474+
{
3475+
logger.warn("Unable to determine Direct IO support for data directory {}: {}", dataDir, e.getMessage());
3476+
unsupportedLocations.add(dataDir + " (check failed: " + e.getMessage() + ")");
3477+
}
3478+
}
3479+
3480+
if (!unsupportedLocations.isEmpty())
3481+
{
3482+
throw new ConfigurationException(
3483+
String.format("background_write_disk_access_mode is set to 'direct', but the following data directories " +
3484+
"do not support Direct I/O: %s. Either change background_write_disk_access_mode to 'standard' " +
3485+
"in cassandra.yaml, or ensure all data directories are on filesystems that support Direct I/O.",
3486+
unsupportedLocations), false);
3487+
}
3488+
}
3489+
}
3490+
3491+
backgroundWriteDiskAccessMode = providedMode;
3492+
}
3493+
34093494
public static String getSavedCachesLocation()
34103495
{
34113496
return conf.saved_caches_directory;
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.cassandra.io;
19+
20+
/**
21+
* Classifies an operation's eligibility for a direct-IO (O_DIRECT) data path, encoding both
22+
* the answer and the rationale class. Consumers maintain their own per-operation classification
23+
* and apply this alongside their own gates (e.g. compression, configuration mode);
24+
* {@link #SUPPORTED} is necessary but not sufficient.
25+
*/
26+
public enum DirectIoSupport
27+
{
28+
/** Eligible for the direct-IO data path. */
29+
SUPPORTED,
30+
31+
/**
32+
* The direct-IO path is mechanically incompatible with this operation. Removing this
33+
* exclusion requires code changes, not policy.
34+
*/
35+
UNSUPPORTED_CORRECTNESS,
36+
37+
/**
38+
* Direct IO would work, but is deliberately disabled for performance or cache-residency
39+
* reasons. Removing this exclusion requires re-evaluating the policy, not code changes.
40+
*/
41+
UNSUPPORTED_POLICY,
42+
43+
/**
44+
* The operation does not exercise this data path (e.g. a sentinel, or an op that does not
45+
* read or write through the consuming component). The gate is moot.
46+
*/
47+
NOT_APPLICABLE;
48+
49+
public boolean isSupported()
50+
{
51+
return this == SUPPORTED;
52+
}
53+
}

0 commit comments

Comments
 (0)