CASSANDRA-21134: Fix O_DIRECT short read on preemptive early-open of compaction SSTables

samueldlightfoot · samueldlightfoot · commit 5bba52ff62fe · 2026-06-03T13:31:31.000+01:00
The preemptive openEarly path (SSTableRewriter.maybeReopenEarly) publishes a
reader incrementally as the data writer's post-flush listener reports synced
offsets, without a sync() call. Under O_DIRECT a chunk's compressed bytes stay
in the aligned writeBuffer until its block reaches disk, so reporting the staged
uncompressed offset exposes chunks not yet durable and the early reader short-
reads past EOF (CorruptBlockException).

Report the durable uncompressed offset instead: track each staged chunk's
{compressedEnd, uncompressedEnd} and, in the post-flush listener, advance over
chunks whose compressed bytes now sit below fchannel.position(). The boundary
simply waits for the next block flush. Adds no extra I/O and stays inside the
DIO subclass, so both SSTable formats are covered.
diff --git a/src/java/org/apache/cassandra/io/compress/DirectCompressedSequentialWriter.java b/src/java/org/apache/cassandra/io/compress/DirectCompressedSequentialWriter.java
@@ -19,8 +19,10 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.ArrayDeque;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.IntConsumer;
+import java.util.function.LongConsumer;
 
 import javax.annotation.Nullable;
 
@@ -34,6 +36,7 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.compression.CompressionDictionaryManager;
+import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.util.ChecksumWriter;
@@ -78,6 +81,12 @@ public class DirectCompressedSequentialWriter extends CompressedSequentialWriter
 
     private final int blockSize;
 
+    // Chunks staged in writeBuffer but not yet on disk, in write order, each as {compressedEnd, uncompressedEnd}.
+    // Drained as whole blocks reach disk so the post-flush listener can report a durable offset (see
+    // setPostFlushListener). A chunk costs one small array per ~chunkLength written — negligible churn.
+    private final ArrayDeque<long[]> stagedChunkBoundaries = new ArrayDeque<>();
+    private long durableUncompressedOffset = 0;
+
     public DirectCompressedSequentialWriter(File file,
                                             File offsetsFile,
                                             @Nullable File digestFile,
@@ -159,6 +168,16 @@ protected void syncInternal()
         syncDataOnlyInternal();
     }
 
+    // flushData fires this per chunk with the staged (uncompressed) offset, but under O_DIRECT a chunk is
+    // only readable once its block reaches disk. Preemptive early-open (markDataSynced) publishes a reader
+    // as soon as this offset covers its boundary, so reporting the staged offset would expose chunks still
+    // in writeBuffer. Report the durable offset instead; the boundary simply waits for the next block flush.
+    @Override
+    public void setPostFlushListener(LongConsumer postFlush)
+    {
+        super.setPostFlushListener(stagedOffset -> postFlush.accept(durableUncompressedOffset()));
+    }
+
     @Override
     protected void writeChunk(ByteBuffer toWrite)
     {
@@ -175,6 +194,7 @@ protected void writeChunk(ByteBuffer toWrite)
         crcMetadata.appendDirect(toWrite, true);
 
         actualDataSize = chunkOffset + chunkLength + CRC_LENGTH;
+        stagedChunkBoundaries.add(new long[]{ actualDataSize, uncompressedSize });
     }
 
     private void writeToAlignedBuffer(ByteBuffer data)
@@ -264,6 +284,30 @@ private void flushBufferedTailForEarlyOpen()
         }
     }
 
+    // Advance over chunks whose compressed bytes now sit entirely below the on-disk boundary
+    // (fchannel.position() — only whole blocks are written). Chunks flush in order, so a single drain
+    // suffices and the offset is monotonic.
+    private long durableUncompressedOffset()
+    {
+        long onDisk;
+        try
+        {
+            onDisk = fchannel.position();
+        }
+        catch (IOException e)
+        {
+            throw new FSReadError(e, getPath());
+        }
+
+        long[] chunk;
+        while ((chunk = stagedChunkBoundaries.peek()) != null && chunk[0] <= onDisk)
+        {
+            durableUncompressedOffset = chunk[1];
+            stagedChunkBoundaries.poll();
+        }
+        return durableUncompressedOffset;
+    }
+
     private void flushFinalWithPadding()
     {
         int logicalPos = writeBuffer.position();
diff --git a/test/unit/org/apache/cassandra/io/compress/DirectCompressedSequentialWriterTest.java b/test/unit/org/apache/cassandra/io/compress/DirectCompressedSequentialWriterTest.java
@@ -867,6 +867,76 @@ private static void assertEarlyOpenReadsBack(byte[] payload, CompressionParams p
         }
     }
 
+    /**
+     * Covers the preemptive early-open read path (SSTableRewriter.maybeReopenEarly): unlike openFinalEarly
+     * it never calls sync(). The partition index advances its readable boundary off the writer's post-flush
+     * offset, then a reader is published over that boundary mid-compaction. Under O_DIRECT only whole blocks
+     * reach disk during flushData, so the writer must report the durable offset, not the staged one —
+     * otherwise the reader short-reads chunks still parked in writeBuffer. testEarlyOpenAfterSyncReadsBackData
+     * does not cover this — its sync() flushes the tail; this window has no sync at all.
+     */
+    @Test
+    public void testPreemptiveOpenReadsBackSyncedData() throws IOException
+    {
+        // Many small chunks that compress far below one block: nothing reaches disk before finish, so the
+        // writer must report 0 synced (the prod system/IndexInfo shape) rather than the staged length.
+        assertSyncedOffsetIsReadable(compressible(DEFAULT_CHUNK_LENGTH * 4 + 137), CompressionParams.lz4());
+
+        // Several MiB of incompressible data forces real block flushes mid-write, so the synced boundary
+        // must advance past zero (preemptive open still works) yet never cross the still-buffered tail.
+        long synced = assertSyncedOffsetIsReadable(incompressible(4 << 20), CompressionParams.lz4());
+        assertTrue("expected the synced boundary to advance for multi-MiB data", synced > 0);
+    }
+
+    // Mirrors how a preemptive early-open reader is bounded: whatever offset the writer reports as synced
+    // (via the post-flush listener that drives PartitionIndexBuilder.markDataSynced) must be fully readable
+    // from disk, with no sync()/finish() in between. Returns that synced offset.
+    private static long assertSyncedOffsetIsReadable(byte[] payload, CompressionParams params) throws IOException
+    {
+        File dataFile = FileUtils.createTempFile("preemptive_open_direct", ".db");
+        File metadataFile = new File(dataFile.absolutePath() + ".metadata");
+        try
+        {
+            MetadataCollector collector = newCollector();
+            try (DirectCompressedSequentialWriter writer = new DirectCompressedSequentialWriter(
+                dataFile, metadataFile, null, SequentialWriterOption.DEFAULT, params, collector, null))
+            {
+                long[] syncedOffset = { 0 };
+                writer.setPostFlushListener(offset -> syncedOffset[0] = offset);
+
+                writer.write(payload);
+
+                int readable = (int) syncedOffset[0];
+                if (readable > 0)
+                {
+                    try (CompressionMetadata md = writer.open(readable);
+                         FileHandle fh = new FileHandle.Builder(dataFile).withCompressionMetadata(md).complete();
+                         RandomAccessReader reader = fh.createReader())
+                    {
+                        byte[] readBack = new byte[readable];
+                        reader.readFully(readBack);
+                        assertArrayEquals("preemptive-open read-back mismatch", Arrays.copyOf(payload, readable), readBack);
+                    }
+                }
+
+                writer.finish();
+                return syncedOffset[0];
+            }
+        }
+        finally
+        {
+            dataFile.tryDelete();
+            metadataFile.tryDelete();
+        }
+    }
+
+    private static byte[] incompressible(int size)
+    {
+        byte[] data = new byte[size];
+        new Random(42).nextBytes(data);
+        return data;
+    }
+
     private void testWriteAndRead(String testName, int dataSize, CompressionParams params) throws IOException
     {
         File dataFile = FileUtils.createTempFile(testName + "_direct", ".db");