diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/CLPEncodingRealtimeTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/CLPEncodingRealtimeTest.java index eec8b1280458..d10e46535745 100644 --- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/CLPEncodingRealtimeTest.java +++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/CLPEncodingRealtimeTest.java @@ -28,6 +28,7 @@ import org.apache.pinot.spi.config.table.ingestion.TransformConfig; import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.data.Schema; +import org.apache.pinot.util.TestUtils; import org.testng.Assert; import org.testng.annotations.Test; @@ -114,6 +115,20 @@ protected List getFieldConfigs() { return fieldConfigs; } + @Override + public void setUp() + throws Exception { + LOGGER.warn("Setting up integration test class: {}", getClass().getSimpleName()); + initControllerRequestURLBuilder(); + TestUtils.ensureDirectoriesExistAndEmpty(_tempDir, _segmentDir, _tarDir); + + setUpTable(); + + // CLP segment conversion can be slow in CI; use a longer timeout than the default 60s. + waitForAllDocsLoaded(600_000); + LOGGER.warn("Finished setting up integration test class: {}", getClass().getSimpleName()); + } + @Override protected IngestionConfig getIngestionConfig() { List transforms = new ArrayList<>(); diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java index 75ddcf386631..3e03384daccd 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryIntegerCompression.java @@ -24,7 +24,7 @@ import java.util.Random; import java.util.concurrent.TimeUnit; import net.jpountz.lz4.LZ4Factory; -import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkCompressor; import org.apache.pinot.segment.spi.compression.ChunkDecompressor; diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java index d7773b6eeae7..dedd1cccb033 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryLongCompression.java @@ -24,7 +24,7 @@ import java.util.Random; import java.util.concurrent.TimeUnit; import net.jpountz.lz4.LZ4Factory; -import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkCompressor; import org.apache.pinot.segment.spi.compression.ChunkDecompressor; diff --git a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java index 50f7687c9adb..11847d2ebb82 100644 --- a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java +++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNoDictionaryStringCompression.java @@ -24,7 +24,7 @@ import java.util.Random; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.RandomStringUtils; -import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkCompressor; import org.apache.pinot.segment.spi.compression.ChunkDecompressor; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/PipelineChunkCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/PipelineChunkCompressor.java new file mode 100644 index 000000000000..3770dba2760c --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/PipelineChunkCompressor.java @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.codec; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.transform.ChunkTransformFactory; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; +import org.apache.pinot.segment.spi.codec.ChunkTransform; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.compression.ChunkCompressor; + + +/** + * A {@link ChunkCompressor} that applies a pipeline of codec stages: first all + * {@link ChunkCodec.CodecKind#TRANSFORM TRANSFORM} stages in order (left-to-right), + * then the terminal {@link ChunkCodec.CodecKind#COMPRESSOR COMPRESSOR}. + * + *

This is the write-path counterpart of {@link PipelineChunkDecompressor}.

+ */ +public class PipelineChunkCompressor implements ChunkCompressor { + + private final ChunkCodecPipeline _pipeline; + private final ChunkTransform[] _transforms; + private final ChunkCompressor _terminalCompressor; + private final int _valueSizeInBytes; + + /** + * Creates a pipeline compressor. + * + * @param pipeline the codec pipeline + * @param valueSizeInBytes size of each typed value (4 for INT, 8 for LONG); used by transforms + */ + public PipelineChunkCompressor(ChunkCodecPipeline pipeline, int valueSizeInBytes) { + _pipeline = pipeline; + _valueSizeInBytes = valueSizeInBytes; + + List transformStages = pipeline.getTransforms(); + _transforms = new ChunkTransform[transformStages.size()]; + for (int i = 0; i < transformStages.size(); i++) { + _transforms[i] = ChunkTransformFactory.getTransform(transformStages.get(i)); + } + + _terminalCompressor = ChunkCompressorFactory.getCompressor( + pipeline.getChunkCompressionType()); + } + + @Override + public int compress(ByteBuffer inUncompressed, ByteBuffer outCompressed) + throws IOException { + // Apply transforms left-to-right (in-place on the input buffer) + int numBytes = inUncompressed.remaining(); + for (ChunkTransform transform : _transforms) { + transform.encode(inUncompressed, numBytes, _valueSizeInBytes); + } + + // Apply terminal compression + return _terminalCompressor.compress(inUncompressed, outCompressed); + } + + @Override + public int maxCompressedSize(int uncompressedSize) { + // Transforms are in-place and don't change size; delegate to terminal compressor + return _terminalCompressor.maxCompressedSize(uncompressedSize); + } + + @Override + public ChunkCompressionType compressionType() { + return _terminalCompressor.compressionType(); + } + + @Override + public void close() + throws IOException { + _terminalCompressor.close(); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/PipelineChunkDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/PipelineChunkDecompressor.java new file mode 100644 index 000000000000..a839d4f71465 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/PipelineChunkDecompressor.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.codec; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.transform.ChunkTransformFactory; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; +import org.apache.pinot.segment.spi.codec.ChunkTransform; +import org.apache.pinot.segment.spi.compression.ChunkDecompressor; + + +/** + * A {@link ChunkDecompressor} that reverses a codec pipeline: first decompresses using the + * terminal {@link ChunkCodec.CodecKind#COMPRESSOR COMPRESSOR}, then applies all + * {@link ChunkCodec.CodecKind#TRANSFORM TRANSFORM} stages in reverse order (right-to-left). + * + *

This is the read-path counterpart of {@link PipelineChunkCompressor}.

+ */ +public class PipelineChunkDecompressor implements ChunkDecompressor { + + private final ChunkCodecPipeline _pipeline; + private final ChunkTransform[] _transforms; + private final ChunkDecompressor _terminalDecompressor; + private final int _valueSizeInBytes; + + /** + * Creates a pipeline decompressor. + * + * @param pipeline the codec pipeline + * @param valueSizeInBytes size of each typed value (4 for INT, 8 for LONG); used by transforms + */ + public PipelineChunkDecompressor(ChunkCodecPipeline pipeline, int valueSizeInBytes) { + _pipeline = pipeline; + _valueSizeInBytes = valueSizeInBytes; + + List transformStages = pipeline.getTransforms(); + _transforms = new ChunkTransform[transformStages.size()]; + for (int i = 0; i < transformStages.size(); i++) { + _transforms[i] = ChunkTransformFactory.getTransform(transformStages.get(i)); + } + + _terminalDecompressor = ChunkCompressorFactory.getDecompressor( + pipeline.getChunkCompressionType()); + } + + @Override + public int decompress(ByteBuffer compressedInput, ByteBuffer decompressedOutput) + throws IOException { + // Decompress using terminal decompressor. + // Per Pinot convention, after this call the output buffer is flipped: position=0, limit=dataSize. + int decompressedSize = _terminalDecompressor.decompress(compressedInput, decompressedOutput); + + if (_transforms.length > 0) { + // Buffer is already in read mode (flipped). Transforms operate from position=0. + int numBytes = decompressedOutput.remaining(); + + // Apply transforms in reverse order (right-to-left) + for (int i = _transforms.length - 1; i >= 0; i--) { + _transforms[i].decode(decompressedOutput, numBytes, _valueSizeInBytes); + } + // Buffer remains flipped: position=0, limit=numBytes — ready for the caller to read. + } + + return decompressedSize; + } + + @Override + public int decompressedLength(ByteBuffer compressedInput) + throws IOException { + return _terminalDecompressor.decompressedLength(compressedInput); + } + + @Override + public void close() + throws IOException { + _terminalDecompressor.close(); + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ChunkCompressorFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ChunkCompressorFactory.java similarity index 66% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ChunkCompressorFactory.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ChunkCompressorFactory.java index 33ef239583ac..27ca8282984c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ChunkCompressorFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ChunkCompressorFactory.java @@ -16,8 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; +import org.apache.pinot.segment.local.io.codec.PipelineChunkCompressor; +import org.apache.pinot.segment.local.io.codec.PipelineChunkDecompressor; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkCompressor; import org.apache.pinot.segment.spi.compression.ChunkDecompressor; @@ -118,4 +121,38 @@ public static ChunkDecompressor getDecompressor(ChunkCompressionType compression throw new IllegalArgumentException("Illegal decompressor name " + compressionType); } } + + /** + * Returns a compressor for a codec pipeline. If the pipeline has transforms, returns a + * {@link PipelineChunkCompressor}; otherwise returns the plain compressor for the terminal codec. + * Legacy compound codecs (DELTA_LZ4, DOUBLE_DELTA_LZ4) are handled by their existing compressor + * implementations. + * + * @param pipeline the codec pipeline + * @param valueSizeInBytes size of each typed value (4 for INT, 8 for LONG); used by transforms + * @return compressor for the pipeline + */ + public static ChunkCompressor getCompressor(ChunkCodecPipeline pipeline, int valueSizeInBytes) { + if (pipeline.hasTransforms()) { + return new PipelineChunkCompressor(pipeline, valueSizeInBytes); + } + return getCompressor(pipeline.getChunkCompressionType()); + } + + /** + * Returns a decompressor for a codec pipeline. If the pipeline has transforms, returns a + * {@link PipelineChunkDecompressor}; otherwise returns the plain decompressor for the terminal codec. + * Legacy compound codecs (DELTA_LZ4, DOUBLE_DELTA_LZ4) are handled by their existing decompressor + * implementations. + * + * @param pipeline the codec pipeline + * @param valueSizeInBytes size of each typed value (4 for INT, 8 for LONG); used by transforms + * @return decompressor for the pipeline + */ + public static ChunkDecompressor getDecompressor(ChunkCodecPipeline pipeline, int valueSizeInBytes) { + if (pipeline.hasTransforms()) { + return new PipelineChunkDecompressor(pipeline, valueSizeInBytes); + } + return getDecompressor(pipeline.getChunkCompressionType()); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaCompressor.java similarity index 99% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaCompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaCompressor.java index 03b8b8fcf50e..be44444ae230 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaCompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaCompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDecompressor.java similarity index 98% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDecompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDecompressor.java index bc873e83320b..ca6b4ca652fd 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDecompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDecompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaCompressor.java similarity index 99% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaCompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaCompressor.java index 394772ac8838..1edc2a245a54 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaCompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaCompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaDecompressor.java similarity index 98% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaDecompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaDecompressor.java index 3016a8c6c77e..bd3f5736f8a9 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaDecompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaDecompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/GzipCompressor.java similarity index 97% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipCompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/GzipCompressor.java index 0d6b391206c0..6ae9a6a209e3 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipCompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/GzipCompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/GzipDecompressor.java similarity index 96% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipDecompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/GzipDecompressor.java index c681dcb00806..76bec1d59295 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/GzipDecompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/GzipDecompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4Compressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4Compressor.java similarity index 97% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4Compressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4Compressor.java index 35c5470b53f1..fa79a3708cdc 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4Compressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4Compressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4Decompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4Decompressor.java similarity index 97% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4Decompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4Decompressor.java index 3915ca6a7389..72978917b2b5 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4Decompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4Decompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4WithLengthCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4WithLengthCompressor.java similarity index 97% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4WithLengthCompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4WithLengthCompressor.java index b42443e4963f..cd4a92b033a3 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4WithLengthCompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4WithLengthCompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4WithLengthDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4WithLengthDecompressor.java similarity index 97% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4WithLengthDecompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4WithLengthDecompressor.java index c1dd8253dee5..dc47cf256f14 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/LZ4WithLengthDecompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/LZ4WithLengthDecompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/PassThroughCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/PassThroughCompressor.java similarity index 96% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/PassThroughCompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/PassThroughCompressor.java index 7b8e8859543f..bc76fbd95750 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/PassThroughCompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/PassThroughCompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/PassThroughDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/PassThroughDecompressor.java similarity index 96% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/PassThroughDecompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/PassThroughDecompressor.java index 20683485e5e6..efa7765f6379 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/PassThroughDecompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/PassThroughDecompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.nio.ByteBuffer; import org.apache.pinot.segment.spi.compression.ChunkDecompressor; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/SnappyCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/SnappyCompressor.java similarity index 96% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/SnappyCompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/SnappyCompressor.java index b508d65d9c2a..f0d1e8791a67 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/SnappyCompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/SnappyCompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/SnappyDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/SnappyDecompressor.java similarity index 96% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/SnappyDecompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/SnappyDecompressor.java index 40e97c870f6e..399d1e6f3d21 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/SnappyDecompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/SnappyDecompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ZstandardCompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ZstandardCompressor.java similarity index 97% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ZstandardCompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ZstandardCompressor.java index d9f68649cd35..3eeb31c00914 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ZstandardCompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ZstandardCompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import com.github.luben.zstd.Zstd; import java.io.IOException; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ZstandardDecompressor.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ZstandardDecompressor.java similarity index 97% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ZstandardDecompressor.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ZstandardDecompressor.java index 16b9a099b988..67e812e9e846 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/ZstandardDecompressor.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/compression/ZstandardDecompressor.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import com.github.luben.zstd.Zstd; import java.io.IOException; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/ChunkTransformFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/ChunkTransformFactory.java new file mode 100644 index 000000000000..2b0ee0667fd4 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/ChunkTransformFactory.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.codec.transform; + +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkTransform; + + +/** + * Factory for obtaining {@link ChunkTransform} instances by their {@link ChunkCodec} identifier. + */ +public class ChunkTransformFactory { + + private ChunkTransformFactory() { + } + + /** + * Returns the singleton {@link ChunkTransform} for the given transform codec. + * + * @param codec the transform codec (must be a {@link ChunkCodec.CodecKind#TRANSFORM}) + * @return the corresponding transform instance + * @throws IllegalArgumentException if the codec is not a known transform + */ + public static ChunkTransform getTransform(ChunkCodec codec) { + switch (codec) { + case DELTA: + return DeltaTransform.INSTANCE; + case DOUBLE_DELTA: + return DoubleDeltaTransform.INSTANCE; + case XOR: + return XorTransform.INSTANCE; + default: + throw new IllegalArgumentException("Unknown transform codec: " + codec); + } + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/DeltaTransform.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/DeltaTransform.java new file mode 100644 index 000000000000..7813a28848ba --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/DeltaTransform.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.codec.transform; + +import java.nio.ByteBuffer; +import java.util.EnumSet; +import java.util.Set; +import org.apache.pinot.segment.spi.codec.ChunkTransform; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * Delta encoding transform for integer numeric values. Stores the first value as-is, then each + * subsequent value as the difference from the previous value. Operates in-place on the + * ByteBuffer. + * + *

Supports only INT and LONG stored types. Subtraction on IEEE 754 bit patterns does not + * produce meaningful deltas, so FLOAT/DOUBLE columns should use {@link XorTransform} instead.

+ * + *

Effective for monotonically increasing data (e.g., timestamps, counters). + * Java two's complement arithmetic guarantees correct wrap-around for overflow.

+ */ +public class DeltaTransform implements ChunkTransform { + + private static final Set SUPPORTED_TYPES = EnumSet.of(DataType.INT, DataType.LONG); + + public static final DeltaTransform INSTANCE = new DeltaTransform(); + + private DeltaTransform() { + } + + @Override + public Set supportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void encode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + if (valueSizeInBytes == Integer.BYTES) { + encodeInts(buffer, numBytes); + } else { + encodeLongs(buffer, numBytes); + } + } + + @Override + public void decode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + if (valueSizeInBytes == Integer.BYTES) { + decodeInts(buffer, numBytes); + } else { + decodeLongs(buffer, numBytes); + } + } + + private void encodeInts(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Integer.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + // Encode backwards so each value can be read before being overwritten + int prev = buffer.getInt(pos + (numValues - 2) * Integer.BYTES); + for (int i = numValues - 1; i >= 1; i--) { + int offset = pos + i * Integer.BYTES; + int curr = buffer.getInt(offset); + buffer.putInt(offset, curr - prev); + if (i > 1) { + prev = buffer.getInt(pos + (i - 2) * Integer.BYTES); + } + } + } + + private void encodeLongs(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Long.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + long prev = buffer.getLong(pos + (numValues - 2) * Long.BYTES); + for (int i = numValues - 1; i >= 1; i--) { + int offset = pos + i * Long.BYTES; + long curr = buffer.getLong(offset); + buffer.putLong(offset, curr - prev); + if (i > 1) { + prev = buffer.getLong(pos + (i - 2) * Long.BYTES); + } + } + } + + private void decodeInts(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Integer.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + for (int i = 1; i < numValues; i++) { + int offset = pos + i * Integer.BYTES; + int prevValue = buffer.getInt(pos + (i - 1) * Integer.BYTES); + int delta = buffer.getInt(offset); + buffer.putInt(offset, prevValue + delta); + } + } + + private void decodeLongs(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Long.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + for (int i = 1; i < numValues; i++) { + int offset = pos + i * Long.BYTES; + long prevValue = buffer.getLong(pos + (i - 1) * Long.BYTES); + long delta = buffer.getLong(offset); + buffer.putLong(offset, prevValue + delta); + } + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/DoubleDeltaTransform.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/DoubleDeltaTransform.java new file mode 100644 index 000000000000..37edd0931654 --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/DoubleDeltaTransform.java @@ -0,0 +1,199 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.codec.transform; + +import java.nio.ByteBuffer; +import java.util.EnumSet; +import java.util.Set; +import org.apache.pinot.segment.spi.codec.ChunkTransform; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * Double-delta (delta-of-delta) transform for integer numeric values. Stores the first value + * as-is, the first delta as-is, then each subsequent value as the difference between + * consecutive deltas. Operates in-place on the ByteBuffer. + * + *

Supports only INT and LONG stored types. Subtraction on IEEE 754 bit patterns does not + * produce meaningful double-deltas, so FLOAT/DOUBLE columns should use {@link XorTransform} + * instead.

+ * + *

Effective for data with constant or near-constant step sizes (e.g., fixed-interval + * timestamps).

+ */ +public class DoubleDeltaTransform implements ChunkTransform { + + private static final Set SUPPORTED_TYPES = EnumSet.of(DataType.INT, DataType.LONG); + + public static final DoubleDeltaTransform INSTANCE = new DoubleDeltaTransform(); + + private DoubleDeltaTransform() { + } + + @Override + public Set supportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void encode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + if (valueSizeInBytes == Integer.BYTES) { + encodeInts(buffer, numBytes); + } else { + encodeLongs(buffer, numBytes); + } + } + + @Override + public void decode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + if (valueSizeInBytes == Integer.BYTES) { + decodeInts(buffer, numBytes); + } else { + decodeLongs(buffer, numBytes); + } + } + + private void encodeInts(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Integer.BYTES; + if (numValues <= 2) { + // For 0, 1, or 2 values, just apply regular delta (first value stays, second becomes delta) + if (numValues == 2) { + int pos = buffer.position(); + int v0 = buffer.getInt(pos); + int v1 = buffer.getInt(pos + Integer.BYTES); + buffer.putInt(pos + Integer.BYTES, v1 - v0); + } + return; + } + int pos = buffer.position(); + + // Encode in forward order, keeping the previous original value and delta in local variables + // so we can safely overwrite each position in-place as we advance from index 2 onward. + int prevPrevVal = buffer.getInt(pos); + int prevVal = buffer.getInt(pos + Integer.BYTES); + int prevDelta = prevVal - prevPrevVal; + + // Store first delta at index 1 + buffer.putInt(pos + Integer.BYTES, prevDelta); + + int prev = prevVal; + int prevD = prevDelta; + for (int i = 2; i < numValues; i++) { + int offset = pos + i * Integer.BYTES; + int curr = buffer.getInt(offset); + int currDelta = curr - prev; + int doubleDelta = currDelta - prevD; + buffer.putInt(offset, doubleDelta); + prev = curr; + prevD = currDelta; + } + } + + private void encodeLongs(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Long.BYTES; + if (numValues <= 2) { + if (numValues == 2) { + int pos = buffer.position(); + long v0 = buffer.getLong(pos); + long v1 = buffer.getLong(pos + Long.BYTES); + buffer.putLong(pos + Long.BYTES, v1 - v0); + } + return; + } + int pos = buffer.position(); + + long prevPrevVal = buffer.getLong(pos); + long prevVal = buffer.getLong(pos + Long.BYTES); + long prevDelta = prevVal - prevPrevVal; + + buffer.putLong(pos + Long.BYTES, prevDelta); + + long prev = prevVal; + long prevD = prevDelta; + for (int i = 2; i < numValues; i++) { + int offset = pos + i * Long.BYTES; + long curr = buffer.getLong(offset); + long currDelta = curr - prev; + long doubleDelta = currDelta - prevD; + buffer.putLong(offset, doubleDelta); + prev = curr; + prevD = currDelta; + } + } + + private void decodeInts(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Integer.BYTES; + if (numValues <= 2) { + if (numValues == 2) { + int pos = buffer.position(); + int v0 = buffer.getInt(pos); + int d1 = buffer.getInt(pos + Integer.BYTES); + buffer.putInt(pos + Integer.BYTES, v0 + d1); + } + return; + } + int pos = buffer.position(); + + int v0 = buffer.getInt(pos); + int d1 = buffer.getInt(pos + Integer.BYTES); + int v1 = v0 + d1; + buffer.putInt(pos + Integer.BYTES, v1); + + int prev = v1; + int prevDelta = d1; + for (int i = 2; i < numValues; i++) { + int offset = pos + i * Integer.BYTES; + int doubleDelta = buffer.getInt(offset); + int currDelta = prevDelta + doubleDelta; + prev = prev + currDelta; + buffer.putInt(offset, prev); + prevDelta = currDelta; + } + } + + private void decodeLongs(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Long.BYTES; + if (numValues <= 2) { + if (numValues == 2) { + int pos = buffer.position(); + long v0 = buffer.getLong(pos); + long d1 = buffer.getLong(pos + Long.BYTES); + buffer.putLong(pos + Long.BYTES, v0 + d1); + } + return; + } + int pos = buffer.position(); + + long v0 = buffer.getLong(pos); + long d1 = buffer.getLong(pos + Long.BYTES); + long v1 = v0 + d1; + buffer.putLong(pos + Long.BYTES, v1); + + long prev = v1; + long prevDelta = d1; + for (int i = 2; i < numValues; i++) { + int offset = pos + i * Long.BYTES; + long doubleDelta = buffer.getLong(offset); + long currDelta = prevDelta + doubleDelta; + prev = prev + currDelta; + buffer.putLong(offset, prev); + prevDelta = currDelta; + } + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/XorTransform.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/XorTransform.java new file mode 100644 index 000000000000..cd21b2017e3e --- /dev/null +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/transform/XorTransform.java @@ -0,0 +1,140 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.codec.transform; + +import java.nio.ByteBuffer; +import java.util.EnumSet; +import java.util.Set; +import org.apache.pinot.segment.spi.codec.ChunkTransform; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * XOR (Gorilla-style) encoding transform for floating-point values. Stores the first value + * as-is, then each subsequent value as the XOR of the current value with the previous value. + * Operates in-place on the ByteBuffer using the IEEE 754 bit patterns. + * + *

Supports only FLOAT and DOUBLE stored types. For integer columns (INT/LONG), use + * {@link DeltaTransform} or {@link DoubleDeltaTransform} instead, which exploit arithmetic + * structure rather than bitwise similarity.

+ * + *

Particularly effective for floating-point time series where consecutive values are + * similar — XOR produces values with many leading/trailing zero bits, which compress + * extremely well with LZ4 or ZSTANDARD.

+ * + *

This is the encoding scheme from the Facebook Gorilla paper (Pelkonen et al., VLDB 2015), + * without the variable-length bit packing — the downstream compressor handles that.

+ */ +public class XorTransform implements ChunkTransform { + + private static final Set SUPPORTED_TYPES = EnumSet.of(DataType.FLOAT, DataType.DOUBLE); + + public static final XorTransform INSTANCE = new XorTransform(); + + private XorTransform() { + } + + @Override + public Set supportedTypes() { + return SUPPORTED_TYPES; + } + + @Override + public void encode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + if (valueSizeInBytes == Integer.BYTES) { + encodeInts(buffer, numBytes); + } else { + encodeLongs(buffer, numBytes); + } + } + + @Override + public void decode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + // XOR decoding is different from encoding: we must go forward and XOR with the + // already-decoded previous value (not the encoded one). + if (valueSizeInBytes == Integer.BYTES) { + decodeInts(buffer, numBytes); + } else { + decodeLongs(buffer, numBytes); + } + } + + private void encodeInts(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Integer.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + // Encode backwards so each value can be read before being overwritten + int prev = buffer.getInt(pos + (numValues - 2) * Integer.BYTES); + for (int i = numValues - 1; i >= 1; i--) { + int offset = pos + i * Integer.BYTES; + int curr = buffer.getInt(offset); + buffer.putInt(offset, curr ^ prev); + if (i > 1) { + prev = buffer.getInt(pos + (i - 2) * Integer.BYTES); + } + } + } + + private void encodeLongs(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Long.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + long prev = buffer.getLong(pos + (numValues - 2) * Long.BYTES); + for (int i = numValues - 1; i >= 1; i--) { + int offset = pos + i * Long.BYTES; + long curr = buffer.getLong(offset); + buffer.putLong(offset, curr ^ prev); + if (i > 1) { + prev = buffer.getLong(pos + (i - 2) * Long.BYTES); + } + } + } + + private void decodeInts(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Integer.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + for (int i = 1; i < numValues; i++) { + int offset = pos + i * Integer.BYTES; + int prevValue = buffer.getInt(pos + (i - 1) * Integer.BYTES); + int xored = buffer.getInt(offset); + buffer.putInt(offset, prevValue ^ xored); + } + } + + private void decodeLongs(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Long.BYTES; + if (numValues <= 1) { + return; + } + int pos = buffer.position(); + for (int i = 1; i < numValues; i++) { + int offset = pos + i * Long.BYTES; + long prevValue = buffer.getLong(pos + (i - 1) * Long.BYTES); + long xored = buffer.getLong(offset); + buffer.putLong(offset, prevValue ^ xored); + } + } +} diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java index 8b3d22aef406..9c16bbdc3665 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/BaseChunkForwardIndexWriter.java @@ -25,7 +25,10 @@ import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; -import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import javax.annotation.Nullable; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkCompressor; import org.slf4j.Logger; @@ -36,6 +39,22 @@ * Base implementation for chunk-based raw (non-dictionary-encoded) forward index writer where each chunk contains fixed * number of docs. * + *

Forward index file format versions: + *

    + *
  • V1 — implicit Snappy compression, no explicit compression field in header.
  • + *
  • V2 — added explicit compression type in header; chunk offsets stored as int.
  • + *
  • V3 — chunk offsets stored as long (supports >2 GB data sections).
  • + *
  • V4 — fixed-width only; added derive-num-docs-per-chunk support + * (also the base for VarByte V4 writers).
  • + *
  • V5 — fixed-width only; similar to V4 + * (also the base for VarByte V5 writers).
  • + *
  • V6 — variable-byte only ({@link VarByteChunkForwardIndexWriterV6}); delta-encodes chunk + * header (individual entry sizes instead of cumulative offsets) for better compression.
  • + *
  • V7 — fixed-width only; codec pipeline header replaces single compression type with a + * pipeline length and array of {@link ChunkCodec} values. + * Supports pre-compression transforms (DELTA, DOUBLE_DELTA, XOR).
  • + *
+ * *

The layout of the file is as follows: *

    *
  • Header Section @@ -45,7 +64,8 @@ *
  • Number of docs per chunk (int)
  • *
  • Size of entry in bytes (int)
  • *
  • Total number of docs (int)
  • - *
  • Compression type enum value (int)
  • + *
  • For V2–V5: compression type enum value (int)
  • + *
  • For V7: pipeline length (int) followed by N codec enum values (int each)
  • *
  • Start offset of data header (int)
  • *
  • Data header (start offsets for all chunks) *
      @@ -88,15 +108,55 @@ public abstract class BaseChunkForwardIndexWriter implements Closeable { protected BaseChunkForwardIndexWriter(File file, ChunkCompressionType compressionType, int totalDocs, int numDocsPerChunk, long chunkSize, int sizeOfEntry, int version, boolean fixed) throws IOException { - Preconditions.checkArgument(version == 2 || version == 3 || (fixed && version >= 4), + this(file, compressionType, null, sizeOfEntry, totalDocs, numDocsPerChunk, chunkSize, sizeOfEntry, version, fixed); + } + + /** + * Constructor with optional codec pipeline support. + * + * @param file Data file to write into + * @param compressionType Type of compression (used when pipeline is null) + * @param codecPipeline Optional codec pipeline; when non-null, requires version 7 + * @param valueSizeInBytes Size of each typed value (4 for INT, 8 for LONG); used by pipeline transforms + * @param totalDocs Total docs to write + * @param numDocsPerChunk Number of docs per data chunk + * @param chunkSize Size of chunk + * @param sizeOfEntry Size of entry (in bytes) + * @param version version of File + * @param fixed if the data type is fixed width + */ + protected BaseChunkForwardIndexWriter(File file, ChunkCompressionType compressionType, + @Nullable ChunkCodecPipeline codecPipeline, int valueSizeInBytes, int totalDocs, + int numDocsPerChunk, long chunkSize, int sizeOfEntry, int version, boolean fixed) + throws IOException { + boolean hasPipeline = codecPipeline != null; + Preconditions.checkArgument( + version == 2 || version == 3 || version == 6 + || (fixed && (version == 4 || version == 5 || version == 7)), "Illegal version: %s for %s bytes values", version, fixed ? "fixed" : "variable"); + if (hasPipeline) { + Preconditions.checkArgument(version == 7, "codecPipeline requires writer version 7, got: %s", version); + } + if (version == 7) { + Preconditions.checkArgument(hasPipeline, + "Writer version 7 requires a non-null codecPipeline (version 7 header layout is pipeline-only)"); + } + if (hasPipeline && codecPipeline.hasTransforms()) { + Preconditions.checkArgument(valueSizeInBytes == Integer.BYTES || valueSizeInBytes == Long.BYTES, + "Codec pipeline transforms require valueSizeInBytes to be 4 (INT/FLOAT) or 8 (LONG/DOUBLE), got: %s", + valueSizeInBytes); + } Preconditions.checkArgument(chunkSize <= Integer.MAX_VALUE, "Chunk size limited to 2GB"); _chunkSize = (int) chunkSize; - _chunkCompressor = ChunkCompressorFactory.getCompressor(compressionType); + if (hasPipeline) { + _chunkCompressor = ChunkCompressorFactory.getCompressor(codecPipeline, valueSizeInBytes); + } else { + _chunkCompressor = ChunkCompressorFactory.getCompressor(compressionType); + } _headerEntryChunkOffsetSize = version == 2 ? Integer.BYTES : Long.BYTES; - _dataOffset = writeHeader(compressionType, totalDocs, numDocsPerChunk, sizeOfEntry, version); + _dataOffset = writeHeader(compressionType, codecPipeline, totalDocs, numDocsPerChunk, sizeOfEntry, version); _chunkBuffer = ByteBuffer.allocateDirect(_chunkSize); - int maxCompressedChunkSize = _chunkCompressor.maxCompressedSize(_chunkSize); // may exceed original chunk size + int maxCompressedChunkSize = _chunkCompressor.maxCompressedSize(_chunkSize); _compressedBuffer = ByteBuffer.allocateDirect(maxCompressedChunkSize); _dataFile = new RandomAccessFile(file, "rw").getChannel(); } @@ -127,36 +187,52 @@ public void close() * @param version Version of file * @return Size of header */ - private int writeHeader(ChunkCompressionType compressionType, int totalDocs, int numDocsPerChunk, int sizeOfEntry, - int version) { + /** + * Writes the header for the forward index file. + * + *

      For version ≤ 5, the header layout is: + * [version][numChunks][numDocsPerChunk][sizeOfEntry][totalDocs][compressionType][dataHeaderStart] + * + *

      For version 7 (codec pipeline), the header layout is: + * [version][numChunks][numDocsPerChunk][sizeOfEntry][totalDocs][pipelineLength][codec0]...[codecN-1][dataHeaderStart] + */ + private int writeHeader(ChunkCompressionType compressionType, @Nullable ChunkCodecPipeline codecPipeline, + int totalDocs, int numDocsPerChunk, int sizeOfEntry, int version) { int numChunks = (totalDocs + numDocsPerChunk - 1) / numDocsPerChunk; - int headerSize = (7 * Integer.BYTES) + (numChunks * _headerEntryChunkOffsetSize); + + // Calculate fixed header size based on version + int fixedHeaderInts; + if (version == 7 && codecPipeline != null) { + // version + numChunks + numDocsPerChunk + sizeOfEntry + totalDocs + pipelineLength + N codec ints + + // dataHeaderStart + fixedHeaderInts = 6 + codecPipeline.size() + 1; + } else { + // version + numChunks + numDocsPerChunk + sizeOfEntry + totalDocs + compressionType + dataHeaderStart + fixedHeaderInts = 7; + } + int headerSize = (fixedHeaderInts * Integer.BYTES) + (numChunks * _headerEntryChunkOffsetSize); _header = ByteBuffer.allocateDirect(headerSize); - int offset = 0; _header.putInt(version); - offset += Integer.BYTES; - _header.putInt(numChunks); - offset += Integer.BYTES; - _header.putInt(numDocsPerChunk); - offset += Integer.BYTES; - _header.putInt(sizeOfEntry); - offset += Integer.BYTES; - - // Write total number of docs. _header.putInt(totalDocs); - offset += Integer.BYTES; - // Write the compressor type - _header.putInt(compressionType.getValue()); - offset += Integer.BYTES; + if (version == 7 && codecPipeline != null) { + // Write pipeline: length + codec values + _header.putInt(codecPipeline.size()); + for (ChunkCodec codec : codecPipeline.getStages()) { + _header.putInt(codec.getValue()); + } + } else { + // Legacy: single compression type + _header.putInt(compressionType.getValue()); + } - // Start of chunk offsets. - int dataHeaderStart = offset + Integer.BYTES; + // dataHeaderStart = current position + sizeof(int) for the dataHeaderStart field itself + int dataHeaderStart = _header.position() + Integer.BYTES; _header.putInt(dataHeaderStart); return headerSize; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java index 8b517a84f9c1..96635b57e958 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/FixedByteChunkForwardIndexWriter.java @@ -21,7 +21,9 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; +import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; @@ -53,6 +55,27 @@ public FixedByteChunkForwardIndexWriter(File file, ChunkCompressionType compress _chunkDataOffset = 0; } + /** + * Constructor with codec pipeline support. + * + * @param file File to write to + * @param compressionType Type of compression (used when pipeline is null) + * @param codecPipeline Optional codec pipeline (requires writerVersion 7) + * @param totalDocs Total number of docs to write + * @param numDocsPerChunk Number of documents per chunk + * @param sizeOfEntry Size of entry (in bytes) + * @param writerVersion writer format version + */ + public FixedByteChunkForwardIndexWriter(File file, ChunkCompressionType compressionType, + @Nullable ChunkCodecPipeline codecPipeline, int totalDocs, int numDocsPerChunk, int sizeOfEntry, + int writerVersion) + throws IOException { + super(file, compressionType, codecPipeline, sizeOfEntry, totalDocs, + normalizeDocsPerChunk(writerVersion, numDocsPerChunk), + (long) sizeOfEntry * normalizeDocsPerChunk(writerVersion, numDocsPerChunk), sizeOfEntry, writerVersion, true); + _chunkDataOffset = 0; + } + public void putInt(int value) { _chunkBuffer.putInt(value); _chunkDataOffset += Integer.BYTES; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java index ec18e9cb764a..cef9e4256075 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/writer/impl/VarByteChunkForwardIndexWriterV4.java @@ -29,7 +29,7 @@ import java.nio.charset.StandardCharsets; import javax.annotation.concurrent.NotThreadSafe; import org.apache.commons.io.FileUtils; -import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; import org.apache.pinot.segment.local.utils.ArraySerDeUtils; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkCompressor; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java index 453519c8a691..e541f661930c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/fwd/SingleValueFixedByteRawIndexCreator.java @@ -20,8 +20,10 @@ import java.io.File; import java.io.IOException; +import javax.annotation.Nullable; import org.apache.pinot.segment.local.io.writer.impl.FixedByteChunkForwardIndexWriter; import org.apache.pinot.segment.spi.V1Constants; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.index.ForwardIndexConfig; import org.apache.pinot.segment.spi.index.creator.ForwardIndexCreator; @@ -67,10 +69,29 @@ public SingleValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionTy public SingleValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionType compressionType, String column, int totalDocs, DataType valueType, int writerVersion, int targetDocsPerChunk) throws IOException { + this(baseIndexDir, compressionType, null, column, totalDocs, valueType, writerVersion, targetDocsPerChunk); + } + + /** + * Constructor with optional codec pipeline. + * + * @param baseIndexDir Index directory + * @param compressionType Type of compression (used when pipeline is null) + * @param codecPipeline Optional codec pipeline (requires writerVersion 7) + * @param column Name of column to index + * @param totalDocs Total number of documents to index + * @param valueType Type of the values + * @param writerVersion writer format version (7 required when codecPipeline is set) + * @param targetDocsPerChunk target number of docs per chunk + */ + public SingleValueFixedByteRawIndexCreator(File baseIndexDir, ChunkCompressionType compressionType, + @Nullable ChunkCodecPipeline codecPipeline, String column, int totalDocs, DataType valueType, int writerVersion, + int targetDocsPerChunk) + throws IOException { File file = new File(baseIndexDir, column + V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); _indexWriter = - new FixedByteChunkForwardIndexWriter(file, compressionType, totalDocs, targetDocsPerChunk, valueType.size(), - writerVersion); + new FixedByteChunkForwardIndexWriter(file, compressionType, codecPipeline, totalDocs, targetDocsPerChunk, + valueType.size(), writerVersion); _valueType = valueType; } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java index 6084c77b4eeb..2b7980feb375 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexCreatorFactory.java @@ -19,6 +19,7 @@ package org.apache.pinot.segment.local.segment.index.forward; +import com.google.common.base.Preconditions; import java.io.File; import java.io.IOException; import org.apache.pinot.segment.local.segment.creator.impl.fwd.CLPForwardIndexCreatorV1; @@ -31,6 +32,7 @@ import org.apache.pinot.segment.local.segment.creator.impl.fwd.SingleValueSortedForwardIndexCreator; import org.apache.pinot.segment.local.segment.creator.impl.fwd.SingleValueUnsortedForwardIndexCreator; import org.apache.pinot.segment.local.segment.creator.impl.fwd.SingleValueVarByteRawIndexCreator; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.DictIdCompressionType; import org.apache.pinot.segment.spi.creator.IndexCreationContext; @@ -86,19 +88,42 @@ public static ForwardIndexCreator createIndexCreator(IndexCreationContext contex if (indexConfig.getCompressionCodec() == FieldConfig.CompressionCodec.CLPV2_LZ4) { return new CLPForwardIndexCreatorV2(indexDir, context.getColumnStatistics(), ChunkCompressionType.LZ4); } + ChunkCodecPipeline codecPipeline = indexConfig.getCodecPipeline(); ChunkCompressionType chunkCompressionType = indexConfig.getChunkCompressionType(); - if (chunkCompressionType == null) { + if (chunkCompressionType == null && codecPipeline == null) { chunkCompressionType = ForwardIndexType.getDefaultCompressionType(fieldSpec.getFieldType()); } boolean deriveNumDocsPerChunk = indexConfig.isDeriveNumDocsPerChunk(); int writerVersion = indexConfig.getRawIndexWriterVersion(); int targetMaxChunkSize = indexConfig.getTargetMaxChunkSizeBytes(); int targetDocsPerChunk = indexConfig.getTargetDocsPerChunk(); + + if (codecPipeline != null && codecPipeline.hasTransforms()) { + Preconditions.checkState(writerVersion == 7, + "codecPipeline with transforms requires rawIndexWriterVersion=7 for column: %s, got: %s", columnName, + writerVersion); + } + if (fieldSpec.isSingleValueField()) { + // Use pipeline path for fixed-width SV columns when pipeline has transforms (requires V7) + if (codecPipeline != null && codecPipeline.hasTransforms() && storedType.isFixedWidth()) { + return new SingleValueFixedByteRawIndexCreator(indexDir, + codecPipeline.getChunkCompressionType(), codecPipeline, columnName, numTotalDocs, + storedType, writerVersion, targetDocsPerChunk); + } + // For compression-only pipelines and non-fixed-width types, use the legacy path + // (avoids requiring V7 for simple compression changes) + if (chunkCompressionType == null && codecPipeline != null) { + chunkCompressionType = codecPipeline.getChunkCompressionType(); + } return getRawIndexCreatorForSVColumn(indexDir, chunkCompressionType, columnName, storedType, numTotalDocs, context.getLengthOfLongestEntry(), deriveNumDocsPerChunk, writerVersion, targetMaxChunkSize, targetDocsPerChunk); } else { + // For MV columns, pipeline is not supported — derive compression type from pipeline if needed + if (chunkCompressionType == null && codecPipeline != null) { + chunkCompressionType = codecPipeline.getChunkCompressionType(); + } return getRawIndexCreatorForMVColumn(indexDir, chunkCompressionType, columnName, storedType, numTotalDocs, context.getMaxNumberOfMultiValueElements(), deriveNumDocsPerChunk, writerVersion, context.getMaxRowLengthInBytes(), targetMaxChunkSize, targetDocsPerChunk); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index 4e5ac20a3ef4..1fcd66645e3d 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -26,6 +26,7 @@ import java.util.List; import java.util.Map; import javax.annotation.Nullable; +import org.apache.pinot.segment.local.io.codec.transform.ChunkTransformFactory; import org.apache.pinot.segment.local.realtime.impl.forward.CLPMutableForwardIndexV2; import org.apache.pinot.segment.local.realtime.impl.forward.FixedByteMVMutableForwardIndex; import org.apache.pinot.segment.local.realtime.impl.forward.FixedByteSVMutableForwardIndex; @@ -34,6 +35,8 @@ import org.apache.pinot.segment.local.segment.index.loader.ForwardIndexHandler; import org.apache.pinot.segment.spi.ColumnMetadata; import org.apache.pinot.segment.spi.V1Constants; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.creator.IndexCreationContext; import org.apache.pinot.segment.spi.index.AbstractIndexType; @@ -109,7 +112,29 @@ private void validateForwardIndexEnabled(ForwardIndexConfig forwardIndexConfig, FieldSpec fieldSpec) { String column = fieldSpec.getName(); CompressionCodec compressionCodec = forwardIndexConfig.getCompressionCodec(); + ChunkCodecPipeline codecPipeline = forwardIndexConfig.getCodecPipeline(); DictionaryIndexConfig dictionaryConfig = indexConfigs.getConfig(StandardIndexes.dictionary()); + + // Validate codec pipeline transform stages. + // Compression-only pipelines (e.g., [LZ4], [DELTA_LZ4]) don't restrict column types. + // Transform pipelines (e.g., [DELTA, ZSTANDARD], [XOR, LZ4]) require SV, fixed-width columns. + if (codecPipeline != null && codecPipeline.hasTransforms()) { + Preconditions.checkState(!dictionaryConfig.isEnabled(), + "codecPipeline with transforms is not applicable to dictionary encoded column: %s", column); + Preconditions.checkState(fieldSpec.isSingleValueField(), + "codecPipeline with transforms is only supported on single-value columns, not applicable to column: %s", + column); + // Delegate per-transform type validation to each transform class + FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); + for (ChunkCodec transformCodec : codecPipeline.getTransforms()) { + ChunkTransformFactory.getTransform(transformCodec).validateStoredType(storedType, column); + } + Preconditions.checkState(forwardIndexConfig.getRawIndexWriterVersion() == 7, + "codecPipeline with transforms requires rawIndexWriterVersion=7 for column: %s. " + + "Transform pipelines emit V7 forward indexes and should only be enabled after all readers in the " + + "cluster support V7.", column); + } + if (dictionaryConfig.isEnabled()) { Preconditions.checkState(compressionCodec == null || compressionCodec.isApplicableToDictEncodedIndex(), "Compression codec: %s is not applicable to dictionary encoded column: %s", compressionCodec, column); @@ -202,7 +227,16 @@ private boolean isDisabled(Map props) { private ForwardIndexConfig createConfigFromFieldConfig(FieldConfig fieldConfig) { ForwardIndexConfig.Builder builder = new ForwardIndexConfig.Builder(); - builder.withCompressionCodec(fieldConfig.getCompressionCodec()); + + // FieldConfig validates mutual exclusivity: only one of compressionCodec/codecPipeline can be set. + // ForwardIndexConfig auto-derives pipeline from compressionCodec when codecPipeline is null. + List pipelineNames = fieldConfig.getCodecPipeline(); + if (pipelineNames != null && !pipelineNames.isEmpty()) { + builder.withCodecPipeline(ChunkCodecPipeline.fromNames(pipelineNames)); + } else { + builder.withCompressionCodec(fieldConfig.getCompressionCodec()); + } + Map properties = fieldConfig.getProperties(); if (properties != null) { builder.withLegacyProperties(properties); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java index eb55d3769f3e..7a2af48171c0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandler.java @@ -49,6 +49,7 @@ import org.apache.pinot.segment.local.utils.ClusterConfigForTable; import org.apache.pinot.segment.spi.ColumnMetadata; import org.apache.pinot.segment.spi.V1Constants; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.DictIdCompressionType; import org.apache.pinot.segment.spi.creator.IndexCreationContext; @@ -68,6 +69,7 @@ import org.apache.pinot.segment.spi.index.reader.ForwardIndexReaderContext; import org.apache.pinot.segment.spi.store.SegmentDirectory; import org.apache.pinot.segment.spi.utils.SegmentMetadataUtils; +import org.apache.pinot.spi.config.table.FieldConfig.CompressionCodec; import org.apache.pinot.spi.config.table.IndexingConfig; import org.apache.pinot.spi.config.table.TableConfig; import org.apache.pinot.spi.data.FieldSpec; @@ -379,24 +381,72 @@ private boolean shouldChangeRawCompressionType(String column, SegmentDirectory.R // The compression type for an existing segment can only be determined by reading the forward index header. ColumnMetadata existingColMetadata = _segmentDirectory.getSegmentMetadata().getColumnMetadataFor(column); ChunkCompressionType existingCompressionType; + ChunkCodecPipeline existingPipeline; // Get the forward index reader factory and create a reader IndexReaderFactory readerFactory = StandardIndexes.forward().getReaderFactory(); try (ForwardIndexReader fwdIndexReader = readerFactory.createIndexReader(segmentReader, _fieldIndexConfigs.get(column), existingColMetadata)) { existingCompressionType = fwdIndexReader.getCompressionType(); + existingPipeline = fwdIndexReader.getCodecPipeline(); Preconditions.checkState(existingCompressionType != null, "Existing compressionType cannot be null for raw forward index column=" + column); } - // Get the new compression type. - ChunkCompressionType newCompressionType = - _fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward()).getChunkCompressionType(); + ForwardIndexConfig newConfig = _fieldIndexConfigs.get(column).getConfig(StandardIndexes.forward()); + return shouldChangeRawCompressionType(existingCompressionType, existingPipeline, newConfig); + } + + @VisibleForTesting + static boolean shouldChangeRawCompressionType(ChunkCompressionType existingCompressionType, + @Nullable ChunkCodecPipeline existingPipeline, ForwardIndexConfig newConfig) { + CompressionCodec newCompressionCodec = newConfig.getCompressionCodec(); + + // Reload-time forward index rewrites do not have CLP stats in the creation context, so converting an existing + // raw forward index into any CLP variant is not supported here. Preserve the existing segment as-is. + if (isCLPCodec(newCompressionCodec)) { + return false; + } + + ChunkCodecPipeline newPipeline = newConfig.getCodecPipeline(); + // newPipeline is non-null for all non-CLP RAW codecs (auto-derived from compressionCodec). + // Compare via pipeline when possible; fall back to compressionType for CLP codecs. + if (newPipeline != null) { + // For legacy segments (pre-V7), derive pipeline from their compressionType for comparison. + // e.g., existing DELTA segment → pipeline [DELTA_LZ4]; existing LZ4 segment → pipeline [LZ4]. + if (existingPipeline == null) { + existingPipeline = ChunkCodecPipeline.fromCompressionType(existingCompressionType); + } + return !existingPipeline.equals(newPipeline); + } + + // CLP codecs/variants do not have a codec pipeline here and can collapse to PASS_THROUGH, so comparing only + // ChunkCompressionType is not sufficient to detect CLP variant changes or a switch from raw PASS_THROUGH to CLP. // Note that default compression type (PASS_THROUGH for metric and LZ4 for dimension) is not considered if the // compressionType is not explicitly provided in tableConfig. This is to avoid incorrectly rewriting all the // forward indexes during segmentReload when the default compressionType changes. - return newCompressionType != null && existingCompressionType != newCompressionType; + ChunkCompressionType newCompressionType = newConfig.getChunkCompressionType(); + if (newCompressionType == null) { + return false; + } + if (existingCompressionType != newCompressionType) { + return true; + } + if (newCompressionCodec == null) { + return false; + } + + // An explicit codec with no derived pipeline is a CLP-style configuration. If the existing index exposes a + // pipeline, it is definitely not the same CLP encoding. If it is PASS_THROUGH without a pipeline, the existing + // encoding is ambiguous (plain raw PASS_THROUGH or some CLP variant), so rewrite to ensure the requested codec is + // applied. + return existingPipeline != null || existingCompressionType == ChunkCompressionType.PASS_THROUGH; + } + + private static boolean isCLPCodec(@Nullable CompressionCodec codec) { + return codec == CompressionCodec.CLP || codec == CompressionCodec.CLPV2 + || codec == CompressionCodec.CLPV2_ZSTD || codec == CompressionCodec.CLPV2_LZ4; } private boolean shouldChangeDictIdCompressionType(String column, SegmentDirectory.Reader segmentReader) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java index 0ef13d23e63b..5e8f00725071 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/BaseChunkForwardIndexReader.java @@ -27,7 +27,10 @@ import java.nio.LongBuffer; import java.util.ArrayList; import java.util.List; -import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.transform.ChunkTransformFactory; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.ChunkDecompressor; import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; @@ -50,6 +53,8 @@ public abstract class BaseChunkForwardIndexReader implements ForwardIndexReader< protected final int _lengthOfLongestEntry; protected final boolean _isCompressed; protected final ChunkCompressionType _compressionType; + // Null for segments written with version < 7 (no pipeline support) + protected final ChunkCodecPipeline _codecPipeline; protected final ChunkDecompressor _chunkDecompressor; protected final PinotDataBuffer _dataHeader; protected final int _headerEntryChunkOffsetSize; @@ -79,11 +84,41 @@ protected BaseChunkForwardIndexReader(PinotDataBuffer dataBuffer, DataType store headerOffset += Integer.BYTES; int dataHeaderStart = headerOffset; - if (version > 1) { + if (version == 7) { + // Version 7: codec pipeline header + _dataBuffer.getInt(headerOffset); // Total docs + headerOffset += Integer.BYTES; + + int pipelineLength = _dataBuffer.getInt(headerOffset); + headerOffset += Integer.BYTES; + + int[] codecValues = new int[pipelineLength]; + for (int i = 0; i < pipelineLength; i++) { + codecValues[i] = _dataBuffer.getInt(headerOffset); + headerOffset += Integer.BYTES; + } + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromValues(codecValues); + _codecPipeline = pipeline; + _compressionType = pipeline.getChunkCompressionType(); + + int valueSizeInBytes = storedType.isFixedWidth() ? storedType.size() : 0; + if (pipeline.hasTransforms()) { + // Delegate per-transform type validation to each transform class + for (ChunkCodec transformCodec : pipeline.getTransforms()) { + ChunkTransformFactory.getTransform(transformCodec).validateStoredType(storedType, "unknown"); + } + } + _chunkDecompressor = ChunkCompressorFactory.getDecompressor(pipeline, valueSizeInBytes); + _isCompressed = !_compressionType.equals(ChunkCompressionType.PASS_THROUGH) || pipeline.hasTransforms(); + + dataHeaderStart = _dataBuffer.getInt(headerOffset); + } else if (version > 1) { + // Version 2-5: legacy single compression type _dataBuffer.getInt(headerOffset); // Total docs headerOffset += Integer.BYTES; _compressionType = ChunkCompressionType.valueOf(_dataBuffer.getInt(headerOffset)); + _codecPipeline = null; _chunkDecompressor = ChunkCompressorFactory.getDecompressor(_compressionType); _isCompressed = !_compressionType.equals(ChunkCompressionType.PASS_THROUGH); @@ -92,6 +127,7 @@ protected BaseChunkForwardIndexReader(PinotDataBuffer dataBuffer, DataType store } else { _isCompressed = true; _compressionType = ChunkCompressionType.SNAPPY; + _codecPipeline = null; _chunkDecompressor = ChunkCompressorFactory.getDecompressor(_compressionType); } @@ -277,6 +313,11 @@ public ChunkCompressionType getCompressionType() { return _compressionType; } + @Override + public ChunkCodecPipeline getCodecPipeline() { + return _codecPipeline; + } + @Override public int getLengthOfLongestEntry() { return _lengthOfLongestEntry; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java index 49e1c5e8fa05..9927a7fb9707 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/forward/VarByteChunkForwardIndexReaderV4.java @@ -27,7 +27,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import org.apache.pinot.segment.local.io.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; import org.apache.pinot.segment.local.io.writer.impl.VarByteChunkForwardIndexWriterV4; import org.apache.pinot.segment.local.utils.ArraySerDeUtils; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java index acefec3e8ab7..c657dd4c47af 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java @@ -49,8 +49,11 @@ import org.apache.pinot.segment.local.aggregator.ValueAggregatorFactory; import org.apache.pinot.segment.local.function.FunctionEvaluator; import org.apache.pinot.segment.local.function.FunctionEvaluatorFactory; +import org.apache.pinot.segment.local.io.codec.transform.ChunkTransformFactory; import org.apache.pinot.segment.local.recordtransformer.SchemaConformingTransformer; import org.apache.pinot.segment.spi.AggregationFunctionType; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.index.DictionaryIndexConfig; import org.apache.pinot.segment.spi.index.FieldIndexConfigs; import org.apache.pinot.segment.spi.index.FieldIndexConfigsUtil; @@ -2174,22 +2177,39 @@ public static boolean isRelevantToTenant(TableConfig tableConfig, String tenantN } private static void validateGorillaCompressionCodecIfPresent(FieldConfig fieldConfig, FieldSpec fieldSpec) { - if (fieldConfig.getCompressionCodec() == null) { - return; - } - switch (fieldConfig.getCompressionCodec()) { - case DELTA: - case DELTADELTA: + // Validate legacy compressionCodec DELTA/DELTADELTA constraints + if (fieldConfig.getCompressionCodec() != null) { + switch (fieldConfig.getCompressionCodec()) { + case DELTA: + case DELTADELTA: + Preconditions.checkState(fieldSpec.isSingleValueField(), + "Compression codec %s can only be used on single-value columns, found multi-value column: %s", + fieldConfig.getCompressionCodec(), fieldConfig.getName()); + DataType storedType = fieldSpec.getDataType().getStoredType(); + Preconditions.checkState(storedType == DataType.INT || storedType == DataType.LONG, + "Compression codec %s can only be used on INT/LONG data types, found %s for column: %s", + fieldConfig.getCompressionCodec(), storedType, fieldConfig.getName()); + break; + default: + // no-op for other codecs + } + } + + // Validate codecPipeline: parse the full pipeline upfront so invalid/misspelled names fail fast, + // then delegate per-transform type validation to each transform class. + List codecPipeline = fieldConfig.getCodecPipeline(); + if (codecPipeline != null && !codecPipeline.isEmpty()) { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames(codecPipeline); + if (pipeline.hasTransforms()) { Preconditions.checkState(fieldSpec.isSingleValueField(), - "Compression codec %s can only be used on single-value columns, found multi-value column: %s", - fieldConfig.getCompressionCodec(), fieldConfig.getName()); + "codecPipeline with transforms can only be used on single-value columns, " + + "found multi-value column: %s", fieldConfig.getName()); DataType storedType = fieldSpec.getDataType().getStoredType(); - Preconditions.checkState(storedType == DataType.INT || storedType == DataType.LONG, - "Compression codec %s can only be used on INT/LONG data types, found %s for column: %s", - fieldConfig.getCompressionCodec(), storedType, fieldConfig.getName()); - break; - default: - // no-op for other codecs + for (ChunkCodec transformCodec : pipeline.getTransforms()) { + ChunkTransformFactory.getTransform(transformCodec) + .validateStoredType(storedType, fieldConfig.getName()); + } + } } } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/ChunkCodecPipelineTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/ChunkCodecPipelineTest.java new file mode 100644 index 000000000000..139f9e4f0439 --- /dev/null +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/ChunkCodecPipelineTest.java @@ -0,0 +1,1014 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.codec; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.Collections; +import java.util.Random; +import org.apache.commons.io.FileUtils; +import org.apache.pinot.segment.local.io.codec.compression.ChunkCompressorFactory; +import org.apache.pinot.segment.local.io.codec.transform.DeltaTransform; +import org.apache.pinot.segment.local.io.codec.transform.DoubleDeltaTransform; +import org.apache.pinot.segment.local.io.codec.transform.XorTransform; +import org.apache.pinot.segment.local.io.writer.impl.FixedByteChunkForwardIndexWriter; +import org.apache.pinot.segment.local.segment.index.readers.forward.ChunkReaderContext; +import org.apache.pinot.segment.local.segment.index.readers.forward.FixedBytePower2ChunkSVForwardIndexReader; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; +import org.apache.pinot.segment.spi.codec.ChunkTransform; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.segment.spi.compression.ChunkCompressor; +import org.apache.pinot.segment.spi.compression.ChunkDecompressor; +import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader; +import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.expectThrows; + + +/** + * Tests for codec pipeline: {@link ChunkCodecPipeline}, {@link PipelineChunkCompressor}, + * {@link PipelineChunkDecompressor}, {@link DeltaTransform}, and {@link DoubleDeltaTransform}. + */ +public class ChunkCodecPipelineTest { + + // ===== ChunkCodecPipeline construction tests ===== + + @Test + public void testSingleCompressor() { + ChunkCodecPipeline pipeline = new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.ZSTANDARD)); + assertEquals(pipeline.size(), 1); + assertEquals(pipeline.getCompressor(), ChunkCodec.ZSTANDARD); + assertFalse(pipeline.hasTransforms()); + assertTrue(pipeline.getTransforms().isEmpty()); + } + + @Test + public void testTransformPlusCompressor() { + ChunkCodecPipeline pipeline = new ChunkCodecPipeline( + Arrays.asList(ChunkCodec.DELTA, ChunkCodec.ZSTANDARD)); + assertEquals(pipeline.size(), 2); + assertEquals(pipeline.getCompressor(), ChunkCodec.ZSTANDARD); + assertTrue(pipeline.hasTransforms()); + assertEquals(pipeline.getTransforms(), Collections.singletonList(ChunkCodec.DELTA)); + } + + @Test + public void testMultipleTransforms() { + ChunkCodecPipeline pipeline = new ChunkCodecPipeline( + Arrays.asList(ChunkCodec.DELTA, ChunkCodec.DOUBLE_DELTA, ChunkCodec.LZ4)); + assertEquals(pipeline.size(), 3); + assertEquals(pipeline.getCompressor(), ChunkCodec.LZ4); + assertEquals(pipeline.getTransforms(), Arrays.asList(ChunkCodec.DELTA, ChunkCodec.DOUBLE_DELTA)); + } + + @Test + public void testTransformOnly() { + // Pipeline with only transforms — compressor defaults to PASS_THROUGH + ChunkCodecPipeline pipeline = new ChunkCodecPipeline( + Collections.singletonList(ChunkCodec.DELTA)); + assertEquals(pipeline.getCompressor(), ChunkCodec.PASS_THROUGH); + assertTrue(pipeline.hasTransforms()); + } + + @Test + public void testCompressorNotLastFails() { + expectThrows(IllegalArgumentException.class, () -> + new ChunkCodecPipeline(Arrays.asList(ChunkCodec.ZSTANDARD, ChunkCodec.DELTA))); + } + + @Test + public void testMultipleCompressorsFails() { + expectThrows(IllegalArgumentException.class, () -> + new ChunkCodecPipeline(Arrays.asList(ChunkCodec.SNAPPY, ChunkCodec.ZSTANDARD))); + } + + @Test + public void testEmptyPipelineFails() { + expectThrows(IllegalArgumentException.class, () -> + new ChunkCodecPipeline(Collections.emptyList())); + } + + @Test + public void testFromNames() { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "ZSTANDARD")); + assertEquals(pipeline.getStages(), Arrays.asList(ChunkCodec.DELTA, ChunkCodec.ZSTANDARD)); + } + + @Test + public void testFromNamesRejectsLegacyCompound() { + // DELTA_LZ4 and DOUBLE_DELTA_LZ4 are internal-only — users must use ["DELTA", "LZ4"] instead + expectThrows(IllegalArgumentException.class, () -> + ChunkCodecPipeline.fromNames(Collections.singletonList("DELTA_LZ4"))); + expectThrows(IllegalArgumentException.class, () -> + ChunkCodecPipeline.fromNames(Collections.singletonList("DOUBLE_DELTA_LZ4"))); + } + + @Test + public void testFromValues() { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromValues(new int[]{100, 2}); + assertEquals(pipeline.getStages(), Arrays.asList(ChunkCodec.DELTA, ChunkCodec.ZSTANDARD)); + } + + @Test + public void testEquality() { + ChunkCodecPipeline a = ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "ZSTANDARD")); + ChunkCodecPipeline b = ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "ZSTANDARD")); + ChunkCodecPipeline c = ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "LZ4")); + assertEquals(a, b); + assertEquals(a.hashCode(), b.hashCode()); + assertFalse(a.equals(c)); + } + + // ===== DeltaTransform tests ===== + + @Test + public void testDeltaEncodeDecodeInts() { + int[] values = {100, 105, 108, 120, 125, 130}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaEncodeDecodeLongs() { + long[] values = {1000000L, 1000050L, 1000100L, 1000200L, 1000250L}; + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaEmptyInts() { + assertRoundTripInts(DeltaTransform.INSTANCE, new int[0]); + } + + @Test + public void testDeltaSingleInt() { + assertRoundTripInts(DeltaTransform.INSTANCE, new int[]{42}); + } + + @Test + public void testDeltaNegativeValues() { + int[] values = {-100, -50, 0, 50, 100, -200}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaOverflowInt() { + int[] values = {Integer.MAX_VALUE, Integer.MIN_VALUE, 0, Integer.MAX_VALUE}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaOverflowLong() { + long[] values = {Long.MAX_VALUE, Long.MIN_VALUE, 0L, Long.MAX_VALUE}; + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaRandomInts() { + Random random = new Random(12345); + int[] values = new int[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaRandomLongs() { + Random random = new Random(12345); + long[] values = new long[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextLong(); + } + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + // ===== DoubleDeltaTransform tests ===== + + @Test + public void testDoubleDeltaEncodeDecodeInts() { + int[] values = {100, 110, 120, 130, 140, 150}; + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaEncodeDecodeLongs() { + long[] values = {1000L, 1100L, 1200L, 1300L, 1400L}; + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaTwoInts() { + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, new int[]{10, 20}); + } + + @Test + public void testDoubleDeltaOverflowInt() { + int[] values = {Integer.MAX_VALUE, Integer.MIN_VALUE, 0, Integer.MAX_VALUE}; + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaConstantStep() { + int[] values = new int[100]; + for (int i = 0; i < values.length; i++) { + values[i] = 1000 + i * 60; + } + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaRandomInts() { + Random random = new Random(67890); + int[] values = new int[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaRandomLongs() { + Random random = new Random(67890); + long[] values = new long[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextLong(); + } + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + // ===== Pipeline compressor/decompressor round-trip tests ===== + + @Test + public void testPipelineDeltaLz4RoundTripInts() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "LZ4")); + int[] original = {100, 200, 300, 400, 500, 600, 700, 800}; + assertPipelineRoundTripInts(pipeline, original); + } + + @Test + public void testPipelineDeltaZstdRoundTripLongs() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "ZSTANDARD")); + long[] original = {1000L, 2000L, 3000L, 4000L, 5000L}; + assertPipelineRoundTripLongs(pipeline, original); + } + + @Test + public void testPipelineDoubleDeltaSnappyRoundTripInts() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames(Arrays.asList("DOUBLE_DELTA", "SNAPPY")); + int[] original = new int[100]; + for (int i = 0; i < original.length; i++) { + original[i] = 1000 + i * 10; + } + assertPipelineRoundTripInts(pipeline, original); + } + + @Test + public void testPipelineMultiTransformRoundTrip() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames( + Arrays.asList("DELTA", "DOUBLE_DELTA", "ZSTANDARD")); + int[] original = new int[200]; + for (int i = 0; i < original.length; i++) { + original[i] = 1000 + i * 5; + } + assertPipelineRoundTripInts(pipeline, original); + } + + @Test + public void testPipelinePassThroughRoundTripInts() + throws IOException { + ChunkCodecPipeline pipeline = new ChunkCodecPipeline( + Collections.singletonList(ChunkCodec.PASS_THROUGH)); + int[] original = {42, 43, 44, 45}; + assertPipelineRoundTripInts(pipeline, original); + } + + // ===== XOR (Gorilla-style) transform tests ===== + + @Test + public void testXorEncodeDecodeFloats() { + float[] floats = {72.3f, 72.4f, 72.5f, 72.6f, 72.7f}; + int numValues = floats.length; + ByteBuffer buffer = ByteBuffer.allocate(numValues * Float.BYTES); + for (float f : floats) { + buffer.putFloat(f); + } + buffer.flip(); + + XorTransform.INSTANCE.encode(buffer, buffer.remaining(), Float.BYTES); + // First value unchanged, subsequent values should be XOR (small diffs → many zero bits) + buffer.position(0); + assertEquals(Float.intBitsToFloat(buffer.getInt(0)), floats[0]); + + XorTransform.INSTANCE.decode(buffer, numValues * Float.BYTES, Float.BYTES); + buffer.position(0); + for (float expected : floats) { + assertEquals(buffer.getFloat(), expected); + } + } + + @Test + public void testXorEncodeDecodeDoubles() { + double[] doubles = {98.6, 98.7, 98.8, 98.9, 99.0}; + int numValues = doubles.length; + ByteBuffer buffer = ByteBuffer.allocate(numValues * Double.BYTES); + for (double d : doubles) { + buffer.putDouble(d); + } + buffer.flip(); + + XorTransform.INSTANCE.encode(buffer, buffer.remaining(), Double.BYTES); + buffer.position(0); + assertEquals(Double.longBitsToDouble(buffer.getLong(0)), doubles[0]); + + XorTransform.INSTANCE.decode(buffer, numValues * Double.BYTES, Double.BYTES); + buffer.position(0); + for (double expected : doubles) { + assertEquals(buffer.getDouble(), expected); + } + } + + @Test + public void testXorSingleFloat() { + float[] floats = {42.5f}; + ByteBuffer buffer = ByteBuffer.allocate(Float.BYTES); + buffer.putFloat(floats[0]); + buffer.flip(); + XorTransform.INSTANCE.encode(buffer, Float.BYTES, Float.BYTES); + XorTransform.INSTANCE.decode(buffer, Float.BYTES, Float.BYTES); + buffer.position(0); + assertEquals(buffer.getFloat(), floats[0]); + } + + @Test + public void testXorEmptyValues() { + // Empty buffer should not throw + ByteBuffer buffer = ByteBuffer.allocate(0); + XorTransform.INSTANCE.encode(buffer, 0, Float.BYTES); + XorTransform.INSTANCE.decode(buffer, 0, Float.BYTES); + } + + @Test + public void testXorRandomDoubles() { + Random rng = new Random(42); + int numValues = 500; + double[] values = new double[numValues]; + values[0] = rng.nextDouble() * 100; + for (int i = 1; i < numValues; i++) { + // Small perturbations — XOR produces values with many zero bits + values[i] = values[i - 1] + (rng.nextDouble() - 0.5); + } + ByteBuffer buffer = ByteBuffer.allocate(numValues * Double.BYTES); + for (double v : values) { + buffer.putDouble(v); + } + buffer.flip(); + XorTransform.INSTANCE.encode(buffer, numValues * Double.BYTES, Double.BYTES); + XorTransform.INSTANCE.decode(buffer, numValues * Double.BYTES, Double.BYTES); + buffer.position(0); + for (double expected : values) { + assertEquals(buffer.getDouble(), expected); + } + } + + @Test + public void testPipelineXorLz4RoundTripFloats() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames(Arrays.asList("XOR", "LZ4")); + float[] floats = {72.3f, 72.4f, 72.5f, 72.6f, 72.7f, 72.8f}; + assertPipelineRoundTripFloats(pipeline, floats); + } + + @Test + public void testPipelineXorZstdRoundTripDoubles() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames(Arrays.asList("XOR", "ZSTANDARD")); + double[] doubles = {98.6, 98.7, 98.8, 98.9, 99.0}; + assertPipelineRoundTripDoubles(pipeline, doubles); + } + + // ===== supportedTypes / validateStoredType tests ===== + + @Test + public void testDeltaSupportedTypes() { + assertTrue(DeltaTransform.INSTANCE.supportedTypes().contains(DataType.INT)); + assertTrue(DeltaTransform.INSTANCE.supportedTypes().contains(DataType.LONG)); + assertFalse(DeltaTransform.INSTANCE.supportedTypes().contains(DataType.FLOAT)); + assertFalse(DeltaTransform.INSTANCE.supportedTypes().contains(DataType.DOUBLE)); + // Should not throw for supported types + DeltaTransform.INSTANCE.validateStoredType(DataType.INT, "col"); + DeltaTransform.INSTANCE.validateStoredType(DataType.LONG, "col"); + // Should throw for unsupported types + expectThrows(IllegalArgumentException.class, + () -> DeltaTransform.INSTANCE.validateStoredType(DataType.FLOAT, "col")); + expectThrows(IllegalArgumentException.class, + () -> DeltaTransform.INSTANCE.validateStoredType(DataType.DOUBLE, "col")); + expectThrows(IllegalArgumentException.class, + () -> DeltaTransform.INSTANCE.validateStoredType(DataType.STRING, "col")); + } + + @Test + public void testDoubleDeltaSupportedTypes() { + assertTrue(DoubleDeltaTransform.INSTANCE.supportedTypes().contains(DataType.INT)); + assertTrue(DoubleDeltaTransform.INSTANCE.supportedTypes().contains(DataType.LONG)); + assertFalse(DoubleDeltaTransform.INSTANCE.supportedTypes().contains(DataType.FLOAT)); + assertFalse(DoubleDeltaTransform.INSTANCE.supportedTypes().contains(DataType.DOUBLE)); + DoubleDeltaTransform.INSTANCE.validateStoredType(DataType.INT, "col"); + DoubleDeltaTransform.INSTANCE.validateStoredType(DataType.LONG, "col"); + expectThrows(IllegalArgumentException.class, + () -> DoubleDeltaTransform.INSTANCE.validateStoredType(DataType.FLOAT, "col")); + } + + @Test + public void testXorSupportedTypes() { + assertTrue(XorTransform.INSTANCE.supportedTypes().contains(DataType.FLOAT)); + assertTrue(XorTransform.INSTANCE.supportedTypes().contains(DataType.DOUBLE)); + assertFalse(XorTransform.INSTANCE.supportedTypes().contains(DataType.INT)); + assertFalse(XorTransform.INSTANCE.supportedTypes().contains(DataType.LONG)); + XorTransform.INSTANCE.validateStoredType(DataType.FLOAT, "col"); + XorTransform.INSTANCE.validateStoredType(DataType.DOUBLE, "col"); + expectThrows(IllegalArgumentException.class, + () -> XorTransform.INSTANCE.validateStoredType(DataType.INT, "col")); + expectThrows(IllegalArgumentException.class, + () -> XorTransform.INSTANCE.validateStoredType(DataType.LONG, "col")); + expectThrows(IllegalArgumentException.class, + () -> XorTransform.INSTANCE.validateStoredType(DataType.STRING, "col")); + } + + // ===== ChunkCodec enum tests ===== + + @Test + public void testChunkCodecFromValue() { + assertEquals(ChunkCodec.fromValue(0), ChunkCodec.PASS_THROUGH); + assertEquals(ChunkCodec.fromValue(2), ChunkCodec.ZSTANDARD); + assertEquals(ChunkCodec.fromValue(6), ChunkCodec.DELTA_LZ4); + assertEquals(ChunkCodec.fromValue(7), ChunkCodec.DOUBLE_DELTA_LZ4); + assertEquals(ChunkCodec.fromValue(100), ChunkCodec.DELTA); + assertEquals(ChunkCodec.fromValue(101), ChunkCodec.DOUBLE_DELTA); + } + + @Test + public void testChunkCodecKind() { + assertTrue(ChunkCodec.ZSTANDARD.isCompressor()); + assertFalse(ChunkCodec.ZSTANDARD.isTransform()); + assertTrue(ChunkCodec.DELTA.isTransform()); + assertFalse(ChunkCodec.DELTA.isCompressor()); + // Legacy compound codecs are compressors and internal-only + assertTrue(ChunkCodec.DELTA_LZ4.isCompressor()); + assertFalse(ChunkCodec.DELTA_LZ4.isTransform()); + assertTrue(ChunkCodec.DELTA_LZ4.isInternalOnly()); + assertTrue(ChunkCodec.DOUBLE_DELTA_LZ4.isCompressor()); + assertFalse(ChunkCodec.DOUBLE_DELTA_LZ4.isTransform()); + assertTrue(ChunkCodec.DOUBLE_DELTA_LZ4.isInternalOnly()); + // User-facing codecs are not internal-only + assertFalse(ChunkCodec.LZ4.isInternalOnly()); + assertFalse(ChunkCodec.DELTA.isInternalOnly()); + assertFalse(ChunkCodec.DOUBLE_DELTA.isInternalOnly()); + } + + @Test + public void testChunkCodecInvalidValue() { + expectThrows(IllegalArgumentException.class, () -> ChunkCodec.fromValue(999)); + } + + // ===== fromCompressionType tests for legacy compound codecs ===== + + @Test + public void testFromCompressionTypeDelta() { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromCompressionType(ChunkCompressionType.DELTA); + assertEquals(pipeline.size(), 1); + assertEquals(pipeline.get(0), ChunkCodec.DELTA_LZ4); + assertTrue(pipeline.get(0).isCompressor()); + assertFalse(pipeline.hasTransforms()); + assertEquals(pipeline.getChunkCompressionType(), ChunkCompressionType.DELTA); + } + + @Test + public void testFromCompressionTypeDeltaDelta() { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromCompressionType(ChunkCompressionType.DELTADELTA); + assertEquals(pipeline.size(), 1); + assertEquals(pipeline.get(0), ChunkCodec.DOUBLE_DELTA_LZ4); + assertTrue(pipeline.get(0).isCompressor()); + assertFalse(pipeline.hasTransforms()); + assertEquals(pipeline.getChunkCompressionType(), ChunkCompressionType.DELTADELTA); + } + + @Test + public void testFromCompressionTypeLz4() { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromCompressionType(ChunkCompressionType.LZ4); + assertEquals(pipeline.size(), 1); + assertEquals(pipeline.get(0), ChunkCodec.LZ4); + assertFalse(pipeline.hasTransforms()); + } + + @Test + public void testLegacyDeltaNotEqualToPipelineDelta() { + // Legacy compound [DELTA_LZ4] is NOT the same as pipeline [DELTA, LZ4] + ChunkCodecPipeline legacyDelta = ChunkCodecPipeline.fromCompressionType(ChunkCompressionType.DELTA); + ChunkCodecPipeline pipelineDelta = new ChunkCodecPipeline(Arrays.asList(ChunkCodec.DELTA, ChunkCodec.LZ4)); + assertFalse(legacyDelta.equals(pipelineDelta)); + } + + // ===== Triple-delta (DELTA → DELTA → DELTA) stacking tests ===== + + @Test + public void testTripleDeltaInts() { + // Stacking 3 delta transforms: should round-trip correctly + int[] values = new int[100]; + for (int i = 0; i < values.length; i++) { + values[i] = 1000 + i * 7; + } + assertChainedTransformRoundTripInts( + new ChunkTransform[]{DeltaTransform.INSTANCE, DeltaTransform.INSTANCE, DeltaTransform.INSTANCE}, values); + } + + @Test + public void testTripleDeltaLongs() { + long[] values = new long[100]; + for (int i = 0; i < values.length; i++) { + values[i] = 50000L + i * 13L; + } + assertChainedTransformRoundTripLongs( + new ChunkTransform[]{DeltaTransform.INSTANCE, DeltaTransform.INSTANCE, DeltaTransform.INSTANCE}, values); + } + + @Test + public void testTripleDeltaRandomInts() { + Random random = new Random(77777); + int[] values = new int[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertChainedTransformRoundTripInts( + new ChunkTransform[]{DeltaTransform.INSTANCE, DeltaTransform.INSTANCE, DeltaTransform.INSTANCE}, values); + } + + @Test + public void testDeltaDoubleDeltaDeltaMixInts() { + // Mixed: DELTA → DOUBLE_DELTA → DELTA + int[] values = new int[200]; + for (int i = 0; i < values.length; i++) { + values[i] = 1000 + i * 3 + (i % 5); + } + assertChainedTransformRoundTripInts( + new ChunkTransform[]{DeltaTransform.INSTANCE, DoubleDeltaTransform.INSTANCE, DeltaTransform.INSTANCE}, values); + } + + @Test + public void testDoubleDeltaDoubleDeltaInts() { + // DOUBLE_DELTA → DOUBLE_DELTA (stacked) + int[] values = new int[200]; + for (int i = 0; i < values.length; i++) { + values[i] = 5000 + i * i; + } + assertChainedTransformRoundTripInts( + new ChunkTransform[]{DoubleDeltaTransform.INSTANCE, DoubleDeltaTransform.INSTANCE}, values); + } + + // ===== Pipeline compressor/decompressor: triple-delta + compression round-trip ===== + + @Test + public void testPipelineTripleDeltaZstdRoundTripInts() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames( + Arrays.asList("DELTA", "DELTA", "DELTA", "ZSTANDARD")); + int[] original = new int[200]; + for (int i = 0; i < original.length; i++) { + original[i] = 1000 + i * 7; + } + assertPipelineRoundTripInts(pipeline, original); + } + + @Test + public void testPipelineTripleDeltaLz4RoundTripLongs() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames( + Arrays.asList("DELTA", "DELTA", "DELTA", "LZ4")); + long[] original = new long[200]; + for (int i = 0; i < original.length; i++) { + original[i] = 50000L + i * 13L; + } + assertPipelineRoundTripLongs(pipeline, original); + } + + @Test + public void testPipelineTripleDeltaRandomInts() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames( + Arrays.asList("DELTA", "DELTA", "DELTA", "ZSTANDARD")); + Random random = new Random(88888); + int[] original = new int[1000]; + for (int i = 0; i < original.length; i++) { + original[i] = random.nextInt(); + } + assertPipelineRoundTripInts(pipeline, original); + } + + @Test + public void testPipelineDeltaDoubleDeltaDeltaSnappyRoundTripInts() + throws IOException { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames( + Arrays.asList("DELTA", "DOUBLE_DELTA", "DELTA", "SNAPPY")); + int[] original = new int[300]; + for (int i = 0; i < original.length; i++) { + original[i] = 1000 + i * 3 + (i % 5); + } + assertPipelineRoundTripInts(pipeline, original); + } + + // ===== End-to-end: write V7 file with pipeline, read back via reader ===== + + private static final int E2E_NUM_VALUES = 10009; + private static final int E2E_NUM_DOCS_PER_CHUNK = 5003; + private static final String E2E_TEST_FILE = + System.getProperty("java.io.tmpdir") + File.separator + "CodecPipelineE2E"; + + @Test + public void testE2eDeltaZstdInts() + throws Exception { + assertE2eWriteReadInts( + ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "ZSTANDARD")), + generateMonotonicInts(E2E_NUM_VALUES, 1000, 7)); + } + + @Test + public void testE2eDoubleDeltaLz4Ints() + throws Exception { + assertE2eWriteReadInts( + ChunkCodecPipeline.fromNames(Arrays.asList("DOUBLE_DELTA", "LZ4")), + generateMonotonicInts(E2E_NUM_VALUES, 5000, 10)); + } + + @Test + public void testE2eTripleDeltaZstdInts() + throws Exception { + assertE2eWriteReadInts( + ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "DELTA", "DELTA", "ZSTANDARD")), + generateMonotonicInts(E2E_NUM_VALUES, 0, 3)); + } + + @Test + public void testE2eDeltaDoubleDeltaSnappyInts() + throws Exception { + assertE2eWriteReadInts( + ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "DOUBLE_DELTA", "SNAPPY")), + generateMonotonicInts(E2E_NUM_VALUES, 100, 5)); + } + + @Test + public void testE2eDeltaZstdLongs() + throws Exception { + assertE2eWriteReadLongs( + ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "ZSTANDARD")), + generateMonotonicLongs(E2E_NUM_VALUES, 1000000L, 60000L)); + } + + @Test + public void testE2eTripleDeltaLz4Longs() + throws Exception { + assertE2eWriteReadLongs( + ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "DELTA", "DELTA", "LZ4")), + generateMonotonicLongs(E2E_NUM_VALUES, 0L, 100L)); + } + + @Test + public void testE2eRandomIntsWithPipeline() + throws Exception { + Random random = new Random(54321); + int[] values = new int[E2E_NUM_VALUES]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertE2eWriteReadInts( + ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "ZSTANDARD")), values); + } + + @Test + public void testE2eRandomLongsWithPipeline() + throws Exception { + Random random = new Random(54321); + long[] values = new long[E2E_NUM_VALUES]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextLong(); + } + assertE2eWriteReadLongs( + ChunkCodecPipeline.fromNames(Arrays.asList("DELTA", "LZ4")), values); + } + + @Test + public void testE2ePipelineMetadata() + throws Exception { + ChunkCodecPipeline pipeline = ChunkCodecPipeline.fromNames( + Arrays.asList("DELTA", "DOUBLE_DELTA", "ZSTANDARD")); + int[] values = generateMonotonicInts(E2E_NUM_VALUES, 0, 1); + + File outFile = new File(E2E_TEST_FILE + "_metadata"); + FileUtils.deleteQuietly(outFile); + try { + try (FixedByteChunkForwardIndexWriter writer = new FixedByteChunkForwardIndexWriter( + outFile, ChunkCompressionType.ZSTANDARD, pipeline, E2E_NUM_VALUES, + E2E_NUM_DOCS_PER_CHUNK, Integer.BYTES, 7)) { + for (int v : values) { + writer.putInt(v); + } + } + try (PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(outFile); + FixedBytePower2ChunkSVForwardIndexReader reader = + new FixedBytePower2ChunkSVForwardIndexReader(buffer, DataType.INT)) { + // Verify the codec pipeline round-trips through the header + ChunkCodecPipeline readPipeline = reader.getCodecPipeline(); + assertEquals(readPipeline, pipeline); + assertEquals(readPipeline.getStages(), + Arrays.asList(ChunkCodec.DELTA, ChunkCodec.DOUBLE_DELTA, ChunkCodec.ZSTANDARD)); + assertTrue(readPipeline.hasTransforms()); + assertEquals(readPipeline.getCompressor(), ChunkCodec.ZSTANDARD); + } + } finally { + FileUtils.deleteQuietly(outFile); + } + } + + // ===== Helpers ===== + + private void assertRoundTripInts(ChunkTransform transform, int[] original) { + int numBytes = original.length * Integer.BYTES; + ByteBuffer buffer = ByteBuffer.allocate(Math.max(numBytes, 1)); + for (int v : original) { + buffer.putInt(v); + } + buffer.flip(); + + transform.encode(buffer, numBytes, Integer.BYTES); + transform.decode(buffer, numBytes, Integer.BYTES); + + buffer.position(0); + for (int i = 0; i < original.length; i++) { + assertEquals(buffer.getInt(), original[i], "Mismatch at index " + i); + } + } + + private void assertRoundTripLongs(ChunkTransform transform, long[] original) { + int numBytes = original.length * Long.BYTES; + ByteBuffer buffer = ByteBuffer.allocate(Math.max(numBytes, 1)); + for (long v : original) { + buffer.putLong(v); + } + buffer.flip(); + + transform.encode(buffer, numBytes, Long.BYTES); + transform.decode(buffer, numBytes, Long.BYTES); + + buffer.position(0); + for (int i = 0; i < original.length; i++) { + assertEquals(buffer.getLong(), original[i], "Mismatch at index " + i); + } + } + + private void assertPipelineRoundTripInts(ChunkCodecPipeline pipeline, int[] original) + throws IOException { + int numBytes = original.length * Integer.BYTES; + int valueSizeInBytes = Integer.BYTES; + + ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(pipeline, valueSizeInBytes); + ChunkDecompressor decompressor = ChunkCompressorFactory.getDecompressor(pipeline, valueSizeInBytes); + + // Fill input buffer + ByteBuffer input = ByteBuffer.allocateDirect(numBytes); + for (int v : original) { + input.putInt(v); + } + input.flip(); + + // Compress + ByteBuffer compressed = ByteBuffer.allocateDirect(compressor.maxCompressedSize(numBytes)); + compressor.compress(input, compressed); + + // Decompress + ByteBuffer decompressed = ByteBuffer.allocateDirect(numBytes); + decompressor.decompress(compressed, decompressed); + + // Verify — decompressor already flips the output buffer per Pinot convention + for (int i = 0; i < original.length; i++) { + assertEquals(decompressed.getInt(), original[i], "Mismatch at index " + i); + } + } + + private void assertPipelineRoundTripLongs(ChunkCodecPipeline pipeline, long[] original) + throws IOException { + int numBytes = original.length * Long.BYTES; + int valueSizeInBytes = Long.BYTES; + + ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(pipeline, valueSizeInBytes); + ChunkDecompressor decompressor = ChunkCompressorFactory.getDecompressor(pipeline, valueSizeInBytes); + + // Fill input buffer + ByteBuffer input = ByteBuffer.allocateDirect(numBytes); + for (long v : original) { + input.putLong(v); + } + input.flip(); + + // Compress + ByteBuffer compressed = ByteBuffer.allocateDirect(compressor.maxCompressedSize(numBytes)); + compressor.compress(input, compressed); + + // Decompress + ByteBuffer decompressed = ByteBuffer.allocateDirect(numBytes); + decompressor.decompress(compressed, decompressed); + + // Verify — decompressor already flips the output buffer per Pinot convention + for (int i = 0; i < original.length; i++) { + assertEquals(decompressed.getLong(), original[i], "Mismatch at index " + i); + } + } + + private void assertPipelineRoundTripFloats(ChunkCodecPipeline pipeline, float[] original) + throws IOException { + int numBytes = original.length * Float.BYTES; + int valueSizeInBytes = Float.BYTES; + + ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(pipeline, valueSizeInBytes); + ChunkDecompressor decompressor = ChunkCompressorFactory.getDecompressor(pipeline, valueSizeInBytes); + + ByteBuffer input = ByteBuffer.allocateDirect(numBytes); + for (float v : original) { + input.putFloat(v); + } + input.flip(); + + ByteBuffer compressed = ByteBuffer.allocateDirect(compressor.maxCompressedSize(numBytes)); + compressor.compress(input, compressed); + + ByteBuffer decompressed = ByteBuffer.allocateDirect(numBytes); + decompressor.decompress(compressed, decompressed); + + for (int i = 0; i < original.length; i++) { + assertEquals(decompressed.getFloat(), original[i], "Mismatch at index " + i); + } + } + + private void assertPipelineRoundTripDoubles(ChunkCodecPipeline pipeline, double[] original) + throws IOException { + int numBytes = original.length * Double.BYTES; + int valueSizeInBytes = Double.BYTES; + + ChunkCompressor compressor = ChunkCompressorFactory.getCompressor(pipeline, valueSizeInBytes); + ChunkDecompressor decompressor = ChunkCompressorFactory.getDecompressor(pipeline, valueSizeInBytes); + + ByteBuffer input = ByteBuffer.allocateDirect(numBytes); + for (double v : original) { + input.putDouble(v); + } + input.flip(); + + ByteBuffer compressed = ByteBuffer.allocateDirect(compressor.maxCompressedSize(numBytes)); + compressor.compress(input, compressed); + + ByteBuffer decompressed = ByteBuffer.allocateDirect(numBytes); + decompressor.decompress(compressed, decompressed); + + for (int i = 0; i < original.length; i++) { + assertEquals(decompressed.getDouble(), original[i], "Mismatch at index " + i); + } + } + + private void assertChainedTransformRoundTripInts(ChunkTransform[] transforms, int[] original) { + int numBytes = original.length * Integer.BYTES; + ByteBuffer buffer = ByteBuffer.allocate(Math.max(numBytes, 1)); + for (int v : original) { + buffer.putInt(v); + } + buffer.flip(); + + // Encode: apply transforms left-to-right + for (ChunkTransform t : transforms) { + t.encode(buffer, numBytes, Integer.BYTES); + } + // Decode: apply transforms right-to-left + for (int i = transforms.length - 1; i >= 0; i--) { + transforms[i].decode(buffer, numBytes, Integer.BYTES); + } + + buffer.position(0); + for (int i = 0; i < original.length; i++) { + assertEquals(buffer.getInt(), original[i], "Mismatch at index " + i); + } + } + + private void assertChainedTransformRoundTripLongs(ChunkTransform[] transforms, long[] original) { + int numBytes = original.length * Long.BYTES; + ByteBuffer buffer = ByteBuffer.allocate(Math.max(numBytes, 1)); + for (long v : original) { + buffer.putLong(v); + } + buffer.flip(); + + for (ChunkTransform t : transforms) { + t.encode(buffer, numBytes, Long.BYTES); + } + for (int i = transforms.length - 1; i >= 0; i--) { + transforms[i].decode(buffer, numBytes, Long.BYTES); + } + + buffer.position(0); + for (int i = 0; i < original.length; i++) { + assertEquals(buffer.getLong(), original[i], "Mismatch at index " + i); + } + } + + private void assertE2eWriteReadInts(ChunkCodecPipeline pipeline, int[] expected) + throws Exception { + File outFile = new File(E2E_TEST_FILE + "_int"); + FileUtils.deleteQuietly(outFile); + try { + // Write using version 7 writer with codec pipeline + try (FixedByteChunkForwardIndexWriter writer = new FixedByteChunkForwardIndexWriter( + outFile, pipeline.getChunkCompressionType(), pipeline, E2E_NUM_VALUES, + E2E_NUM_DOCS_PER_CHUNK, Integer.BYTES, 7)) { + for (int v : expected) { + writer.putInt(v); + } + } + + // Read back via the standard reader and verify every value + try (PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(outFile); + ForwardIndexReader reader = + new FixedBytePower2ChunkSVForwardIndexReader(buffer, DataType.INT); + ChunkReaderContext context = reader.createContext()) { + for (int i = 0; i < expected.length; i++) { + assertEquals(reader.getInt(i, context), expected[i], "Mismatch at docId " + i); + } + } + } finally { + FileUtils.deleteQuietly(outFile); + } + } + + private void assertE2eWriteReadLongs(ChunkCodecPipeline pipeline, long[] expected) + throws Exception { + File outFile = new File(E2E_TEST_FILE + "_long"); + FileUtils.deleteQuietly(outFile); + try { + try (FixedByteChunkForwardIndexWriter writer = new FixedByteChunkForwardIndexWriter( + outFile, pipeline.getChunkCompressionType(), pipeline, E2E_NUM_VALUES, + E2E_NUM_DOCS_PER_CHUNK, Long.BYTES, 7)) { + for (long v : expected) { + writer.putLong(v); + } + } + + try (PinotDataBuffer buffer = PinotDataBuffer.mapReadOnlyBigEndianFile(outFile); + ForwardIndexReader reader = + new FixedBytePower2ChunkSVForwardIndexReader(buffer, DataType.LONG); + ChunkReaderContext context = reader.createContext()) { + for (int i = 0; i < expected.length; i++) { + assertEquals(reader.getLong(i, context), expected[i], "Mismatch at docId " + i); + } + } + } finally { + FileUtils.deleteQuietly(outFile); + } + } + + private static int[] generateMonotonicInts(int count, int start, int step) { + int[] values = new int[count]; + for (int i = 0; i < count; i++) { + values[i] = start + i * step; + } + return values; + } + + private static long[] generateMonotonicLongs(int count, long start, long step) { + long[] values = new long[count]; + for (int i = 0; i < count; i++) { + values[i] = start + i * step; + } + return values; + } +} diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/DeltaCompressionTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/DeltaCompressionTest.java similarity index 99% rename from pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/DeltaCompressionTest.java rename to pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/DeltaCompressionTest.java index b5bd8eb5ab81..5722a45ca2f3 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/DeltaCompressionTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/DeltaCompressionTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaCompressionTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaCompressionTest.java similarity index 99% rename from pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaCompressionTest.java rename to pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaCompressionTest.java index 5411d5619649..68d12a597a7d 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/DeltaDeltaCompressionTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/DeltaDeltaCompressionTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/TestCompression.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/TestCompression.java similarity index 99% rename from pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/TestCompression.java rename to pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/TestCompression.java index 4199da004281..d111bd9c1998 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/TestCompression.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/codec/compression/TestCompression.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.segment.local.io.compression; +package org.apache.pinot.segment.local.io.codec.compression; import java.io.IOException; import java.nio.ByteBuffer; diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java index f8dc1d01d1f2..dbffd6f1acd7 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/loader/ForwardIndexHandlerTest.java @@ -45,6 +45,7 @@ import org.apache.pinot.segment.local.segment.store.SegmentLocalFSDirectory; import org.apache.pinot.segment.spi.ColumnMetadata; import org.apache.pinot.segment.spi.V1Constants; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.DictIdCompressionType; import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig; @@ -594,6 +595,21 @@ public void testComputeOperationNoOp() } } + @Test + public void testShouldChangeRawCompressionTypeNoOpForClpCodecs() { + ChunkCodecPipeline lz4Pipeline = ChunkCodecPipeline.fromCompressionType(ChunkCompressionType.LZ4); + List clpCodecs = + List.of(CompressionCodec.CLP, CompressionCodec.CLPV2, CompressionCodec.CLPV2_ZSTD, + CompressionCodec.CLPV2_LZ4); + for (CompressionCodec codec : clpCodecs) { + ForwardIndexConfig newConfig = new ForwardIndexConfig.Builder().withCompressionCodec(codec).build(); + assertFalse( + ForwardIndexHandler.shouldChangeRawCompressionType(ChunkCompressionType.PASS_THROUGH, null, newConfig)); + assertFalse( + ForwardIndexHandler.shouldChangeRawCompressionType(ChunkCompressionType.LZ4, lz4Pipeline, newConfig)); + } + } + private IndexLoadingConfig createIndexLoadingConfig() { return new IndexLoadingConfig(createTableConfig(), SCHEMA); } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java index 22a4fd04be90..a495b44cb062 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java @@ -1195,6 +1195,82 @@ public void testValidateFieldConfig() { "Compression codec DELTADELTA can only be used on single-value columns, found multi-value column: myCol2"); } + // Validate codecPipeline with transforms on wrong data type (STRING) + try { + FieldConfig fieldConfig = + new FieldConfig("myCol1", FieldConfig.EncodingType.RAW, null, null, null, null, null, + Arrays.asList("DELTA", "ZSTANDARD"), null, null); + tableConfig.setFieldConfigList(Arrays.asList(fieldConfig)); + TableConfigUtils.validate(tableConfig, schema); + fail("Should fail for codecPipeline with DELTA transform on STRING column"); + } catch (Exception e) { + assertTrue(e.getMessage().contains("does not support stored type STRING"), + "Expected per-transform type validation error, got: " + e.getMessage()); + } + + // Validate codecPipeline with transforms on MV column + try { + FieldConfig fieldConfig = + new FieldConfig("myCol2", FieldConfig.EncodingType.RAW, null, null, null, null, null, + Arrays.asList("DELTA", "LZ4"), null, null); + tableConfig.setFieldConfigList(Arrays.asList(fieldConfig)); + TableConfigUtils.validate(tableConfig, schema); + fail("Should fail for codecPipeline with DELTA transform on MV column"); + } catch (Exception e) { + assertEquals(e.getMessage(), + "codecPipeline with transforms can only be used on single-value columns, " + + "found multi-value column: myCol2"); + } + + // Validate XOR transform on INT column fails (XOR only supports FLOAT/DOUBLE) + try { + FieldConfig fieldConfig = + new FieldConfig("intCol", FieldConfig.EncodingType.RAW, null, null, null, null, null, + Arrays.asList("XOR", "ZSTANDARD"), null, null); + tableConfig.setFieldConfigList(Arrays.asList(fieldConfig)); + TableConfigUtils.validate(tableConfig, schema); + fail("Should fail for codecPipeline with XOR transform on INT column"); + } catch (Exception e) { + assertTrue(e.getMessage().contains("does not support stored type INT"), + "Expected XOR type validation error, got: " + e.getMessage()); + } + + // Validate codecPipeline with transforms requires an explicit V7 writer version + try { + FieldConfig fieldConfig = + new FieldConfig("intCol", FieldConfig.EncodingType.RAW, null, null, null, null, null, + Arrays.asList("DELTA", "ZSTANDARD"), null, null); + tableConfig.setFieldConfigList(Arrays.asList(fieldConfig)); + TableConfigUtils.validate(tableConfig, schema); + fail("Should fail for codecPipeline with transforms when rawIndexWriterVersion is not explicitly set to 7"); + } catch (Exception e) { + assertTrue(e.getMessage().contains("rawIndexWriterVersion=7"), + "Expected writer version validation error, got: " + e.getMessage()); + } + + // Validate codecPipeline with transforms on valid SV INT column passes with explicit V7 opt-in + try { + Map properties = Collections.singletonMap(FieldConfig.RAW_INDEX_WRITER_VERSION, "7"); + FieldConfig fieldConfig = + new FieldConfig("intCol", FieldConfig.EncodingType.RAW, null, null, null, null, null, + Arrays.asList("DELTA", "ZSTANDARD"), properties, null); + tableConfig.setFieldConfigList(Arrays.asList(fieldConfig)); + TableConfigUtils.validate(tableConfig, schema); + } catch (Exception e) { + fail("codecPipeline with DELTA transform on SV INT column should pass", e); + } + + // Validate compression-only pipeline on STRING column passes (no transforms) + try { + FieldConfig fieldConfig = + new FieldConfig("myCol1", FieldConfig.EncodingType.RAW, null, null, null, null, null, + Arrays.asList("ZSTANDARD"), null, null); + tableConfig.setFieldConfigList(Arrays.asList(fieldConfig)); + TableConfigUtils.validate(tableConfig, schema); + } catch (Exception e) { + fail("Compression-only codecPipeline on STRING column should pass", e); + } + try { FieldConfig fieldConfig = new FieldConfig("myCol1", FieldConfig.EncodingType.DICTIONARY, FieldConfig.IndexType.FST, null, null); diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkCodec.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkCodec.java new file mode 100644 index 000000000000..fa7121653da4 --- /dev/null +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkCodec.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.spi.codec; + +/** + * Unified codec enum for the forward index codec pipeline. Each codec is classified as either a + * {@link CodecKind#COMPRESSOR} (byte-level, size-changing) or a {@link CodecKind#TRANSFORM} + * (typed, in-place, size-preserving). + * + *

      A codec pipeline is an ordered list of {@code ChunkCodec} stages. Transforms are applied + * left-to-right before a terminal compressor on write; the reverse order is used on read.

      + * + *

      On-disk, each codec is stored as its {@link #getValue()} int. Compressor values (0–99) + * align with {@link ChunkCompressionType} for backward compatibility. Transform values start + * at 100 to leave room for future compressors.

      + */ +public enum ChunkCodec { + + // --- Compressors (byte-level, size-changing) --- + // Values 0-99 reserved for compressors; aligned with ChunkCompressionType ordinals. + PASS_THROUGH(0, CodecKind.COMPRESSOR, false), + SNAPPY(1, CodecKind.COMPRESSOR, false), + ZSTANDARD(2, CodecKind.COMPRESSOR, false), + LZ4(3, CodecKind.COMPRESSOR, false), + GZIP(5, CodecKind.COMPRESSOR, false), + + // Legacy compound codecs that bundle delta transform + LZ4 in a single compressor/decompressor. + // These use a custom on-disk format (type flag byte, count metadata, first value, LZ4-compressed deltas) + // and are NOT byte-compatible with the pipeline approach (DELTA transform + LZ4 compressor). + // Values aligned with ChunkCompressionType.DELTA(6) and ChunkCompressionType.DELTADELTA(7). + // Internal-only: users must use ["DELTA", "LZ4"] or ["DOUBLE_DELTA", "LZ4"] instead. + DELTA_LZ4(6, CodecKind.COMPRESSOR, true), + DOUBLE_DELTA_LZ4(7, CodecKind.COMPRESSOR, true), + + // --- Transforms (typed, in-place, size-preserving) --- + // Values 100+ reserved for transforms. + DELTA(100, CodecKind.TRANSFORM, false), + DOUBLE_DELTA(101, CodecKind.TRANSFORM, false), + XOR(102, CodecKind.TRANSFORM, false); + + /** Classification of a codec stage. */ + public enum CodecKind { + /** Byte-level compression that may change the data size. */ + COMPRESSOR, + /** Numeric transform that operates in-place without changing data size. */ + TRANSFORM + } + + private static final ChunkCodec[] BY_VALUE; + + static { + int maxValue = 0; + for (ChunkCodec codec : values()) { + maxValue = Math.max(maxValue, codec._value); + } + BY_VALUE = new ChunkCodec[maxValue + 1]; + for (ChunkCodec codec : values()) { + BY_VALUE[codec._value] = codec; + } + } + + private final int _value; + private final CodecKind _kind; + private final boolean _internalOnly; + + ChunkCodec(int value, CodecKind kind, boolean internalOnly) { + _value = value; + _kind = kind; + _internalOnly = internalOnly; + } + + /** Returns the on-disk integer identifier for this codec. */ + public int getValue() { + return _value; + } + + /** Returns whether this codec is a compressor or a transform. */ + public CodecKind getKind() { + return _kind; + } + + /** Returns {@code true} if this is a byte-level compressor. */ + public boolean isCompressor() { + return _kind == CodecKind.COMPRESSOR; + } + + /** Returns {@code true} if this is a numeric transform. */ + public boolean isTransform() { + return _kind == CodecKind.TRANSFORM; + } + + /** + * Returns {@code true} if this codec is internal-only and cannot be used in user-provided + * codec pipelines. Internal codecs are used for backward-compatible auto-derivation from + * legacy {@code compressionCodec} settings. + */ + public boolean isInternalOnly() { + return _internalOnly; + } + + /** + * Validates that this codec is allowed in a user-provided pipeline. + * + * @param userProvidedName the name as provided by the user (for error messages) + * @throws IllegalArgumentException if this codec is internal-only + */ + public void validateUserFacing(String userProvidedName) { + if (_internalOnly) { + throw new IllegalArgumentException( + "'" + userProvidedName + "' is an internal-only legacy codec and cannot be used in codecPipeline. " + + "Use [\"DELTA\", \"LZ4\"] or [\"DOUBLE_DELTA\", \"LZ4\"] instead."); + } + } + + /** + * Look up a {@code ChunkCodec} by its on-disk int value (not ordinal). + * + * @throws IllegalArgumentException if the value does not map to a known codec + */ + public static ChunkCodec fromValue(int value) { + if (value < 0 || value >= BY_VALUE.length || BY_VALUE[value] == null) { + throw new IllegalArgumentException("Invalid ChunkCodec value: " + value); + } + return BY_VALUE[value]; + } +} diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkCodecPipeline.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkCodecPipeline.java new file mode 100644 index 000000000000..45d79f071503 --- /dev/null +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkCodecPipeline.java @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.spi.codec; + +import com.google.common.base.Preconditions; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.pinot.segment.spi.compression.ChunkCompressionType; + + +/** + * An immutable, ordered sequence of {@link ChunkCodec} stages that defines how chunk data is + * encoded before being written to disk and decoded after being read back. + * + *

      Invariants enforced at construction time: + *

        + *
      • The pipeline is non-empty.
      • + *
      • At most one {@link ChunkCodec.CodecKind#COMPRESSOR COMPRESSOR} stage is present, + * and it must be the last stage.
      • + *
      • All preceding stages (if any) must be {@link ChunkCodec.CodecKind#TRANSFORM TRANSFORM}.
      • + *
      • Pipeline length is at most {@value #MAX_PIPELINE_LENGTH}.
      • + *
      + * + *

      On write the stages are applied left-to-right (transforms first, then compression). + * On read the stages are applied right-to-left (decompress first, then reverse transforms).

      + */ +public final class ChunkCodecPipeline { + + /** Maximum number of codec stages in a single pipeline. */ + public static final int MAX_PIPELINE_LENGTH = 8; + + /** Default pipeline: a single ZSTANDARD compressor. */ + public static final ChunkCodecPipeline DEFAULT = new ChunkCodecPipeline( + Collections.singletonList(ChunkCodec.ZSTANDARD)); + + private final List _stages; + + /** + * Creates a pipeline from an ordered list of codec stages. + * + * @throws IllegalArgumentException if invariants are violated + */ + public ChunkCodecPipeline(List stages) { + Preconditions.checkArgument(stages != null && !stages.isEmpty(), "Pipeline must have at least one stage"); + Preconditions.checkArgument(stages.size() <= MAX_PIPELINE_LENGTH, + "Pipeline length %s exceeds maximum of %s", stages.size(), MAX_PIPELINE_LENGTH); + + // Validate: all transforms first, at most one compressor at the end + boolean seenCompressor = false; + for (int i = 0; i < stages.size(); i++) { + ChunkCodec codec = stages.get(i); + Preconditions.checkArgument(codec != null, "Pipeline stage at index %s is null", i); + if (codec.isCompressor()) { + Preconditions.checkArgument(!seenCompressor, "Pipeline contains more than one compressor: %s", stages); + Preconditions.checkArgument(i == stages.size() - 1, + "Compressor %s must be the last stage in the pipeline, but found at index %s of %s", + codec, i, stages.size()); + seenCompressor = true; + } + } + + _stages = Collections.unmodifiableList(new ArrayList<>(stages)); + } + + /** Returns the ordered, immutable list of codec stages. */ + public List getStages() { + return _stages; + } + + /** Returns the number of stages in the pipeline. */ + public int size() { + return _stages.size(); + } + + /** Returns the codec at the given index. */ + public ChunkCodec get(int index) { + return _stages.get(index); + } + + /** + * Returns the terminal compressor, or {@link ChunkCodec#PASS_THROUGH} if the pipeline + * contains only transforms. + */ + public ChunkCodec getCompressor() { + ChunkCodec last = _stages.get(_stages.size() - 1); + return last.isCompressor() ? last : ChunkCodec.PASS_THROUGH; + } + + /** Returns the transform stages (all stages except the terminal compressor). */ + public List getTransforms() { + ChunkCodec last = _stages.get(_stages.size() - 1); + if (last.isCompressor()) { + return _stages.subList(0, _stages.size() - 1); + } + return _stages; + } + + /** Returns {@code true} if the pipeline contains any transform stages. */ + public boolean hasTransforms() { + return !getTransforms().isEmpty(); + } + + /** + * Maps the terminal compressor back to a {@link ChunkCompressionType} for backward + * compatibility with readers/writers that still use the legacy enum. + */ + public ChunkCompressionType getChunkCompressionType() { + ChunkCodec compressor = getCompressor(); + switch (compressor) { + case PASS_THROUGH: + return ChunkCompressionType.PASS_THROUGH; + case SNAPPY: + return ChunkCompressionType.SNAPPY; + case ZSTANDARD: + return ChunkCompressionType.ZSTANDARD; + case LZ4: + return ChunkCompressionType.LZ4; + case GZIP: + return ChunkCompressionType.GZIP; + case DELTA_LZ4: + return ChunkCompressionType.DELTA; + case DOUBLE_DELTA_LZ4: + return ChunkCompressionType.DELTADELTA; + default: + throw new IllegalStateException("No ChunkCompressionType mapping for: " + compressor); + } + } + + /** + * Creates a pipeline from codec names (e.g., {@code ["DELTA", "ZSTANDARD"]}). + * Internal-only codecs (see {@link ChunkCodec#isInternalOnly()}) are rejected. + * + * @throws IllegalArgumentException if names contain internal-only codecs + */ + public static ChunkCodecPipeline fromNames(List names) { + Preconditions.checkArgument(names != null && !names.isEmpty(), "Pipeline names must be non-empty"); + List stages = new ArrayList<>(names.size()); + for (int i = 0; i < names.size(); i++) { + String name = names.get(i); + Preconditions.checkArgument(name != null, + "Pipeline codec name at index %s must be non-null", i); + String normalized = name.trim(); + Preconditions.checkArgument(!normalized.isEmpty(), + "Pipeline codec name at index %s must be non-blank (value: '%s')", i, name); + ChunkCodec codec = ChunkCodec.valueOf(normalized.toUpperCase()); + codec.validateUserFacing(name); + stages.add(codec); + } + return new ChunkCodecPipeline(stages); + } + + /** + * Creates a pipeline from on-disk int values. + */ + public static ChunkCodecPipeline fromValues(int[] values) { + List stages = new ArrayList<>(values.length); + for (int v : values) { + stages.add(ChunkCodec.fromValue(v)); + } + return new ChunkCodecPipeline(stages); + } + + /** + * Creates a single-stage pipeline from a legacy {@link ChunkCompressionType}. + */ + public static ChunkCodecPipeline fromCompressionType(ChunkCompressionType compressionType) { + switch (compressionType) { + case PASS_THROUGH: + return new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.PASS_THROUGH)); + case SNAPPY: + return new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.SNAPPY)); + case ZSTANDARD: + return new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.ZSTANDARD)); + case LZ4: + case LZ4_LENGTH_PREFIXED: + return new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.LZ4)); + case GZIP: + return new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.GZIP)); + case DELTA: + return new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.DELTA_LZ4)); + case DELTADELTA: + return new ChunkCodecPipeline(Collections.singletonList(ChunkCodec.DOUBLE_DELTA_LZ4)); + default: + throw new IllegalArgumentException("Unsupported compression type: " + compressionType); + } + } + + /** Returns the pipeline as a list of codec names (for JSON serialization). */ + public List toNames() { + return _stages.stream().map(ChunkCodec::name).collect(Collectors.toList()); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ChunkCodecPipeline)) { + return false; + } + return _stages.equals(((ChunkCodecPipeline) o)._stages); + } + + @Override + public int hashCode() { + return Objects.hash(_stages); + } + + @Override + public String toString() { + return _stages.stream().map(ChunkCodec::name).collect(Collectors.joining(" → ", "[", "]")); + } +} diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkTransform.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkTransform.java new file mode 100644 index 000000000000..10c5637b0753 --- /dev/null +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/codec/ChunkTransform.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.spi.codec; + +import java.nio.ByteBuffer; +import java.util.Set; +import org.apache.pinot.spi.data.FieldSpec.DataType; + + +/** + * Interface for a reversible numeric transform that operates in-place on a {@link ByteBuffer}. + * Transforms reduce entropy in typed numeric data before byte-level compression. + * + *

      Implementations must be stateless and thread-safe. A single instance is shared across + * all chunks of all segments that use the same transform.

      + * + *

      The transform operates on a window of {@code numBytes} starting at {@code buffer.position()}. + * It must not change the buffer's position or limit.

      + */ +public interface ChunkTransform { + + /** + * Returns the set of stored {@link DataType}s this transform supports. Callers should check + * this before applying the transform to a column. + */ + Set supportedTypes(); + + /** + * Validates that the given stored type is supported by this transform. + * + * @param storedType the stored data type of the column + * @param columnName the column name (for error messages) + * @throws IllegalArgumentException if the type is not supported + */ + default void validateStoredType(DataType storedType, String columnName) { + if (!supportedTypes().contains(storedType)) { + throw new IllegalArgumentException( + String.format("Transform '%s' does not support stored type %s for column '%s'. Supported types: %s", + getClass().getSimpleName(), storedType, columnName, supportedTypes())); + } + } + + /** + * Applies the forward (encoding) transform in-place. + * + * @param buffer the buffer containing typed values to transform + * @param numBytes the number of bytes in the active region (from current position) + * @param valueSizeInBytes size of each typed value: 4 for INT/FLOAT, 8 for LONG/DOUBLE + */ + void encode(ByteBuffer buffer, int numBytes, int valueSizeInBytes); + + /** + * Applies the reverse (decoding) transform in-place, undoing a prior {@link #encode} call. + * + * @param buffer the buffer containing transformed data to restore + * @param numBytes the number of bytes in the active region (from current position) + * @param valueSizeInBytes size of each typed value: 4 for INT/FLOAT, 8 for LONG/DOUBLE + */ + void decode(ByteBuffer buffer, int numBytes, int valueSizeInBytes); +} diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java index a4324a407119..ee1b8015f70a 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/ForwardIndexConfig.java @@ -23,9 +23,11 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Objects; import javax.annotation.Nullable; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.DictIdCompressionType; import org.apache.pinot.spi.config.table.FieldConfig; @@ -74,11 +76,13 @@ public static ForwardIndexConfig getDefault() { } public static ForwardIndexConfig getDisabled() { - return new ForwardIndexConfig(true, null, null, null, null, null, null, null, null); + return new ForwardIndexConfig(true, null, null, null, null, null, null, null, null, null); } @Nullable private final CompressionCodec _compressionCodec; + @Nullable + private final ChunkCodecPipeline _codecPipeline; private final boolean _deriveNumDocsPerChunk; private final int _rawIndexWriterVersion; private final String _targetMaxChunkSize; @@ -96,6 +100,21 @@ public ForwardIndexConfig(@Nullable Boolean disabled, @Nullable CompressionCodec @Nullable Boolean deriveNumDocsPerChunk, @Nullable Integer rawIndexWriterVersion, @Nullable String targetMaxChunkSize, @Nullable Integer targetDocsPerChunk, @Nullable Map configs) { + this(disabled, compressionCodec, null, deriveNumDocsPerChunk, rawIndexWriterVersion, targetMaxChunkSize, + targetDocsPerChunk, configs); + } + + /** + * Primary constructor with codec pipeline support. + * + *

      When {@code codecPipeline} is null and {@code compressionCodec} is a standard RAW compression codec + * (i.e., not CLP/MV_ENTRY_DICT), the pipeline is automatically derived from the compression codec. This + * ensures all downstream code can operate through a single pipeline-based code path.

      + */ + public ForwardIndexConfig(@Nullable Boolean disabled, @Nullable CompressionCodec compressionCodec, + @Nullable ChunkCodecPipeline codecPipeline, @Nullable Boolean deriveNumDocsPerChunk, + @Nullable Integer rawIndexWriterVersion, @Nullable String targetMaxChunkSize, + @Nullable Integer targetDocsPerChunk, @Nullable Map configs) { super(disabled); _compressionCodec = compressionCodec; _deriveNumDocsPerChunk = Boolean.TRUE.equals(deriveNumDocsPerChunk); @@ -151,6 +170,48 @@ public ForwardIndexConfig(@Nullable Boolean disabled, @Nullable CompressionCodec _dictIdCompressionType = null; _chunkCompressionType = null; } + + // Validate that compressionCodec and codecPipeline are not both set. + Preconditions.checkArgument(compressionCodec == null || codecPipeline == null, + "'compressionCodec' and 'codecPipeline' cannot both be set. " + + "Use 'codecPipeline' for new configs; 'compressionCodec' is deprecated."); + + // Auto-derive codec pipeline from compressionCodec when not explicitly set. + // This ensures all RAW forward index code paths operate through a unified pipeline. + // CLP codecs are excluded because they use completely separate forward index creators/readers. + // MV_ENTRY_DICT is excluded because it operates on dictionary-encoded columns, not RAW. + // Legacy DELTA/DELTADELTA are mapped to DELTA_LZ4/DOUBLE_DELTA_LZ4 single-stage compressor pipelines + // to preserve byte-compatible reads of existing segments. + if (codecPipeline != null) { + _codecPipeline = codecPipeline; + } else if (_chunkCompressionType != null && !isCLPCodec(compressionCodec)) { + _codecPipeline = ChunkCodecPipeline.fromCompressionType(_chunkCompressionType); + } else { + _codecPipeline = null; + } + } + + /** + * Returns {@code true} if the given compression codec is a CLP variant. + * CLP codecs use completely separate forward index creators/readers and cannot be + * represented as a standard codec pipeline. + */ + private static boolean isCLPCodec(@Nullable CompressionCodec codec) { + return codec == CompressionCodec.CLP || codec == CompressionCodec.CLPV2 + || codec == CompressionCodec.CLPV2_ZSTD || codec == CompressionCodec.CLPV2_LZ4; + } + + /** + * Backward-compatible constructor without codecPipeline parameter. Retained for binary + * compatibility with existing compiled code. + */ + public ForwardIndexConfig(@Nullable Boolean disabled, @Nullable CompressionCodec compressionCodec, + @Nullable ChunkCompressionType chunkCompressionType, @Nullable DictIdCompressionType dictIdCompressionType, + @Nullable Boolean deriveNumDocsPerChunk, @Nullable Integer rawIndexWriterVersion, + @Nullable String targetMaxChunkSize, @Nullable Integer targetDocsPerChunk, + @Nullable Map configs) { + this(disabled, compressionCodec, chunkCompressionType, dictIdCompressionType, deriveNumDocsPerChunk, + rawIndexWriterVersion, targetMaxChunkSize, targetDocsPerChunk, null, configs); } @JsonCreator @@ -162,11 +223,22 @@ public ForwardIndexConfig(@JsonProperty("disabled") @Nullable Boolean disabled, @JsonProperty("rawIndexWriterVersion") @Nullable Integer rawIndexWriterVersion, @JsonProperty("targetMaxChunkSize") @Nullable String targetMaxChunkSize, @JsonProperty("targetDocsPerChunk") @Nullable Integer targetDocsPerChunk, + @JsonProperty("codecPipeline") @Nullable List codecPipelineNames, @JsonProperty("configs") @Nullable Map configs) { this(disabled, getActualCompressionCodec(compressionCodec, chunkCompressionType, dictIdCompressionType), + validateAndParseCodecPipeline(codecPipelineNames), deriveNumDocsPerChunk, rawIndexWriterVersion, targetMaxChunkSize, targetDocsPerChunk, configs); } + @Nullable + private static ChunkCodecPipeline validateAndParseCodecPipeline(@Nullable List codecPipelineNames) { + if (codecPipelineNames == null) { + return null; + } + Preconditions.checkArgument(!codecPipelineNames.isEmpty(), "'codecPipeline' must be non-empty if specified"); + return ChunkCodecPipeline.fromNames(codecPipelineNames); + } + public static CompressionCodec getActualCompressionCodec(@Nullable CompressionCodec compressionCodec, @Nullable ChunkCompressionType chunkCompressionType, @Nullable DictIdCompressionType dictIdCompressionType) { if (compressionCodec != null) { @@ -204,11 +276,37 @@ public static CompressionCodec getActualCompressionCodec(@Nullable CompressionCo } } + /** + * @deprecated Use {@link #getCodecPipeline()} instead. The codec pipeline is now the canonical + * representation for RAW forward index compression. This getter is retained for JSON + * serialization backward compatibility. + */ + @Deprecated @Nullable public CompressionCodec getCompressionCodec() { return _compressionCodec; } + /** + * Returns the codec pipeline for this forward index. For all RAW forward indexes (except CLP + * codecs), this is always non-null — auto-derived from {@code compressionCodec} when not + * explicitly configured. + * + *

      Legacy compound codecs are mapped to single-stage compressor pipelines: + * {@code compressionCodec: DELTA} → {@code [DELTA_LZ4]}, + * {@code compressionCodec: DELTADELTA} → {@code [DOUBLE_DELTA_LZ4]}. + * These preserve byte-compatible reads of existing segments. For the pure delta transform, + * use an explicit pipeline like {@code ["DELTA", "ZSTANDARD"]}.

      + * + *

      Returns {@code null} only for CLP codecs, MV_ENTRY_DICT, or when no compression + * is configured.

      + */ + @JsonIgnore + @Nullable + public ChunkCodecPipeline getCodecPipeline() { + return _codecPipeline; + } + public boolean isDeriveNumDocsPerChunk() { return _deriveNumDocsPerChunk; } @@ -260,20 +358,23 @@ public boolean equals(Object o) { return false; } ForwardIndexConfig that = (ForwardIndexConfig) o; - return _compressionCodec == that._compressionCodec && _deriveNumDocsPerChunk == that._deriveNumDocsPerChunk + return _compressionCodec == that._compressionCodec && Objects.equals(_codecPipeline, that._codecPipeline) + && _deriveNumDocsPerChunk == that._deriveNumDocsPerChunk && _rawIndexWriterVersion == that._rawIndexWriterVersion && Objects.equals(_targetMaxChunkSize, that._targetMaxChunkSize) && _targetDocsPerChunk == that._targetDocsPerChunk; } @Override public int hashCode() { - return Objects.hash(super.hashCode(), _compressionCodec, _deriveNumDocsPerChunk, _rawIndexWriterVersion, - _targetMaxChunkSize, _targetDocsPerChunk); + return Objects.hash(super.hashCode(), _compressionCodec, _codecPipeline, _deriveNumDocsPerChunk, + _rawIndexWriterVersion, _targetMaxChunkSize, _targetDocsPerChunk); } public static class Builder { @Nullable private CompressionCodec _compressionCodec; + @Nullable + private ChunkCodecPipeline _codecPipeline; private boolean _deriveNumDocsPerChunk = false; private int _rawIndexWriterVersion = _defaultRawIndexWriterVersion; private String _targetMaxChunkSize = _defaultTargetMaxChunkSize; @@ -285,6 +386,7 @@ public Builder() { public Builder(ForwardIndexConfig other) { _compressionCodec = other._compressionCodec; + _codecPipeline = other._codecPipeline; _deriveNumDocsPerChunk = other._deriveNumDocsPerChunk; _rawIndexWriterVersion = other._rawIndexWriterVersion; _targetMaxChunkSize = other._targetMaxChunkSize; @@ -297,6 +399,11 @@ public Builder withCompressionCodec(CompressionCodec compressionCodec) { return this; } + public Builder withCodecPipeline(ChunkCodecPipeline codecPipeline) { + _codecPipeline = codecPipeline; + return this; + } + public Builder withDeriveNumDocsPerChunk(boolean deriveNumDocsPerChunk) { _deriveNumDocsPerChunk = deriveNumDocsPerChunk; return this; @@ -376,8 +483,8 @@ public Builder withLegacyProperties(Map properties) { } public ForwardIndexConfig build() { - return new ForwardIndexConfig(false, _compressionCodec, _deriveNumDocsPerChunk, _rawIndexWriterVersion, - _targetMaxChunkSize, _targetDocsPerChunk, _configs); + return new ForwardIndexConfig(false, _compressionCodec, _codecPipeline, _deriveNumDocsPerChunk, + _rawIndexWriterVersion, _targetMaxChunkSize, _targetDocsPerChunk, _configs); } } } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java index 0b44dc8fddc5..8f2a57be69ad 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java @@ -23,6 +23,7 @@ import java.util.Map; import java.util.Objects; import javax.annotation.Nullable; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; import org.apache.pinot.segment.spi.compression.DictIdCompressionType; import org.apache.pinot.segment.spi.index.IndexReader; @@ -66,6 +67,15 @@ default ChunkCompressionType getCompressionType() { return null; } + /** + * Returns the codec pipeline, or {@code null} if the segment was not written with a pipeline. + * Only valid for version 7+ RAW forward index columns. + */ + @Nullable + default ChunkCodecPipeline getCodecPipeline() { + return null; + } + /** * Returns the compression type for dictionary encoded forward index. */ diff --git a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/ForwardIndexConfigTest.java b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/ForwardIndexConfigTest.java index 33b1f61f2085..17ec5e8ead5c 100644 --- a/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/ForwardIndexConfigTest.java +++ b/pinot-segment-spi/src/test/java/org/apache/pinot/segment/spi/index/ForwardIndexConfigTest.java @@ -19,7 +19,11 @@ package org.apache.pinot.segment.spi.index; import com.fasterxml.jackson.core.JsonProcessingException; +import java.util.Collections; +import org.apache.pinot.segment.spi.codec.ChunkCodec; +import org.apache.pinot.segment.spi.codec.ChunkCodecPipeline; import org.apache.pinot.segment.spi.compression.ChunkCompressionType; +import org.apache.pinot.spi.config.table.FieldConfig.CompressionCodec; import org.apache.pinot.spi.utils.JsonUtils; import org.testng.annotations.Test; @@ -108,4 +112,93 @@ public void withSomeData() assertEquals(config.getTargetMaxChunkSizeBytes(), 512 * 1024, "Unexpected targetMaxChunkSizeBytes"); assertEquals(config.getTargetDocsPerChunk(), 2000, "Unexpected defaultTargetDocsPerChunk"); } + + // ===== Auto-derivation of codecPipeline from compressionCodec ===== + + @Test + public void testAutoDerivePipelineFromLz4() { + ForwardIndexConfig config = new ForwardIndexConfig(false, CompressionCodec.LZ4, + null, null, null, null, null, null); + assertNotNull(config.getCodecPipeline()); + assertEquals(config.getCodecPipeline().size(), 1); + assertEquals(config.getCodecPipeline().get(0), ChunkCodec.LZ4); + assertEquals(config.getChunkCompressionType(), ChunkCompressionType.LZ4); + } + + @Test + public void testAutoDerivePipelineFromDelta() { + // Legacy DELTA → pipeline [DELTA_LZ4] (single compound compressor) + ForwardIndexConfig config = new ForwardIndexConfig(false, CompressionCodec.DELTA, + null, null, null, null, null, null); + assertNotNull(config.getCodecPipeline()); + assertEquals(config.getCodecPipeline().size(), 1); + assertEquals(config.getCodecPipeline().get(0), ChunkCodec.DELTA_LZ4); + assertFalse(config.getCodecPipeline().hasTransforms()); + assertEquals(config.getChunkCompressionType(), ChunkCompressionType.DELTA); + } + + @Test + public void testAutoDerivePipelineFromDeltaDelta() { + // Legacy DELTADELTA → pipeline [DOUBLE_DELTA_LZ4] (single compound compressor) + ForwardIndexConfig config = new ForwardIndexConfig(false, CompressionCodec.DELTADELTA, + null, null, null, null, null, null); + assertNotNull(config.getCodecPipeline()); + assertEquals(config.getCodecPipeline().size(), 1); + assertEquals(config.getCodecPipeline().get(0), ChunkCodec.DOUBLE_DELTA_LZ4); + assertFalse(config.getCodecPipeline().hasTransforms()); + assertEquals(config.getChunkCompressionType(), ChunkCompressionType.DELTADELTA); + } + + @Test + public void testNoPipelineForCLP() { + // CLP codecs should NOT auto-derive a pipeline + ForwardIndexConfig config = new ForwardIndexConfig(false, CompressionCodec.CLP, + null, null, null, null, null, null); + assertNull(config.getCodecPipeline()); + } + + @Test + public void testNoPipelineForMvEntryDict() { + // MV_ENTRY_DICT should NOT auto-derive a pipeline (chunkCompressionType is null) + ForwardIndexConfig config = new ForwardIndexConfig(false, CompressionCodec.MV_ENTRY_DICT, + null, null, null, null, null, null); + assertNull(config.getCodecPipeline()); + } + + @Test + public void testExplicitPipelineOverridesCompression() { + // Explicit pipeline — compressionCodec must be null per mutual exclusivity + ChunkCodecPipeline pipeline = new ChunkCodecPipeline( + Collections.singletonList(ChunkCodec.ZSTANDARD)); + ForwardIndexConfig config = new ForwardIndexConfig(false, null, pipeline, + null, null, null, null, null); + assertNotNull(config.getCodecPipeline()); + assertEquals(config.getCodecPipeline(), pipeline); + assertNull(config.getCompressionCodec()); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testConflictCompressionCodecAndPipeline() { + // Both compressionCodec and codecPipeline set — should throw + ChunkCodecPipeline pipeline = new ChunkCodecPipeline( + Collections.singletonList(ChunkCodec.ZSTANDARD)); + new ForwardIndexConfig(false, CompressionCodec.LZ4, pipeline, + null, null, null, null, null); + } + + @Test + public void testJsonConflictCompressionCodecAndPipeline() + throws JsonProcessingException { + // JSON with both compressionCodec and codecPipeline should throw + String confStr = "{\n" + + " \"compressionCodec\": \"LZ4\",\n" + + " \"codecPipeline\": [\"ZSTANDARD\"]\n" + + "}"; + try { + JsonUtils.stringToObject(confStr, ForwardIndexConfig.class); + fail("Expected exception for conflicting compressionCodec and codecPipeline"); + } catch (Exception e) { + // Expected + } + } } diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java index 064526695d45..ea7dd63b7264 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java @@ -77,25 +77,35 @@ public class FieldConfig extends BaseJsonConfig { private final JsonNode _indexes; private final JsonNode _tierOverwrites; private final CompressionCodec _compressionCodec; + private final List _codecPipeline; private final Map _properties; private final TimestampConfig _timestampConfig; @Deprecated public FieldConfig(String name, EncodingType encodingType, @Nullable IndexType indexType, @Nullable CompressionCodec compressionCodec, @Nullable Map properties) { - this(name, encodingType, indexType, null, compressionCodec, null, null, properties, null); + this(name, encodingType, indexType, null, compressionCodec, null, null, null, properties, null); } public FieldConfig(String name, EncodingType encodingType, @Nullable List indexTypes, @Nullable CompressionCodec compressionCodec, @Nullable Map properties) { - this(name, encodingType, null, indexTypes, compressionCodec, null, null, properties, null); + this(name, encodingType, null, indexTypes, compressionCodec, null, null, null, properties, null); } @Deprecated public FieldConfig(String name, EncodingType encodingType, @Nullable IndexType indexType, @Nullable List indexTypes, @Nullable CompressionCodec compressionCodec, @Nullable TimestampConfig timestampConfig, @Nullable Map properties) { - this(name, encodingType, indexType, indexTypes, compressionCodec, timestampConfig, null, properties, null); + this(name, encodingType, indexType, indexTypes, compressionCodec, timestampConfig, null, null, properties, null); + } + + // Backward-compatible 9-arg constructor (without codecPipeline) + public FieldConfig(String name, EncodingType encodingType, @Nullable IndexType indexType, + @Nullable List indexTypes, @Nullable CompressionCodec compressionCodec, + @Nullable TimestampConfig timestampConfig, @Nullable JsonNode indexes, + @Nullable Map properties, @Nullable JsonNode tierOverwrites) { + this(name, encodingType, indexType, indexTypes, compressionCodec, timestampConfig, indexes, null, properties, + tierOverwrites); } @JsonCreator @@ -106,14 +116,21 @@ public FieldConfig(@JsonProperty(value = "name", required = true) String name, @JsonProperty(value = "compressionCodec") @Nullable CompressionCodec compressionCodec, @JsonProperty(value = "timestampConfig") @Nullable TimestampConfig timestampConfig, @JsonProperty(value = "indexes") @Nullable JsonNode indexes, + @JsonProperty(value = "codecPipeline") @Nullable List codecPipeline, @JsonProperty(value = "properties") @Nullable Map properties, @JsonProperty(value = "tierOverwrites") @Nullable JsonNode tierOverwrites) { Preconditions.checkArgument(name != null, "'name' must be configured"); + Preconditions.checkArgument(codecPipeline == null || !codecPipeline.isEmpty(), + "'codecPipeline' must be non-empty if specified for column '%s'", name); + Preconditions.checkArgument(compressionCodec == null || codecPipeline == null, + "'compressionCodec' and 'codecPipeline' cannot both be set for column '%s'. " + + "Use 'codecPipeline' for new configs; 'compressionCodec' is deprecated.", name); _name = name; _encodingType = encodingType == null ? EncodingType.DICTIONARY : encodingType; _indexTypes = indexTypes != null ? indexTypes : (indexType == null ? Lists.newArrayList() : Lists.newArrayList(indexType)); _compressionCodec = compressionCodec; + _codecPipeline = codecPipeline; _timestampConfig = timestampConfig; _properties = properties; _indexes = indexes == null ? NullNode.getInstance() : indexes; @@ -197,11 +214,26 @@ public JsonNode getTierOverwrites() { return _tierOverwrites; } + /** + * @deprecated Use {@code codecPipeline} instead. This field is retained for backward compatibility + * with existing table configs. Setting both {@code compressionCodec} and a non-empty + * {@code codecPipeline} is rejected at construction time. + */ + @Deprecated @Nullable public CompressionCodec getCompressionCodec() { return _compressionCodec; } + /** + * Returns the codec pipeline as an ordered list of codec names (e.g., ["DELTA", "ZSTANDARD"]). + * Mutually exclusive with {@link #getCompressionCodec()} — setting both is rejected at construction time. + */ + @Nullable + public List getCodecPipeline() { + return _codecPipeline; + } + @Nullable public TimestampConfig getTimestampConfig() { return _timestampConfig; @@ -218,6 +250,7 @@ public static class Builder { private List _indexTypes; private JsonNode _indexes; private CompressionCodec _compressionCodec; + private List _codecPipeline; private Map _properties; private TimestampConfig _timestampConfig; private JsonNode _tierOverwrites; @@ -232,6 +265,7 @@ public Builder(FieldConfig other) { _indexTypes = other._indexTypes; _indexes = other._indexes; _compressionCodec = other._compressionCodec; + _codecPipeline = other._codecPipeline; _properties = other._properties; _timestampConfig = other._timestampConfig; _tierOverwrites = other._tierOverwrites; @@ -262,6 +296,11 @@ public Builder withCompressionCodec(CompressionCodec compressionCodec) { return this; } + public Builder withCodecPipeline(List codecPipeline) { + _codecPipeline = codecPipeline; + return this; + } + public Builder withProperties(Map properties) { _properties = properties; return this; @@ -279,7 +318,7 @@ public Builder withTierOverwrites(JsonNode tierOverwrites) { public FieldConfig build() { return new FieldConfig(_name, _encodingType, null, _indexTypes, _compressionCodec, _timestampConfig, _indexes, - _properties, _tierOverwrites); + _codecPipeline, _properties, _tierOverwrites); } } }