Skip to content

Commit ba5d027

Browse files
committed
GH-3522: Reuse intermediate buffers in RunLengthBitPackingHybridDecoder PACKED path
Allocate the int[] values buffer and byte[] read-staging buffer once per decoder and grow them lazily, instead of allocating fresh arrays on every PACKED run. Resolves the existing "TODO: reuse a buffer" comment. A new currentBufferLength field tracks the logical length of the active region in packedValuesBuffer (which may now exceed the current run's size after a prior larger run grew it). Benchmark (RleDictionaryIndexDecodingBenchmark, 100k INT32, BIT_WIDTH=10, JMH -wi 5 -i 10 -f 2): Pattern | master ops/s | optimized ops/s | Improvement SEQUENTIAL | 93,061,521 | 113,856,860 | +22.3% RANDOM | 92,929,824 | 114,238,638 | +22.9% LOW_CARDINALITY | 92,813,229 | 115,271,347 | +24.2% End-to-end FileReadBenchmark sees ~2% improvement (RLE decoding is a small fraction of full file reads). Validation: 573 parquet-column tests pass. Built with -Dspotless.check.skip=true -Drat.skip=true -Djapicmp.skip=true.
1 parent d96c669 commit ba5d027

1 file changed

Lines changed: 17 additions & 5 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/values/rle/RunLengthBitPackingHybridDecoder.java

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ private static enum MODE {
4848
private int currentCount;
4949
private int currentValue;
5050
private int[] currentBuffer;
51+
private int currentBufferLength;
52+
53+
// Reusable buffers to avoid per-run allocation in PACKED mode
54+
private int[] packedValuesBuffer = new int[0];
55+
private byte[] packedBytesBuffer = new byte[0];
5156

5257
public RunLengthBitPackingHybridDecoder(int bitWidth, InputStream in) {
5358
LOG.debug("decoding bitWidth {}", bitWidth);
@@ -69,7 +74,7 @@ public int readInt() throws IOException {
6974
result = currentValue;
7075
break;
7176
case PACKED:
72-
result = currentBuffer[currentBuffer.length - 1 - currentCount];
77+
result = currentBuffer[currentBufferLength - 1 - currentCount];
7378
break;
7479
default:
7580
throw new ParquetDecodingException("not a valid mode " + mode);
@@ -90,17 +95,24 @@ private void readNext() throws IOException {
9095
case PACKED:
9196
int numGroups = header >>> 1;
9297
currentCount = numGroups * 8;
98+
currentBufferLength = currentCount;
9399
LOG.debug("reading {} values BIT PACKED", currentCount);
94-
currentBuffer = new int[currentCount]; // TODO: reuse a buffer
95-
byte[] bytes = new byte[numGroups * bitWidth];
100+
if (packedValuesBuffer.length < currentCount) {
101+
packedValuesBuffer = new int[currentCount];
102+
}
103+
currentBuffer = packedValuesBuffer;
104+
int bytesRequired = numGroups * bitWidth;
105+
if (packedBytesBuffer.length < bytesRequired) {
106+
packedBytesBuffer = new byte[bytesRequired];
107+
}
96108
// At the end of the file RLE data though, there might not be that many bytes left.
97109
int bytesToRead = (int) Math.ceil(currentCount * bitWidth / 8.0);
98110
bytesToRead = Math.min(bytesToRead, in.available());
99-
new DataInputStream(in).readFully(bytes, 0, bytesToRead);
111+
new DataInputStream(in).readFully(packedBytesBuffer, 0, bytesToRead);
100112
for (int valueIndex = 0, byteIndex = 0;
101113
valueIndex < currentCount;
102114
valueIndex += 8, byteIndex += bitWidth) {
103-
packer.unpack8Values(bytes, byteIndex, currentBuffer, valueIndex);
115+
packer.unpack8Values(packedBytesBuffer, byteIndex, currentBuffer, valueIndex);
104116
}
105117
break;
106118
default:

0 commit comments

Comments
 (0)