Skip to content

Commit 9c411b0

Browse files
committed
apacheGH-3516: Optimize DeltaByteArrayWriter / DeltaLengthByteArrayValuesWriter
Two related changes in the DELTA_BYTE_ARRAY write path: 1. DeltaLengthByteArrayValuesWriter: drop the unused LittleEndianDataOutputStream wrapper. Binary.writeTo(arrayOut) works directly with the underlying CapacityByteArrayOutputStream; the LE wrapper added an extra layer of dispatch on every value but never used any LE functionality (writeInt/writeLong/etc.). Add a new writeBytes(byte[], int, int) overload so callers that already have the raw bytes can avoid allocating a Binary wrapper. 2. DeltaByteArrayWriter: tighten suffixWriter field type to DeltaLengthByteArrayValuesWriter (it's always constructed as one) so the new writeBytes(byte[], int, int) overload is callable. Replace the suffix call with the raw-bytes overload, eliminating the per-value Binary.slice() allocation. Benchmark (BinaryEncodingBenchmark, 100k BINARY values per invocation, JMH -wi 3 -i 5 -f 1): Benchmark Param Before (ops/s) After (ops/s) Improvement encodeDeltaByteArray LOW/10 61,475,818 81,416,754 +32% (1.32x) encodeDeltaByteArray LOW/100 34,759,755 45,186,617 +30% (1.30x) encodeDeltaByteArray LOW/1000 5,386,922 6,532,850 +21% (1.21x) encodeDeltaByteArray HIGH/10 56,799,595 78,966,929 +39% (1.39x) encodeDeltaLengthByteArray LOW/10 129,447,876 136,657,079 +6% encodeDeltaLengthByteArray HIGH/10 123,673,058 116,778,775 flat (noise) Negative controls (encodePlain, encodeDictionary): unchanged within noise. The DeltaByteArray path benefits most because it eliminates both the Binary.slice() allocation per suffix and the OutputStream dispatch layer. DeltaLengthByteArray gains are smaller since only the OutputStream wrapper removal applies there. No public API change. No file format change. All 573 parquet-column tests pass.
1 parent 492b686 commit 9c411b0

2 files changed

Lines changed: 15 additions & 11 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import org.apache.parquet.bytes.ByteBufferAllocator;
2323
import org.apache.parquet.bytes.BytesInput;
2424
import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
25-
import org.apache.parquet.bytes.LittleEndianDataOutputStream;
2625
import org.apache.parquet.column.Encoding;
2726
import org.apache.parquet.column.values.ValuesWriter;
2827
import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriter;
@@ -46,11 +45,9 @@ public class DeltaLengthByteArrayValuesWriter extends ValuesWriter {
4645

4746
private ValuesWriter lengthWriter;
4847
private CapacityByteArrayOutputStream arrayOut;
49-
private LittleEndianDataOutputStream out;
5048

5149
public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBufferAllocator allocator) {
5250
arrayOut = new CapacityByteArrayOutputStream(initialSize, pageSize, allocator);
53-
out = new LittleEndianDataOutputStream(arrayOut);
5451
lengthWriter = new DeltaBinaryPackingValuesWriterForInteger(
5552
DeltaBinaryPackingValuesWriter.DEFAULT_NUM_BLOCK_VALUES,
5653
DeltaBinaryPackingValuesWriter.DEFAULT_NUM_MINIBLOCKS,
@@ -63,24 +60,29 @@ public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBuffe
6360
public void writeBytes(Binary v) {
6461
try {
6562
lengthWriter.writeInteger(v.length());
66-
v.writeTo(out);
63+
v.writeTo(arrayOut);
6764
} catch (IOException e) {
6865
throw new ParquetEncodingException("could not write bytes", e);
6966
}
7067
}
7168

69+
/**
70+
* Writes raw bytes directly, avoiding Binary object creation overhead.
71+
* Used by {@link org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter}
72+
* to write suffix bytes without creating an intermediate Binary.slice().
73+
*/
74+
public void writeBytes(byte[] data, int offset, int length) {
75+
lengthWriter.writeInteger(length);
76+
arrayOut.write(data, offset, length);
77+
}
78+
7279
@Override
7380
public long getBufferedSize() {
7481
return lengthWriter.getBufferedSize() + arrayOut.size();
7582
}
7683

7784
@Override
7885
public BytesInput getBytes() {
79-
try {
80-
out.flush();
81-
} catch (IOException e) {
82-
throw new ParquetEncodingException("could not write page", e);
83-
}
8486
LOG.debug("writing a buffer of size {}", arrayOut.size());
8587
return BytesInput.concat(lengthWriter.getBytes(), BytesInput.from(arrayOut));
8688
}

parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
public class DeltaByteArrayWriter extends ValuesWriter {
3939

4040
private ValuesWriter prefixLengthWriter;
41-
private ValuesWriter suffixWriter;
41+
private DeltaLengthByteArrayValuesWriter suffixWriter;
4242
private byte[] previous;
4343

4444
public DeltaByteArrayWriter(int initialCapacity, int pageSize, ByteBufferAllocator allocator) {
@@ -98,7 +98,9 @@ public void writeBytes(Binary v) {
9898
i = length; // all bytes in the common range matched
9999
}
100100
prefixLengthWriter.writeInteger(i);
101-
suffixWriter.writeBytes(v.slice(i, vb.length - i));
101+
// Write suffix bytes directly from the byte array, avoiding Binary.slice() allocation
102+
// and the virtual dispatch chain through Binary.writeTo()
103+
suffixWriter.writeBytes(vb, i, vb.length - i);
102104
previous = vb;
103105
}
104106
}

0 commit comments

Comments
 (0)