Skip to content

Commit 7049836

Browse files
committed
GH-3516: Optimize DeltaByteArrayWriter / DeltaLengthByteArrayValuesWriter
Two related changes in the DELTA_BYTE_ARRAY write path: 1. DeltaLengthByteArrayValuesWriter: drop the unused LittleEndianDataOutputStream wrapper. Binary.writeTo(arrayOut) works directly with the underlying CapacityByteArrayOutputStream; the LE wrapper added an extra layer of dispatch on every value but never used any LE functionality (writeInt/writeLong/etc.). Add a new writeBytes(byte[], int, int) overload so callers that already have the raw bytes can avoid allocating a Binary wrapper. 2. DeltaByteArrayWriter: tighten suffixWriter field type to DeltaLengthByteArrayValuesWriter (it's always constructed as one) so the new writeBytes(byte[], int, int) overload is callable. Replace the suffix call with the raw-bytes overload, eliminating the per-value Binary.slice() allocation. Benchmark results (BinaryEncodingBenchmark.encodeDeltaByteArray and encodeDeltaLengthByteArray, added in #3512): - encodeDeltaByteArray (LOW cardinality, len=10): +33% to +55% - encodeDeltaLengthByteArray (LOW card, len=10): +18% to +21% - long-string cases: flat (per-value alloc amortized away) No public API change. No file format change. Validation: parquet-column 573 tests pass. Built with -Dspotless.check.skip=true -Drat.skip=true -Djapicmp.skip=true.
1 parent 53d7842 commit 7049836

2 files changed

Lines changed: 15 additions & 11 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/values/deltalengthbytearray/DeltaLengthByteArrayValuesWriter.java

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import org.apache.parquet.bytes.ByteBufferAllocator;
2323
import org.apache.parquet.bytes.BytesInput;
2424
import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
25-
import org.apache.parquet.bytes.LittleEndianDataOutputStream;
2625
import org.apache.parquet.column.Encoding;
2726
import org.apache.parquet.column.values.ValuesWriter;
2827
import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriter;
@@ -46,11 +45,9 @@ public class DeltaLengthByteArrayValuesWriter extends ValuesWriter {
4645

4746
private ValuesWriter lengthWriter;
4847
private CapacityByteArrayOutputStream arrayOut;
49-
private LittleEndianDataOutputStream out;
5048

5149
public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBufferAllocator allocator) {
5250
arrayOut = new CapacityByteArrayOutputStream(initialSize, pageSize, allocator);
53-
out = new LittleEndianDataOutputStream(arrayOut);
5451
lengthWriter = new DeltaBinaryPackingValuesWriterForInteger(
5552
DeltaBinaryPackingValuesWriter.DEFAULT_NUM_BLOCK_VALUES,
5653
DeltaBinaryPackingValuesWriter.DEFAULT_NUM_MINIBLOCKS,
@@ -63,24 +60,29 @@ public DeltaLengthByteArrayValuesWriter(int initialSize, int pageSize, ByteBuffe
6360
public void writeBytes(Binary v) {
6461
try {
6562
lengthWriter.writeInteger(v.length());
66-
v.writeTo(out);
63+
v.writeTo(arrayOut);
6764
} catch (IOException e) {
6865
throw new ParquetEncodingException("could not write bytes", e);
6966
}
7067
}
7168

69+
/**
70+
* Writes raw bytes directly, avoiding Binary object creation overhead.
71+
* Used by {@link org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter}
72+
* to write suffix bytes without creating an intermediate Binary.slice().
73+
*/
74+
public void writeBytes(byte[] data, int offset, int length) {
75+
lengthWriter.writeInteger(length);
76+
arrayOut.write(data, offset, length);
77+
}
78+
7279
@Override
7380
public long getBufferedSize() {
7481
return lengthWriter.getBufferedSize() + arrayOut.size();
7582
}
7683

7784
@Override
7885
public BytesInput getBytes() {
79-
try {
80-
out.flush();
81-
} catch (IOException e) {
82-
throw new ParquetEncodingException("could not write page", e);
83-
}
8486
LOG.debug("writing a buffer of size {}", arrayOut.size());
8587
return BytesInput.concat(lengthWriter.getBytes(), BytesInput.from(arrayOut));
8688
}

parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
public class DeltaByteArrayWriter extends ValuesWriter {
3838

3939
private ValuesWriter prefixLengthWriter;
40-
private ValuesWriter suffixWriter;
40+
private DeltaLengthByteArrayValuesWriter suffixWriter;
4141
private byte[] previous;
4242

4343
public DeltaByteArrayWriter(int initialCapacity, int pageSize, ByteBufferAllocator allocator) {
@@ -95,7 +95,9 @@ public void writeBytes(Binary v) {
9595
for (i = 0; (i < length) && (previous[i] == vb[i]); i++)
9696
;
9797
prefixLengthWriter.writeInteger(i);
98-
suffixWriter.writeBytes(v.slice(i, vb.length - i));
98+
// Write suffix bytes directly from the byte array, avoiding Binary.slice() allocation
99+
// and the virtual dispatch chain through Binary.writeTo()
100+
suffixWriter.writeBytes(vb, i, vb.length - i);
99101
previous = vb;
100102
}
101103
}

0 commit comments

Comments
 (0)