Skip to content

Commit 6c6db2e

Browse files
committed
Optimizing Variant read path with lazy caching
1 parent 4c8f4d4 commit 6c6db2e

File tree

3 files changed

+132
-72
lines changed

3 files changed

+132
-72
lines changed

parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java

Lines changed: 111 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,26 @@ public final class Variant {
3636
*/
3737
final ByteBuffer metadata;
3838

39+
/**
40+
* Pre-computed metadata dictionary size
41+
*/
42+
private final int dictSize;
43+
44+
/**
45+
* Lazy cache for metadata dictionary strings.
46+
*/
47+
private String[] metadataCache;
48+
49+
/**
50+
* Lazy cache for the parsed object header.
51+
*/
52+
private VariantUtil.ObjectInfo cachedObjectInfo;
53+
54+
/**
55+
* Lazy cache for the parsed array header.
56+
*/
57+
private VariantUtil.ArrayInfo cachedArrayInfo;
58+
3959
/**
4060
* The threshold to switch from linear search to binary search when looking up a field by key in
4161
* an object. This is a performance optimization to avoid the overhead of binary search for a
@@ -67,6 +87,26 @@ public Variant(ByteBuffer value, ByteBuffer metadata) {
6787
"Unsupported variant metadata version: %d",
6888
metadata.get(metadata.position()) & VariantUtil.VERSION_MASK));
6989
}
90+
91+
// Pre-compute dictionary size for lazy metadata cache allocation.
92+
int pos = this.metadata.position();
93+
int metaOffsetSize = ((this.metadata.get(pos) >> 6) & 0x3) + 1;
94+
if (this.metadata.remaining() > 1) {
95+
this.dictSize = VariantUtil.readUnsigned(this.metadata, pos + 1, metaOffsetSize);
96+
} else {
97+
this.dictSize = 0;
98+
}
99+
this.metadataCache = null;
100+
}
101+
102+
/**
103+
* Package-private constructor that shares pre-parsed metadata state from a parent Variant.
104+
*/
105+
Variant(ByteBuffer value, ByteBuffer metadata, String[] metadataCache, int dictSize) {
106+
this.value = value.asReadOnlyBuffer();
107+
this.metadata = metadata.asReadOnlyBuffer();
108+
this.metadataCache = metadataCache;
109+
this.dictSize = dictSize;
70110
}
71111

72112
public ByteBuffer getValueBuffer() {
@@ -194,7 +234,7 @@ public Type getType() {
194234
* @throws IllegalArgumentException if `getType()` does not return `Type.OBJECT`
195235
*/
196236
public int numObjectElements() {
197-
return VariantUtil.getObjectInfo(value).numElements;
237+
return objectInfo().numElements;
198238
}
199239

200240
/**
@@ -206,22 +246,18 @@ public int numObjectElements() {
206246
* @throws IllegalArgumentException if `getType()` does not return `Type.OBJECT`
207247
*/
208248
public Variant getFieldByKey(String key) {
209-
VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value);
210-
// Use linear search for a short list. Switch to binary search when the length reaches
211-
// `BINARY_SEARCH_THRESHOLD`.
249+
VariantUtil.ObjectInfo info = objectInfo();
250+
int idStart = value.position() + info.idStartOffset;
251+
int offsetStart = value.position() + info.offsetStartOffset;
252+
int dataStart = value.position() + info.dataStartOffset;
253+
212254
if (info.numElements < BINARY_SEARCH_THRESHOLD) {
213255
for (int i = 0; i < info.numElements; ++i) {
214-
ObjectField field = getFieldAtIndex(
215-
i,
216-
value,
217-
metadata,
218-
info.idSize,
219-
info.offsetSize,
220-
value.position() + info.idStartOffset,
221-
value.position() + info.offsetStartOffset,
222-
value.position() + info.dataStartOffset);
223-
if (field.key.equals(key)) {
224-
return field.value;
256+
int id = VariantUtil.readUnsigned(value, idStart + info.idSize * i, info.idSize);
257+
String fieldKey = getMetadataKeyCached(id);
258+
if (fieldKey.equals(key)) {
259+
int offset = VariantUtil.readUnsigned(value, offsetStart + info.offsetSize * i, info.offsetSize);
260+
return childVariant(VariantUtil.slice(value, dataStart + offset));
225261
}
226262
}
227263
} else {
@@ -232,22 +268,16 @@ public Variant getFieldByKey(String key) {
232268
// performance optimization, because it can properly handle the case where `low + high`
233269
// overflows int.
234270
int mid = (low + high) >>> 1;
235-
ObjectField field = getFieldAtIndex(
236-
mid,
237-
value,
238-
metadata,
239-
info.idSize,
240-
info.offsetSize,
241-
value.position() + info.idStartOffset,
242-
value.position() + info.offsetStartOffset,
243-
value.position() + info.dataStartOffset);
244-
int cmp = field.key.compareTo(key);
271+
int midId = VariantUtil.readUnsigned(value, idStart + info.idSize * mid, info.idSize);
272+
String midKey = getMetadataKeyCached(midId);
273+
int cmp = midKey.compareTo(key);
245274
if (cmp < 0) {
246275
low = mid + 1;
247276
} else if (cmp > 0) {
248277
high = mid - 1;
249278
} else {
250-
return field.value;
279+
int offset = VariantUtil.readUnsigned(value, offsetStart + info.offsetSize * mid, info.offsetSize);
280+
return childVariant(VariantUtil.slice(value, dataStart + offset));
251281
}
252282
}
253283
}
@@ -275,35 +305,14 @@ public ObjectField(String key, Variant value) {
275305
* @throws IllegalArgumentException if `getType()` does not return `Type.OBJECT`
276306
*/
277307
public ObjectField getFieldAtIndex(int idx) {
278-
VariantUtil.ObjectInfo info = VariantUtil.getObjectInfo(value);
279-
// Use linear search for a short list. Switch to binary search when the length reaches
280-
// `BINARY_SEARCH_THRESHOLD`.
281-
ObjectField field = getFieldAtIndex(
282-
idx,
283-
value,
284-
metadata,
285-
info.idSize,
286-
info.offsetSize,
287-
value.position() + info.idStartOffset,
288-
value.position() + info.offsetStartOffset,
289-
value.position() + info.dataStartOffset);
290-
return field;
291-
}
292-
293-
static ObjectField getFieldAtIndex(
294-
int index,
295-
ByteBuffer value,
296-
ByteBuffer metadata,
297-
int idSize,
298-
int offsetSize,
299-
int idStart,
300-
int offsetStart,
301-
int dataStart) {
302-
// idStart, offsetStart, and dataStart are absolute positions in the `value` buffer.
303-
int id = VariantUtil.readUnsigned(value, idStart + idSize * index, idSize);
304-
int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
305-
String key = VariantUtil.getMetadataKey(metadata, id);
306-
Variant v = new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
308+
VariantUtil.ObjectInfo info = objectInfo();
309+
int idStart = value.position() + info.idStartOffset;
310+
int offsetStart = value.position() + info.offsetStartOffset;
311+
int dataStart = value.position() + info.dataStartOffset;
312+
int id = VariantUtil.readUnsigned(value, idStart + info.idSize * idx, info.idSize);
313+
int offset = VariantUtil.readUnsigned(value, offsetStart + info.offsetSize * idx, info.offsetSize);
314+
String key = getMetadataKeyCached(id);
315+
Variant v = childVariant(VariantUtil.slice(value, dataStart + offset));
307316
return new ObjectField(key, v);
308317
}
309318

@@ -312,7 +321,7 @@ static ObjectField getFieldAtIndex(
312321
* @throws IllegalArgumentException if `getType()` does not return `Type.ARRAY`
313322
*/
314323
public int numArrayElements() {
315-
return VariantUtil.getArrayInfo(value).numElements;
324+
return arrayInfo().numElements;
316325
}
317326

318327
/**
@@ -324,23 +333,56 @@ public int numArrayElements() {
324333
* @throws IllegalArgumentException if `getType()` does not return `Type.ARRAY`
325334
*/
326335
public Variant getElementAtIndex(int index) {
327-
VariantUtil.ArrayInfo info = VariantUtil.getArrayInfo(value);
336+
VariantUtil.ArrayInfo info = arrayInfo();
328337
if (index < 0 || index >= info.numElements) {
329338
return null;
330339
}
331-
return getElementAtIndex(
332-
index,
333-
value,
334-
metadata,
335-
info.offsetSize,
336-
value.position() + info.offsetStartOffset,
337-
value.position() + info.dataStartOffset);
340+
int offsetStart = value.position() + info.offsetStartOffset;
341+
int dataStart = value.position() + info.dataStartOffset;
342+
int offset = VariantUtil.readUnsigned(value, offsetStart + info.offsetSize * index, info.offsetSize);
343+
return childVariant(VariantUtil.slice(value, dataStart + offset));
344+
}
345+
346+
/**
347+
* Creates a child Variant that shares this instance's metadata cache.
348+
*/
349+
private Variant childVariant(ByteBuffer childValue) {
350+
return new Variant(childValue, metadata, metadataCache, dictSize);
338351
}
339352

340-
private static Variant getElementAtIndex(
341-
int index, ByteBuffer value, ByteBuffer metadata, int offsetSize, int offsetStart, int dataStart) {
342-
// offsetStart and dataStart are absolute positions in the `value` buffer.
343-
int offset = VariantUtil.readUnsigned(value, offsetStart + offsetSize * index, offsetSize);
344-
return new Variant(VariantUtil.slice(value, dataStart + offset), metadata);
353+
/**
354+
* Returns the metadata dictionary string for the given ID, caching the result.
355+
*/
356+
String getMetadataKeyCached(int id) {
357+
if (metadataCache == null) {
358+
metadataCache = new String[dictSize];
359+
}
360+
if (id < 0 || id >= dictSize) {
361+
return VariantUtil.getMetadataKey(metadata, id);
362+
}
363+
if (metadataCache[id] == null) {
364+
metadataCache[id] = VariantUtil.getMetadataKey(metadata, id);
365+
}
366+
return metadataCache[id];
367+
}
368+
369+
/**
370+
* Returns the cached object header, parsing it on first access.
371+
*/
372+
private VariantUtil.ObjectInfo objectInfo() {
373+
if (cachedObjectInfo == null) {
374+
cachedObjectInfo = VariantUtil.getObjectInfo(value);
375+
}
376+
return cachedObjectInfo;
377+
}
378+
379+
/**
380+
* Returns the cached array header, parsing it on first access.
381+
*/
382+
private VariantUtil.ArrayInfo arrayInfo() {
383+
if (cachedArrayInfo == null) {
384+
cachedArrayInfo = VariantUtil.getArrayInfo(value);
385+
}
386+
return cachedArrayInfo;
345387
}
346388
}

parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.util.ArrayList;
2525
import java.util.Collections;
2626
import java.util.Set;
27+
import org.apache.parquet.io.api.Binary;
2728

2829
/**
2930
* Builder for creating Variant value and metadata.
@@ -109,7 +110,14 @@ public void appendEncodedValue(ByteBuffer value) {
109110
*/
110111
public void appendString(String str) {
111112
onAppend();
112-
byte[] data = str.getBytes(StandardCharsets.UTF_8);
113+
writeUTF8bytes(str.getBytes(StandardCharsets.UTF_8));
114+
}
115+
116+
/**
117+
* Write bytes as a UTF8 string.
118+
* @param data data to write; this is not modified.
119+
*/
120+
private void writeUTF8bytes(final byte[] data) {
113121
boolean longStr = data.length > VariantUtil.MAX_SHORT_STR_SIZE;
114122
checkCapacity((longStr ? 1 + VariantUtil.U32_SIZE : 1) + data.length);
115123
if (longStr) {
@@ -125,6 +133,16 @@ public void appendString(String str) {
125133
writePos += data.length;
126134
}
127135

136+
/**
137+
* Given a Binary, append it to the variant as a string.
138+
* Avoids intermediate String creation when unmarshalling from shredded string columns.
139+
* @param binary source data.
140+
*/
141+
void appendAsString(Binary binary) {
142+
onAppend();
143+
writeUTF8bytes(binary.getBytesUnsafe());
144+
}
145+
128146
/**
129147
* Appends a null value to the Variant builder.
130148
*/

parquet-variant/src/main/java/org/apache/parquet/variant/VariantConverters.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,14 +233,14 @@ static class PartiallyShreddedFieldsConverter extends GroupConverter {
233233
PartiallyShreddedFieldsConverter(GroupType fieldsType, ParentConverter<VariantBuilder> parent) {
234234
this.converters = new Converter[fieldsType.getFieldCount()];
235235
this.parent = parent;
236+
ParentConverter<VariantObjectBuilder> newParent = converter -> converter.accept(objectBuilder);
236237

237238
for (int index = 0; index < fieldsType.getFieldCount(); index += 1) {
238239
Type field = fieldsType.getType(index);
239240
Preconditions.checkArgument(!field.isPrimitive(), "Invalid field group: " + field);
240241

241242
String name = field.getName();
242243
shreddedFieldNames.add(name);
243-
ParentConverter<VariantObjectBuilder> newParent = converter -> converter.accept(objectBuilder);
244244
converters[index] = new FieldValueConverter(name, field.asGroupType(), newParent);
245245
}
246246
}
@@ -501,7 +501,7 @@ static class VariantStringConverter extends ShreddedScalarConverter {
501501

502502
@Override
503503
public void addBinary(Binary value) {
504-
parent.build(builder -> builder.appendString(value.toStringUsingUTF8()));
504+
parent.build(builder -> builder.appendAsString(value));
505505
}
506506
}
507507

0 commit comments

Comments
 (0)