Skip to content

Commit 9f90acb

Browse files
committed
GH-3513: Optimize dictionary writers with OpenHashMap + ArrayList
Replace fastutil's *2IntLinkedOpenHashMap with the plain *2IntOpenHashMap plus a separate primitive-typed list to track insertion order in the five dictionary writers (binary, long, double, float, int). The Linked variant was used because the dictionary page must be emitted in insertion order, but it pays an avoidable cost on every put: two extra long fields per slot (prev, next), 3-4 scattered writes per insert to fix up the doubly-linked list, and re-stitching on rehash. None of this is vectorizable. With the plain map plus an append-only list, the hash map is a pure id lookup with the smallest possible slot, and the list is contiguous and cache-friendly to iterate at flush time. Both candidates are fastutil primitive-keyed maps, so this is not a boxing change. The win is structural: an ordering guarantee that was being paid for on every insert is replaced with an explicit append-only list that provides it more cheaply. Benchmark results (BinaryEncodingBenchmark.encodeDictionary, IntEncodingBenchmark.encodeDictionary - added in #3512): - encodeDictionary (binary, high cardinality, short strings): +23-42% - encodeDictionary (int, high cardinality): ~+2x - low-cardinality cases: flat (linked-list overhead doesn't matter when there are few inserts) No public API change. No file format change. Behavior is identical: dictionary pages emit values in the same order. Validation: parquet-column 573 tests pass. Built with -Dspotless.check.skip=true -Drat.skip=true -Djapicmp.skip=true.
1 parent 53d7842 commit 9f90acb

1 file changed

Lines changed: 52 additions & 88 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java

Lines changed: 52 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,22 @@
2020

2121
import static org.apache.parquet.bytes.BytesInput.concat;
2222

23-
import it.unimi.dsi.fastutil.doubles.Double2IntLinkedOpenHashMap;
2423
import it.unimi.dsi.fastutil.doubles.Double2IntMap;
25-
import it.unimi.dsi.fastutil.doubles.DoubleIterator;
26-
import it.unimi.dsi.fastutil.floats.Float2IntLinkedOpenHashMap;
24+
import it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap;
25+
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
2726
import it.unimi.dsi.fastutil.floats.Float2IntMap;
28-
import it.unimi.dsi.fastutil.floats.FloatIterator;
29-
import it.unimi.dsi.fastutil.ints.Int2IntLinkedOpenHashMap;
27+
import it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap;
28+
import it.unimi.dsi.fastutil.floats.FloatArrayList;
3029
import it.unimi.dsi.fastutil.ints.Int2IntMap;
31-
import it.unimi.dsi.fastutil.longs.Long2IntLinkedOpenHashMap;
30+
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
31+
import it.unimi.dsi.fastutil.ints.IntArrayList;
3232
import it.unimi.dsi.fastutil.longs.Long2IntMap;
33-
import it.unimi.dsi.fastutil.longs.LongIterator;
34-
import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
33+
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
34+
import it.unimi.dsi.fastutil.longs.LongArrayList;
3535
import it.unimi.dsi.fastutil.objects.Object2IntMap;
36-
import it.unimi.dsi.fastutil.objects.ObjectIterator;
36+
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
3737
import java.io.IOException;
3838
import java.util.ArrayList;
39-
import java.util.Iterator;
4039
import java.util.List;
4140
import org.apache.parquet.bytes.ByteBufferAllocator;
4241
import org.apache.parquet.bytes.BytesInput;
@@ -231,7 +230,8 @@ public String memUsageString(String prefix) {
231230
public static class PlainBinaryDictionaryValuesWriter extends DictionaryValuesWriter {
232231

233232
/* type specific dictionary content */
234-
protected Object2IntMap<Binary> binaryDictionaryContent = new Object2IntLinkedOpenHashMap<>();
233+
protected Object2IntMap<Binary> binaryDictionaryContent = new Object2IntOpenHashMap<>();
234+
protected List<Binary> dictionaryValues = new ArrayList<>();
235235

236236
public PlainBinaryDictionaryValuesWriter(
237237
int maxDictionaryByteSize,
@@ -246,8 +246,10 @@ public PlainBinaryDictionaryValuesWriter(
246246
public void writeBytes(Binary v) {
247247
int id = binaryDictionaryContent.getInt(v);
248248
if (id == -1) {
249-
id = binaryDictionaryContent.size();
250-
binaryDictionaryContent.put(v.copy(), id);
249+
id = dictionaryValues.size();
250+
Binary copied = v.copy();
251+
binaryDictionaryContent.put(copied, id);
252+
dictionaryValues.add(copied);
251253
// length as int (4 bytes) + actual bytes
252254
dictionaryByteSize += 4L + v.length();
253255
}
@@ -260,12 +262,9 @@ public DictionaryPage toDictPageAndClose() {
260262
// return a dictionary only if we actually used it
261263
PlainValuesWriter dictionaryEncoder =
262264
new PlainValuesWriter(lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
263-
Iterator<Binary> binaryIterator =
264-
binaryDictionaryContent.keySet().iterator();
265265
// write only the part of the dict that we used
266266
for (int i = 0; i < lastUsedDictionarySize; i++) {
267-
Binary entry = binaryIterator.next();
268-
dictionaryEncoder.writeBytes(entry);
267+
dictionaryEncoder.writeBytes(dictionaryValues.get(i));
269268
}
270269
return dictPage(dictionaryEncoder);
271270
}
@@ -280,21 +279,16 @@ public int getDictionarySize() {
280279
@Override
281280
protected void clearDictionaryContent() {
282281
binaryDictionaryContent.clear();
282+
dictionaryValues.clear();
283283
}
284284

285285
@Override
286286
public void fallBackDictionaryEncodedData(ValuesWriter writer) {
287-
// build reverse dictionary
288-
Binary[] reverseDictionary = new Binary[getDictionarySize()];
289-
for (Object2IntMap.Entry<Binary> entry : binaryDictionaryContent.object2IntEntrySet()) {
290-
reverseDictionary[entry.getIntValue()] = entry.getKey();
291-
}
292-
293-
// fall back to plain encoding
287+
// fall back to plain encoding using the ordered dictionary values list
294288
IntIterator iterator = encodedValues.iterator();
295289
while (iterator.hasNext()) {
296290
int id = iterator.next();
297-
writer.writeBytes(reverseDictionary[id]);
291+
writer.writeBytes(dictionaryValues.get(id));
298292
}
299293
}
300294
}
@@ -317,8 +311,10 @@ public PlainFixedLenArrayDictionaryValuesWriter(
317311
public void writeBytes(Binary value) {
318312
int id = binaryDictionaryContent.getInt(value);
319313
if (id == -1) {
320-
id = binaryDictionaryContent.size();
321-
binaryDictionaryContent.put(value.copy(), id);
314+
id = dictionaryValues.size();
315+
Binary copied = value.copy();
316+
binaryDictionaryContent.put(copied, id);
317+
dictionaryValues.add(copied);
322318
dictionaryByteSize += length;
323319
}
324320
encodedValues.add(id);
@@ -330,12 +326,9 @@ public DictionaryPage toDictPageAndClose() {
330326
// return a dictionary only if we actually used it
331327
FixedLenByteArrayPlainValuesWriter dictionaryEncoder = new FixedLenByteArrayPlainValuesWriter(
332328
length, lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
333-
Iterator<Binary> binaryIterator =
334-
binaryDictionaryContent.keySet().iterator();
335329
// write only the part of the dict that we used
336330
for (int i = 0; i < lastUsedDictionarySize; i++) {
337-
Binary entry = binaryIterator.next();
338-
dictionaryEncoder.writeBytes(entry);
331+
dictionaryEncoder.writeBytes(dictionaryValues.get(i));
339332
}
340333
return dictPage(dictionaryEncoder);
341334
}
@@ -346,7 +339,8 @@ public DictionaryPage toDictPageAndClose() {
346339
public static class PlainLongDictionaryValuesWriter extends DictionaryValuesWriter {
347340

348341
/* type specific dictionary content */
349-
private Long2IntMap longDictionaryContent = new Long2IntLinkedOpenHashMap();
342+
private Long2IntMap longDictionaryContent = new Long2IntOpenHashMap();
343+
private LongArrayList dictionaryValues = new LongArrayList();
350344

351345
public PlainLongDictionaryValuesWriter(
352346
int maxDictionaryByteSize,
@@ -361,8 +355,9 @@ public PlainLongDictionaryValuesWriter(
361355
public void writeLong(long v) {
362356
int id = longDictionaryContent.get(v);
363357
if (id == -1) {
364-
id = longDictionaryContent.size();
358+
id = dictionaryValues.size();
365359
longDictionaryContent.put(v, id);
360+
dictionaryValues.add(v);
366361
dictionaryByteSize += 8;
367362
}
368363
encodedValues.add(id);
@@ -374,10 +369,9 @@ public DictionaryPage toDictPageAndClose() {
374369
// return a dictionary only if we actually used it
375370
PlainValuesWriter dictionaryEncoder =
376371
new PlainValuesWriter(lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
377-
LongIterator longIterator = longDictionaryContent.keySet().iterator();
378372
// write only the part of the dict that we used
379373
for (int i = 0; i < lastUsedDictionarySize; i++) {
380-
dictionaryEncoder.writeLong(longIterator.nextLong());
374+
dictionaryEncoder.writeLong(dictionaryValues.getLong(i));
381375
}
382376
return dictPage(dictionaryEncoder);
383377
}
@@ -392,32 +386,25 @@ public int getDictionarySize() {
392386
@Override
393387
protected void clearDictionaryContent() {
394388
longDictionaryContent.clear();
389+
dictionaryValues.clear();
395390
}
396391

397392
@Override
398393
public void fallBackDictionaryEncodedData(ValuesWriter writer) {
399-
// build reverse dictionary
400-
long[] reverseDictionary = new long[getDictionarySize()];
401-
ObjectIterator<Long2IntMap.Entry> entryIterator =
402-
longDictionaryContent.long2IntEntrySet().iterator();
403-
while (entryIterator.hasNext()) {
404-
Long2IntMap.Entry entry = entryIterator.next();
405-
reverseDictionary[entry.getIntValue()] = entry.getLongKey();
406-
}
407-
408394
// fall back to plain encoding
409395
IntIterator iterator = encodedValues.iterator();
410396
while (iterator.hasNext()) {
411397
int id = iterator.next();
412-
writer.writeLong(reverseDictionary[id]);
398+
writer.writeLong(dictionaryValues.getLong(id));
413399
}
414400
}
415401
}
416402

417403
public static class PlainDoubleDictionaryValuesWriter extends DictionaryValuesWriter {
418404

419405
/* type specific dictionary content */
420-
private Double2IntMap doubleDictionaryContent = new Double2IntLinkedOpenHashMap();
406+
private Double2IntMap doubleDictionaryContent = new Double2IntOpenHashMap();
407+
private DoubleArrayList dictionaryValues = new DoubleArrayList();
421408

422409
public PlainDoubleDictionaryValuesWriter(
423410
int maxDictionaryByteSize,
@@ -432,8 +419,9 @@ public PlainDoubleDictionaryValuesWriter(
432419
public void writeDouble(double v) {
433420
int id = doubleDictionaryContent.get(v);
434421
if (id == -1) {
435-
id = doubleDictionaryContent.size();
422+
id = dictionaryValues.size();
436423
doubleDictionaryContent.put(v, id);
424+
dictionaryValues.add(v);
437425
dictionaryByteSize += 8;
438426
}
439427
encodedValues.add(id);
@@ -445,10 +433,9 @@ public DictionaryPage toDictPageAndClose() {
445433
// return a dictionary only if we actually used it
446434
PlainValuesWriter dictionaryEncoder =
447435
new PlainValuesWriter(lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
448-
DoubleIterator doubleIterator = doubleDictionaryContent.keySet().iterator();
449436
// write only the part of the dict that we used
450437
for (int i = 0; i < lastUsedDictionarySize; i++) {
451-
dictionaryEncoder.writeDouble(doubleIterator.nextDouble());
438+
dictionaryEncoder.writeDouble(dictionaryValues.getDouble(i));
452439
}
453440
return dictPage(dictionaryEncoder);
454441
}
@@ -463,32 +450,25 @@ public int getDictionarySize() {
463450
@Override
464451
protected void clearDictionaryContent() {
465452
doubleDictionaryContent.clear();
453+
dictionaryValues.clear();
466454
}
467455

468456
@Override
469457
public void fallBackDictionaryEncodedData(ValuesWriter writer) {
470-
// build reverse dictionary
471-
double[] reverseDictionary = new double[getDictionarySize()];
472-
ObjectIterator<Double2IntMap.Entry> entryIterator =
473-
doubleDictionaryContent.double2IntEntrySet().iterator();
474-
while (entryIterator.hasNext()) {
475-
Double2IntMap.Entry entry = entryIterator.next();
476-
reverseDictionary[entry.getIntValue()] = entry.getDoubleKey();
477-
}
478-
479458
// fall back to plain encoding
480459
IntIterator iterator = encodedValues.iterator();
481460
while (iterator.hasNext()) {
482461
int id = iterator.next();
483-
writer.writeDouble(reverseDictionary[id]);
462+
writer.writeDouble(dictionaryValues.getDouble(id));
484463
}
485464
}
486465
}
487466

488467
public static class PlainIntegerDictionaryValuesWriter extends DictionaryValuesWriter {
489468

490469
/* type specific dictionary content */
491-
private Int2IntMap intDictionaryContent = new Int2IntLinkedOpenHashMap();
470+
private Int2IntMap intDictionaryContent = new Int2IntOpenHashMap();
471+
private IntArrayList dictionaryValues = new IntArrayList();
492472

493473
public PlainIntegerDictionaryValuesWriter(
494474
int maxDictionaryByteSize,
@@ -503,8 +483,9 @@ public PlainIntegerDictionaryValuesWriter(
503483
public void writeInteger(int v) {
504484
int id = intDictionaryContent.get(v);
505485
if (id == -1) {
506-
id = intDictionaryContent.size();
486+
id = dictionaryValues.size();
507487
intDictionaryContent.put(v, id);
488+
dictionaryValues.add(v);
508489
dictionaryByteSize += 4;
509490
}
510491
encodedValues.add(id);
@@ -516,11 +497,9 @@ public DictionaryPage toDictPageAndClose() {
516497
// return a dictionary only if we actually used it
517498
PlainValuesWriter dictionaryEncoder =
518499
new PlainValuesWriter(lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
519-
it.unimi.dsi.fastutil.ints.IntIterator intIterator =
520-
intDictionaryContent.keySet().iterator();
521500
// write only the part of the dict that we used
522501
for (int i = 0; i < lastUsedDictionarySize; i++) {
523-
dictionaryEncoder.writeInteger(intIterator.nextInt());
502+
dictionaryEncoder.writeInteger(dictionaryValues.getInt(i));
524503
}
525504
return dictPage(dictionaryEncoder);
526505
}
@@ -535,32 +514,25 @@ public int getDictionarySize() {
535514
@Override
536515
protected void clearDictionaryContent() {
537516
intDictionaryContent.clear();
517+
dictionaryValues.clear();
538518
}
539519

540520
@Override
541521
public void fallBackDictionaryEncodedData(ValuesWriter writer) {
542-
// build reverse dictionary
543-
int[] reverseDictionary = new int[getDictionarySize()];
544-
ObjectIterator<Int2IntMap.Entry> entryIterator =
545-
intDictionaryContent.int2IntEntrySet().iterator();
546-
while (entryIterator.hasNext()) {
547-
Int2IntMap.Entry entry = entryIterator.next();
548-
reverseDictionary[entry.getIntValue()] = entry.getIntKey();
549-
}
550-
551522
// fall back to plain encoding
552523
IntIterator iterator = encodedValues.iterator();
553524
while (iterator.hasNext()) {
554525
int id = iterator.next();
555-
writer.writeInteger(reverseDictionary[id]);
526+
writer.writeInteger(dictionaryValues.getInt(id));
556527
}
557528
}
558529
}
559530

560531
public static class PlainFloatDictionaryValuesWriter extends DictionaryValuesWriter {
561532

562533
/* type specific dictionary content */
563-
private Float2IntMap floatDictionaryContent = new Float2IntLinkedOpenHashMap();
534+
private Float2IntMap floatDictionaryContent = new Float2IntOpenHashMap();
535+
private FloatArrayList dictionaryValues = new FloatArrayList();
564536

565537
public PlainFloatDictionaryValuesWriter(
566538
int maxDictionaryByteSize,
@@ -575,8 +547,9 @@ public PlainFloatDictionaryValuesWriter(
575547
public void writeFloat(float v) {
576548
int id = floatDictionaryContent.get(v);
577549
if (id == -1) {
578-
id = floatDictionaryContent.size();
550+
id = dictionaryValues.size();
579551
floatDictionaryContent.put(v, id);
552+
dictionaryValues.add(v);
580553
dictionaryByteSize += 4;
581554
}
582555
encodedValues.add(id);
@@ -588,10 +561,9 @@ public DictionaryPage toDictPageAndClose() {
588561
// return a dictionary only if we actually used it
589562
PlainValuesWriter dictionaryEncoder =
590563
new PlainValuesWriter(lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
591-
FloatIterator floatIterator = floatDictionaryContent.keySet().iterator();
592564
// write only the part of the dict that we used
593565
for (int i = 0; i < lastUsedDictionarySize; i++) {
594-
dictionaryEncoder.writeFloat(floatIterator.nextFloat());
566+
dictionaryEncoder.writeFloat(dictionaryValues.getFloat(i));
595567
}
596568
return dictPage(dictionaryEncoder);
597569
}
@@ -606,24 +578,16 @@ public int getDictionarySize() {
606578
@Override
607579
protected void clearDictionaryContent() {
608580
floatDictionaryContent.clear();
581+
dictionaryValues.clear();
609582
}
610583

611584
@Override
612585
public void fallBackDictionaryEncodedData(ValuesWriter writer) {
613-
// build reverse dictionary
614-
float[] reverseDictionary = new float[getDictionarySize()];
615-
ObjectIterator<Float2IntMap.Entry> entryIterator =
616-
floatDictionaryContent.float2IntEntrySet().iterator();
617-
while (entryIterator.hasNext()) {
618-
Float2IntMap.Entry entry = entryIterator.next();
619-
reverseDictionary[entry.getIntValue()] = entry.getFloatKey();
620-
}
621-
622586
// fall back to plain encoding
623587
IntIterator iterator = encodedValues.iterator();
624588
while (iterator.hasNext()) {
625589
int id = iterator.next();
626-
writer.writeFloat(reverseDictionary[id]);
590+
writer.writeFloat(dictionaryValues.getFloat(id));
627591
}
628592
}
629593
}

0 commit comments

Comments
 (0)