2020
2121import static org .apache .parquet .bytes .BytesInput .concat ;
2222
23- import it .unimi .dsi .fastutil .doubles .Double2IntLinkedOpenHashMap ;
2423import it .unimi .dsi .fastutil .doubles .Double2IntMap ;
25- import it .unimi .dsi .fastutil .doubles .DoubleIterator ;
26- import it .unimi .dsi .fastutil .floats . Float2IntLinkedOpenHashMap ;
24+ import it .unimi .dsi .fastutil .doubles .Double2IntOpenHashMap ;
25+ import it .unimi .dsi .fastutil .doubles . DoubleArrayList ;
2726import it .unimi .dsi .fastutil .floats .Float2IntMap ;
28- import it .unimi .dsi .fastutil .floats .FloatIterator ;
29- import it .unimi .dsi .fastutil .ints . Int2IntLinkedOpenHashMap ;
27+ import it .unimi .dsi .fastutil .floats .Float2IntOpenHashMap ;
28+ import it .unimi .dsi .fastutil .floats . FloatArrayList ;
3029import it .unimi .dsi .fastutil .ints .Int2IntMap ;
31- import it .unimi .dsi .fastutil .longs .Long2IntLinkedOpenHashMap ;
30+ import it .unimi .dsi .fastutil .ints .Int2IntOpenHashMap ;
31+ import it .unimi .dsi .fastutil .ints .IntArrayList ;
3232import it .unimi .dsi .fastutil .longs .Long2IntMap ;
33- import it .unimi .dsi .fastutil .longs .LongIterator ;
34- import it .unimi .dsi .fastutil .objects . Object2IntLinkedOpenHashMap ;
33+ import it .unimi .dsi .fastutil .longs .Long2IntOpenHashMap ;
34+ import it .unimi .dsi .fastutil .longs . LongArrayList ;
3535import it .unimi .dsi .fastutil .objects .Object2IntMap ;
36- import it .unimi .dsi .fastutil .objects .ObjectIterator ;
36+ import it .unimi .dsi .fastutil .objects .Object2IntOpenHashMap ;
3737import java .io .IOException ;
3838import java .util .ArrayList ;
39- import java .util .Iterator ;
4039import java .util .List ;
4140import org .apache .parquet .bytes .ByteBufferAllocator ;
4241import org .apache .parquet .bytes .BytesInput ;
@@ -231,7 +230,8 @@ public String memUsageString(String prefix) {
231230 public static class PlainBinaryDictionaryValuesWriter extends DictionaryValuesWriter {
232231
233232 /* type specific dictionary content */
234- protected Object2IntMap <Binary > binaryDictionaryContent = new Object2IntLinkedOpenHashMap <>();
233+ protected Object2IntMap <Binary > binaryDictionaryContent = new Object2IntOpenHashMap <>();
234+ protected List <Binary > dictionaryValues = new ArrayList <>();
235235
236236 public PlainBinaryDictionaryValuesWriter (
237237 int maxDictionaryByteSize ,
@@ -246,8 +246,10 @@ public PlainBinaryDictionaryValuesWriter(
246246 public void writeBytes (Binary v ) {
247247 int id = binaryDictionaryContent .getInt (v );
248248 if (id == -1 ) {
249- id = binaryDictionaryContent .size ();
250- binaryDictionaryContent .put (v .copy (), id );
249+ id = dictionaryValues .size ();
250+ Binary copied = v .copy ();
251+ binaryDictionaryContent .put (copied , id );
252+ dictionaryValues .add (copied );
251253 // length as int (4 bytes) + actual bytes
252254 dictionaryByteSize += 4L + v .length ();
253255 }
@@ -260,12 +262,9 @@ public DictionaryPage toDictPageAndClose() {
260262 // return a dictionary only if we actually used it
261263 PlainValuesWriter dictionaryEncoder =
262264 new PlainValuesWriter (lastUsedDictionaryByteSize , maxDictionaryByteSize , allocator );
263- Iterator <Binary > binaryIterator =
264- binaryDictionaryContent .keySet ().iterator ();
265265 // write only the part of the dict that we used
266266 for (int i = 0 ; i < lastUsedDictionarySize ; i ++) {
267- Binary entry = binaryIterator .next ();
268- dictionaryEncoder .writeBytes (entry );
267+ dictionaryEncoder .writeBytes (dictionaryValues .get (i ));
269268 }
270269 return dictPage (dictionaryEncoder );
271270 }
@@ -280,21 +279,16 @@ public int getDictionarySize() {
280279 @ Override
281280 protected void clearDictionaryContent () {
282281 binaryDictionaryContent .clear ();
282+ dictionaryValues .clear ();
283283 }
284284
285285 @ Override
286286 public void fallBackDictionaryEncodedData (ValuesWriter writer ) {
287- // build reverse dictionary
288- Binary [] reverseDictionary = new Binary [getDictionarySize ()];
289- for (Object2IntMap .Entry <Binary > entry : binaryDictionaryContent .object2IntEntrySet ()) {
290- reverseDictionary [entry .getIntValue ()] = entry .getKey ();
291- }
292-
293- // fall back to plain encoding
287+ // fall back to plain encoding using the ordered dictionary values list
294288 IntIterator iterator = encodedValues .iterator ();
295289 while (iterator .hasNext ()) {
296290 int id = iterator .next ();
297- writer .writeBytes (reverseDictionary [ id ] );
291+ writer .writeBytes (dictionaryValues . get ( id ) );
298292 }
299293 }
300294 }
@@ -317,8 +311,10 @@ public PlainFixedLenArrayDictionaryValuesWriter(
317311 public void writeBytes (Binary value ) {
318312 int id = binaryDictionaryContent .getInt (value );
319313 if (id == -1 ) {
320- id = binaryDictionaryContent .size ();
321- binaryDictionaryContent .put (value .copy (), id );
314+ id = dictionaryValues .size ();
315+ Binary copied = value .copy ();
316+ binaryDictionaryContent .put (copied , id );
317+ dictionaryValues .add (copied );
322318 dictionaryByteSize += length ;
323319 }
324320 encodedValues .add (id );
@@ -330,12 +326,9 @@ public DictionaryPage toDictPageAndClose() {
330326 // return a dictionary only if we actually used it
331327 FixedLenByteArrayPlainValuesWriter dictionaryEncoder = new FixedLenByteArrayPlainValuesWriter (
332328 length , lastUsedDictionaryByteSize , maxDictionaryByteSize , allocator );
333- Iterator <Binary > binaryIterator =
334- binaryDictionaryContent .keySet ().iterator ();
335329 // write only the part of the dict that we used
336330 for (int i = 0 ; i < lastUsedDictionarySize ; i ++) {
337- Binary entry = binaryIterator .next ();
338- dictionaryEncoder .writeBytes (entry );
331+ dictionaryEncoder .writeBytes (dictionaryValues .get (i ));
339332 }
340333 return dictPage (dictionaryEncoder );
341334 }
@@ -346,7 +339,8 @@ public DictionaryPage toDictPageAndClose() {
346339 public static class PlainLongDictionaryValuesWriter extends DictionaryValuesWriter {
347340
348341 /* type specific dictionary content */
349- private Long2IntMap longDictionaryContent = new Long2IntLinkedOpenHashMap ();
342+ private Long2IntMap longDictionaryContent = new Long2IntOpenHashMap ();
343+ private LongArrayList dictionaryValues = new LongArrayList ();
350344
351345 public PlainLongDictionaryValuesWriter (
352346 int maxDictionaryByteSize ,
@@ -361,8 +355,9 @@ public PlainLongDictionaryValuesWriter(
361355 public void writeLong (long v ) {
362356 int id = longDictionaryContent .get (v );
363357 if (id == -1 ) {
364- id = longDictionaryContent .size ();
358+ id = dictionaryValues .size ();
365359 longDictionaryContent .put (v , id );
360+ dictionaryValues .add (v );
366361 dictionaryByteSize += 8 ;
367362 }
368363 encodedValues .add (id );
@@ -374,10 +369,9 @@ public DictionaryPage toDictPageAndClose() {
374369 // return a dictionary only if we actually used it
375370 PlainValuesWriter dictionaryEncoder =
376371 new PlainValuesWriter (lastUsedDictionaryByteSize , maxDictionaryByteSize , allocator );
377- LongIterator longIterator = longDictionaryContent .keySet ().iterator ();
378372 // write only the part of the dict that we used
379373 for (int i = 0 ; i < lastUsedDictionarySize ; i ++) {
380- dictionaryEncoder .writeLong (longIterator . nextLong ( ));
374+ dictionaryEncoder .writeLong (dictionaryValues . getLong ( i ));
381375 }
382376 return dictPage (dictionaryEncoder );
383377 }
@@ -392,32 +386,25 @@ public int getDictionarySize() {
392386 @ Override
393387 protected void clearDictionaryContent () {
394388 longDictionaryContent .clear ();
389+ dictionaryValues .clear ();
395390 }
396391
397392 @ Override
398393 public void fallBackDictionaryEncodedData (ValuesWriter writer ) {
399- // build reverse dictionary
400- long [] reverseDictionary = new long [getDictionarySize ()];
401- ObjectIterator <Long2IntMap .Entry > entryIterator =
402- longDictionaryContent .long2IntEntrySet ().iterator ();
403- while (entryIterator .hasNext ()) {
404- Long2IntMap .Entry entry = entryIterator .next ();
405- reverseDictionary [entry .getIntValue ()] = entry .getLongKey ();
406- }
407-
408394 // fall back to plain encoding
409395 IntIterator iterator = encodedValues .iterator ();
410396 while (iterator .hasNext ()) {
411397 int id = iterator .next ();
412- writer .writeLong (reverseDictionary [ id ] );
398+ writer .writeLong (dictionaryValues . getLong ( id ) );
413399 }
414400 }
415401 }
416402
417403 public static class PlainDoubleDictionaryValuesWriter extends DictionaryValuesWriter {
418404
419405 /* type specific dictionary content */
420- private Double2IntMap doubleDictionaryContent = new Double2IntLinkedOpenHashMap ();
406+ private Double2IntMap doubleDictionaryContent = new Double2IntOpenHashMap ();
407+ private DoubleArrayList dictionaryValues = new DoubleArrayList ();
421408
422409 public PlainDoubleDictionaryValuesWriter (
423410 int maxDictionaryByteSize ,
@@ -432,8 +419,9 @@ public PlainDoubleDictionaryValuesWriter(
432419 public void writeDouble (double v ) {
433420 int id = doubleDictionaryContent .get (v );
434421 if (id == -1 ) {
435- id = doubleDictionaryContent .size ();
422+ id = dictionaryValues .size ();
436423 doubleDictionaryContent .put (v , id );
424+ dictionaryValues .add (v );
437425 dictionaryByteSize += 8 ;
438426 }
439427 encodedValues .add (id );
@@ -445,10 +433,9 @@ public DictionaryPage toDictPageAndClose() {
445433 // return a dictionary only if we actually used it
446434 PlainValuesWriter dictionaryEncoder =
447435 new PlainValuesWriter (lastUsedDictionaryByteSize , maxDictionaryByteSize , allocator );
448- DoubleIterator doubleIterator = doubleDictionaryContent .keySet ().iterator ();
449436 // write only the part of the dict that we used
450437 for (int i = 0 ; i < lastUsedDictionarySize ; i ++) {
451- dictionaryEncoder .writeDouble (doubleIterator . nextDouble ( ));
438+ dictionaryEncoder .writeDouble (dictionaryValues . getDouble ( i ));
452439 }
453440 return dictPage (dictionaryEncoder );
454441 }
@@ -463,32 +450,25 @@ public int getDictionarySize() {
463450 @ Override
464451 protected void clearDictionaryContent () {
465452 doubleDictionaryContent .clear ();
453+ dictionaryValues .clear ();
466454 }
467455
468456 @ Override
469457 public void fallBackDictionaryEncodedData (ValuesWriter writer ) {
470- // build reverse dictionary
471- double [] reverseDictionary = new double [getDictionarySize ()];
472- ObjectIterator <Double2IntMap .Entry > entryIterator =
473- doubleDictionaryContent .double2IntEntrySet ().iterator ();
474- while (entryIterator .hasNext ()) {
475- Double2IntMap .Entry entry = entryIterator .next ();
476- reverseDictionary [entry .getIntValue ()] = entry .getDoubleKey ();
477- }
478-
479458 // fall back to plain encoding
480459 IntIterator iterator = encodedValues .iterator ();
481460 while (iterator .hasNext ()) {
482461 int id = iterator .next ();
483- writer .writeDouble (reverseDictionary [ id ] );
462+ writer .writeDouble (dictionaryValues . getDouble ( id ) );
484463 }
485464 }
486465 }
487466
488467 public static class PlainIntegerDictionaryValuesWriter extends DictionaryValuesWriter {
489468
490469 /* type specific dictionary content */
491- private Int2IntMap intDictionaryContent = new Int2IntLinkedOpenHashMap ();
470+ private Int2IntMap intDictionaryContent = new Int2IntOpenHashMap ();
471+ private IntArrayList dictionaryValues = new IntArrayList ();
492472
493473 public PlainIntegerDictionaryValuesWriter (
494474 int maxDictionaryByteSize ,
@@ -503,8 +483,9 @@ public PlainIntegerDictionaryValuesWriter(
503483 public void writeInteger (int v ) {
504484 int id = intDictionaryContent .get (v );
505485 if (id == -1 ) {
506- id = intDictionaryContent .size ();
486+ id = dictionaryValues .size ();
507487 intDictionaryContent .put (v , id );
488+ dictionaryValues .add (v );
508489 dictionaryByteSize += 4 ;
509490 }
510491 encodedValues .add (id );
@@ -516,11 +497,9 @@ public DictionaryPage toDictPageAndClose() {
516497 // return a dictionary only if we actually used it
517498 PlainValuesWriter dictionaryEncoder =
518499 new PlainValuesWriter (lastUsedDictionaryByteSize , maxDictionaryByteSize , allocator );
519- it .unimi .dsi .fastutil .ints .IntIterator intIterator =
520- intDictionaryContent .keySet ().iterator ();
521500 // write only the part of the dict that we used
522501 for (int i = 0 ; i < lastUsedDictionarySize ; i ++) {
523- dictionaryEncoder .writeInteger (intIterator . nextInt ( ));
502+ dictionaryEncoder .writeInteger (dictionaryValues . getInt ( i ));
524503 }
525504 return dictPage (dictionaryEncoder );
526505 }
@@ -535,32 +514,25 @@ public int getDictionarySize() {
535514 @ Override
536515 protected void clearDictionaryContent () {
537516 intDictionaryContent .clear ();
517+ dictionaryValues .clear ();
538518 }
539519
540520 @ Override
541521 public void fallBackDictionaryEncodedData (ValuesWriter writer ) {
542- // build reverse dictionary
543- int [] reverseDictionary = new int [getDictionarySize ()];
544- ObjectIterator <Int2IntMap .Entry > entryIterator =
545- intDictionaryContent .int2IntEntrySet ().iterator ();
546- while (entryIterator .hasNext ()) {
547- Int2IntMap .Entry entry = entryIterator .next ();
548- reverseDictionary [entry .getIntValue ()] = entry .getIntKey ();
549- }
550-
551522 // fall back to plain encoding
552523 IntIterator iterator = encodedValues .iterator ();
553524 while (iterator .hasNext ()) {
554525 int id = iterator .next ();
555- writer .writeInteger (reverseDictionary [ id ] );
526+ writer .writeInteger (dictionaryValues . getInt ( id ) );
556527 }
557528 }
558529 }
559530
560531 public static class PlainFloatDictionaryValuesWriter extends DictionaryValuesWriter {
561532
562533 /* type specific dictionary content */
563- private Float2IntMap floatDictionaryContent = new Float2IntLinkedOpenHashMap ();
534+ private Float2IntMap floatDictionaryContent = new Float2IntOpenHashMap ();
535+ private FloatArrayList dictionaryValues = new FloatArrayList ();
564536
565537 public PlainFloatDictionaryValuesWriter (
566538 int maxDictionaryByteSize ,
@@ -575,8 +547,9 @@ public PlainFloatDictionaryValuesWriter(
575547 public void writeFloat (float v ) {
576548 int id = floatDictionaryContent .get (v );
577549 if (id == -1 ) {
578- id = floatDictionaryContent .size ();
550+ id = dictionaryValues .size ();
579551 floatDictionaryContent .put (v , id );
552+ dictionaryValues .add (v );
580553 dictionaryByteSize += 4 ;
581554 }
582555 encodedValues .add (id );
@@ -588,10 +561,9 @@ public DictionaryPage toDictPageAndClose() {
588561 // return a dictionary only if we actually used it
589562 PlainValuesWriter dictionaryEncoder =
590563 new PlainValuesWriter (lastUsedDictionaryByteSize , maxDictionaryByteSize , allocator );
591- FloatIterator floatIterator = floatDictionaryContent .keySet ().iterator ();
592564 // write only the part of the dict that we used
593565 for (int i = 0 ; i < lastUsedDictionarySize ; i ++) {
594- dictionaryEncoder .writeFloat (floatIterator . nextFloat ( ));
566+ dictionaryEncoder .writeFloat (dictionaryValues . getFloat ( i ));
595567 }
596568 return dictPage (dictionaryEncoder );
597569 }
@@ -606,24 +578,16 @@ public int getDictionarySize() {
606578 @ Override
607579 protected void clearDictionaryContent () {
608580 floatDictionaryContent .clear ();
581+ dictionaryValues .clear ();
609582 }
610583
611584 @ Override
612585 public void fallBackDictionaryEncodedData (ValuesWriter writer ) {
613- // build reverse dictionary
614- float [] reverseDictionary = new float [getDictionarySize ()];
615- ObjectIterator <Float2IntMap .Entry > entryIterator =
616- floatDictionaryContent .float2IntEntrySet ().iterator ();
617- while (entryIterator .hasNext ()) {
618- Float2IntMap .Entry entry = entryIterator .next ();
619- reverseDictionary [entry .getIntValue ()] = entry .getFloatKey ();
620- }
621-
622586 // fall back to plain encoding
623587 IntIterator iterator = encodedValues .iterator ();
624588 while (iterator .hasNext ()) {
625589 int id = iterator .next ();
626- writer .writeFloat (reverseDictionary [ id ] );
590+ writer .writeFloat (dictionaryValues . getFloat ( id ) );
627591 }
628592 }
629593 }
0 commit comments