Skip to content

Commit 0ecd799

Browse files
authored
Allow reading dictionary encoded boolean (#3370)
* Allow reading dictionary encoded boolean I've observed some Parquet files in the wild that contain dictionary encoded boolean values, which is also wild. I don't think we want allow producing this, but I think it would be good to allow reading this. We don't judge. * Thanks Gang
1 parent 46218f2 commit 0ecd799

File tree

3 files changed

+115
-0
lines changed

3 files changed

+115
-0
lines changed

parquet-column/src/main/java/org/apache/parquet/column/Encoding.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
4141
import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
4242
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary;
43+
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
4344
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainDoubleDictionary;
4445
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainFloatDictionary;
4546
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainIntegerDictionary;
@@ -102,6 +103,8 @@ public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dic
102103
return new PlainIntegerDictionary(dictionaryPage);
103104
case FLOAT:
104105
return new PlainFloatDictionary(dictionaryPage);
106+
case BOOLEAN:
107+
return new PlainBooleanDictionary(dictionaryPage);
105108
default:
106109
throw new ParquetDecodingException(
107110
"Dictionary encoding not supported for type: " + descriptor.getType());

parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/PlainValuesDictionary.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.parquet.bytes.ByteBufferInputStream;
2929
import org.apache.parquet.column.Dictionary;
3030
import org.apache.parquet.column.page.DictionaryPage;
31+
import org.apache.parquet.column.values.plain.BooleanPlainValuesReader;
3132
import org.apache.parquet.column.values.plain.PlainValuesReader.DoublePlainValuesReader;
3233
import org.apache.parquet.column.values.plain.PlainValuesReader.FloatPlainValuesReader;
3334
import org.apache.parquet.column.values.plain.PlainValuesReader.IntegerPlainValuesReader;
@@ -300,4 +301,46 @@ public int getMaxId() {
300301
return floatDictionaryContent.length - 1;
301302
}
302303
}
304+
305+
/**
306+
* a simple implementation of dictionary for plain encoded boolean values
307+
*/
308+
public static class PlainBooleanDictionary extends PlainValuesDictionary {
309+
310+
private final boolean[] boolDictionaryContent;
311+
312+
/**
313+
* @param dictionaryPage a dictionary page of encoded boolean values
314+
* @throws IOException if there is an exception while decoding the dictionary page
315+
*/
316+
public PlainBooleanDictionary(DictionaryPage dictionaryPage) throws IOException {
317+
super(dictionaryPage);
318+
ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream();
319+
boolDictionaryContent = new boolean[dictionaryPage.getDictionarySize()];
320+
BooleanPlainValuesReader boolReader = new BooleanPlainValuesReader();
321+
boolReader.initFromPage(dictionaryPage.getDictionarySize(), in);
322+
for (int i = 0; i < boolDictionaryContent.length; i++) {
323+
boolDictionaryContent[i] = boolReader.readBoolean();
324+
}
325+
}
326+
327+
@Override
328+
public boolean decodeToBoolean(int id) {
329+
return boolDictionaryContent[id];
330+
}
331+
332+
@Override
333+
public String toString() {
334+
StringBuilder sb = new StringBuilder("PlainBooleanDictionary {\n");
335+
for (int i = 0; i < boolDictionaryContent.length; i++) {
336+
sb.append(i).append(" => ").append(boolDictionaryContent[i]).append("\n");
337+
}
338+
return sb.append("}").toString();
339+
}
340+
341+
@Override
342+
public int getMaxId() {
343+
return boolDictionaryContent.length - 1;
344+
}
345+
}
303346
}

parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
2626
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
2727
import static org.junit.Assert.assertEquals;
28+
import static org.junit.Assert.assertFalse;
29+
import static org.junit.Assert.assertTrue;
2830

2931
import java.io.IOException;
3032
import java.nio.ByteBuffer;
@@ -44,6 +46,7 @@
4446
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter;
4547
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
4648
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter;
49+
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBooleanDictionary;
4750
import org.apache.parquet.column.values.fallback.FallbackValuesWriter;
4851
import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
4952
import org.apache.parquet.column.values.plain.PlainValuesReader;
@@ -678,6 +681,72 @@ public void testZeroValues() throws IOException {
678681
}
679682
}
680683

684+
@Test
685+
public void testBooleanDictionary() throws IOException {
686+
// Create a dictionary page with boolean values (false, true)
687+
// Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 0x02
688+
BytesInput bytes = BytesInput.from(new byte[] {0x02});
689+
DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
690+
691+
PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage);
692+
693+
// Verify dictionary decoding
694+
assertFalse(dictionary.decodeToBoolean(0));
695+
assertTrue(dictionary.decodeToBoolean(1));
696+
assertEquals(1, dictionary.getMaxId());
697+
}
698+
699+
@Test
700+
public void testBooleanDictionarySingleValue() throws IOException {
701+
// Test dictionary with only true value
702+
// Bit-packed: bit 0 = true (1) => byte = 0b00000001 = 0x01
703+
BytesInput bytesTrue = BytesInput.from(new byte[] {0x01});
704+
DictionaryPage dictionaryPageTrue = new DictionaryPage(bytesTrue, 1, PLAIN);
705+
706+
PlainBooleanDictionary dictionaryTrue = new PlainBooleanDictionary(dictionaryPageTrue);
707+
708+
assertTrue(dictionaryTrue.decodeToBoolean(0));
709+
assertEquals(0, dictionaryTrue.getMaxId());
710+
711+
// Test dictionary with only false value
712+
// Bit-packed: bit 0 = false (0) => byte = 0b00000000 = 0x00
713+
BytesInput bytesFalse = BytesInput.from(new byte[] {0x00});
714+
DictionaryPage dictionaryPageFalse = new DictionaryPage(bytesFalse, 1, PLAIN);
715+
716+
PlainBooleanDictionary dictionaryFalse = new PlainBooleanDictionary(dictionaryPageFalse);
717+
718+
assertFalse(dictionaryFalse.decodeToBoolean(0));
719+
assertEquals(0, dictionaryFalse.getMaxId());
720+
}
721+
722+
@Test
723+
public void testBooleanDictionaryToString() throws IOException {
724+
// Bit-packed: bit 0 = false (0), bit 1 = true (1) => byte = 0b00000010 = 0x02
725+
BytesInput bytes = BytesInput.from(new byte[] {0x02});
726+
DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN);
727+
728+
PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage);
729+
730+
String str = dictionary.toString();
731+
Assert.assertTrue(str.contains("PlainBooleanDictionary"));
732+
Assert.assertTrue(str.contains("0 => false"));
733+
Assert.assertTrue(str.contains("1 => true"));
734+
}
735+
736+
@Test
737+
public void testBooleanDictionaryWithDictionaryEncoding() throws IOException {
738+
// Test with PLAIN_DICTIONARY encoding (both PLAIN and PLAIN_DICTIONARY should work)
739+
// Bit-packed: bit 0 = true (1), bit 1 = false (0) => byte = 0b00000001 = 0x01
740+
BytesInput bytes = BytesInput.from(new byte[] {0x01});
741+
DictionaryPage dictionaryPage = new DictionaryPage(bytes, 2, PLAIN_DICTIONARY);
742+
743+
PlainBooleanDictionary dictionary = new PlainBooleanDictionary(dictionaryPage);
744+
745+
assertEquals(true, dictionary.decodeToBoolean(0));
746+
assertEquals(false, dictionary.decodeToBoolean(1));
747+
assertEquals(1, dictionary.getMaxId());
748+
}
749+
681750
private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type) throws IOException {
682751
final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy();
683752
final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] {"foo"}, type, 0, 0);

0 commit comments

Comments
 (0)