|
58 | 58 | import org.apache.parquet.ParquetReadOptions; |
59 | 59 | import org.apache.parquet.bytes.HeapByteBufferAllocator; |
60 | 60 | import org.apache.parquet.bytes.TrackingByteBufferAllocator; |
| 61 | +import org.apache.parquet.column.ColumnDescriptor; |
61 | 62 | import org.apache.parquet.column.Encoding; |
62 | 63 | import org.apache.parquet.column.ParquetProperties; |
63 | 64 | import org.apache.parquet.column.ParquetProperties.WriterVersion; |
| 65 | +import org.apache.parquet.column.page.DataPage; |
| 66 | +import org.apache.parquet.column.page.DataPageV2; |
| 67 | +import org.apache.parquet.column.page.PageReadStore; |
| 68 | +import org.apache.parquet.column.page.PageReader; |
64 | 69 | import org.apache.parquet.column.values.bloomfilter.BloomFilter; |
65 | 70 | import org.apache.parquet.crypto.AesCipher; |
66 | 71 | import org.apache.parquet.crypto.ColumnEncryptionProperties; |
@@ -858,4 +863,68 @@ public void testNoFlushAfterException() throws Exception { |
858 | 863 | FileSystem fs = file.getFileSystem(conf); |
859 | 864 | assertTrue(!fs.exists(file) || fs.getFileStatus(file).getLen() == 0); |
860 | 865 | } |
| 866 | + |
| 867 | + @Test |
| 868 | + public void testV2PageNullCountWithStatisticsDisabled() throws Exception { |
| 869 | + // Regression test: when using PARQUET_2_0 with statistics disabled on a nullable column, |
| 870 | + // DataPageHeaderV2.num_nulls must still contain the correct null count (not -1). |
| 871 | + MessageType schema = Types.buildMessage() |
| 872 | + .required(INT32) |
| 873 | + .named("id") |
| 874 | + .optional(BINARY) |
| 875 | + .as(stringType()) |
| 876 | + .named("value") |
| 877 | + .named("test_schema"); |
| 878 | + |
| 879 | + File file = temp.newFile(); |
| 880 | + file.delete(); |
| 881 | + Path path = new Path(file.getAbsolutePath()); |
| 882 | + |
| 883 | + int totalRecords = 10; |
| 884 | + int expectedNulls = 4; // records where i % 3 == 0: i=0,3,6,9 |
| 885 | + |
| 886 | + // Write with PARQUET_2_0 and statistics disabled on the nullable "value" column |
| 887 | + try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path) |
| 888 | + .withType(schema) |
| 889 | + .withWriterVersion(PARQUET_2_0) |
| 890 | + .withStatisticsEnabled("value", false) |
| 891 | + .withPageSize(1024 * 1024) // large page to keep all records in one page |
| 892 | + .build()) { |
| 893 | + SimpleGroupFactory factory = new SimpleGroupFactory(schema); |
| 894 | + for (int i = 0; i < totalRecords; i++) { |
| 895 | + Group group = factory.newGroup().append("id", i); |
| 896 | + if (i % 3 != 0) { |
| 897 | + group.append("value", "hello-" + i); |
| 898 | + } |
| 899 | + writer.write(group); |
| 900 | + } |
| 901 | + } |
| 902 | + |
| 903 | + // Read back the page-level metadata and verify num_nulls |
| 904 | + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { |
| 905 | + MessageType fileSchema = reader.getFooter().getFileMetaData().getSchema(); |
| 906 | + |
| 907 | + // Find the "value" column descriptor |
| 908 | + ColumnDescriptor valueColumn = fileSchema.getColumns().stream() |
| 909 | + .filter(c -> c.getPath()[0].equals("value")) |
| 910 | + .findFirst() |
| 911 | + .orElseThrow(() -> new AssertionError("Column 'value' not found")); |
| 912 | + |
| 913 | + PageReadStore rowGroup = reader.readNextRowGroup(); |
| 914 | + PageReader pageReader = rowGroup.getPageReader(valueColumn); |
| 915 | + DataPage page = pageReader.readPage(); |
| 916 | + |
| 917 | + // Verify it's a V2 page (because we used PARQUET_2_0) |
| 918 | + assertTrue( |
| 919 | + "PARQUET_2_0 writer should produce DataPageV2 pages, got: " |
| 920 | + + page.getClass().getSimpleName(), |
| 921 | + page instanceof DataPageV2); |
| 922 | + |
| 923 | + DataPageV2 pageV2 = (DataPageV2) page; |
| 924 | + assertEquals( |
| 925 | + "DataPageV2.num_nulls should be the actual null count even when statistics are disabled", |
| 926 | + expectedNulls, |
| 927 | + pageV2.getNullCount()); |
| 928 | + } |
| 929 | + } |
861 | 930 | } |
0 commit comments