Skip to content

Commit d9d390b

Browse files
committed
Fix DataPageHeaderV2.num_nulls=-1 when column statistics are disabled
1 parent 8931c1c commit d9d390b

5 files changed

Lines changed: 88 additions & 1 deletion

File tree

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnValueCollector.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ class ColumnValueCollector {
4545
private SizeStatistics.Builder sizeStatisticsBuilder;
4646
private GeospatialStatistics.Builder geospatialStatisticsBuilder;
4747

48+
// track the required `num_nulls` field in DataPageHeaderV2
49+
// https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
50+
private int nullCount;
51+
4852
ColumnValueCollector(ColumnDescriptor path, BloomFilterWriter bloomFilterWriter, ParquetProperties props) {
4953
this.path = path;
5054
this.statisticsEnabled = props.getStatisticsEnabled(path);
@@ -54,6 +58,7 @@ class ColumnValueCollector {
5458
}
5559

5660
void resetPageStatistics() {
61+
this.nullCount = 0;
5762
this.statistics = statisticsEnabled
5863
? Statistics.createStats(path.getPrimitiveType())
5964
: Statistics.noopStats(path.getPrimitiveType());
@@ -68,6 +73,7 @@ void resetPageStatistics() {
6873
}
6974

7075
void writeNull(int repetitionLevel, int definitionLevel) {
76+
++nullCount;
7177
statistics.incrementNumNulls();
7278
sizeStatisticsBuilder.add(repetitionLevel, definitionLevel);
7379
}
@@ -198,6 +204,14 @@ void finalizeColumnChunk() {
198204
}
199205
}
200206

207+
/**
208+
* Returns the number of null values written in the current page.
209+
* This counter is to supply the required field `num_nulls` in DataPageHeaderV2.
210+
*/
211+
int getNullCount() {
212+
return nullCount;
213+
}
214+
201215
Statistics<?> getStatistics() {
202216
return statistics;
203217
}

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ void writePage() {
379379
writePage(
380380
pageRowCount,
381381
valueCount,
382+
collector.getNullCount(),
382383
collector.getStatistics(),
383384
collector.getSizeStatistics(),
384385
collector.getGeospatialStatistics(),
@@ -403,6 +404,7 @@ void writePage() {
403404
abstract void writePage(
404405
int rowCount,
405406
int valueCount,
407+
int nullCount,
406408
Statistics<?> statistics,
407409
SizeStatistics sizeStatistics,
408410
GeospatialStatistics geospatialStatistics,

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) {
6161
void writePage(
6262
int rowCount,
6363
int valueCount,
64+
int nullCount,
6465
Statistics<?> statistics,
6566
SizeStatistics sizeStatistics,
6667
GeospatialStatistics geospatialStatistics,

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ ValuesWriter createDLWriter(ParquetProperties props, ColumnDescriptor path) {
8787
void writePage(
8888
int rowCount,
8989
int valueCount,
90+
int nullCount,
9091
Statistics<?> statistics,
9192
SizeStatistics sizeStatistics,
9293
GeospatialStatistics geospatialStatistics,
@@ -100,7 +101,7 @@ void writePage(
100101
Encoding encoding = values.getEncoding();
101102
pageWriter.writePageV2(
102103
rowCount,
103-
Math.toIntExact(statistics.getNumNulls()),
104+
nullCount,
104105
valueCount,
105106
repetitionLevels.getBytes(),
106107
definitionLevels.getBytes(),

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,14 @@
5858
import org.apache.parquet.ParquetReadOptions;
5959
import org.apache.parquet.bytes.HeapByteBufferAllocator;
6060
import org.apache.parquet.bytes.TrackingByteBufferAllocator;
61+
import org.apache.parquet.column.ColumnDescriptor;
6162
import org.apache.parquet.column.Encoding;
6263
import org.apache.parquet.column.ParquetProperties;
6364
import org.apache.parquet.column.ParquetProperties.WriterVersion;
65+
import org.apache.parquet.column.page.DataPage;
66+
import org.apache.parquet.column.page.DataPageV2;
67+
import org.apache.parquet.column.page.PageReadStore;
68+
import org.apache.parquet.column.page.PageReader;
6469
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
6570
import org.apache.parquet.crypto.AesCipher;
6671
import org.apache.parquet.crypto.ColumnEncryptionProperties;
@@ -858,4 +863,68 @@ public void testNoFlushAfterException() throws Exception {
858863
FileSystem fs = file.getFileSystem(conf);
859864
assertTrue(!fs.exists(file) || fs.getFileStatus(file).getLen() == 0);
860865
}
866+
867+
@Test
868+
public void testV2PageNullCountWithStatisticsDisabled() throws Exception {
869+
// Regression test: when using PARQUET_2_0 with statistics disabled on a nullable column,
870+
// DataPageHeaderV2.num_nulls must still contain the correct null count (not -1).
871+
MessageType schema = Types.buildMessage()
872+
.required(INT32)
873+
.named("id")
874+
.optional(BINARY)
875+
.as(stringType())
876+
.named("value")
877+
.named("test_schema");
878+
879+
File file = temp.newFile();
880+
temp.delete();
881+
Path path = new Path(file.getAbsolutePath());
882+
883+
int totalRecords = 10;
884+
int expectedNulls = 4; // records where i % 3 == 0: i=0,3,6,9
885+
886+
// Write with PARQUET_2_0 and statistics disabled on the nullable "value" column
887+
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
888+
.withType(schema)
889+
.withWriterVersion(PARQUET_2_0)
890+
.withStatisticsEnabled("value", false)
891+
.withPageSize(1024 * 1024) // large page to keep all records in one page
892+
.build()) {
893+
SimpleGroupFactory factory = new SimpleGroupFactory(schema);
894+
for (int i = 0; i < totalRecords; i++) {
895+
Group group = factory.newGroup().append("id", i);
896+
if (i % 3 != 0) {
897+
group.append("value", "hello-" + i);
898+
}
899+
writer.write(group);
900+
}
901+
}
902+
903+
// Read back the page-level metadata and verify num_nulls
904+
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
905+
MessageType fileSchema = reader.getFooter().getFileMetaData().getSchema();
906+
907+
// Find the "value" column descriptor
908+
ColumnDescriptor valueColumn = fileSchema.getColumns().stream()
909+
.filter(c -> c.getPath()[0].equals("value"))
910+
.findFirst()
911+
.orElseThrow(() -> new AssertionError("Column 'value' not found"));
912+
913+
PageReadStore rowGroup = reader.readNextRowGroup();
914+
PageReader pageReader = rowGroup.getPageReader(valueColumn);
915+
DataPage page = pageReader.readPage();
916+
917+
// Verify it's a V2 page (because we used PARQUET_2_0)
918+
assertTrue(
919+
"PARQUET_2_0 writer should produce DataPageV2 pages, got: "
920+
+ page.getClass().getSimpleName(),
921+
page instanceof DataPageV2);
922+
923+
DataPageV2 pageV2 = (DataPageV2) page;
924+
assertEquals(
925+
"DataPageV2.num_nulls should be the actual null count even when statistics are disabled",
926+
expectedNulls,
927+
pageV2.getNullCount());
928+
}
929+
}
861930
}

0 commit comments

Comments
 (0)