Skip to content

Commit e03b101

Browse files
committed
Fix DataPageHeaderV2.num_nulls=-1 when column statistics are disabled
1 parent b8f3330 commit e03b101

3 files changed

Lines changed: 75 additions & 1 deletion

File tree

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterBase.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ abstract class ColumnWriterBase implements ColumnWriter {
5151
private ValuesWriter definitionLevelColumn;
5252
private ValuesWriter dataColumn;
5353
private int valueCount;
54+
// track the required field DataPageHeaderV2.num_nulls
55+
// https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
56+
protected int nullCount;
5457

5558
private long rowsWrittenSoFar = 0;
5659
private int pageRowCount;
@@ -115,6 +118,7 @@ public void writeNull(int repetitionLevel, int definitionLevel) {
115118
definitionLevel(definitionLevel);
116119
collector.writeNull(repetitionLevel, definitionLevel);
117120
++valueCount;
121+
++nullCount;
118122
} catch (Throwable e) {
119123
statusManager.abort();
120124
throw e;
@@ -392,6 +396,7 @@ void writePage() {
392396
definitionLevelColumn.reset();
393397
dataColumn.reset();
394398
valueCount = 0;
399+
nullCount = 0;
395400
collector.resetPageStatistics();
396401
pageRowCount = 0;
397402
} catch (Throwable t) {

parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ void writePage(
100100
Encoding encoding = values.getEncoding();
101101
pageWriter.writePageV2(
102102
rowCount,
103-
Math.toIntExact(statistics.getNumNulls()),
103+
nullCount,
104104
valueCount,
105105
repetitionLevels.getBytes(),
106106
definitionLevels.getBytes(),

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,14 @@
5858
import org.apache.parquet.ParquetReadOptions;
5959
import org.apache.parquet.bytes.HeapByteBufferAllocator;
6060
import org.apache.parquet.bytes.TrackingByteBufferAllocator;
61+
import org.apache.parquet.column.ColumnDescriptor;
6162
import org.apache.parquet.column.Encoding;
6263
import org.apache.parquet.column.ParquetProperties;
6364
import org.apache.parquet.column.ParquetProperties.WriterVersion;
65+
import org.apache.parquet.column.page.DataPage;
66+
import org.apache.parquet.column.page.DataPageV2;
67+
import org.apache.parquet.column.page.PageReadStore;
68+
import org.apache.parquet.column.page.PageReader;
6469
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
6570
import org.apache.parquet.crypto.AesCipher;
6671
import org.apache.parquet.crypto.ColumnEncryptionProperties;
@@ -858,4 +863,68 @@ public void testNoFlushAfterException() throws Exception {
858863
FileSystem fs = file.getFileSystem(conf);
859864
assertTrue(!fs.exists(file) || fs.getFileStatus(file).getLen() == 0);
860865
}
866+
867+
@Test
868+
public void testV2PageNullCountWithStatisticsDisabled() throws Exception {
869+
// Regression test: when using PARQUET_2_0 with statistics disabled on a nullable column,
870+
// DataPageHeaderV2.num_nulls must still contain the correct null count (not -1).
871+
MessageType schema = Types.buildMessage()
872+
.required(INT32)
873+
.named("id")
874+
.optional(BINARY)
875+
.as(stringType())
876+
.named("value")
877+
.named("test_schema");
878+
879+
File file = temp.newFile();
880+
file.delete();
881+
Path path = new Path(file.getAbsolutePath());
882+
883+
int totalRecords = 10;
884+
int expectedNulls = 4; // records where i % 3 == 0: i=0,3,6,9
885+
886+
// Write with PARQUET_2_0 and statistics disabled on the nullable "value" column
887+
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
888+
.withType(schema)
889+
.withWriterVersion(PARQUET_2_0)
890+
.withStatisticsEnabled("value", false)
891+
.withPageSize(1024 * 1024) // large page to keep all records in one page
892+
.build()) {
893+
SimpleGroupFactory factory = new SimpleGroupFactory(schema);
894+
for (int i = 0; i < totalRecords; i++) {
895+
Group group = factory.newGroup().append("id", i);
896+
if (i % 3 != 0) {
897+
group.append("value", "hello-" + i);
898+
}
899+
writer.write(group);
900+
}
901+
}
902+
903+
// Read back the page-level metadata and verify num_nulls
904+
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
905+
MessageType fileSchema = reader.getFooter().getFileMetaData().getSchema();
906+
907+
// Find the "value" column descriptor
908+
ColumnDescriptor valueColumn = fileSchema.getColumns().stream()
909+
.filter(c -> c.getPath()[0].equals("value"))
910+
.findFirst()
911+
.orElseThrow(() -> new AssertionError("Column 'value' not found"));
912+
913+
PageReadStore rowGroup = reader.readNextRowGroup();
914+
PageReader pageReader = rowGroup.getPageReader(valueColumn);
915+
DataPage page = pageReader.readPage();
916+
917+
// Verify it's a V2 page (because we used PARQUET_2_0)
918+
assertTrue(
919+
"PARQUET_2_0 writer should produce DataPageV2 pages, got: "
920+
+ page.getClass().getSimpleName(),
921+
page instanceof DataPageV2);
922+
923+
DataPageV2 pageV2 = (DataPageV2) page;
924+
assertEquals(
925+
"DataPageV2.num_nulls should be the actual null count even when statistics are disabled",
926+
expectedNulls,
927+
pageV2.getNullCount());
928+
}
929+
}
861930
}

0 commit comments

Comments
 (0)