Skip to content

Commit 83b6461

Browse files
committed
parquet-hadoop: Statistics.toParquetStatistics: always set null_count
1 parent 7be05b4 commit 83b6461

2 files changed

Lines changed: 5 additions & 4 deletions

File tree

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -800,12 +800,12 @@ public static Statistics toParquetStatistics(org.apache.parquet.column.statistic
800800
public static Statistics toParquetStatistics(
801801
org.apache.parquet.column.statistics.Statistics stats, int truncateLength) {
802802
Statistics formatStats = new Statistics();
803+
formatStats.setNull_count(stats.getNumNulls());
803804
// Don't write stats larger than the max size rather than truncating. The
804805
// rationale is that some engines may use the minimum value in the page as
805806
// the true minimum for aggregations and there is no way to mark that a
806807
// value has been truncated and is a lower bound and not in the page.
807808
if (!stats.isEmpty() && withinLimit(stats, truncateLength)) {
808-
formatStats.setNull_count(stats.getNumNulls());
809809
if (stats.hasNonNullValue()) {
810810
byte[] min;
811811
byte[] max;

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,7 @@ private void testBinaryStats(StatsHelper helper) {
807807
}
808808
Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count());
809809

810-
// convert to empty stats because the values are too large
810+
// min/max are not written because the values are too large, but null count is always written
811811
stats.setMinMaxFromBytes(max, max);
812812

813813
formatStats = helper.toParquetStatistics(stats);
@@ -816,15 +816,16 @@ private void testBinaryStats(StatsHelper helper) {
816816
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
817817
Assert.assertFalse("Min_value should not be set", formatStats.isSetMin_value());
818818
Assert.assertFalse("Max_value should not be set", formatStats.isSetMax_value());
819-
Assert.assertFalse("Num nulls should not be set", formatStats.isSetNull_count());
819+
Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count());
820820

821821
Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatisticsInternal(
822822
Version.FULL_VERSION,
823823
formatStats,
824824
new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""),
825825
ParquetMetadataConverter.SortOrder.SIGNED);
826826

827-
Assert.assertTrue(roundTripStats.isEmpty());
827+
Assert.assertFalse("Round-trip stats should not be empty (null count is set)", roundTripStats.isEmpty());
828+
Assert.assertEquals("Round-trip null count should match", 3004, roundTripStats.getNumNulls());
828829
}
829830

830831
@Test

0 commit comments

Comments
 (0)