Skip to content

Commit 0a47bb2

Browse files
authored
Merge pull request #2353 from gCass/fix/parquet_correct_null_metadata
Correction of handling of null counts metadata in parquet files for PhpParquetEngine
2 parents dccb425 + 2e5907b commit 0a47bb2

7 files changed

Lines changed: 137 additions & 0 deletions

File tree

src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,24 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void
7373

7474
$maxDefinitionLevel = $this->column->maxDefinitionsLevel();
7575

76+
$nullsInBatch = 0;
77+
7678
foreach ($defLevels as $definitionLevel) {
7779
if ($definitionLevel < $maxDefinitionLevel) {
7880
$this->nullCount++;
81+
$nullsInBatch++;
7982
} else {
8083
$this->nonNullValuesCount++;
8184
}
8285
}
8386

8487
$this->valueStorage->addValues($this->column, $columnValues->values());
8588
$this->pageStatistics->addBatch($columnValues->values());
89+
90+
if ($nullsInBatch > 0) {
91+
$this->pageStatistics->addNulls($nullsInBatch);
92+
}
93+
8694
$this->rowsCount += $columnValues->rowsCount();
8795
}
8896

src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,24 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void
7171

7272
$maxDefinitionLevel = $this->column->maxDefinitionsLevel();
7373

74+
$nullsInBatch = 0;
75+
7476
foreach ($defLevels as $definitionLevel) {
7577
if ($definitionLevel < $maxDefinitionLevel) {
7678
$this->nullCount++;
79+
$nullsInBatch++;
7780
} else {
7881
$this->nonNullValuesCount++;
7982
}
8083
}
8184

8285
$this->valueStorage->addValues($this->column, $columnValues->values());
8386
$this->pageStatistics->addBatch($columnValues->values());
87+
88+
if ($nullsInBatch > 0) {
89+
$this->pageStatistics->addNulls($nullsInBatch);
90+
}
91+
8492
$this->rowsCount += $columnValues->rowsCount();
8593
}
8694

src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,22 @@ public function addColumn(WriteFlatColumnValues $columnValues) : void
7070

7171
$maxDefinitionLevel = $this->column->maxDefinitionsLevel();
7272

73+
$nullsInBatch = 0;
74+
7375
foreach ($defLevels as $definitionLevel) {
7476
if ($definitionLevel < $maxDefinitionLevel) {
7577
$this->nullCount++;
78+
$nullsInBatch++;
7679
}
7780
}
7881

7982
array_push($this->pageValues, ...$columnValues->values());
8083
$this->pageStatistics->addBatch($columnValues->values());
84+
85+
if ($nullsInBatch > 0) {
86+
$this->pageStatistics->addNulls($nullsInBatch);
87+
}
88+
8189
$this->rowsCount += $columnValues->rowsCount();
8290
}
8391

src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,16 @@ public function addBatch(array $values) : void
105105
}
106106
}
107107

108+
public function addNulls(int $count) : void
109+
{
110+
if ($count < 0) {
111+
throw new InvalidArgumentException('Null count cannot be negative.');
112+
}
113+
114+
$this->nullCount += $count;
115+
$this->valuesCount += $count;
116+
}
117+
108118
public function max() : mixed
109119
{
110120
return $this->max;

src/lib/parquet/tests/Flow/Parquet/Tests/Integration/IO/WriterTest.php

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,38 @@ public function test_writing_column_statistics(ParquetEngine $engine) : void
141141
\unlink($path);
142142
}
143143

144+
#[DataProvider('engine_provider')]
145+
public function test_writing_column_statistics_with_null_values(ParquetEngine $engine) : void
146+
{
147+
$schema = Schema::with(
148+
FlatColumn::string('all_null'),
149+
FlatColumn::string('all_string'),
150+
FlatColumn::string('mixed'),
151+
);
152+
153+
$rows = [
154+
['all_null' => null, 'all_string' => 'a', 'mixed' => 'x'],
155+
['all_null' => null, 'all_string' => 'b', 'mixed' => null],
156+
['all_null' => null, 'all_string' => 'c', 'mixed' => 'z'],
157+
];
158+
159+
$path = __DIR__ . '/var/test-writer-parquet-null-stats-' . generate_random_string() . '.parquet';
160+
161+
(new Writer(engine: $engine))->write($path, $schema, $rows);
162+
163+
$chunks = [];
164+
165+
foreach ((new Reader(engine: $engine))->read($path)->metadata()->columnChunks() as $chunk) {
166+
$chunks[$chunk->flatPath()] = $chunk;
167+
}
168+
169+
static::assertSame(3, $chunks['all_null']->statistics()->nullCount());
170+
static::assertSame(0, $chunks['all_string']->statistics()->nullCount());
171+
static::assertSame(1, $chunks['mixed']->statistics()->nullCount());
172+
173+
\unlink($path);
174+
}
175+
144176
public function test_writing_data_page_v2_statistics() : void
145177
{
146178
$options = Options::default()->set(Option::WRITER_VERSION, 2);

src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilderTest.php

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,40 @@ public function test_statistics_are_generated() : void
637637
self::assertNotNull($statistics);
638638
}
639639

640+
public function test_statistics_track_null_count_for_all_null_chunk() : void
641+
{
642+
$column = FlatColumn::string('all_null');
643+
$options = new Options();
644+
$compression = Compressions::UNCOMPRESSED;
645+
$builder = new PlainFlatColumnChunkBuilder($column, $options, $compression);
646+
647+
$builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [0, 0, 0], []));
648+
649+
$statistics = $builder->flush(0)[0]->columnChunk->statistics();
650+
651+
self::assertNotNull($statistics);
652+
self::assertSame(3, $statistics->nullCount());
653+
self::assertNull($statistics->min($column));
654+
self::assertNull($statistics->max($column));
655+
}
656+
657+
public function test_statistics_track_null_count_for_mixed_chunk() : void
658+
{
659+
$column = FlatColumn::string('mixed');
660+
$options = new Options();
661+
$compression = Compressions::UNCOMPRESSED;
662+
$builder = new PlainFlatColumnChunkBuilder($column, $options, $compression);
663+
664+
$builder->addColumn(new WriteFlatColumnValues($column, [0, 0, 0], [1, 0, 1], ['x', 'z']));
665+
666+
$statistics = $builder->flush(0)[0]->columnChunk->statistics();
667+
668+
self::assertNotNull($statistics);
669+
self::assertSame(1, $statistics->nullCount());
670+
self::assertSame('x', $statistics->min($column));
671+
self::assertSame('z', $statistics->max($column));
672+
}
673+
640674
public function test_uncompressed_size_accumulates_multiple_pages() : void
641675
{
642676
$column = new FlatColumn('test_col', PhysicalType::INT32);

src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,43 @@ public function test_add_null_value() : void
130130
self::assertNull($statistics->max());
131131
}
132132

133+
public function test_add_nulls_increments_null_and_values_count() : void
134+
{
135+
$column = FlatColumn::string('test_column');
136+
$statistics = new StatisticsCounter($column);
137+
138+
$statistics->add('hello');
139+
$statistics->addNulls(3);
140+
141+
self::assertSame(3, $statistics->nullCount());
142+
self::assertSame(4, $statistics->valuesCount());
143+
self::assertSame(1, $statistics->notNullCount());
144+
self::assertSame('hello', $statistics->min());
145+
self::assertSame('hello', $statistics->max());
146+
}
147+
148+
public function test_add_nulls_with_negative_throws() : void
149+
{
150+
$column = FlatColumn::string('test_column');
151+
$statistics = new StatisticsCounter($column);
152+
153+
$this->expectException(InvalidArgumentException::class);
154+
$this->expectExceptionMessage('Null count cannot be negative.');
155+
156+
$statistics->addNulls(-1);
157+
}
158+
159+
public function test_add_nulls_with_zero_is_noop() : void
160+
{
161+
$column = FlatColumn::string('test_column');
162+
$statistics = new StatisticsCounter($column);
163+
164+
$statistics->addNulls(0);
165+
166+
self::assertSame(0, $statistics->nullCount());
167+
self::assertSame(0, $statistics->valuesCount());
168+
}
169+
133170
public function test_add_object_value() : void
134171
{
135172
$column = FlatColumn::string('test_column');

0 commit comments

Comments
 (0)