Skip to content

Commit b65cc2e

Browse files
authored
Adjusted parquet default values for page/row group size (#1774)
1 parent bf700fc commit b65cc2e

5 files changed

Lines changed: 22 additions & 7 deletions

File tree

src/lib/parquet/src/Flow/Parquet/Option.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ enum Option
113113
* PageBuilder is not going to make it precisely equal to this value, but it will try to make it as close as possible.
114114
* This should be considered as a threshold rather than a strict value.
115115
*
116-
* Default value is 8Kb
116+
* Default value is 128Kb
117117
*
118118
* https://parquet.apache.org/docs/file-format/configurations/#data-page--size
119119
*/
@@ -140,7 +140,7 @@ enum Option
140140
* RowGroupBuilder is going to use this value to determine for how long it should keep adding rows to the buffer
141141
* before flushing it on disk.
142142
*
143-
* Default value is 8Mb
143+
* Default value is 32Mb
144144
*
145145
* In order to be more aligned with apache spark and hadoop, this value should be set between 128 and 512Mb.
146146
* https://parquet.apache.org/docs/file-format/configurations/#row-group-size

src/lib/parquet/src/Flow/Parquet/Options.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ public function __construct()
2121
Option::ROUND_NANOSECONDS->name => false,
2222
Option::INT_96_AS_DATETIME->name => true,
2323
Option::PAGE_MAXIMUM_ROWS_COUNT->name => 1000,
24-
Option::PAGE_SIZE_BYTES->name => SizeUnits::KiB_SIZE * 8,
24+
Option::PAGE_SIZE_BYTES->name => SizeUnits::KiB_SIZE * 128,
2525
Option::PAGE_SIZE_CHECK_INTERVAL->name => 100,
26-
Option::ROW_GROUP_SIZE_BYTES->name => SizeUnits::MiB_SIZE * 4,
26+
Option::ROW_GROUP_SIZE_BYTES->name => SizeUnits::MiB_SIZE * 32,
2727
Option::ROW_GROUP_SIZE_CHECK_INTERVAL->name => 1000,
2828
Option::DICTIONARY_PAGE_SIZE->name => SizeUnits::MiB_SIZE,
2929
Option::DICTIONARY_PAGE_MIN_CARDINALITY_RATION->name => 0.4,

src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/DeltaBinaryPackedColumnChunkBuilder.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ public function isFull() : bool
159159

160160
public function uncompressedSize() : int
161161
{
162-
return $this->pages->uncompressedSize();
162+
return $this->pages->uncompressedSize() + $this->currentPageUncompressedSize();
163163
}
164164

165165
private function buildDataPage(Codec $codec, Compressions $compression) : PageContainer
@@ -257,4 +257,9 @@ private function buildDataPageV2(Codec $codec, Compressions $compression) : Page
257257
$pageHeader
258258
);
259259
}
260+
261+
private function currentPageUncompressedSize() : int
262+
{
263+
return $this->valueStorage->size() + (count($this->repetitionLevels) * 4) + (count($this->definitionLevels) * 4);
264+
}
260265
}

src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/PlainFlatColumnChunkBuilder.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ public function isFull() : bool
156156

157157
public function uncompressedSize() : int
158158
{
159-
return $this->pages->uncompressedSize();
159+
return $this->pages->uncompressedSize() + $this->currentPageUncompressedSize();
160160
}
161161

162162
private function buildDataPage(Codec $codec, Compressions $compression) : PageContainer
@@ -253,4 +253,9 @@ private function buildDataPageV2(Codec $codec, Compressions $compression) : Page
253253
$pageHeader
254254
);
255255
}
256+
257+
private function currentPageUncompressedSize() : int
258+
{
259+
return $this->valueStorage->size() + (count($this->repetitionLevels) * 4) + (count($this->definitionLevels) * 4);
260+
}
256261
}

src/lib/parquet/src/Flow/Parquet/Writer/ColumnChunkBuilder/RLEDictionaryChunkBuilder.php

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ public function isFull() : bool
170170

171171
public function uncompressedSize() : int
172172
{
173-
return $this->pages->uncompressedSize();
173+
return $this->pages->uncompressedSize() + $this->currentPageUncompressedSize();
174174
}
175175

176176
private function buildDataPage(Codec $codec, Compressions $compression) : PageContainer
@@ -311,4 +311,9 @@ private function buildDictionaryPage(Codec $codec, Compressions $compression) :
311311
$pageHeader
312312
);
313313
}
314+
315+
private function currentPageUncompressedSize() : int
316+
{
317+
return (count($this->pageValues) * 4) + (count($this->repetitionLevels) * 4) + (count($this->definitionLevels) * 4);
318+
}
314319
}

0 commit comments

Comments
 (0)