Skip to content

Commit 504bd6e

Browse files
committed
Fix batch_size resolving in to_parquet and improve to_parquet documentation
1 parent 4c3231d commit 504bd6e

1 file changed

Lines changed: 20 additions & 8 deletions

File tree

src/datasets/iterable_dataset.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4360,8 +4360,20 @@ def to_parquet(
43604360
Either a path to a file (e.g. `file.parquet`), a remote URI (e.g. `hf://datasets/username/my_dataset_name/data.parquet`),
43614361
or a BinaryIO, where the dataset will be saved to in the specified format.
43624362
batch_size (`int`, *optional*):
4363-
Size of the batch to load in memory and write at once.
4364-
Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
4363+
Size of the Arrow table batches loaded into memory and written at once.
4364+
Each batch is written as a single Parquet row group (unless overridden
4365+
by `row_group_size` in `parquet_writer_kwargs`).
4366+
Defaults to a value inferred from the dataset features, falling back
4367+
to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
4368+
4369+
<Tip warning={true}>
4370+
4371+
If the dataset has no features set, the entire dataset is materialized
4372+
into memory as a single Arrow table before writing, regardless of
4373+
`batch_size`. Call [`IterableDataset.cast`] or set features beforehand
4374+
to enable streaming writes.
4375+
4376+
</Tip>
43654377
storage_options (`dict`, *optional*):
43664378
Key/value pairs to be passed on to the file-system backend, if any.
43674379
@@ -4386,21 +4398,21 @@ def to_parquet(
43864398
```
43874399
43884400
"""
4389-
from .io.parquet import ParquetDatasetWriter
4401+
from .arrow_writer import get_arrow_writer_batch_size_from_features
43904402

4403+
batch_size = (
4404+
batch_size or get_arrow_writer_batch_size_from_features(self.features) or config.DEFAULT_MAX_BATCH_SIZE
4405+
)
43914406
dataset: Union[Dataset, IterableDataset, None] = None
43924407
if self.features is None:
43934408
# Without features we can't construct a schema upfront — fall back to materializing
4394-
from .arrow_writer import get_arrow_writer_batch_size_from_features
4395-
4396-
batch_size = (
4397-
batch_size or get_arrow_writer_batch_size_from_features(self.features) or config.DEFAULT_MAX_BATCH_SIZE
4398-
)
43994409
table = pa.concat_tables(list(self.with_format("arrow").iter(batch_size=batch_size)))
44004410
dataset = Dataset(table, fingerprint="unset")
44014411
else:
44024412
dataset = self
44034413

4414+
from .io.parquet import ParquetDatasetWriter
4415+
44044416
return ParquetDatasetWriter(
44054417
dataset, path_or_buf, batch_size=batch_size, storage_options=storage_options, **parquet_writer_kwargs
44064418
).write()

0 commit comments

Comments
 (0)