|
20 | 20 | import tempfile |
21 | 21 | import uuid |
22 | 22 | from datetime import date |
| 23 | +from pathlib import Path |
23 | 24 | from typing import Any, List, Optional |
24 | 25 | from unittest.mock import MagicMock, patch |
25 | 26 | from uuid import uuid4 |
|
67 | 68 | _determine_partitions, |
68 | 69 | _primitive_to_physical, |
69 | 70 | _read_deletes, |
| 71 | + _resolve_row_group_size, |
70 | 72 | _to_requested_schema, |
71 | 73 | bin_pack_arrow_table, |
72 | 74 | compute_statistics_plan, |
73 | 75 | data_file_statistics_from_parquet_metadata, |
74 | 76 | expression_to_pyarrow, |
75 | 77 | parquet_path_to_id_mapping, |
76 | 78 | schema_to_pyarrow, |
| 79 | + write_file, |
77 | 80 | ) |
78 | 81 | from pyiceberg.manifest import DataFile, DataFileContent, FileFormat |
79 | 82 | from pyiceberg.partitioning import PartitionField, PartitionSpec |
@@ -2319,3 +2322,86 @@ def test_pyarrow_io_multi_fs() -> None: |
2319 | 2322 |
|
2320 | 2323 | # Same PyArrowFileIO instance resolves local file input to LocalFileSystem |
2321 | 2324 | assert isinstance(pyarrow_file_io.new_input("file:///path/to/file")._filesystem, LocalFileSystem) |
| 2325 | + |
| 2326 | + |
| 2327 | +@pytest.mark.parametrize( |
| 2328 | + "arrow_table,row_group_limit,row_group_size_bytes,expected", |
| 2329 | + [ |
| 2330 | + # Byte limit tighter than row limit — 2 int64 cols => 16 bytes/row, |
| 2331 | + # 1024-byte budget => 64 rows/group. |
| 2332 | + (pa.table({"a": list(range(1000)), "b": list(range(1000))}), 10_000, 1024, 64), |
| 2333 | + # Row limit tighter than byte limit. |
| 2334 | + (pa.table({"a": list(range(1000))}), 10, 10**9, 10), |
| 2335 | + # Byte limit disabled (0) falls back to the row limit. |
| 2336 | + (pa.table({"a": list(range(1000))}), 500, 0, 500), |
| 2337 | + # Empty input falls back to the row limit. |
| 2338 | + (pa.table({"a": pa.array([], type=pa.int64())}), 500, 1024, 500), |
| 2339 | + ], |
| 2340 | +) |
| 2341 | +def test__resolve_row_group_size(arrow_table: pa.Table, row_group_limit: int, row_group_size_bytes: int, expected: int) -> None: |
| 2342 | + """Pick min(row_group_limit, bytes/(bytes_per_row)) when byte limit is set.""" |
| 2343 | + assert _resolve_row_group_size(arrow_table, row_group_limit, row_group_size_bytes) == expected |
| 2344 | + |
| 2345 | + |
| 2346 | +def test_write_file_byte_limit_produces_more_row_groups_than_row_limit_alone(tmp_path: Path) -> None: |
| 2347 | + """A tight byte limit splits a single arrow table across multiple row groups.""" |
| 2348 | + from pyiceberg.table import WriteTask |
| 2349 | + |
| 2350 | + table_schema = Schema( |
| 2351 | + NestedField(1, "a", LongType(), required=False), |
| 2352 | + NestedField(2, "b", LongType(), required=False), |
| 2353 | + ) |
| 2354 | + arrow_data = pa.table({"a": list(range(10_000)), "b": list(range(10_000))}) |
| 2355 | + |
| 2356 | + def _write(properties: dict[str, str], subdir: str) -> Path: |
| 2357 | + table_metadata = TableMetadataV2( |
| 2358 | + location=f"file://{tmp_path}/{subdir}", |
| 2359 | + last_column_id=2, |
| 2360 | + format_version=2, |
| 2361 | + schemas=[table_schema], |
| 2362 | + partition_specs=[PartitionSpec()], |
| 2363 | + properties=properties, |
| 2364 | + ) |
| 2365 | + task = WriteTask( |
| 2366 | + write_uuid=uuid.uuid4(), |
| 2367 | + task_id=0, |
| 2368 | + record_batches=arrow_data.to_batches(), |
| 2369 | + schema=table_schema, |
| 2370 | + ) |
| 2371 | + data_files = list(write_file(io=PyArrowFileIO(), table_metadata=table_metadata, tasks=iter([task]))) |
| 2372 | + return Path(data_files[0].file_path.removeprefix("file://")) |
| 2373 | + |
| 2374 | + default_groups = pq.ParquetFile(_write({}, "default")).num_row_groups |
| 2375 | + constrained_groups = pq.ParquetFile( |
| 2376 | + _write({TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES: "1024"}, "constrained") |
| 2377 | + ).num_row_groups |
| 2378 | + assert default_groups == 1 |
| 2379 | + assert constrained_groups > 1 |
| 2380 | + |
| 2381 | + |
| 2382 | +def test_write_file_byte_limit_respects_row_limit_upper_bound(tmp_path: Path) -> None: |
| 2383 | + """With an effectively infinite byte target, the row limit caps row groups.""" |
| 2384 | + from pyiceberg.table import WriteTask |
| 2385 | + |
| 2386 | + table_schema = Schema(NestedField(1, "a", LongType(), required=False)) |
| 2387 | + arrow_data = pa.table({"a": list(range(10_000))}) |
| 2388 | + table_metadata = TableMetadataV2( |
| 2389 | + location=f"file://{tmp_path}", |
| 2390 | + last_column_id=1, |
| 2391 | + format_version=2, |
| 2392 | + schemas=[table_schema], |
| 2393 | + partition_specs=[PartitionSpec()], |
| 2394 | + properties={ |
| 2395 | + TableProperties.PARQUET_ROW_GROUP_LIMIT: "1000", |
| 2396 | + TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES: str(10**12), |
| 2397 | + }, |
| 2398 | + ) |
| 2399 | + task = WriteTask( |
| 2400 | + write_uuid=uuid.uuid4(), |
| 2401 | + task_id=0, |
| 2402 | + record_batches=arrow_data.to_batches(), |
| 2403 | + schema=table_schema, |
| 2404 | + ) |
| 2405 | + data_files = list(write_file(io=PyArrowFileIO(), table_metadata=table_metadata, tasks=iter([task]))) |
| 2406 | + pf = pq.ParquetFile(data_files[0].file_path.removeprefix("file://")) |
| 2407 | + assert pf.num_row_groups == 10 |
0 commit comments