Skip to content

Commit e93f019

Browse files
Close test coverage gaps and split mega-test
Coverage gaps surfaced by branch coverage of pyiceberg/partitioning.py: - v1 keep-field branch (preserves field_id when current and new specs share a (source, transform) key) had no helper-level test. - v1 _unique_void_name multi-suffix loop (when name and name_<field_id> are both taken) had no test. - v2 "prefer highest field_id" tiebreak (same key across multiple historical specs) had no test. Also split test_complete_replace_transaction into three focused tests behind a shared _run_complete_replace helper: applies-new-spec-and-sort, merges-properties, and rtas-preserves-old-snapshot. The umbrella test asserted a dozen unrelated invariants in one body; splitting gives actionable failure messages. Tightens test_replace_table_transaction_wire_payload to hardcode the expected action list rather than testing-of-tests via len(set(...)).
1 parent d793777 commit e93f019

3 files changed

Lines changed: 112 additions & 13 deletions

File tree

tests/catalog/test_catalog_behaviors.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow
4242
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec
4343
from pyiceberg.schema import Schema
44-
from pyiceberg.table import TableProperties
44+
from pyiceberg.table import Table, TableProperties
4545
from pyiceberg.table.snapshots import Operation
4646
from pyiceberg.table.sorting import NullOrder, SortDirection, SortField, SortOrder
4747
from pyiceberg.table.update import AddSchemaUpdate, SetCurrentSchemaUpdate
@@ -450,10 +450,17 @@ def test_replace_transaction(catalog: Catalog, test_table_identifier: Identifier
450450
assert replaced.scan(snapshot_id=old_snapshot_id).to_arrow().equals(_simple_data())
451451

452452

453-
def test_complete_replace_transaction(catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path) -> None:
454-
_create_simple_table(catalog, test_table_identifier, properties={"keep": "yes", "override": "old"})
455-
catalog.load_table(test_table_identifier).append(_simple_data())
456-
original = catalog.load_table(test_table_identifier)
453+
def _run_complete_replace(
454+
catalog: Catalog, identifier: Identifier, tmp_path: Path
455+
) -> tuple[Table, Table, SortOrder, pa.Table, int]:
456+
"""Set up a table, run a full-six-args RTAS replace, and return the handles needed for assertions.
457+
458+
Returns:
459+
(original, replaced, new_sort, original_data, old_snapshot_id)
460+
"""
461+
_create_simple_table(catalog, identifier, properties={"keep": "yes", "override": "old"})
462+
catalog.load_table(identifier).append(_simple_data())
463+
original = catalog.load_table(identifier)
457464
old_snapshot_id = original.current_snapshot().snapshot_id # type: ignore[union-attr]
458465
original_data = original.scan().to_arrow()
459466

@@ -471,7 +478,7 @@ def test_complete_replace_transaction(catalog: Catalog, test_table_identifier: I
471478
)
472479

473480
with catalog.replace_table_transaction(
474-
test_table_identifier,
481+
identifier,
475482
schema=new_schema,
476483
partition_spec=new_spec,
477484
sort_order=new_sort,
@@ -480,12 +487,16 @@ def test_complete_replace_transaction(catalog: Catalog, test_table_identifier: I
480487
) as txn:
481488
txn.append(new_data)
482489

483-
replaced = catalog.load_table(test_table_identifier)
490+
return original, catalog.load_table(identifier), new_sort, original_data, old_snapshot_id
491+
484492

493+
def test_complete_replace_transaction_applies_new_schema_spec_and_sort(
494+
catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path
495+
) -> None:
496+
original, replaced, new_sort, _, _ = _run_complete_replace(catalog, test_table_identifier, tmp_path)
485497
# Identity invariants.
486498
assert replaced.metadata.table_uuid == original.metadata.table_uuid
487-
assert replaced.metadata.location == new_location
488-
499+
assert replaced.metadata.location == f"file://{tmp_path}/replaced"
489500
# New schema / spec / sort applied; old entries retained in history.
490501
assert {f.name for f in replaced.schema().fields} == {"id", "data", "extra"}
491502
assert sorted(s.schema_id for s in replaced.metadata.schemas) == [0, 1]
@@ -495,14 +506,23 @@ def test_complete_replace_transaction(catalog: Catalog, test_table_identifier: I
495506
assert replaced.sort_order().fields == new_sort.fields
496507
assert {s.order_id for s in replaced.metadata.sort_orders} == {0, replaced.metadata.default_sort_order_id}
497508

498-
# Property merge: kept, overridden, added — and `format-version` does not leak.
509+
510+
def test_complete_replace_transaction_merges_properties(
511+
catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path
512+
) -> None:
513+
_, replaced, _, _, _ = _run_complete_replace(catalog, test_table_identifier, tmp_path)
514+
# `keep` is preserved, `override` is updated, `added` is new, and `format-version` does not leak.
499515
assert replaced.properties["keep"] == "yes"
500516
assert replaced.properties["override"] == "new"
501517
assert replaced.properties["added"] == "v"
502518
assert "format-version" not in replaced.properties
503519

504-
# RTAS atomicity: new snapshot exists, has no parent (fresh start), old snapshot is still
505-
# in the snapshot list, and time-travel reads return the original rows.
520+
521+
def test_complete_replace_transaction_rtas_preserves_old_snapshot(
522+
catalog: Catalog, test_table_identifier: Identifier, tmp_path: Path
523+
) -> None:
524+
_, replaced, _, original_data, old_snapshot_id = _run_complete_replace(catalog, test_table_identifier, tmp_path)
525+
# New snapshot exists, has no parent (fresh start), old snapshot is still in the snapshot list.
506526
new_snapshot = replaced.current_snapshot()
507527
assert new_snapshot is not None
508528
assert new_snapshot.snapshot_id != old_snapshot_id

tests/catalog/test_rest.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2953,7 +2953,13 @@ def test_replace_table_transaction_wire_payload(
29532953
]
29542954

29552955
actions = [u["action"] for u in request["updates"]]
2956-
assert len(actions) == len(set(actions)), f"duplicate actions in request: {actions}"
2956+
assert sorted(actions) == [
2957+
"add-schema",
2958+
"remove-snapshot-ref",
2959+
"set-current-schema",
2960+
"set-default-sort-order",
2961+
"set-default-spec",
2962+
]
29572963
updates_by_action = {u["action"]: u for u in request["updates"]}
29582964

29592965
assert updates_by_action["remove-snapshot-ref"] == {"action": "remove-snapshot-ref", "ref-name": "main"}

tests/table/test_partitioning.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,3 +405,76 @@ def test_assign_fresh_partition_spec_ids_for_replace_v1_renames_void_on_name_col
405405
void_field = next(f for f in fresh_spec.fields if isinstance(f.transform, VoidTransform))
406406
assert void_field.name != "data", "void name must not collide with active partition name"
407407
assert void_field.name == "data_1000"
408+
409+
410+
def test_assign_fresh_partition_spec_ids_for_replace_v1_keeps_field_preserves_id() -> None:
411+
"""v1 carry-forward: when a current-spec field is also in the new spec, its field_id is preserved."""
412+
schema = Schema(
413+
NestedField(field_id=1, name="id", field_type=IntegerType(), required=False),
414+
NestedField(field_id=2, name="data", field_type=StringType(), required=False),
415+
)
416+
current_spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), spec_id=0)
417+
# New spec keeps the same (source, transform) on "id" — should reuse field_id=1000, no void emitted.
418+
new_spec = PartitionSpec(PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id"))
419+
fresh_spec, new_last_pid = assign_fresh_partition_spec_ids_for_replace(
420+
new_spec,
421+
schema,
422+
schema,
423+
existing_specs=[current_spec],
424+
last_partition_id=1000,
425+
format_version=1,
426+
current_spec=current_spec,
427+
)
428+
assert [f.field_id for f in fresh_spec.fields] == [1000]
429+
assert fresh_spec.fields[0].name == "id"
430+
assert isinstance(fresh_spec.fields[0].transform, IdentityTransform)
431+
assert not any(isinstance(f.transform, VoidTransform) for f in fresh_spec.fields)
432+
assert new_last_pid == 1000
433+
434+
435+
def test_assign_fresh_partition_spec_ids_for_replace_v1_void_name_uses_multi_suffix_loop() -> None:
436+
"""When `name` and `name_<field_id>` are both already used, append `_2`, `_3`, ... until unique."""
437+
schema = Schema(
438+
NestedField(field_id=1, name="foo", field_type=IntegerType(), required=False),
439+
NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False),
440+
NestedField(field_id=3, name="baz", field_type=IntegerType(), required=False),
441+
NestedField(field_id=4, name="qux", field_type=IntegerType(), required=False),
442+
)
443+
# Current v1 spec partitions source=1 by bucket(4) at field_id=1000, named "p" — note
444+
# the partition NAME ("p") differs from the source COLUMN NAME ("foo") which is allowed
445+
# for non-identity transforms.
446+
current_spec = PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=BucketTransform(4), name="p"), spec_id=0)
447+
# New spec has two partition fields named "p" and "p_1000" — colliding with both the
448+
# void's preferred name and its first fallback. Both are on different sources, so they
449+
# do not match the current (source=1, bucket[4]) key and the current field becomes void.
450+
new_spec = PartitionSpec(
451+
PartitionField(source_id=2, field_id=997, transform=BucketTransform(4), name="p"),
452+
PartitionField(source_id=3, field_id=998, transform=BucketTransform(4), name="p_1000"),
453+
)
454+
fresh_spec, _ = assign_fresh_partition_spec_ids_for_replace(
455+
new_spec,
456+
schema,
457+
schema,
458+
existing_specs=[current_spec],
459+
last_partition_id=1000,
460+
format_version=1,
461+
current_spec=current_spec,
462+
)
463+
void_field = next(f for f in fresh_spec.fields if isinstance(f.transform, VoidTransform))
464+
assert void_field.name == "p_1000_2"
465+
466+
467+
def test_assign_fresh_partition_spec_ids_for_replace_v2_prefers_highest_field_id_for_repeated_key() -> None:
468+
"""v2: when the same (source_id, transform) appears across multiple specs, the highest field_id wins."""
469+
# Two historical specs both partition by (source_id=1, IdentityTransform), with different field_ids.
470+
existing_specs = [
471+
PartitionSpec(PartitionField(source_id=1, field_id=1000, transform=IdentityTransform(), name="id"), spec_id=0),
472+
PartitionSpec(PartitionField(source_id=1, field_id=1002, transform=IdentityTransform(), name="id_v2"), spec_id=1),
473+
]
474+
# New spec uses the same (source, transform) — should reuse the highest historical field_id (1002).
475+
new_spec = PartitionSpec(PartitionField(source_id=1, field_id=999, transform=IdentityTransform(), name="id"))
476+
fresh_spec, new_last_pid = assign_fresh_partition_spec_ids_for_replace(
477+
new_spec, _REPLACE_SCHEMA_FOR_PARTITION, _REPLACE_SCHEMA_FOR_PARTITION, existing_specs, last_partition_id=1002
478+
)
479+
assert fresh_spec.fields[0].field_id == 1002
480+
assert new_last_pid == 1002

0 commit comments

Comments
 (0)