Test RTAS: replace_table_transaction with atomic write

smaheshwar-pltr · smaheshwar-pltr · commit 86752569ab86 · 2026-05-18T18:41:50.000+01:00
Adds two new behavior tests:
- test_replace_table_transaction_with_write_atomic_rtas (memory + sql):
  replace + fast_append in one transaction lands schema swap and new
  data atomically. New snapshot is current, old snapshot preserved in
  history.
- test_replace_table_followed_by_separate_append (memory + sql):
  replace_table clears the current snapshot; a subsequent append
  restores main ref with new data only.
- test_replace_table_transaction_rtas_against_rest_server: same RTAS
  flow exercised end-to-end against the REST docker stack.

The bare replace_table() is the DDL-only form (clears current snapshot,
preserves history). RTAS via replace_table_transaction is the primary
use case for atomic schema-and-data swaps.
diff --git a/tests/catalog/test_catalog_behaviors.py b/tests/catalog/test_catalog_behaviors.py
@@ -536,6 +536,74 @@ def test_replace_table_transaction_can_stage_additional_changes(
     assert replaced.properties.get("staged") == "yes"
 
 
+def test_replace_table_transaction_with_write_atomic_rtas(
+    catalog: Catalog, test_table_identifier: Identifier
+) -> None:
+    """RTAS (Replace Table As Select): replace the table and write new data in one transaction.
+
+    Verifies the primary use case for `replace_table_transaction`: the new schema and the new
+    data land atomically, the new snapshot becomes the current snapshot (main ref is restored
+    on commit because the transaction emits a fast-append), and the old snapshot is preserved
+    in history."""
+    _create_simple_table(catalog, test_table_identifier)
+    original_table = catalog.load_table(test_table_identifier)
+    old_data = pa.Table.from_pydict(
+        {"id": [1], "data": ["old"]},
+        schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]),
+    )
+    original_table.append(old_data)
+    old_snapshot_id = catalog.load_table(test_table_identifier).current_snapshot().snapshot_id  # type: ignore[union-attr]
+
+    new_schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="name", field_type=StringType(), required=False),
+    )
+    new_data = pa.Table.from_pydict(
+        {"id": [10, 20], "name": ["alice", "bob"]},
+        schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]),
+    )
+    with catalog.replace_table_transaction(test_table_identifier, schema=new_schema) as txn:
+        with txn.update_snapshot().fast_append() as snap:
+            for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=new_data, io=txn._table.io):
+                snap.append_data_file(data_file)
+
+    replaced = catalog.load_table(test_table_identifier)
+    # Atomically: new schema is current, new snapshot is current, old snapshot is in history.
+    assert replaced.current_snapshot() is not None
+    assert replaced.current_snapshot().snapshot_id != old_snapshot_id  # type: ignore[union-attr]
+    assert any(s.snapshot_id == old_snapshot_id for s in replaced.metadata.snapshots)
+    assert {f.name for f in replaced.schema().fields} == {"id", "name"}
+    # The new snapshot reflects the new data only — the old "data" column's row is gone from
+    # the active view (still in history).
+    assert replaced.scan().to_arrow().num_rows == 2
+
+
+def test_replace_table_followed_by_separate_append(
+    catalog: Catalog, test_table_identifier: Identifier
+) -> None:
+    """`replace_table` clears the current snapshot; a subsequent `append` makes a new one current."""
+    _, schema = _create_simple_table(catalog, test_table_identifier)
+    catalog.load_table(test_table_identifier).append(
+        pa.Table.from_pydict(
+            {"id": [1], "data": ["x"]},
+            schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]),
+        )
+    )
+
+    replaced = catalog.replace_table(test_table_identifier, schema=schema)
+    assert replaced.current_snapshot() is None
+
+    replaced.append(
+        pa.Table.from_pydict(
+            {"id": [42], "data": ["after-replace"]},
+            schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]),
+        )
+    )
+    after = catalog.load_table(test_table_identifier)
+    assert after.current_snapshot() is not None
+    assert after.scan().to_arrow().num_rows == 1  # Only the post-replace row is visible.
+
+
 # Rename table tests
 
 
diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py
@@ -117,6 +117,51 @@ def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None:
     catalog.drop_table(identifier)
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize("catalog", [lf("session_catalog")])
+def test_replace_table_transaction_rtas_against_rest_server(catalog: Catalog) -> None:
+    """RTAS (Replace Table As Select) against a real REST server: the schema swap and the
+    new-data write must land atomically — the new snapshot is current on commit."""
+    identifier = f"default.test_replace_rtas_{catalog.name}"
+    if not catalog.namespace_exists("default"):
+        catalog.create_namespace("default")
+    if catalog.table_exists(identifier):
+        catalog.drop_table(identifier)
+
+    original_schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="data", field_type=StringType(), required=False),
+    )
+    original = catalog.create_table(identifier, schema=original_schema)
+    original.append(
+        pa.Table.from_pydict(
+            {"id": [1], "data": ["old"]},
+            schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]),
+        )
+    )
+    old_snapshot_id = catalog.load_table(identifier).current_snapshot().snapshot_id  # type: ignore[union-attr]
+
+    new_schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="name", field_type=StringType(), required=False),
+    )
+    new_data = pa.Table.from_pydict(
+        {"id": [10, 20], "name": ["alice", "bob"]},
+        schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string())]),
+    )
+    with catalog.replace_table_transaction(identifier, schema=new_schema) as txn:
+        with txn.update_snapshot().fast_append() as snap:
+            for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=new_data, io=txn._table.io):
+                snap.append_data_file(data_file)
+
+    replaced = catalog.load_table(identifier)
+    assert replaced.current_snapshot() is not None
+    assert replaced.current_snapshot().snapshot_id != old_snapshot_id  # type: ignore[union-attr]
+    assert any(s.snapshot_id == old_snapshot_id for s in replaced.metadata.snapshots)
+    assert replaced.scan().to_arrow().num_rows == 2
+    catalog.drop_table(identifier)
+
+
 @pytest.mark.integration
 @pytest.mark.parametrize("catalog", [lf("session_catalog")])
 def test_load_view(catalog: RestCatalog, table_schema_nested: Schema, database_name: str, view_name: str) -> None: