Use manual Schema for plain replace_table, df.schema for RTAS

smaheshwar-pltr · smaheshwar-pltr · commit 1efa6fe9be09 · 2026-05-18T20:01:55.000+01:00
Per reviewer feedback: bare replace_table examples and tests should
construct an explicit Schema, since that's the natural user-facing
API for DDL-only redefinition. RTAS flows keep df.schema since the
data and schema are coupled there.
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -190,10 +190,18 @@ with catalog.create_table_transaction(identifier="docs_example.bids", schema=sch
 Atomically replace an existing table's schema, partition spec, sort order, location, and properties. The table UUID and history (snapshots, schemas, specs, sort orders, metadata log) are preserved; the current snapshot is cleared (the `main` branch ref is removed). `replace_table` redefines the table in this way; `replace_table_transaction` lets you write new data alongside this change to permit RTAS (replace-table-as-select) workflows.
 
 ```python
-catalog.replace_table(identifier="docs_example.bids", schema=df.schema)
+from pyiceberg.schema import Schema
+from pyiceberg.types import NestedField, LongType, StringType, BooleanType
+
+new_schema = Schema(
+    NestedField(field_id=1, name="datetime", field_type=LongType(), required=False),
+    NestedField(field_id=2, name="symbol", field_type=StringType(), required=False),
+    NestedField(field_id=3, name="active", field_type=BooleanType(), required=False),
+)
+catalog.replace_table(identifier="docs_example.bids", schema=new_schema)
 ```
 
-Where `df` is a PyArrow table (or `Schema`) carrying the new column set. Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`.
+Field IDs from columns whose names appear in the previous schema are reused, so existing data files remain readable when the new schema is a compatible superset. New columns get fresh IDs above `last-column-id`.
 
 Properties passed to `replace_table` are **merged** with the existing table properties (your values override; existing keys you don't pass are preserved). To remove a property as part of the replace, use `replace_table_transaction` and remove it explicitly within the transaction.
 
@@ -208,7 +216,7 @@ with catalog.replace_table_transaction(identifier="docs_example.bids", schema=df
 To upgrade the table's format version as part of the replace, pass `format-version` in `properties`:
 
 ```python
-catalog.replace_table(identifier="docs_example.bids", schema=df.schema, properties={"format-version": "2"})
+catalog.replace_table(identifier="docs_example.bids", schema=new_schema, properties={"format-version": "2"})
 ```
 
 ## Register a table
diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py
@@ -26,6 +26,7 @@
 from pyiceberg.catalog.rest import RestCatalog
 from pyiceberg.exceptions import NoSuchViewError
 from pyiceberg.schema import Schema
+from pyiceberg.types import BooleanType, LongType, NestedField, StringType
 from pyiceberg.view.metadata import SQLViewRepresentation, ViewVersion
 
 TEST_NAMESPACE_IDENTIFIER = "TEST NS"
@@ -85,20 +86,26 @@ def test_replace_table_end_to_end_against_rest_server(catalog: Catalog) -> None:
     if catalog.table_exists(identifier):
         catalog.drop_table(identifier)
 
-    pa_table = pa.Table.from_pydict(
-        {"id": [1, 2, 3], "data": ["a", "b", "c"]},
-        schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]),
+    original_schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="data", field_type=StringType(), required=False),
+    )
+    original = catalog.create_table(identifier, schema=original_schema)
+    original.append(
+        pa.Table.from_pydict(
+            {"id": [1, 2, 3], "data": ["a", "b", "c"]},
+            schema=pa.schema([pa.field("id", pa.int64()), pa.field("data", pa.large_string())]),
+        )
     )
-    original = catalog.create_table(identifier, schema=pa_table.schema)
-    original.append(pa_table)
     original.refresh()
     original_snapshot_id = original.current_snapshot().snapshot_id  # type: ignore[union-attr]
 
-    new_data = pa.Table.from_pydict(
-        {"id": [10], "name": ["alice"], "active": [True]},
-        schema=pa.schema([pa.field("id", pa.int64()), pa.field("name", pa.large_string()), pa.field("active", pa.bool_())]),
+    new_schema = Schema(
+        NestedField(field_id=1, name="id", field_type=LongType(), required=False),
+        NestedField(field_id=2, name="name", field_type=StringType(), required=False),
+        NestedField(field_id=3, name="active", field_type=BooleanType(), required=False),
     )
-    replaced = catalog.replace_table(identifier, schema=new_data.schema)
+    replaced = catalog.replace_table(identifier, schema=new_schema)
 
     assert replaced.metadata.table_uuid == original.metadata.table_uuid
     assert replaced.current_snapshot() is None