fix(upsert): run join-column type rejection before schema compat check

claude · claude · commit 9928f50d80e3 · 2026-05-21T00:30:43.000Z
A pa.null() source column was being rejected by
_check_pyarrow_schema_compatible (format-version=2 forbids null) before the
join-column validation could surface the intended "Null-type column ...
cannot be used as a join key" error. Reordering the checks lets the
upsert-specific rejection fire first, giving users the actionable message.

Dataframe-level checks now skip columns that are absent from the source so
the pre-existing _check_pyarrow_schema_compatible path still owns the
"PyArrow table contains more columns" error in test_key_cols_misaligned.
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -784,15 +784,8 @@ def upsert(
 
         from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, schema_to_pyarrow
 
-        downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
-        _check_pyarrow_schema_compatible(
-            self.table_metadata.schema(),
-            provided_schema=df.schema,
-            downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
-            format_version=self.table_metadata.format_version,
-        )
-
         table_arrow_schema = schema_to_pyarrow(self.table_metadata.schema(), include_field_ids=False)
+        df_column_names = set(df.schema.names)
 
         for col in join_cols:
             table_field = table_arrow_schema.field(col)
@@ -809,8 +802,10 @@ def upsert(
                     "Only primitive types are supported."
                 )
 
-            # Dataframe-level rejections: These implementation-specific formats (e.g.,
-            # dictionary encoding) are not yet supported by the PyArrow join engine.
+            # Dataframe-level rejections: only validate when the column is present in the
+            # source; missing columns are surfaced by _check_pyarrow_schema_compatible below.
+            if col not in df_column_names:
+                continue
             arr = df.column(col)
             if pa.types.is_dictionary(arr.type):
                 raise NotImplementedError(
@@ -823,6 +818,14 @@ def upsert(
                     f"Extension type '{arr.type}' for column '{col}' is not currently supported as a join key in upsert."
                 )
 
+        downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
+        _check_pyarrow_schema_compatible(
+            self.table_metadata.schema(),
+            provided_schema=df.schema,
+            downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
+            format_version=self.table_metadata.format_version,
+        )
+
         # Validate uniqueness after type checks to avoid comparing/hashing unsupported types.
         if upsert_util.has_duplicate_rows(df, join_cols):
             raise ValueError("Duplicate rows found in source dataset based on the key columns. No upsert executed")