Skip to content

Commit 9928f50

Browse files
committed
fix(upsert): run join-column type rejection before schema compat check
A pa.null() source column was being rejected by _check_pyarrow_schema_compatible (format-version=2 forbids null) before the join-column validation could surface the intended "Null-type column ... cannot be used as a join key" error. Reordering the checks lets the upsert-specific rejection fire first, giving users the actionable message. Dataframe-level checks now skip columns that are absent from the source so the pre-existing _check_pyarrow_schema_compatible path still owns the "PyArrow table contains more columns" error in test_key_cols_misaligned.
1 parent fe60c97 commit 9928f50

1 file changed

Lines changed: 13 additions & 10 deletions

File tree

pyiceberg/table/__init__.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -784,15 +784,8 @@ def upsert(
784784

785785
from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, schema_to_pyarrow
786786

787-
downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
788-
_check_pyarrow_schema_compatible(
789-
self.table_metadata.schema(),
790-
provided_schema=df.schema,
791-
downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
792-
format_version=self.table_metadata.format_version,
793-
)
794-
795787
table_arrow_schema = schema_to_pyarrow(self.table_metadata.schema(), include_field_ids=False)
788+
df_column_names = set(df.schema.names)
796789

797790
for col in join_cols:
798791
table_field = table_arrow_schema.field(col)
@@ -809,8 +802,10 @@ def upsert(
809802
"Only primitive types are supported."
810803
)
811804

812-
# Dataframe-level rejections: These implementation-specific formats (e.g.,
813-
# dictionary encoding) are not yet supported by the PyArrow join engine.
805+
# Dataframe-level rejections: only validate when the column is present in the
806+
# source; missing columns are surfaced by _check_pyarrow_schema_compatible below.
807+
if col not in df_column_names:
808+
continue
814809
arr = df.column(col)
815810
if pa.types.is_dictionary(arr.type):
816811
raise NotImplementedError(
@@ -823,6 +818,14 @@ def upsert(
823818
f"Extension type '{arr.type}' for column '{col}' is not currently supported as a join key in upsert."
824819
)
825820

821+
downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False
822+
_check_pyarrow_schema_compatible(
823+
self.table_metadata.schema(),
824+
provided_schema=df.schema,
825+
downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
826+
format_version=self.table_metadata.format_version,
827+
)
828+
826829
# Validate uniqueness after type checks to avoid comparing/hashing unsupported types.
827830
if upsert_util.has_duplicate_rows(df, join_cols):
828831
raise ValueError("Duplicate rows found in source dataset based on the key columns. No upsert executed")

0 commit comments

Comments
 (0)