Allow upserting into an empty table

Fokko · Fokko · commit 4817b00b23c8 · 2025-02-21T16:27:06.000+01:00
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -579,7 +579,9 @@ def overwrite(
             self.table_metadata.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us
         )
 
-        self.delete(delete_filter=overwrite_filter, case_sensitive=case_sensitive, snapshot_properties=snapshot_properties)
+        if overwrite_filter != AlwaysFalse():
+            # Only delete when the filter is != AlwaysFalse
+            self.delete(delete_filter=overwrite_filter, case_sensitive=case_sensitive, snapshot_properties=snapshot_properties)
 
         with self._append_snapshot_producer(snapshot_properties) as append_files:
             # skip writing data files if the dataframe is empty
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -36,7 +36,11 @@ def create_match_filter(df: pyarrow_table, join_cols: list[str]) -> BooleanExpre
     unique_keys = df.select(join_cols).group_by(join_cols).aggregate([])
 
     if len(join_cols) == 1:
-        return In(join_cols[0], unique_keys[0].to_pylist())
+        keys = unique_keys[0].to_pylist()
+        if len(keys) > 0:
+            return AlwaysFalse()
+        else:
+            return In(join_cols[0], keys)
     else:
         filters: List[BooleanExpression] = [
             cast(BooleanExpression, And(*[EqualTo(col, row[col]) for col in join_cols])) for row in unique_keys.to_pylist()
@@ -67,6 +71,10 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
 
     non_key_cols = list(all_columns - join_cols_set)
 
+    if len(target_table) == 0:
+        # When the target table is empty, there is nothing to update :)
+        return source_table.schema.empty_table()
+
     match_expr = functools.reduce(operator.and_, [pc.field(col).isin(target_table.column(col).to_pylist()) for col in join_cols])
 
     matching_source_rows = source_table.filter(match_expr)
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
@@ -371,6 +371,42 @@ def test_upsert_with_identifier_fields(catalog: Catalog) -> None:
     assert upd.rows_inserted == 1
 
 
+def test_upsert_into_empty_table(catalog: Catalog) -> None:
+    identifier = "default.test_upsert_into_empty_table"
+    _drop_table(catalog, identifier)
+
+    schema = Schema(
+        NestedField(1, "city", StringType(), required=True),
+        NestedField(2, "inhabitants", IntegerType(), required=True),
+        # Mark City as the identifier field, also known as the primary-key
+        identifier_field_ids=[1],
+    )
+
+    tbl = catalog.create_table(identifier, schema=schema)
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("city", pa.string(), nullable=False),
+            pa.field("inhabitants", pa.int32(), nullable=False),
+        ]
+    )
+
+    # Write some data
+    df = pa.Table.from_pylist(
+        [
+            {"city": "Amsterdam", "inhabitants": 921402},
+            {"city": "San Francisco", "inhabitants": 808988},
+            {"city": "Drachten", "inhabitants": 45019},
+            {"city": "Paris", "inhabitants": 2103000},
+        ],
+        schema=arrow_schema,
+    )
+    upd = tbl.upsert(df)
+
+    assert upd.rows_updated == 0
+    assert upd.rows_inserted == 4
+
+
 def test_create_match_filter_single_condition() -> None:
     """
     Test create_match_filter with a composite key where the source yields exactly one unique key.

Original file line number	Diff line number	Diff line change
`@@ -579,7 +579,9 @@ def overwrite(`
`579`	`579`	`self.table_metadata.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us`
`580`	`580`	`)`
`581`	`581`
`582`		`- self.delete(delete_filter=overwrite_filter, case_sensitive=case_sensitive, snapshot_properties=snapshot_properties)`
	`582`	`+ if overwrite_filter != AlwaysFalse():`
	`583`	`+ # Only delete when the filter is != AlwaysFalse`
	`584`	`+ self.delete(delete_filter=overwrite_filter, case_sensitive=case_sensitive, snapshot_properties=snapshot_properties)`
`583`	`585`
`584`	`586`	`with self._append_snapshot_producer(snapshot_properties) as append_files:`
`585`	`587`	`# skip writing data files if the dataframe is empty`