Skip to content

Commit f62f67e

Browse files
authored
Raise explicit error when join columns cannot be found (#1698)
1 parent 948486e commit f62f67e

File tree

2 files changed

+43
-10
lines changed

2 files changed

+43
-10
lines changed

pyiceberg/table/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,6 +1159,9 @@ def upsert(
11591159
else:
11601160
raise ValueError(f"Field-ID could not be found: {join_cols}")
11611161

1162+
if len(join_cols) == 0:
1163+
raise ValueError("Join columns could not be found, please set identifier-field-ids or pass in explicitly.")
1164+
11621165
if not when_matched_update_all and not when_not_matched_insert_all:
11631166
raise ValueError("no upsert options selected...exiting")
11641167

tests/table/test_upsert.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pyiceberg.exceptions import NoSuchTableError
2626
from pyiceberg.expressions import And, EqualTo, Reference
2727
from pyiceberg.expressions.literals import LongLiteral
28+
from pyiceberg.io.pyarrow import schema_to_pyarrow
2829
from pyiceberg.schema import Schema
2930
from pyiceberg.table import UpsertResult
3031
from pyiceberg.table.upsert_util import create_match_filter
@@ -328,7 +329,7 @@ def test_upsert_with_identifier_fields(catalog: Catalog) -> None:
328329

329330
schema = Schema(
330331
NestedField(1, "city", StringType(), required=True),
331-
NestedField(2, "inhabitants", IntegerType(), required=True),
332+
NestedField(2, "population", IntegerType(), required=True),
332333
# Mark City as the identifier field, also known as the primary-key
333334
identifier_field_ids=[1],
334335
)
@@ -338,30 +339,30 @@ def test_upsert_with_identifier_fields(catalog: Catalog) -> None:
338339
arrow_schema = pa.schema(
339340
[
340341
pa.field("city", pa.string(), nullable=False),
341-
pa.field("inhabitants", pa.int32(), nullable=False),
342+
pa.field("population", pa.int32(), nullable=False),
342343
]
343344
)
344345

345346
# Write some data
346347
df = pa.Table.from_pylist(
347348
[
348-
{"city": "Amsterdam", "inhabitants": 921402},
349-
{"city": "San Francisco", "inhabitants": 808988},
350-
{"city": "Drachten", "inhabitants": 45019},
351-
{"city": "Paris", "inhabitants": 2103000},
349+
{"city": "Amsterdam", "population": 921402},
350+
{"city": "San Francisco", "population": 808988},
351+
{"city": "Drachten", "population": 45019},
352+
{"city": "Paris", "population": 2103000},
352353
],
353354
schema=arrow_schema,
354355
)
355356
tbl.append(df)
356357

357358
df = pa.Table.from_pylist(
358359
[
359-
# Will be updated, the inhabitants has been updated
360-
{"city": "Drachten", "inhabitants": 45505},
360+
# Will be updated, the population has been updated
361+
{"city": "Drachten", "population": 45505},
361362
# New row, will be inserted
362-
{"city": "Berlin", "inhabitants": 3432000},
363+
{"city": "Berlin", "population": 3432000},
363364
# Ignored, already exists in the table
364-
{"city": "Paris", "inhabitants": 2103000},
365+
{"city": "Paris", "population": 2103000},
365366
],
366367
schema=arrow_schema,
367368
)
@@ -388,3 +389,32 @@ def test_create_match_filter_single_condition() -> None:
388389
EqualTo(term=Reference(name="order_id"), literal=LongLiteral(101)),
389390
EqualTo(term=Reference(name="order_line_id"), literal=LongLiteral(1)),
390391
)
392+
393+
394+
def test_upsert_without_identifier_fields(catalog: Catalog) -> None:
395+
identifier = "default.test_upsert_without_identifier_fields"
396+
_drop_table(catalog, identifier)
397+
398+
schema = Schema(
399+
NestedField(1, "city", StringType(), required=True),
400+
NestedField(2, "population", IntegerType(), required=True),
401+
# No identifier field :o
402+
identifier_field_ids=[],
403+
)
404+
405+
tbl = catalog.create_table(identifier, schema=schema)
406+
# Write some data
407+
df = pa.Table.from_pylist(
408+
[
409+
{"city": "Amsterdam", "population": 921402},
410+
{"city": "San Francisco", "population": 808988},
411+
{"city": "Drachten", "population": 45019},
412+
{"city": "Paris", "population": 2103000},
413+
],
414+
schema=schema_to_pyarrow(schema),
415+
)
416+
417+
with pytest.raises(
418+
ValueError, match="Join columns could not be found, please set identifier-field-ids or pass in explicitly."
419+
):
420+
tbl.upsert(df)

0 commit comments

Comments
 (0)