|
16 | 16 | # under the License. |
17 | 17 | from pathlib import PosixPath |
18 | 18 |
|
| 19 | +import pyarrow as pa |
19 | 20 | import pytest |
20 | 21 | from datafusion import SessionContext |
21 | 22 | from pyarrow import Table as pa_table |
22 | 23 |
|
23 | 24 | from pyiceberg.catalog import Catalog |
24 | 25 | from pyiceberg.exceptions import NoSuchTableError |
| 26 | +from pyiceberg.schema import Schema |
25 | 27 | from pyiceberg.table import UpsertResult |
| 28 | +from pyiceberg.types import IntegerType, NestedField, StringType |
26 | 29 | from tests.catalog.test_base import InMemoryCatalog, Table |
27 | 30 |
|
28 | 31 |
|
@@ -314,3 +317,52 @@ def test_key_cols_misaligned(catalog: Catalog) -> None: |
314 | 317 |
|
315 | 318 | with pytest.raises(Exception, match=r"""Field ".*" does not exist in schema"""): |
316 | 319 | table.upsert(df=df_src, join_cols=["order_id"]) |
| 320 | + |
| 321 | + |
| 322 | +def test_upsert_with_identifier_fields(catalog: Catalog) -> None: |
| 323 | + identifier = "default.test_upsert_with_identifier_fields" |
| 324 | + _drop_table(catalog, identifier) |
| 325 | + |
| 326 | + schema = Schema( |
| 327 | + NestedField(1, "city", StringType(), required=True), |
| 328 | + NestedField(2, "inhabitants", IntegerType(), required=True), |
| 329 | + # Mark City as the identifier field, also known as the primary-key |
| 330 | + identifier_field_ids=[1], |
| 331 | + ) |
| 332 | + |
| 333 | + tbl = catalog.create_table(identifier, schema=schema) |
| 334 | + |
| 335 | + arrow_schema = pa.schema( |
| 336 | + [ |
| 337 | + pa.field("city", pa.string(), nullable=False), |
| 338 | + pa.field("inhabitants", pa.int32(), nullable=False), |
| 339 | + ] |
| 340 | + ) |
| 341 | + |
| 342 | + # Write some data |
| 343 | + df = pa.Table.from_pylist( |
| 344 | + [ |
| 345 | + {"city": "Amsterdam", "inhabitants": 921402}, |
| 346 | + {"city": "San Francisco", "inhabitants": 808988}, |
| 347 | + {"city": "Drachten", "inhabitants": 45019}, |
| 348 | + {"city": "Paris", "inhabitants": 2103000}, |
| 349 | + ], |
| 350 | + schema=arrow_schema, |
| 351 | + ) |
| 352 | + tbl.append(df) |
| 353 | + |
| 354 | + df = pa.Table.from_pylist( |
| 355 | + [ |
| 356 | + # Will be updated, the inhabitants has been updated |
| 357 | + {"city": "Drachten", "inhabitants": 45505}, |
| 358 | + # New row, will be inserted |
| 359 | + {"city": "Berlin", "inhabitants": 3432000}, |
| 360 | + # Ignored, already exists in the table |
| 361 | + {"city": "Paris", "inhabitants": 2103000}, |
| 362 | + ], |
| 363 | + schema=arrow_schema, |
| 364 | + ) |
| 365 | + upd = tbl.upsert(df) |
| 366 | + |
| 367 | + assert upd.rows_updated == 1 |
| 368 | + assert upd.rows_inserted == 1 |
0 commit comments