|
| 1 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +# or more contributor license agreements. See the NOTICE file |
| 3 | +# distributed with this work for additional information |
| 4 | +# regarding copyright ownership. The ASF licenses this file |
| 5 | +# to you under the Apache License, Version 2.0 (the |
| 6 | +# "License"); you may not use this file except in compliance |
| 7 | +# with the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, |
| 12 | +# software distributed under the License is distributed on an |
| 13 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +# KIND, either express or implied. See the License for the |
| 15 | +# specific language governing permissions and limitations |
| 16 | +# under the License. |
| 17 | +"""Reads of Spark-written, Parquet-encrypted Iceberg tables via PyIceberg. |
| 18 | +
|
| 19 | +The encrypted table (`hive.default.test_encrypted`) is provisioned by `dev/provision.py` |
| 20 | +using Spark with `encryption.kms-impl=org.apache.iceberg.encryption.UnitestKMS`. UnitestKMS |
| 21 | +ships hardcoded master keys (keyA=b"0123456789012345", keyB=b"1123456789012345"); we mirror |
| 22 | +those bytes here through PyIceberg's InMemoryKms so unwrapping succeeds. |
| 23 | +
|
| 24 | +Decryption of the data files requires PyArrow's `parquet.encryption.create_decryption_properties` |
| 25 | +API, which is available in PyArrow >= 25 (currently shipped only via the nightly wheels). See |
| 26 | +the Makefile target `install-pyarrow-nightly`. |
| 27 | +""" |
| 28 | + |
| 29 | +from __future__ import annotations |
| 30 | + |
| 31 | +import pytest |
| 32 | + |
| 33 | +from pyiceberg.catalog import load_catalog |
| 34 | + |
| 35 | +# UnitestKMS master keys, hex-encoded so they can be set as catalog properties and parsed by |
| 36 | +# InMemoryKms.initialize (`encryption.kms.key.<id>=<hex>`). |
| 37 | +_KEY_A_HEX = b"0123456789012345".hex() |
| 38 | +_KEY_B_HEX = b"1123456789012345".hex() |
| 39 | + |
| 40 | + |
| 41 | +@pytest.fixture(scope="module") |
| 42 | +def hive_catalog_with_kms(): # type: ignore[no-untyped-def] |
| 43 | + return load_catalog( |
| 44 | + "local", |
| 45 | + **{ |
| 46 | + "type": "hive", |
| 47 | + "uri": "thrift://localhost:9083", |
| 48 | + "s3.endpoint": "http://localhost:9000", |
| 49 | + "s3.access-key-id": "admin", |
| 50 | + "s3.secret-access-key": "password", |
| 51 | + "py-kms-impl": "pyiceberg.encryption.kms.InMemoryKms", |
| 52 | + "encryption.kms.key.keyA": _KEY_A_HEX, |
| 53 | + "encryption.kms.key.keyB": _KEY_B_HEX, |
| 54 | + }, |
| 55 | + ) |
| 56 | + |
| 57 | + |
| 58 | +@pytest.mark.integration |
| 59 | +def test_encrypted_table_metadata(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def] |
| 60 | + tbl = hive_catalog_with_kms.load_table("default.test_encrypted") |
| 61 | + |
| 62 | + assert tbl.metadata.format_version == 3 |
| 63 | + assert tbl.metadata.properties.get("encryption.key-id") == "keyA" |
| 64 | + assert tbl.metadata.encryption_keys, "expected encryption keys on table metadata" |
| 65 | + |
| 66 | + snapshot = tbl.current_snapshot() |
| 67 | + assert snapshot is not None |
| 68 | + assert snapshot.key_id is not None, "expected key_id on current snapshot" |
| 69 | + |
| 70 | + |
| 71 | +@pytest.mark.integration |
| 72 | +def test_encrypted_table_to_arrow(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def] |
| 73 | + tbl = hive_catalog_with_kms.load_table("default.test_encrypted") |
| 74 | + |
| 75 | + result = tbl.scan().to_arrow().sort_by("id") |
| 76 | + |
| 77 | + assert result.num_rows == 3 |
| 78 | + assert result.column("id").to_pylist() == [1, 2, 3] |
| 79 | + assert result.column("data").to_pylist() == ["alice", "bob", "charlie"] |
| 80 | + assert result.column("value").to_pylist() == [1.0, 2.0, 3.0] |
| 81 | + |
| 82 | + |
| 83 | +@pytest.mark.integration |
| 84 | +def test_encrypted_table_to_pandas(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def] |
| 85 | + tbl = hive_catalog_with_kms.load_table("default.test_encrypted") |
| 86 | + |
| 87 | + df = tbl.scan().to_pandas().sort_values("id").reset_index(drop=True) |
| 88 | + |
| 89 | + assert list(df["id"]) == [1, 2, 3] |
| 90 | + assert list(df["data"]) == ["alice", "bob", "charlie"] |
| 91 | + assert list(df["value"]) == [1.0, 2.0, 3.0] |
| 92 | + |
| 93 | + |
| 94 | +@pytest.mark.integration |
| 95 | +def test_encrypted_table_to_duckdb(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def] |
| 96 | + tbl = hive_catalog_with_kms.load_table("default.test_encrypted") |
| 97 | + |
| 98 | + con = tbl.scan().to_duckdb("encrypted") |
| 99 | + rows = con.execute("SELECT id, data, value FROM encrypted ORDER BY id").fetchall() |
| 100 | + |
| 101 | + assert rows == [(1, "alice", 1.0), (2, "bob", 2.0), (3, "charlie", 3.0)] |
| 102 | + |
| 103 | + |
| 104 | +@pytest.mark.integration |
| 105 | +def test_encrypted_table_to_polars(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def] |
| 106 | + tbl = hive_catalog_with_kms.load_table("default.test_encrypted") |
| 107 | + |
| 108 | + df = tbl.scan().to_polars().sort("id") |
| 109 | + |
| 110 | + assert df["id"].to_list() == [1, 2, 3] |
| 111 | + assert df["data"].to_list() == ["alice", "bob", "charlie"] |
| 112 | + assert df["value"].to_list() == [1.0, 2.0, 3.0] |
0 commit comments