Skip to content

Commit 8427e54

Browse files
infra: hive encryption integration test
1 parent 89abb2a commit 8427e54

5 files changed

Lines changed: 148 additions & 11 deletions

File tree

Makefile

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# under the License.
1717
.PHONY: help install install-uv check-license lint \
1818
test test-integration test-integration-setup test-integration-exec test-integration-cleanup test-integration-rebuild \
19-
test-s3 test-adls test-gcs test-coverage coverage-report test test-notebook\
19+
test-s3 test-adls test-gcs test-coverage coverage-report \
2020
docs-serve docs-build notebook notebook-infra \
2121
clean
2222

@@ -38,10 +38,12 @@ else
3838
PYTHON_ARG =
3939
endif
4040

41+
# --no-sync so that overlays applied after `make install` (e.g. install-pyarrow-nightly for
42+
# the encryption integration test) aren't reverted by uv re-syncing the lockfile on `uv run`.
4143
ifeq ($(COVERAGE),1)
42-
TEST_RUNNER = uv run $(PYTHON_ARG) python -m coverage run --parallel-mode --source=pyiceberg -m
44+
TEST_RUNNER = uv run --no-sync $(PYTHON_ARG) python -m coverage run --parallel-mode --source=pyiceberg -m
4345
else
44-
TEST_RUNNER = uv run $(PYTHON_ARG) python -m
46+
TEST_RUNNER = uv run --no-sync $(PYTHON_ARG) python -m
4547
endif
4648

4749
ifeq ($(KEEP_COMPOSE),1)
@@ -108,12 +110,19 @@ test: ## Run all unit tests (excluding integration)
108110

109111
test-integration: test-integration-setup test-integration-exec test-integration-cleanup ## Run integration tests
110112

111-
test-integration-setup: install ## Start Docker services for integration tests
113+
test-integration-setup: install install-pyarrow-nightly ## Start Docker services for integration tests
112114
docker compose -f dev/docker-compose-integration.yml kill
113115
docker compose -f dev/docker-compose-integration.yml rm -f
114116
docker compose -f dev/docker-compose-integration.yml up -d --build --wait
115117
uv run $(PYTHON_ARG) python dev/provision.py
116118

119+
# Parquet Modular Encryption decryption (tests/integration/test_encryption.py) needs the
120+
# pyarrow.parquet.encryption.create_decryption_properties API from apache/arrow#49667. That
121+
# lands in pyarrow 25, which hasn't been released — pull the nightly until it is.
122+
install-pyarrow-nightly: ## Overlay nightly pyarrow on top of the installed env (for PME)
123+
uv pip install $(PYTHON_ARG) --prerelease=allow --upgrade --force-reinstall \
124+
-i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pyarrow
125+
117126
test-integration-exec: ## Run integration tests (excluding provision)
118127
$(TEST_RUNNER) pytest tests/ -m integration $(PYTEST_ARGS)
119128

@@ -150,9 +159,6 @@ coverage-report: ## Combine and report coverage
150159
uv run $(PYTHON_ARG) coverage html
151160
uv run $(PYTHON_ARG) coverage xml
152161

153-
test-notebook: ## Run notebook tests (pyiceberg_example and spark_integration_example) via papermill
154-
$(TEST_RUNNER) pytest tests/notebooks/test_pyiceberg_example.py tests/notebooks/test_spark_integration_example.py -m notebook $(PYTEST_ARGS)
155-
156162
# ================
157163
# Documentation
158164
# ================

dev/provision.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,13 @@
395395
)
396396
spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id")
397397
spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'")
398+
399+
# Encrypted Iceberg table written via Spark, read back via PyIceberg in tests/integration/test_encryption.py.
400+
# Only the Hive catalog is configured with a Java-side KMS (encryption.kms-impl=UnitestKMS); the REST catalog
401+
# image does not ship UnitestKMS so we limit this fixture to Hive.
402+
spark.sql("""
403+
CREATE OR REPLACE TABLE hive.default.test_encrypted (id bigint, data string, value float)
404+
USING iceberg
405+
TBLPROPERTIES ('encryption.key-id'='keyA', 'format-version'='3')
406+
""")
407+
spark.sql("INSERT INTO hive.default.test_encrypted VALUES (1, 'alice', 1.0), (2, 'bob', 2.0), (3, 'charlie', 3.0)")

dev/spark/Dockerfile

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ ARG BASE_IMAGE_SPARK_VERSION=4.0.1
1818
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
1919

2020
# Dependency versions - keep these compatible
21-
# Changing these will invalidate the JAR download cache layer
22-
ARG ICEBERG_VERSION=1.10.1
21+
# Changing these will invalidate the JAR download cache layer.
22+
# Iceberg 1.11.0 carries the Hive encryption integration (apache/iceberg#13066) — the prior
23+
# 1.10.x release predates that work and silently no-ops encryption.kms-impl / encryption.key-id.
24+
ARG ICEBERG_VERSION=1.11.0
2325
ARG ICEBERG_SPARK_RUNTIME_VERSION=4.0_2.13
2426
ARG HADOOP_VERSION=3.4.1
2527
ARG AWS_SDK_VERSION=2.24.6
@@ -36,13 +38,15 @@ RUN apt-get update -qq && \
3638
mkdir -p /home/iceberg/spark-events && \
3739
chown -R spark:spark /home/iceberg
3840

39-
# Download JARs with retry logic (most cacheable - only changes when versions change)
40-
# This is the slowest step, so we do it before copying config files
41+
# Download JARs with retry logic (most cacheable - only changes when versions change).
42+
# iceberg-core-${ICEBERG_VERSION}-tests.jar ships org.apache.iceberg.encryption.UnitestKMS, a
43+
# fixed-master-key KMS used by the encryption integration test on the Spark write path.
4144
RUN set -e && \
4245
cd "${SPARK_HOME}/jars" && \
4346
for jar_path in \
4447
"org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
4548
"org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
49+
"org/apache/iceberg/iceberg-core/${ICEBERG_VERSION}/iceberg-core-${ICEBERG_VERSION}-tests.jar" \
4650
"org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
4751
"software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar"; \
4852
do \

dev/spark/spark-defaults.conf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ spark.sql.catalog.hive.io-impl org.apache.iceberg.aws.s3.S3FileIO
3434
spark.sql.catalog.hive.warehouse s3://warehouse/hive/
3535
spark.sql.catalog.hive.s3.endpoint http://minio:9000
3636

37+
# Test-only KMS so Spark can write encrypted Iceberg tables for the encryption integration test.
38+
# UnitestKMS comes from iceberg-core-<version>-tests.jar and uses fixed master keys ("keyA",
39+
# "keyB") that match the InMemoryKms config used on the PyIceberg side.
40+
spark.sql.catalog.hive.encryption.kms-impl org.apache.iceberg.encryption.UnitestKMS
41+
3742
# Configure Spark's default session catalog (spark_catalog) to use Iceberg backed by the Hive Metastore
3843
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
3944
spark.sql.catalog.spark_catalog.type hive
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
"""Reads of Spark-written, Parquet-encrypted Iceberg tables via PyIceberg.
18+
19+
The encrypted table (`hive.default.test_encrypted`) is provisioned by `dev/provision.py`
20+
using Spark with `encryption.kms-impl=org.apache.iceberg.encryption.UnitestKMS`. UnitestKMS
21+
ships hardcoded master keys (keyA=b"0123456789012345", keyB=b"1123456789012345"); we mirror
22+
those bytes here through PyIceberg's InMemoryKms so unwrapping succeeds.
23+
24+
Decryption of the data files requires PyArrow's `parquet.encryption.create_decryption_properties`
25+
API, which is available in PyArrow >= 25 (currently shipped only via the nightly wheels). See
26+
the Makefile target `install-pyarrow-nightly`.
27+
"""
28+
29+
from __future__ import annotations
30+
31+
import pytest
32+
33+
from pyiceberg.catalog import load_catalog
34+
35+
# UnitestKMS master keys, hex-encoded so they can be set as catalog properties and parsed by
36+
# InMemoryKms.initialize (`encryption.kms.key.<id>=<hex>`).
37+
_KEY_A_HEX = b"0123456789012345".hex()
38+
_KEY_B_HEX = b"1123456789012345".hex()
39+
40+
41+
@pytest.fixture(scope="module")
42+
def hive_catalog_with_kms(): # type: ignore[no-untyped-def]
43+
return load_catalog(
44+
"local",
45+
**{
46+
"type": "hive",
47+
"uri": "thrift://localhost:9083",
48+
"s3.endpoint": "http://localhost:9000",
49+
"s3.access-key-id": "admin",
50+
"s3.secret-access-key": "password",
51+
"py-kms-impl": "pyiceberg.encryption.kms.InMemoryKms",
52+
"encryption.kms.key.keyA": _KEY_A_HEX,
53+
"encryption.kms.key.keyB": _KEY_B_HEX,
54+
},
55+
)
56+
57+
58+
@pytest.mark.integration
59+
def test_encrypted_table_metadata(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def]
60+
tbl = hive_catalog_with_kms.load_table("default.test_encrypted")
61+
62+
assert tbl.metadata.format_version == 3
63+
assert tbl.metadata.properties.get("encryption.key-id") == "keyA"
64+
assert tbl.metadata.encryption_keys, "expected encryption keys on table metadata"
65+
66+
snapshot = tbl.current_snapshot()
67+
assert snapshot is not None
68+
assert snapshot.key_id is not None, "expected key_id on current snapshot"
69+
70+
71+
@pytest.mark.integration
72+
def test_encrypted_table_to_arrow(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def]
73+
tbl = hive_catalog_with_kms.load_table("default.test_encrypted")
74+
75+
result = tbl.scan().to_arrow().sort_by("id")
76+
77+
assert result.num_rows == 3
78+
assert result.column("id").to_pylist() == [1, 2, 3]
79+
assert result.column("data").to_pylist() == ["alice", "bob", "charlie"]
80+
assert result.column("value").to_pylist() == [1.0, 2.0, 3.0]
81+
82+
83+
@pytest.mark.integration
84+
def test_encrypted_table_to_pandas(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def]
85+
tbl = hive_catalog_with_kms.load_table("default.test_encrypted")
86+
87+
df = tbl.scan().to_pandas().sort_values("id").reset_index(drop=True)
88+
89+
assert list(df["id"]) == [1, 2, 3]
90+
assert list(df["data"]) == ["alice", "bob", "charlie"]
91+
assert list(df["value"]) == [1.0, 2.0, 3.0]
92+
93+
94+
@pytest.mark.integration
95+
def test_encrypted_table_to_duckdb(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def]
96+
tbl = hive_catalog_with_kms.load_table("default.test_encrypted")
97+
98+
con = tbl.scan().to_duckdb("encrypted")
99+
rows = con.execute("SELECT id, data, value FROM encrypted ORDER BY id").fetchall()
100+
101+
assert rows == [(1, "alice", 1.0), (2, "bob", 2.0), (3, "charlie", 3.0)]
102+
103+
104+
@pytest.mark.integration
105+
def test_encrypted_table_to_polars(hive_catalog_with_kms) -> None: # type: ignore[no-untyped-def]
106+
tbl = hive_catalog_with_kms.load_table("default.test_encrypted")
107+
108+
df = tbl.scan().to_polars().sort("id")
109+
110+
assert df["id"].to_list() == [1, 2, 3]
111+
assert df["data"].to_list() == ["alice", "bob", "charlie"]
112+
assert df["value"].to_list() == [1.0, 2.0, 3.0]

0 commit comments

Comments
 (0)