Skip to content

Commit b581214

Browse files
author
Sreesh Maheshwar
committed
Add partition test
1 parent 8523b3a commit b581214

File tree

2 files changed

+57
-16
lines changed

2 files changed

+57
-16
lines changed

tests/integration/test_writes/test_partitioned_writes.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,43 @@ def test_query_filter_v1_v2_append_null(
280280
assert df.where(f"{col} is null").count() == 2, f"Expected 2 null rows for {col}"
281281

282282

283+
@pytest.mark.integration
284+
@pytest.mark.parametrize(
285+
"part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"]
286+
)
287+
@pytest.mark.parametrize("format_version", [1, 2])
288+
def test_object_storage_excludes_partition(
289+
session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str, format_version: int
290+
) -> None:
291+
nested_field = TABLE_SCHEMA.find_field(part_col)
292+
partition_spec = PartitionSpec(
293+
PartitionField(source_id=nested_field.field_id, field_id=1001, transform=IdentityTransform(), name=part_col)
294+
)
295+
296+
tbl = _create_table(
297+
session_catalog=session_catalog,
298+
identifier=f"default.arrow_table_v{format_version}_with_null_partitioned_on_col_{part_col}",
299+
properties={"format-version": str(format_version), "write.object-storage.enabled": True},
300+
data=[arrow_table_with_null],
301+
partition_spec=partition_spec,
302+
)
303+
304+
original_paths = tbl.inspect.data_files().to_pydict()["file_path"]
305+
assert len(original_paths) == 3
306+
307+
# Update props to exclude partitioned paths and append data
308+
with tbl.transaction() as tx:
309+
tx.set_properties({"write.object-storage.partitioned-paths": False})
310+
tbl.append(arrow_table_with_null)
311+
312+
added_paths = set(tbl.inspect.data_files().to_pydict()["file_path"]) - set(original_paths)
313+
assert len(added_paths) == 3
314+
315+
# All paths before the props update should contain the partition, while all paths after should not
316+
assert all(f"{part_col}=" in path for path in original_paths)
317+
assert all(f"{part_col}=" not in path for path in added_paths)
318+
319+
283320
@pytest.mark.integration
284321
@pytest.mark.parametrize(
285322
"spec",

tests/integration/test_writes/test_writes.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,6 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
273273
# Since we don't rewrite, this should produce a new manifest with an ADDED entry
274274
tbl.append(arrow_table_with_null)
275275

276-
277276
rows = spark.sql(
278277
f"""
279278
SELECT added_data_files_count, existing_data_files_count, deleted_data_files_count
@@ -285,27 +284,32 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w
285284
assert [row.existing_data_files_count for row in rows] == [0, 0, 0, 0, 0]
286285
assert [row.deleted_data_files_count for row in rows] == [0, 1, 0, 0, 0]
287286

288-
287+
tests/integration/test_writes/test_writes.py
289288
@pytest.mark.integration
290289
@pytest.mark.parametrize("format_version", [1, 2])
291-
def test_object_storage_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None:
292-
# TODO: What to do about "tbl.add_files()"?
293-
identifier = "default.object_stored_table"
294-
295-
tbl = _create_table(session_catalog, identifier, {"format-version": format_version, "write.object-storage.enabled": True}, [])
290+
def test_object_storage_data_files(
291+
spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
292+
) -> None:
293+
tbl = _create_table(
294+
session_catalog=session_catalog,
295+
identifier="default.object_stored",
296+
properties={"format-version": format_version, "write.object-storage.enabled": True},
297+
data=[arrow_table_with_null],
298+
)
296299
tbl.append(arrow_table_with_null)
297300

298-
paths = tbl.inspect.entries().to_pydict()["data_file"]
299-
assert len(paths) == 1
300-
location = paths[0]["file_path"]
301+
paths = tbl.inspect.data_files().to_pydict()["file_path"]
302+
assert len(paths) == 2
301303

302-
parts = location.split("/")
303-
assert len(parts) == 11
304+
for location in paths:
305+
assert location.startswith("s3://warehouse/default/object_stored/data/")
306+
parts = location.split("/")
307+
assert len(parts) == 11
304308

305-
assert location.startswith("s3://warehouse/default/object_stored_table/data/")
306-
for i in range(6, 10):
307-
assert len(parts[i]) == (8 if i == 9 else 4)
308-
assert all(c in "01" for c in parts[i])
309+
# Entropy binary directories should have been injected
310+
for i in range(6, 10):
311+
assert parts[i]
312+
assert all(c in "01" for c in parts[i])
309313

310314

311315
@pytest.mark.integration

0 commit comments

Comments
 (0)