Skip to content

Commit d3e0c0f

Browse files
author
Sreesh Maheshwar
committed
Add unit tests
1 parent ce5f0d5 commit d3e0c0f

1 file changed

Lines changed: 135 additions & 0 deletions

File tree

tests/table/test_locations.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
from typing import Optional
18+
19+
import pytest
20+
21+
from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec
22+
from pyiceberg.schema import Schema
23+
from pyiceberg.table import (
24+
LocationProvider,
25+
load_location_provider,
26+
)
27+
from pyiceberg.transforms import IdentityTransform
28+
from pyiceberg.typedef import EMPTY_DICT
29+
from pyiceberg.types import NestedField, StringType
30+
31+
TABLE_SCHEMA = Schema(NestedField(field_id=2, name="field", field_type=StringType(), required=False))
32+
PARTITION_FIELD = PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="part#field")
33+
PARTITION_SPEC = PartitionSpec(PARTITION_FIELD)
34+
PARTITION_KEY = PartitionKey(
35+
raw_partition_field_values=[PartitionFieldValue(PARTITION_FIELD, "example#val")],
36+
partition_spec=PARTITION_SPEC,
37+
schema=TABLE_SCHEMA,
38+
)
39+
40+
41+
class CustomLocationProvider(LocationProvider):
42+
def new_data_location(self, data_file_name: str, partition_key: Optional[PartitionKey] = None) -> str:
43+
return f"custom_location_provider/{data_file_name}"
44+
45+
46+
def test_default_location_provider() -> None:
47+
provider = load_location_provider(table_location="table_location", table_properties=EMPTY_DICT)
48+
49+
assert provider.new_data_location("my_file") == "table_location/data/my_file"
50+
51+
52+
def test_custom_location_provider() -> None:
53+
qualified_name = CustomLocationProvider.__module__ + "." + CustomLocationProvider.__name__
54+
provider = load_location_provider(
55+
table_location="table_location", table_properties={"write.location-provider.impl": qualified_name}
56+
)
57+
58+
assert provider.new_data_location("my_file") == "custom_location_provider/my_file"
59+
60+
61+
def test_custom_location_provider_single_path() -> None:
62+
with pytest.raises(ValueError, match=r"write\.location-provider\.impl should be full path"):
63+
load_location_provider(table_location="table_location", table_properties={"write.location-provider.impl": "not_found"})
64+
65+
66+
def test_custom_location_provider_not_found() -> None:
67+
with pytest.raises(ValueError, match=r"Could not initialize LocationProvider"):
68+
load_location_provider(
69+
table_location="table_location", table_properties={"write.location-provider.impl": "module.not_found"}
70+
)
71+
72+
73+
def test_object_storage_injects_entropy() -> None:
74+
provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "true"})
75+
76+
location = provider.new_data_location("test.parquet")
77+
parts = location.split("/")
78+
79+
assert len(parts) == 7
80+
assert parts[0] == "table_location"
81+
assert parts[1] == "data"
82+
# Entropy directories in the middle
83+
assert parts[-1] == "test.parquet"
84+
85+
# Entropy directories should be 4 binary names of lengths 4, 4, 4, 8.
86+
for i in range(2, 6):
87+
assert len(parts[i]) == (8 if i == 5 else 4)
88+
assert all(c in "01" for c in parts[i])
89+
90+
91+
@pytest.mark.parametrize("object_storage", [True, False])
92+
def test_partition_value_in_path(object_storage: bool) -> None:
93+
provider = load_location_provider(
94+
table_location="table_location",
95+
table_properties={
96+
"write.object-storage.enabled": str(object_storage),
97+
},
98+
)
99+
100+
location = provider.new_data_location("test.parquet", PARTITION_KEY)
101+
partition_segment = location.split("/")[-2]
102+
103+
# Field name is not encoded but partition value is - this differs from the Java implementation
104+
# https://github.com/apache/iceberg/blob/cdf748e8e5537f13d861aa4c617a51f3e11dc97c/core/src/test/java/org/apache/iceberg/TestLocationProvider.java#L304
105+
assert partition_segment == "part#field=example%23val"
106+
107+
108+
def test_object_storage_exclude_partition_in_path() -> None:
109+
provider = load_location_provider(
110+
table_location="table_location",
111+
table_properties={
112+
"write.object-storage.enabled": "true",
113+
"write.object-storage.partitioned-paths": "false",
114+
},
115+
)
116+
117+
location = provider.new_data_location("test.parquet", PARTITION_KEY)
118+
119+
# No partition values included in the path and last part of entropy is seperated with "-"
120+
assert location == "table_location/data/0110/1010/0011/11101000-test.parquet"
121+
122+
123+
@pytest.mark.parametrize(
124+
["data_file_name", "expected_hash"],
125+
[
126+
("a", "0101/0110/1001/10110010"),
127+
("b", "1110/0111/1110/00000011"),
128+
("c", "0010/1101/0110/01011111"),
129+
("d", "1001/0001/0100/01110011"),
130+
],
131+
)
132+
def test_hash_injection(data_file_name: str, expected_hash: str) -> None:
133+
provider = load_location_provider(table_location="table_location", table_properties={"write.object-storage.enabled": "true"})
134+
135+
assert provider.new_data_location(data_file_name) == f"table_location/data/{expected_hash}/{data_file_name}"

0 commit comments

Comments
 (0)