Skip to content

Commit ed75062

Browse files
committed
Consolidate unit tests
1 parent 1dc1c01 commit ed75062

2 files changed

Lines changed: 56 additions & 174 deletions

File tree

pyiceberg/utils/schema_conversion.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -531,17 +531,17 @@ def field(self, field: NestedField, field_result: AvroType) -> AvroType:
531531
if isinstance(field_result, dict) and field_result.get("type") == "record":
532532
field_result["name"] = f"r{field.field_id}"
533533

534-
orig_field_name = field.name
535-
field_name = make_compatible_name(orig_field_name)
534+
original_name = field.name
535+
sanitized_name = make_compatible_name(original_name)
536536

537537
result = {
538-
"name": field_name,
538+
"name": sanitized_name,
539539
FIELD_ID_PROP: field.field_id,
540540
"type": field_result if field.required else ["null", field_result],
541541
}
542542

543-
if orig_field_name != field_name:
544-
result[ICEBERG_FIELD_NAME_PROP] = orig_field_name
543+
if original_name != sanitized_name:
544+
result[ICEBERG_FIELD_NAME_PROP] = original_name
545545

546546
if field.write_default is not None:
547547
result["default"] = field.write_default

tests/test_avro_sanitization.py

Lines changed: 51 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,26 @@ def field_starting_with_digit(self) -> str:
4646
return self._data[2]
4747

4848

49-
def test_avro_field_name_sanitization() -> None:
50-
"""Test that field names are sanitized according to Java implementation."""
49+
def test_comprehensive_field_name_sanitization() -> None:
50+
"""Test comprehensive field name sanitization including edge cases and Java compatibility."""
5151

52-
# Test cases from Java TestSchemaConversions.java
5352
test_cases = [
53+
# Java compatibility test cases
5454
("9x", "_9x"),
5555
("x_", "x_"),
5656
("a.b", "a_x2Eb"),
5757
("☃", "_x2603"),
5858
("a#b", "a_x23b"),
59+
("123", "_123"),
60+
("_", "_"),
61+
("a", "a"),
62+
("a1", "a1"),
63+
("1a", "_1a"),
64+
("a☃b", "a_x2603b"),
65+
("name#with#hash", "name_x23with_x23hash"),
66+
("123number", "_123number"),
67+
("😎", "_x1F60E"),
68+
("😎_with_text", "_x1F60E_with_text"),
5969
]
6070

6171
for original_name, expected_sanitized in test_cases:
@@ -72,53 +82,22 @@ def test_avro_field_name_sanitization() -> None:
7282
assert ICEBERG_FIELD_NAME_PROP not in avro_dict["fields"][0]
7383

7484

75-
def test_complex_schema_sanitization() -> None:
76-
"""Test sanitization with nested schemas."""
77-
schema = Schema(
78-
NestedField(field_id=1, name="valid_field", field_type=StringType(), required=True),
79-
NestedField(field_id=2, name="invalid.field", field_type=IntegerType(), required=True),
80-
)
81-
82-
avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema)
83-
avro_dict: Dict[str, Any] = avro_schema
84-
85-
assert avro_dict["fields"][0]["name"] == "valid_field"
86-
assert ICEBERG_FIELD_NAME_PROP not in avro_dict["fields"][0]
87-
88-
assert avro_dict["fields"][1]["name"] == "invalid_x2Efield"
89-
assert avro_dict["fields"][1][ICEBERG_FIELD_NAME_PROP] == "invalid.field"
90-
91-
92-
def test_edge_cases() -> None:
93-
"""Test edge cases for sanitization."""
94-
edge_cases = [
95-
("123", "_123"),
96-
("_", "_"),
97-
("a", "a"),
98-
("a1", "a1"),
99-
("1a", "_1a"),
100-
]
101-
102-
for original_name, expected_sanitized in edge_cases:
103-
schema = Schema(NestedField(field_id=1, name=original_name, field_type=StringType(), required=True))
104-
105-
avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema)
106-
avro_dict: Dict[str, Any] = avro_schema
107-
assert avro_dict["fields"][0]["name"] == expected_sanitized
108-
109-
110-
def test_avro_compatibility() -> None:
111-
"""Test that Avro files with sanitized names can be read by other tools."""
85+
def test_comprehensive_avro_compatibility() -> None:
86+
"""Test comprehensive Avro compatibility including complex schemas and file structure."""
11287

88+
# Create schema with various field name types
11389
schema = Schema(
11490
NestedField(field_id=1, name="valid_field", field_type=StringType(), required=True),
11591
NestedField(field_id=2, name="invalid.field", field_type=IntegerType(), required=True),
11692
NestedField(field_id=3, name="9x", field_type=StringType(), required=True),
93+
NestedField(field_id=4, name="name#with#hash", field_type=StringType(), required=True),
94+
NestedField(field_id=5, name="☃", field_type=IntegerType(), required=True),
95+
NestedField(field_id=6, name="😎", field_type=IntegerType(), required=True),
11796
)
11897

11998
test_records = [
120-
AvroTestRecord("hello", 42, "test"),
121-
AvroTestRecord("goodbye", 99, "example"),
99+
AvroTestRecord("hello", 42, "test", "hash_value", 100, 200),
100+
AvroTestRecord("goodbye", 99, "example", "another_hash", 200, 300),
122101
]
123102

124103
with tempfile.NamedTemporaryFile(suffix=".avro", delete=False) as tmp_file:
@@ -134,6 +113,16 @@ def test_avro_compatibility() -> None:
134113
output_file.write_block(test_records)
135114

136115
with open(tmp_avro_file, "rb") as fo:
116+
# Test Avro file structure
117+
magic = fo.read(4)
118+
assert magic == b"Obj\x01" # Avro magic bytes
119+
120+
import struct
121+
122+
metadata_length = struct.unpack(">I", fo.read(4))[0]
123+
assert metadata_length > 0
124+
125+
fo.seek(0)
137126
avro_reader = reader(fo)
138127

139128
avro_schema: AvroType = avro_reader.writer_schema
@@ -145,10 +134,14 @@ def test_avro_compatibility() -> None:
145134
"valid_field",
146135
"invalid_x2Efield",
147136
"_9x",
137+
"name_x23with_x23hash",
138+
"_x2603",
139+
"_x1F60E",
148140
]
149141

150142
assert field_names == expected_field_names
151143

144+
# Verify iceberg-field-name properties
152145
for field in avro_dict["fields"]:
153146
field_dict: Dict[str, Any] = field
154147
if field_dict["name"] == "invalid_x2Efield":
@@ -157,22 +150,37 @@ def test_avro_compatibility() -> None:
157150
elif field_dict["name"] == "_9x":
158151
assert "iceberg-field-name" in field_dict
159152
assert field_dict["iceberg-field-name"] == "9x"
153+
elif field_dict["name"] == "name_x23with_x23hash":
154+
assert "iceberg-field-name" in field_dict
155+
assert field_dict["iceberg-field-name"] == "name#with#hash"
156+
elif field_dict["name"] == "_x2603":
157+
assert "iceberg-field-name" in field_dict
158+
assert field_dict["iceberg-field-name"] == "☃"
159+
elif field_dict["name"] == "_x1F60E":
160+
assert "iceberg-field-name" in field_dict
161+
assert field_dict["iceberg-field-name"] == "😎"
160162
else:
161163
assert "iceberg-field-name" not in field_dict
162164

163165
records = list(avro_reader)
164-
165166
assert len(records) == 2
166167

168+
# Verify data integrity
167169
first_record = records[0]
168170
assert first_record["valid_field"] == "hello"
169171
assert first_record["invalid_x2Efield"] == 42
170172
assert first_record["_9x"] == "test"
173+
assert first_record["name_x23with_x23hash"] == "hash_value"
174+
assert first_record["_x2603"] == 100
175+
assert first_record["_x1F60E"] == 200
171176

172177
second_record = records[1]
173178
assert second_record["valid_field"] == "goodbye"
174179
assert second_record["invalid_x2Efield"] == 99
175180
assert second_record["_9x"] == "example"
181+
assert second_record["name_x23with_x23hash"] == "another_hash"
182+
assert second_record["_x2603"] == 200
183+
assert second_record["_x1F60E"] == 300
176184

177185
assert avro_reader.metadata.get("test") == "metadata"
178186

@@ -183,132 +191,6 @@ def test_avro_compatibility() -> None:
183191
os.unlink(tmp_avro_file)
184192

185193

186-
def test_avro_schema_conversion_sanitization() -> None:
187-
"""Test that schema conversion properly sanitizes field names."""
188-
189-
# Create schema with various invalid field names
190-
schema = Schema(
191-
NestedField(field_id=1, name="valid_name", field_type=StringType(), required=True),
192-
NestedField(field_id=2, name="invalid.name", field_type=IntegerType(), required=True),
193-
NestedField(field_id=3, name="name#with#hash", field_type=StringType(), required=True),
194-
NestedField(field_id=4, name="☃", field_type=IntegerType(), required=True), # Unicode character
195-
NestedField(field_id=5, name="123number", field_type=StringType(), required=True),
196-
)
197-
198-
avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema, schema_name="test_schema")
199-
avro_dict: Dict[str, Any] = avro_schema
200-
201-
field_names = [field["name"] for field in avro_dict["fields"]]
202-
expected_field_names = [
203-
"valid_name", # Valid name, unchanged
204-
"invalid_x2Ename", # Dot becomes _x2E
205-
"name_x23with_x23hash", # Hash becomes _x23
206-
"_x2603", # Unicode snowman becomes _x2603
207-
"_123number", # Starts with digit, gets leading underscore
208-
]
209-
210-
assert field_names == expected_field_names
211-
212-
for field in avro_dict["fields"]:
213-
field_dict: Dict[str, Any] = field
214-
if field_dict["name"] == "invalid_x2Ename":
215-
assert field_dict["iceberg-field-name"] == "invalid.name"
216-
elif field_dict["name"] == "name_x23with_x23hash":
217-
assert field_dict["iceberg-field-name"] == "name#with#hash"
218-
elif field_dict["name"] == "_x2603":
219-
assert field_dict["iceberg-field-name"] == "☃"
220-
elif field_dict["name"] == "_123number":
221-
assert field_dict["iceberg-field-name"] == "123number"
222-
else:
223-
assert "iceberg-field-name" not in field_dict
224-
225-
226-
def test_avro_file_structure_verification() -> None:
227-
"""Test that the Avro file structure is correct and can be parsed."""
228-
229-
schema = Schema(
230-
NestedField(field_id=1, name="test.field", field_type=StringType(), required=True),
231-
)
232-
233-
test_records = [AvroTestRecord("hello")]
234-
235-
with tempfile.NamedTemporaryFile(suffix=".avro", delete=False) as tmp_file:
236-
tmp_avro_file = tmp_file.name
237-
238-
try:
239-
with avro.AvroOutputFile[AvroTestRecord](
240-
output_file=PyArrowFileIO().new_output(tmp_avro_file),
241-
file_schema=schema,
242-
schema_name="simple_test",
243-
) as output_file:
244-
output_file.write_block(test_records)
245-
246-
with open(tmp_avro_file, "rb") as fo:
247-
# Read magic bytes (first 4 bytes should be Avro magic)
248-
magic = fo.read(4)
249-
assert magic == b"Obj\x01" # Avro magic bytes
250-
251-
import struct
252-
253-
metadata_length = struct.unpack(">I", fo.read(4))[0]
254-
assert metadata_length > 0
255-
256-
from fastavro import reader
257-
258-
fo.seek(0)
259-
avro_reader = reader(fo)
260-
261-
avro_schema: AvroType = avro_reader.writer_schema
262-
avro_dict: Dict[str, Any] = avro_schema
263-
264-
assert len(avro_dict["fields"]) == 1
265-
field: Dict[str, Any] = avro_dict["fields"][0]
266-
assert field["name"] == "test_x2Efield"
267-
assert field["iceberg-field-name"] == "test.field"
268-
269-
records = list(avro_reader)
270-
assert len(records) == 1
271-
assert records[0]["test_x2Efield"] == "hello"
272-
273-
finally:
274-
import os
275-
276-
if os.path.exists(tmp_avro_file):
277-
os.unlink(tmp_avro_file)
278-
279-
280-
def test_edge_cases_sanitization() -> None:
281-
"""Test edge cases for field name sanitization."""
282-
283-
test_cases = [
284-
("123", "_123"), # All digits
285-
("_", "_"), # Just underscore
286-
("a", "a"), # Single letter
287-
("a1", "a1"), # Letter followed by digit
288-
("1a", "_1a"), # Digit followed by letter
289-
("a.b", "a_x2Eb"), # Letter, dot, letter
290-
("a#b", "a_x23b"), # Letter, hash, letter
291-
("☃", "_x2603"), # Unicode character
292-
("a☃b", "a_x2603b"), # Letter, unicode, letter
293-
]
294-
295-
for original_name, expected_sanitized in test_cases:
296-
schema = Schema(
297-
NestedField(field_id=1, name=original_name, field_type=StringType(), required=True),
298-
)
299-
300-
avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema, schema_name="edge_test")
301-
avro_dict: Dict[str, Any] = avro_schema
302-
303-
field: Dict[str, Any] = avro_dict["fields"][0]
304-
assert field["name"] == expected_sanitized
305-
306-
if original_name != expected_sanitized:
307-
assert field["iceberg-field-name"] == original_name
308-
else:
309-
assert "iceberg-field-name" not in field
310-
311-
312194
def test_emoji_field_name_sanitization() -> None:
313195
"""Test that emoji field names are properly sanitized according to Java implementation."""
314196

0 commit comments

Comments
 (0)