@@ -46,16 +46,26 @@ def field_starting_with_digit(self) -> str:
4646 return self ._data [2 ]
4747
4848
49- def test_avro_field_name_sanitization () -> None :
50- """Test that field names are sanitized according to Java implementation ."""
49+ def test_comprehensive_field_name_sanitization () -> None :
50+ """Test comprehensive field name sanitization including edge cases and Java compatibility ."""
5151
52- # Test cases from Java TestSchemaConversions.java
5352 test_cases = [
53+ # Java compatibility test cases
5454 ("9x" , "_9x" ),
5555 ("x_" , "x_" ),
5656 ("a.b" , "a_x2Eb" ),
5757 ("☃" , "_x2603" ),
5858 ("a#b" , "a_x23b" ),
59+ ("123" , "_123" ),
60+ ("_" , "_" ),
61+ ("a" , "a" ),
62+ ("a1" , "a1" ),
63+ ("1a" , "_1a" ),
64+ ("a☃b" , "a_x2603b" ),
65+ ("name#with#hash" , "name_x23with_x23hash" ),
66+ ("123number" , "_123number" ),
67+ ("😎" , "_x1F60E" ),
68+ ("😎_with_text" , "_x1F60E_with_text" ),
5969 ]
6070
6171 for original_name , expected_sanitized in test_cases :
@@ -72,53 +82,22 @@ def test_avro_field_name_sanitization() -> None:
7282 assert ICEBERG_FIELD_NAME_PROP not in avro_dict ["fields" ][0 ]
7383
7484
75- def test_complex_schema_sanitization () -> None :
76- """Test sanitization with nested schemas."""
77- schema = Schema (
78- NestedField (field_id = 1 , name = "valid_field" , field_type = StringType (), required = True ),
79- NestedField (field_id = 2 , name = "invalid.field" , field_type = IntegerType (), required = True ),
80- )
81-
82- avro_schema : AvroType = AvroSchemaConversion ().iceberg_to_avro (schema )
83- avro_dict : Dict [str , Any ] = avro_schema
84-
85- assert avro_dict ["fields" ][0 ]["name" ] == "valid_field"
86- assert ICEBERG_FIELD_NAME_PROP not in avro_dict ["fields" ][0 ]
87-
88- assert avro_dict ["fields" ][1 ]["name" ] == "invalid_x2Efield"
89- assert avro_dict ["fields" ][1 ][ICEBERG_FIELD_NAME_PROP ] == "invalid.field"
90-
91-
92- def test_edge_cases () -> None :
93- """Test edge cases for sanitization."""
94- edge_cases = [
95- ("123" , "_123" ),
96- ("_" , "_" ),
97- ("a" , "a" ),
98- ("a1" , "a1" ),
99- ("1a" , "_1a" ),
100- ]
101-
102- for original_name , expected_sanitized in edge_cases :
103- schema = Schema (NestedField (field_id = 1 , name = original_name , field_type = StringType (), required = True ))
104-
105- avro_schema : AvroType = AvroSchemaConversion ().iceberg_to_avro (schema )
106- avro_dict : Dict [str , Any ] = avro_schema
107- assert avro_dict ["fields" ][0 ]["name" ] == expected_sanitized
108-
109-
110- def test_avro_compatibility () -> None :
111- """Test that Avro files with sanitized names can be read by other tools."""
85+ def test_comprehensive_avro_compatibility () -> None :
86+ """Test comprehensive Avro compatibility including complex schemas and file structure."""
11287
88+ # Create schema with various field name types
11389 schema = Schema (
11490 NestedField (field_id = 1 , name = "valid_field" , field_type = StringType (), required = True ),
11591 NestedField (field_id = 2 , name = "invalid.field" , field_type = IntegerType (), required = True ),
11692 NestedField (field_id = 3 , name = "9x" , field_type = StringType (), required = True ),
93+ NestedField (field_id = 4 , name = "name#with#hash" , field_type = StringType (), required = True ),
94+ NestedField (field_id = 5 , name = "☃" , field_type = IntegerType (), required = True ),
95+ NestedField (field_id = 6 , name = "😎" , field_type = IntegerType (), required = True ),
11796 )
11897
11998 test_records = [
120- AvroTestRecord ("hello" , 42 , "test" ),
121- AvroTestRecord ("goodbye" , 99 , "example" ),
99+ AvroTestRecord ("hello" , 42 , "test" , "hash_value" , 100 , 200 ),
100+ AvroTestRecord ("goodbye" , 99 , "example" , "another_hash" , 200 , 300 ),
122101 ]
123102
124103 with tempfile .NamedTemporaryFile (suffix = ".avro" , delete = False ) as tmp_file :
@@ -134,6 +113,16 @@ def test_avro_compatibility() -> None:
134113 output_file .write_block (test_records )
135114
136115 with open (tmp_avro_file , "rb" ) as fo :
116+ # Test Avro file structure
117+ magic = fo .read (4 )
118+ assert magic == b"Obj\x01 " # Avro magic bytes
119+
120+ import struct
121+
122+ metadata_length = struct .unpack (">I" , fo .read (4 ))[0 ]
123+ assert metadata_length > 0
124+
125+ fo .seek (0 )
137126 avro_reader = reader (fo )
138127
139128 avro_schema : AvroType = avro_reader .writer_schema
@@ -145,10 +134,14 @@ def test_avro_compatibility() -> None:
145134 "valid_field" ,
146135 "invalid_x2Efield" ,
147136 "_9x" ,
137+ "name_x23with_x23hash" ,
138+ "_x2603" ,
139+ "_x1F60E" ,
148140 ]
149141
150142 assert field_names == expected_field_names
151143
144+ # Verify iceberg-field-name properties
152145 for field in avro_dict ["fields" ]:
153146 field_dict : Dict [str , Any ] = field
154147 if field_dict ["name" ] == "invalid_x2Efield" :
@@ -157,22 +150,37 @@ def test_avro_compatibility() -> None:
157150 elif field_dict ["name" ] == "_9x" :
158151 assert "iceberg-field-name" in field_dict
159152 assert field_dict ["iceberg-field-name" ] == "9x"
153+ elif field_dict ["name" ] == "name_x23with_x23hash" :
154+ assert "iceberg-field-name" in field_dict
155+ assert field_dict ["iceberg-field-name" ] == "name#with#hash"
156+ elif field_dict ["name" ] == "_x2603" :
157+ assert "iceberg-field-name" in field_dict
158+ assert field_dict ["iceberg-field-name" ] == "☃"
159+ elif field_dict ["name" ] == "_x1F60E" :
160+ assert "iceberg-field-name" in field_dict
161+ assert field_dict ["iceberg-field-name" ] == "😎"
160162 else :
161163 assert "iceberg-field-name" not in field_dict
162164
163165 records = list (avro_reader )
164-
165166 assert len (records ) == 2
166167
168+ # Verify data integrity
167169 first_record = records [0 ]
168170 assert first_record ["valid_field" ] == "hello"
169171 assert first_record ["invalid_x2Efield" ] == 42
170172 assert first_record ["_9x" ] == "test"
173+ assert first_record ["name_x23with_x23hash" ] == "hash_value"
174+ assert first_record ["_x2603" ] == 100
175+ assert first_record ["_x1F60E" ] == 200
171176
172177 second_record = records [1 ]
173178 assert second_record ["valid_field" ] == "goodbye"
174179 assert second_record ["invalid_x2Efield" ] == 99
175180 assert second_record ["_9x" ] == "example"
181+ assert second_record ["name_x23with_x23hash" ] == "another_hash"
182+ assert second_record ["_x2603" ] == 200
183+ assert second_record ["_x1F60E" ] == 300
176184
177185 assert avro_reader .metadata .get ("test" ) == "metadata"
178186
@@ -183,132 +191,6 @@ def test_avro_compatibility() -> None:
183191 os .unlink (tmp_avro_file )
184192
185193
186- def test_avro_schema_conversion_sanitization () -> None :
187- """Test that schema conversion properly sanitizes field names."""
188-
189- # Create schema with various invalid field names
190- schema = Schema (
191- NestedField (field_id = 1 , name = "valid_name" , field_type = StringType (), required = True ),
192- NestedField (field_id = 2 , name = "invalid.name" , field_type = IntegerType (), required = True ),
193- NestedField (field_id = 3 , name = "name#with#hash" , field_type = StringType (), required = True ),
194- NestedField (field_id = 4 , name = "☃" , field_type = IntegerType (), required = True ), # Unicode character
195- NestedField (field_id = 5 , name = "123number" , field_type = StringType (), required = True ),
196- )
197-
198- avro_schema : AvroType = AvroSchemaConversion ().iceberg_to_avro (schema , schema_name = "test_schema" )
199- avro_dict : Dict [str , Any ] = avro_schema
200-
201- field_names = [field ["name" ] for field in avro_dict ["fields" ]]
202- expected_field_names = [
203- "valid_name" , # Valid name, unchanged
204- "invalid_x2Ename" , # Dot becomes _x2E
205- "name_x23with_x23hash" , # Hash becomes _x23
206- "_x2603" , # Unicode snowman becomes _x2603
207- "_123number" , # Starts with digit, gets leading underscore
208- ]
209-
210- assert field_names == expected_field_names
211-
212- for field in avro_dict ["fields" ]:
213- field_dict : Dict [str , Any ] = field
214- if field_dict ["name" ] == "invalid_x2Ename" :
215- assert field_dict ["iceberg-field-name" ] == "invalid.name"
216- elif field_dict ["name" ] == "name_x23with_x23hash" :
217- assert field_dict ["iceberg-field-name" ] == "name#with#hash"
218- elif field_dict ["name" ] == "_x2603" :
219- assert field_dict ["iceberg-field-name" ] == "☃"
220- elif field_dict ["name" ] == "_123number" :
221- assert field_dict ["iceberg-field-name" ] == "123number"
222- else :
223- assert "iceberg-field-name" not in field_dict
224-
225-
226- def test_avro_file_structure_verification () -> None :
227- """Test that the Avro file structure is correct and can be parsed."""
228-
229- schema = Schema (
230- NestedField (field_id = 1 , name = "test.field" , field_type = StringType (), required = True ),
231- )
232-
233- test_records = [AvroTestRecord ("hello" )]
234-
235- with tempfile .NamedTemporaryFile (suffix = ".avro" , delete = False ) as tmp_file :
236- tmp_avro_file = tmp_file .name
237-
238- try :
239- with avro .AvroOutputFile [AvroTestRecord ](
240- output_file = PyArrowFileIO ().new_output (tmp_avro_file ),
241- file_schema = schema ,
242- schema_name = "simple_test" ,
243- ) as output_file :
244- output_file .write_block (test_records )
245-
246- with open (tmp_avro_file , "rb" ) as fo :
247- # Read magic bytes (first 4 bytes should be Avro magic)
248- magic = fo .read (4 )
249- assert magic == b"Obj\x01 " # Avro magic bytes
250-
251- import struct
252-
253- metadata_length = struct .unpack (">I" , fo .read (4 ))[0 ]
254- assert metadata_length > 0
255-
256- from fastavro import reader
257-
258- fo .seek (0 )
259- avro_reader = reader (fo )
260-
261- avro_schema : AvroType = avro_reader .writer_schema
262- avro_dict : Dict [str , Any ] = avro_schema
263-
264- assert len (avro_dict ["fields" ]) == 1
265- field : Dict [str , Any ] = avro_dict ["fields" ][0 ]
266- assert field ["name" ] == "test_x2Efield"
267- assert field ["iceberg-field-name" ] == "test.field"
268-
269- records = list (avro_reader )
270- assert len (records ) == 1
271- assert records [0 ]["test_x2Efield" ] == "hello"
272-
273- finally :
274- import os
275-
276- if os .path .exists (tmp_avro_file ):
277- os .unlink (tmp_avro_file )
278-
279-
280- def test_edge_cases_sanitization () -> None :
281- """Test edge cases for field name sanitization."""
282-
283- test_cases = [
284- ("123" , "_123" ), # All digits
285- ("_" , "_" ), # Just underscore
286- ("a" , "a" ), # Single letter
287- ("a1" , "a1" ), # Letter followed by digit
288- ("1a" , "_1a" ), # Digit followed by letter
289- ("a.b" , "a_x2Eb" ), # Letter, dot, letter
290- ("a#b" , "a_x23b" ), # Letter, hash, letter
291- ("☃" , "_x2603" ), # Unicode character
292- ("a☃b" , "a_x2603b" ), # Letter, unicode, letter
293- ]
294-
295- for original_name , expected_sanitized in test_cases :
296- schema = Schema (
297- NestedField (field_id = 1 , name = original_name , field_type = StringType (), required = True ),
298- )
299-
300- avro_schema : AvroType = AvroSchemaConversion ().iceberg_to_avro (schema , schema_name = "edge_test" )
301- avro_dict : Dict [str , Any ] = avro_schema
302-
303- field : Dict [str , Any ] = avro_dict ["fields" ][0 ]
304- assert field ["name" ] == expected_sanitized
305-
306- if original_name != expected_sanitized :
307- assert field ["iceberg-field-name" ] == original_name
308- else :
309- assert "iceberg-field-name" not in field
310-
311-
312194def test_emoji_field_name_sanitization () -> None :
313195 """Test that emoji field names are properly sanitized according to Java implementation."""
314196
0 commit comments