Consolidate unit tests

kris-gaudel · kris-gaudel · commit ed750629d579 · 2025-08-06T13:32:31.000-04:00
diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py
@@ -531,17 +531,17 @@ def field(self, field: NestedField, field_result: AvroType) -> AvroType:
         if isinstance(field_result, dict) and field_result.get("type") == "record":
             field_result["name"] = f"r{field.field_id}"
 
-        orig_field_name = field.name
-        field_name = make_compatible_name(orig_field_name)
+        original_name = field.name
+        sanitized_name = make_compatible_name(original_name)
 
         result = {
-            "name": field_name,
+            "name": sanitized_name,
             FIELD_ID_PROP: field.field_id,
             "type": field_result if field.required else ["null", field_result],
         }
 
-        if orig_field_name != field_name:
-            result[ICEBERG_FIELD_NAME_PROP] = orig_field_name
+        if original_name != sanitized_name:
+            result[ICEBERG_FIELD_NAME_PROP] = original_name
 
         if field.write_default is not None:
             result["default"] = field.write_default
diff --git a/tests/test_avro_sanitization.py b/tests/test_avro_sanitization.py
@@ -46,16 +46,26 @@ def field_starting_with_digit(self) -> str:
         return self._data[2]
 
 
-def test_avro_field_name_sanitization() -> None:
-    """Test that field names are sanitized according to Java implementation."""
+def test_comprehensive_field_name_sanitization() -> None:
+    """Test comprehensive field name sanitization including edge cases and Java compatibility."""
 
-    # Test cases from Java TestSchemaConversions.java
     test_cases = [
+        # Java compatibility test cases
         ("9x", "_9x"),
         ("x_", "x_"),
         ("a.b", "a_x2Eb"),
         ("☃", "_x2603"),
         ("a#b", "a_x23b"),
+        ("123", "_123"),
+        ("_", "_"),
+        ("a", "a"),
+        ("a1", "a1"),
+        ("1a", "_1a"),
+        ("a☃b", "a_x2603b"),
+        ("name#with#hash", "name_x23with_x23hash"),
+        ("123number", "_123number"),
+        ("😎", "_x1F60E"),
+        ("😎_with_text", "_x1F60E_with_text"),
     ]
 
     for original_name, expected_sanitized in test_cases:
@@ -72,53 +82,22 @@ def test_avro_field_name_sanitization() -> None:
             assert ICEBERG_FIELD_NAME_PROP not in avro_dict["fields"][0]
 
 
-def test_complex_schema_sanitization() -> None:
-    """Test sanitization with nested schemas."""
-    schema = Schema(
-        NestedField(field_id=1, name="valid_field", field_type=StringType(), required=True),
-        NestedField(field_id=2, name="invalid.field", field_type=IntegerType(), required=True),
-    )
-
-    avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema)
-    avro_dict: Dict[str, Any] = avro_schema
-
-    assert avro_dict["fields"][0]["name"] == "valid_field"
-    assert ICEBERG_FIELD_NAME_PROP not in avro_dict["fields"][0]
-
-    assert avro_dict["fields"][1]["name"] == "invalid_x2Efield"
-    assert avro_dict["fields"][1][ICEBERG_FIELD_NAME_PROP] == "invalid.field"
-
-
-def test_edge_cases() -> None:
-    """Test edge cases for sanitization."""
-    edge_cases = [
-        ("123", "_123"),
-        ("_", "_"),
-        ("a", "a"),
-        ("a1", "a1"),
-        ("1a", "_1a"),
-    ]
-
-    for original_name, expected_sanitized in edge_cases:
-        schema = Schema(NestedField(field_id=1, name=original_name, field_type=StringType(), required=True))
-
-        avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema)
-        avro_dict: Dict[str, Any] = avro_schema
-        assert avro_dict["fields"][0]["name"] == expected_sanitized
-
-
-def test_avro_compatibility() -> None:
-    """Test that Avro files with sanitized names can be read by other tools."""
+def test_comprehensive_avro_compatibility() -> None:
+    """Test comprehensive Avro compatibility including complex schemas and file structure."""
 
+    # Create schema with various field name types
     schema = Schema(
         NestedField(field_id=1, name="valid_field", field_type=StringType(), required=True),
         NestedField(field_id=2, name="invalid.field", field_type=IntegerType(), required=True),
         NestedField(field_id=3, name="9x", field_type=StringType(), required=True),
+        NestedField(field_id=4, name="name#with#hash", field_type=StringType(), required=True),
+        NestedField(field_id=5, name="☃", field_type=IntegerType(), required=True),
+        NestedField(field_id=6, name="😎", field_type=IntegerType(), required=True),
     )
 
     test_records = [
-        AvroTestRecord("hello", 42, "test"),
-        AvroTestRecord("goodbye", 99, "example"),
+        AvroTestRecord("hello", 42, "test", "hash_value", 100, 200),
+        AvroTestRecord("goodbye", 99, "example", "another_hash", 200, 300),
     ]
 
     with tempfile.NamedTemporaryFile(suffix=".avro", delete=False) as tmp_file:
@@ -134,6 +113,16 @@ def test_avro_compatibility() -> None:
             output_file.write_block(test_records)
 
         with open(tmp_avro_file, "rb") as fo:
+            # Test Avro file structure
+            magic = fo.read(4)
+            assert magic == b"Obj\x01"  # Avro magic bytes
+
+            import struct
+
+            metadata_length = struct.unpack(">I", fo.read(4))[0]
+            assert metadata_length > 0
+
+            fo.seek(0)
             avro_reader = reader(fo)
 
             avro_schema: AvroType = avro_reader.writer_schema
@@ -145,10 +134,14 @@ def test_avro_compatibility() -> None:
                 "valid_field",
                 "invalid_x2Efield",
                 "_9x",
+                "name_x23with_x23hash",
+                "_x2603",
+                "_x1F60E",
             ]
 
             assert field_names == expected_field_names
 
+            # Verify iceberg-field-name properties
             for field in avro_dict["fields"]:
                 field_dict: Dict[str, Any] = field
                 if field_dict["name"] == "invalid_x2Efield":
@@ -157,22 +150,37 @@ def test_avro_compatibility() -> None:
                 elif field_dict["name"] == "_9x":
                     assert "iceberg-field-name" in field_dict
                     assert field_dict["iceberg-field-name"] == "9x"
+                elif field_dict["name"] == "name_x23with_x23hash":
+                    assert "iceberg-field-name" in field_dict
+                    assert field_dict["iceberg-field-name"] == "name#with#hash"
+                elif field_dict["name"] == "_x2603":
+                    assert "iceberg-field-name" in field_dict
+                    assert field_dict["iceberg-field-name"] == "☃"
+                elif field_dict["name"] == "_x1F60E":
+                    assert "iceberg-field-name" in field_dict
+                    assert field_dict["iceberg-field-name"] == "😎"
                 else:
                     assert "iceberg-field-name" not in field_dict
 
             records = list(avro_reader)
-
             assert len(records) == 2
 
+            # Verify data integrity
             first_record = records[0]
             assert first_record["valid_field"] == "hello"
             assert first_record["invalid_x2Efield"] == 42
             assert first_record["_9x"] == "test"
+            assert first_record["name_x23with_x23hash"] == "hash_value"
+            assert first_record["_x2603"] == 100
+            assert first_record["_x1F60E"] == 200
 
             second_record = records[1]
             assert second_record["valid_field"] == "goodbye"
             assert second_record["invalid_x2Efield"] == 99
             assert second_record["_9x"] == "example"
+            assert second_record["name_x23with_x23hash"] == "another_hash"
+            assert second_record["_x2603"] == 200
+            assert second_record["_x1F60E"] == 300
 
             assert avro_reader.metadata.get("test") == "metadata"
 
@@ -183,132 +191,6 @@ def test_avro_compatibility() -> None:
             os.unlink(tmp_avro_file)
 
 
-def test_avro_schema_conversion_sanitization() -> None:
-    """Test that schema conversion properly sanitizes field names."""
-
-    # Create schema with various invalid field names
-    schema = Schema(
-        NestedField(field_id=1, name="valid_name", field_type=StringType(), required=True),
-        NestedField(field_id=2, name="invalid.name", field_type=IntegerType(), required=True),
-        NestedField(field_id=3, name="name#with#hash", field_type=StringType(), required=True),
-        NestedField(field_id=4, name="☃", field_type=IntegerType(), required=True),  # Unicode character
-        NestedField(field_id=5, name="123number", field_type=StringType(), required=True),
-    )
-
-    avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema, schema_name="test_schema")
-    avro_dict: Dict[str, Any] = avro_schema
-
-    field_names = [field["name"] for field in avro_dict["fields"]]
-    expected_field_names = [
-        "valid_name",  # Valid name, unchanged
-        "invalid_x2Ename",  # Dot becomes _x2E
-        "name_x23with_x23hash",  # Hash becomes _x23
-        "_x2603",  # Unicode snowman becomes _x2603
-        "_123number",  # Starts with digit, gets leading underscore
-    ]
-
-    assert field_names == expected_field_names
-
-    for field in avro_dict["fields"]:
-        field_dict: Dict[str, Any] = field
-        if field_dict["name"] == "invalid_x2Ename":
-            assert field_dict["iceberg-field-name"] == "invalid.name"
-        elif field_dict["name"] == "name_x23with_x23hash":
-            assert field_dict["iceberg-field-name"] == "name#with#hash"
-        elif field_dict["name"] == "_x2603":
-            assert field_dict["iceberg-field-name"] == "☃"
-        elif field_dict["name"] == "_123number":
-            assert field_dict["iceberg-field-name"] == "123number"
-        else:
-            assert "iceberg-field-name" not in field_dict
-
-
-def test_avro_file_structure_verification() -> None:
-    """Test that the Avro file structure is correct and can be parsed."""
-
-    schema = Schema(
-        NestedField(field_id=1, name="test.field", field_type=StringType(), required=True),
-    )
-
-    test_records = [AvroTestRecord("hello")]
-
-    with tempfile.NamedTemporaryFile(suffix=".avro", delete=False) as tmp_file:
-        tmp_avro_file = tmp_file.name
-
-    try:
-        with avro.AvroOutputFile[AvroTestRecord](
-            output_file=PyArrowFileIO().new_output(tmp_avro_file),
-            file_schema=schema,
-            schema_name="simple_test",
-        ) as output_file:
-            output_file.write_block(test_records)
-
-        with open(tmp_avro_file, "rb") as fo:
-            # Read magic bytes (first 4 bytes should be Avro magic)
-            magic = fo.read(4)
-            assert magic == b"Obj\x01"  # Avro magic bytes
-
-            import struct
-
-            metadata_length = struct.unpack(">I", fo.read(4))[0]
-            assert metadata_length > 0
-
-            from fastavro import reader
-
-            fo.seek(0)
-            avro_reader = reader(fo)
-
-            avro_schema: AvroType = avro_reader.writer_schema
-            avro_dict: Dict[str, Any] = avro_schema
-
-            assert len(avro_dict["fields"]) == 1
-            field: Dict[str, Any] = avro_dict["fields"][0]
-            assert field["name"] == "test_x2Efield"
-            assert field["iceberg-field-name"] == "test.field"
-
-            records = list(avro_reader)
-            assert len(records) == 1
-            assert records[0]["test_x2Efield"] == "hello"
-
-    finally:
-        import os
-
-        if os.path.exists(tmp_avro_file):
-            os.unlink(tmp_avro_file)
-
-
-def test_edge_cases_sanitization() -> None:
-    """Test edge cases for field name sanitization."""
-
-    test_cases = [
-        ("123", "_123"),  # All digits
-        ("_", "_"),  # Just underscore
-        ("a", "a"),  # Single letter
-        ("a1", "a1"),  # Letter followed by digit
-        ("1a", "_1a"),  # Digit followed by letter
-        ("a.b", "a_x2Eb"),  # Letter, dot, letter
-        ("a#b", "a_x23b"),  # Letter, hash, letter
-        ("☃", "_x2603"),  # Unicode character
-        ("a☃b", "a_x2603b"),  # Letter, unicode, letter
-    ]
-
-    for original_name, expected_sanitized in test_cases:
-        schema = Schema(
-            NestedField(field_id=1, name=original_name, field_type=StringType(), required=True),
-        )
-
-        avro_schema: AvroType = AvroSchemaConversion().iceberg_to_avro(schema, schema_name="edge_test")
-        avro_dict: Dict[str, Any] = avro_schema
-
-        field: Dict[str, Any] = avro_dict["fields"][0]
-        assert field["name"] == expected_sanitized
-
-        if original_name != expected_sanitized:
-            assert field["iceberg-field-name"] == original_name
-        else:
-            assert "iceberg-field-name" not in field
-
-
 def test_emoji_field_name_sanitization() -> None:
     """Test that emoji field names are properly sanitized according to Java implementation."""