@@ -70,27 +70,28 @@ TEST(SanitizeFieldNameTest, InvalidFieldNames) {
7070 // Field names starting with numbers should be prefixed with underscore
7171 EXPECT_EQ (SanitizeFieldName (" 123field" ), " _123field" );
7272 EXPECT_EQ (SanitizeFieldName (" 0value" ), " _0value" );
73-
73+
7474 // Field names with special characters should have them replaced with underscores
7575 EXPECT_EQ (SanitizeFieldName (" field-name" ), " field_name" );
7676 EXPECT_EQ (SanitizeFieldName (" field.name" ), " field_name" );
7777 EXPECT_EQ (SanitizeFieldName (" field name" ), " field_name" );
7878 EXPECT_EQ (SanitizeFieldName (" field@name" ), " field_name" );
7979 EXPECT_EQ (SanitizeFieldName (" field#name" ), " field_name" );
80-
80+
8181 // Complex field names with multiple issues
8282 EXPECT_EQ (SanitizeFieldName (" 1field-with.special@chars" ), " _1field_with_special_chars" );
8383 EXPECT_EQ (SanitizeFieldName (" user-email" ), " user_email" );
84- EXPECT_EQ (SanitizeFieldName (" 价格" ), " _______" ); // Non-ASCII characters become underscores
84+ EXPECT_EQ (SanitizeFieldName (" 价格" ),
85+ " _______" ); // Non-ASCII characters become underscores
8586}
8687
8788TEST (SanitizeFieldNameTest, EdgeCases) {
8889 // Empty field name
8990 EXPECT_EQ (SanitizeFieldName (" " ), " _empty" );
90-
91+
9192 // Field name with only special characters
9293 EXPECT_EQ (SanitizeFieldName (" @#$" ), " ____" );
93-
94+
9495 // Field name starting with special character
9596 EXPECT_EQ (SanitizeFieldName (" -field" ), " __field" );
9697 EXPECT_EQ (SanitizeFieldName (" .field" ), " __field" );
@@ -230,14 +231,15 @@ TEST(ToAvroNodeVisitorTest, StructType) {
230231
231232TEST (ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
232233 // Test struct with field names that require sanitization
233- StructType struct_type{{SchemaField{/* field_id=*/ 1 , " user-name" , iceberg::string (),
234- /* optional=*/ false },
235- SchemaField{/* field_id=*/ 2 , " email.address" , iceberg::string (),
236- /* optional=*/ true },
237- SchemaField{/* field_id=*/ 3 , " 123field" , iceberg::int32 (),
238- /* optional=*/ false },
239- SchemaField{/* field_id=*/ 4 , " field with spaces" , iceberg::boolean (),
240- /* optional=*/ true }}};
234+ StructType struct_type{
235+ {SchemaField{/* field_id=*/ 1 , " user-name" , iceberg::string (),
236+ /* optional=*/ false },
237+ SchemaField{/* field_id=*/ 2 , " email.address" , iceberg::string (),
238+ /* optional=*/ true },
239+ SchemaField{/* field_id=*/ 3 , " 123field" , iceberg::int32 (),
240+ /* optional=*/ false },
241+ SchemaField{/* field_id=*/ 4 , " field with spaces" , iceberg::boolean (),
242+ /* optional=*/ true }}};
241243
242244 ::avro::NodePtr node;
243245 EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
@@ -248,10 +250,12 @@ TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
248250 EXPECT_EQ (node->nameAt (0 ), " user_name" ); // "user-name" -> "user_name"
249251 EXPECT_EQ (node->nameAt (1 ), " email_address" ); // "email.address" -> "email_address"
250252 EXPECT_EQ (node->nameAt (2 ), " _123field" ); // "123field" -> "_123field"
251- EXPECT_EQ (node->nameAt (3 ), " field_with_spaces" ); // "field with spaces" -> "field_with_spaces"
253+ EXPECT_EQ (node->nameAt (3 ),
254+ " field_with_spaces" ); // "field with spaces" -> "field_with_spaces"
252255
253256 // Check that field IDs are correctly applied
254- // Each field has 2 custom attributes: iceberg-field-name (index 0,2,4,6) and field-id (index 1,3,5,7)
257+ // Each field has 2 custom attributes: iceberg-field-name (index 0,2,4,6) and field-id
258+ // (index 1,3,5,7)
255259 ASSERT_EQ (node->customAttributes (), 8 );
256260 ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 1 , /* field_id=*/ 1 ));
257261 ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 3 , /* field_id=*/ 2 ));
@@ -1644,7 +1648,7 @@ TEST_F(NameMappingAvroSchemaTest, SanitizeFieldNamesNestedRecord) {
16441648 EXPECT_EQ (node->names (), 2 );
16451649
16461650 // Check top-level field names
1647- EXPECT_EQ (node->nameAt (0 ), " id" ); // Valid name, unchanged
1651+ EXPECT_EQ (node->nameAt (0 ), " id" ); // Valid name, unchanged
16481652 EXPECT_EQ (node->nameAt (1 ), " person_info" ); // Sanitized from "person-info"
16491653
16501654 // Check that top-level field IDs are applied
@@ -1661,11 +1665,12 @@ TEST_F(NameMappingAvroSchemaTest, SanitizeFieldNamesNestedRecord) {
16611665 EXPECT_EQ (person_node->names (), 3 );
16621666
16631667 // Check that nested field names are sanitized
1664- EXPECT_EQ (person_node->nameAt (0 ), " first_name" ); // "first-name" -> "first_name"
1665- EXPECT_EQ (person_node->nameAt (1 ), " last_name" ); // "last.name" -> "last_name"
1666- EXPECT_EQ (person_node->nameAt (2 ), " _123age" ); // "123age" -> "_123age"
1668+ EXPECT_EQ (person_node->nameAt (0 ), " first_name" ); // "first-name" -> "first_name"
1669+ EXPECT_EQ (person_node->nameAt (1 ), " last_name" ); // "last.name" -> "last_name"
1670+ EXPECT_EQ (person_node->nameAt (2 ), " _123age" ); // "123age" -> "_123age"
16671671
1668- // Check that nested field IDs are applied (all 3 fields are sanitized, so 6 attributes total)
1672+ // Check that nested field IDs are applied (all 3 fields are sanitized, so 6 attributes
1673+ // total)
16691674 ASSERT_EQ (person_node->customAttributes (), 6 );
16701675 ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (person_node, /* index=*/ 0 , /* field_id=*/ 10 ));
16711676 ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (person_node, /* index=*/ 2 , /* field_id=*/ 11 ));
@@ -1679,12 +1684,13 @@ TEST_F(NameMappingAvroSchemaTest, SanitizeFieldNamesNestedRecord) {
16791684
16801685TEST (CustomAttributesTest, PreserveOriginalFieldNames) {
16811686 // Test that custom attributes properly preserve original field names
1682- StructType struct_type{{SchemaField{/* field_id=*/ 1 , " normal_field" , iceberg::string (),
1683- /* optional=*/ false },
1684- SchemaField{/* field_id=*/ 2 , " field-with-dashes" , iceberg::int32 (),
1685- /* optional=*/ true },
1686- SchemaField{/* field_id=*/ 3 , " field.with.dots" , iceberg::boolean (),
1687- /* optional=*/ false }}};
1687+ StructType struct_type{
1688+ {SchemaField{/* field_id=*/ 1 , " normal_field" , iceberg::string (),
1689+ /* optional=*/ false },
1690+ SchemaField{/* field_id=*/ 2 , " field-with-dashes" , iceberg::int32 (),
1691+ /* optional=*/ true },
1692+ SchemaField{/* field_id=*/ 3 , " field.with.dots" , iceberg::boolean (),
1693+ /* optional=*/ false }}};
16881694
16891695 ::avro::NodePtr node;
16901696 EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
@@ -1715,7 +1721,7 @@ TEST(CustomAttributesTest, PreserveOriginalFieldNames) {
17151721
17161722TEST (CustomAttributesTest, MultipleCustomAttributesCoexist) {
17171723 // Test that iceberg-field-name attributes coexist with other custom attributes
1718- StructType struct_type{{SchemaField{/* field_id=*/ 100 , " field@with#special$chars" ,
1724+ StructType struct_type{{SchemaField{/* field_id=*/ 100 , " field@with#special$chars" ,
17191725 iceberg::timestamp_tz (), /* optional=*/ false }}};
17201726
17211727 ::avro::NodePtr node;
@@ -1729,7 +1735,8 @@ TEST(CustomAttributesTest, MultipleCustomAttributesCoexist) {
17291735 ASSERT_EQ (node->customAttributes (), 2 );
17301736 const auto & field_name_attrs = node->customAttributesAt (0 );
17311737 EXPECT_TRUE (field_name_attrs.getAttribute (" iceberg-field-name" ).has_value ());
1732- EXPECT_EQ (field_name_attrs.getAttribute (" iceberg-field-name" ).value (), " field@with#special$chars" );
1738+ EXPECT_EQ (field_name_attrs.getAttribute (" iceberg-field-name" ).value (),
1739+ " field@with#special$chars" );
17331740 const auto & field_id_attrs = node->customAttributesAt (1 );
17341741 EXPECT_TRUE (field_id_attrs.getAttribute (" field-id" ).has_value ());
17351742 EXPECT_EQ (field_id_attrs.getAttribute (" field-id" ).value (), " 100" );
@@ -1747,7 +1754,8 @@ TEST(CustomAttributesTest, NestedStructuresPreserveOriginalNames) {
17471754 // Test that nested structures properly preserve original field names at all levels
17481755 auto inner_struct = std::make_shared<StructType>(std::vector<SchemaField>{
17491756 SchemaField{/* field_id=*/ 10 , " inner-field" , iceberg::string (), /* optional=*/ false },
1750- SchemaField{/* field_id=*/ 11 , " another.field" , iceberg::int32 (), /* optional=*/ true }});
1757+ SchemaField{/* field_id=*/ 11 , " another.field" , iceberg::int32 (),
1758+ /* optional=*/ true }});
17511759
17521760 StructType root_struct{{SchemaField{/* field_id=*/ 1 , " normal_field" , iceberg::boolean (),
17531761 /* optional=*/ false },
@@ -1764,11 +1772,11 @@ TEST(CustomAttributesTest, NestedStructuresPreserveOriginalNames) {
17641772
17651773 // Check root level attributes
17661774 ASSERT_EQ (root_node->customAttributes (), 3 );
1767-
1775+
17681776 // Normal field should not have iceberg-field-name
17691777 const auto & root_attrs0 = root_node->customAttributesAt (0 );
17701778 EXPECT_FALSE (root_attrs0.getAttribute (" iceberg-field-name" ).has_value ());
1771-
1779+
17721780 // Nested struct field should have iceberg-field-name preserved
17731781 const auto & root_attrs1_name = root_node->customAttributesAt (1 );
17741782 EXPECT_TRUE (root_attrs1_name.getAttribute (" iceberg-field-name" ).has_value ());
@@ -1778,23 +1786,24 @@ TEST(CustomAttributesTest, NestedStructuresPreserveOriginalNames) {
17781786 auto nested_union_node = root_node->leafAt (1 );
17791787 ASSERT_EQ (nested_union_node->type (), ::avro::AVRO_UNION );
17801788 ASSERT_EQ (nested_union_node->leaves (), 2 );
1781-
1789+
17821790 auto nested_struct_node = nested_union_node->leafAt (1 );
17831791 ASSERT_EQ (nested_struct_node->type (), ::avro::AVRO_RECORD );
17841792 ASSERT_EQ (nested_struct_node->names (), 2 );
1785- EXPECT_EQ (nested_struct_node->nameAt (0 ), " inner_field" ); // Sanitized
1786- EXPECT_EQ (nested_struct_node->nameAt (1 ), " another_field" ); // Sanitized
1793+ EXPECT_EQ (nested_struct_node->nameAt (0 ), " inner_field" ); // Sanitized
1794+ EXPECT_EQ (nested_struct_node->nameAt (1 ), " another_field" ); // Sanitized
17871795
17881796 // Check nested field attributes
17891797 ASSERT_EQ (nested_struct_node->customAttributes (), 4 );
1790-
1798+
17911799 const auto & nested_attrs0_name = nested_struct_node->customAttributesAt (0 );
17921800 EXPECT_TRUE (nested_attrs0_name.getAttribute (" iceberg-field-name" ).has_value ());
17931801 EXPECT_EQ (nested_attrs0_name.getAttribute (" iceberg-field-name" ).value (), " inner-field" );
1794-
1802+
17951803 const auto & nested_attrs1_name = nested_struct_node->customAttributesAt (2 );
17961804 EXPECT_TRUE (nested_attrs1_name.getAttribute (" iceberg-field-name" ).has_value ());
1797- EXPECT_EQ (nested_attrs1_name.getAttribute (" iceberg-field-name" ).value (), " another.field" );
1805+ EXPECT_EQ (nested_attrs1_name.getAttribute (" iceberg-field-name" ).value (),
1806+ " another.field" );
17981807}
17991808
18001809TEST (EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
@@ -1807,8 +1816,10 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18071816 SchemaField::MakeOptional (
18081817 /* field_id=*/ 5 , " nested-record" ,
18091818 std::make_shared<StructType>(std::vector<SchemaField>{
1810- SchemaField::MakeRequired (/* field_id=*/ 10 , " inner.field" , iceberg::string ()),
1811- SchemaField::MakeOptional (/* field_id=*/ 11 , " another-field" , iceberg::float64 ()),
1819+ SchemaField::MakeRequired (/* field_id=*/ 10 , " inner.field" ,
1820+ iceberg::string ()),
1821+ SchemaField::MakeOptional (/* field_id=*/ 11 , " another-field" ,
1822+ iceberg::float64 ()),
18121823 })),
18131824 });
18141825
@@ -1819,18 +1830,22 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18191830
18201831 // Verify that field names are sanitized in the Avro schema
18211832 ASSERT_EQ (avro_node->names (), 5 );
1822- EXPECT_EQ (avro_node->nameAt (0 ), " user_id" ); // "user-id" -> "user_id"
1823- EXPECT_EQ (avro_node->nameAt (1 ), " email_address" ); // "email.address" -> "email_address"
1824- EXPECT_EQ (avro_node->nameAt (2 ), " _123numeric_start" ); // "123numeric_start" -> "_123numeric_start"
1825- EXPECT_EQ (avro_node->nameAt (3 ), " field_with_spaces" ); // "field with spaces" -> "field_with_spaces"
1826- EXPECT_EQ (avro_node->nameAt (4 ), " nested_record" ); // "nested-record" -> "nested_record"
1833+ EXPECT_EQ (avro_node->nameAt (0 ), " user_id" ); // "user-id" -> "user_id"
1834+ EXPECT_EQ (avro_node->nameAt (1 ), " email_address" ); // "email.address" -> "email_address"
1835+ EXPECT_EQ (avro_node->nameAt (2 ),
1836+ " _123numeric_start" ); // "123numeric_start" -> "_123numeric_start"
1837+ EXPECT_EQ (avro_node->nameAt (3 ),
1838+ " field_with_spaces" ); // "field with spaces" -> "field_with_spaces"
1839+ EXPECT_EQ (avro_node->nameAt (4 ), " nested_record" ); // "nested-record" -> "nested_record"
18271840
18281841 // Verify that original field names are preserved in custom attributes
18291842 ASSERT_EQ (avro_node->customAttributes (), 10 );
18301843 ASSERT_NO_FATAL_FAILURE (CheckIcebergFieldName (avro_node, /* index=*/ 0 , " user-id" ));
18311844 ASSERT_NO_FATAL_FAILURE (CheckIcebergFieldName (avro_node, /* index=*/ 2 , " email.address" ));
1832- ASSERT_NO_FATAL_FAILURE (CheckIcebergFieldName (avro_node, /* index=*/ 4 , " 123numeric_start" ));
1833- ASSERT_NO_FATAL_FAILURE (CheckIcebergFieldName (avro_node, /* index=*/ 6 , " field with spaces" ));
1845+ ASSERT_NO_FATAL_FAILURE (
1846+ CheckIcebergFieldName (avro_node, /* index=*/ 4 , " 123numeric_start" ));
1847+ ASSERT_NO_FATAL_FAILURE (
1848+ CheckIcebergFieldName (avro_node, /* index=*/ 6 , " field with spaces" ));
18341849 ASSERT_NO_FATAL_FAILURE (CheckIcebergFieldName (avro_node, /* index=*/ 8 , " nested-record" ));
18351850
18361851 // Verify nested structure
@@ -1839,21 +1854,25 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18391854 auto nested_struct_node = nested_union_node->leafAt (1 );
18401855 ASSERT_EQ (nested_struct_node->type (), ::avro::AVRO_RECORD );
18411856 ASSERT_EQ (nested_struct_node->names (), 2 );
1842- EXPECT_EQ (nested_struct_node->nameAt (0 ), " inner_field" ); // "inner.field" -> "inner_field"
1843- EXPECT_EQ (nested_struct_node->nameAt (1 ), " another_field" ); // "another-field" -> "another_field"
1857+ EXPECT_EQ (nested_struct_node->nameAt (0 ),
1858+ " inner_field" ); // "inner.field" -> "inner_field"
1859+ EXPECT_EQ (nested_struct_node->nameAt (1 ),
1860+ " another_field" ); // "another-field" -> "another_field"
18441861
18451862 // Verify nested field name preservation
18461863 ASSERT_EQ (nested_struct_node->customAttributes (), 4 );
1847- ASSERT_NO_FATAL_FAILURE (CheckIcebergFieldName (nested_struct_node, /* index=*/ 0 , " inner.field" ));
1848- ASSERT_NO_FATAL_FAILURE (CheckIcebergFieldName (nested_struct_node, /* index=*/ 2 , " another-field" ));
1864+ ASSERT_NO_FATAL_FAILURE (
1865+ CheckIcebergFieldName (nested_struct_node, /* index=*/ 0 , " inner.field" ));
1866+ ASSERT_NO_FATAL_FAILURE (
1867+ CheckIcebergFieldName (nested_struct_node, /* index=*/ 2 , " another-field" ));
18491868
18501869 // Step 2: Project back to Iceberg schema (simulate reading the Avro data)
18511870 auto projection_result = Project (iceberg_schema, avro_node, /* prune_source=*/ false );
18521871 ASSERT_THAT (projection_result, IsOk ());
18531872
18541873 const auto & projection = *projection_result;
18551874 ASSERT_EQ (projection.fields .size (), 5 );
1856-
1875+
18571876 // Verify that all fields can be correctly projected back
18581877 for (size_t i = 0 ; i < projection.fields .size (); ++i) {
18591878 ASSERT_EQ (projection.fields [i].kind , FieldProjection::Kind::kProjected );
@@ -1871,21 +1890,19 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18711890TEST (EndToEndFieldSanitizationTest, ValidateRoundTripConsistency) {
18721891 // Test that multiple round trips maintain consistency
18731892 std::vector<std::string> problematic_field_names = {
1874- " user-name" , " email.address" , " 123field" , " field with spaces" ,
1875- " field@special" , " field#hash" , " field$dollar" , " field%percent" ,
1876- " field&ersand" , " field(parenthesis)" , " field[bracket]" ,
1877- " field{brace} " , " field |pipe" , " field\\ backslash" , " field/slash" ,
1878- " field:colon " , " field ;semicolon" , " field'quote" , " field\" doublequote"
1893+ " user-name" , " email.address" , " 123field" , " field with spaces" ,
1894+ " field@special" , " field#hash" , " field$dollar" , " field%percent" ,
1895+ " field&ersand" , " field(parenthesis)" , " field[bracket]" , " field{brace} " ,
1896+ " field|pipe" , " field\\ backslash" , " field/slash" , " field:colon " ,
1897+ " field;semicolon" , " field'quote" , " field\" doublequote"
18791898 };
18801899
18811900 for (size_t i = 0 ; i < problematic_field_names.size (); ++i) {
18821901 const auto & original_name = problematic_field_names[i];
1883-
1902+
18841903 // Create a simple schema with the problematic field name
1885- Schema iceberg_schema ({
1886- SchemaField::MakeRequired (/* field_id=*/ static_cast <int32_t >(i + 1 ),
1887- original_name, iceberg::string ())
1888- });
1904+ Schema iceberg_schema ({SchemaField::MakeRequired (
1905+ /* field_id=*/ static_cast <int32_t >(i + 1 ), original_name, iceberg::string ())});
18891906
18901907 // Convert to Avro
18911908 ::avro::NodePtr avro_node;
@@ -1894,14 +1911,14 @@ TEST(EndToEndFieldSanitizationTest, ValidateRoundTripConsistency) {
18941911 // Verify the field name was processed
18951912 ASSERT_EQ (avro_node->names (), 1 );
18961913 std::string sanitized_name = avro_node->nameAt (0 );
1897-
1914+
18981915 // Verify the original name is preserved if sanitization occurred
18991916 if (sanitized_name != original_name) {
19001917 ASSERT_EQ (avro_node->customAttributes (), 2 );
19011918 const auto & name_attrs = avro_node->customAttributesAt (0 );
19021919 EXPECT_TRUE (name_attrs.getAttribute (" iceberg-field-name" ).has_value ());
19031920 EXPECT_EQ (name_attrs.getAttribute (" iceberg-field-name" ).value (), original_name);
1904-
1921+
19051922 const auto & id_attrs = avro_node->customAttributesAt (1 );
19061923 EXPECT_TRUE (id_attrs.getAttribute (" field-id" ).has_value ());
19071924 EXPECT_EQ (id_attrs.getAttribute (" field-id" ).value (), std::to_string (i + 1 ));
0 commit comments