3232
3333namespace iceberg ::avro {
3434
35+ // Forward declaration of functions to test
36+ bool validAvroName (const std::string& name);
37+
3538namespace {
3639
3740void CheckCustomLogicalType (const ::avro::NodePtr& node, const std::string& type_name) {
@@ -47,8 +50,81 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
4750 ASSERT_EQ (attrs.getAttribute (key), std::make_optional (std::to_string (field_id)));
4851}
4952
53+ // Helper function to check if a custom attribute exists for a field name preservation
54+ void CheckIcebergFieldName (const ::avro::NodePtr& node, size_t index,
55+ const std::string& original_name) {
56+ ASSERT_LT (index, node->customAttributes ());
57+ const auto & attrs = node->customAttributesAt (index);
58+ ASSERT_EQ (attrs.getAttribute (" iceberg-field-name" ), std::make_optional (original_name));
59+ }
60+
5061} // namespace
5162
63+ TEST (ValidAvroNameTest, ValidNames) {
64+ // Valid field names should return true
65+ EXPECT_TRUE (validAvroName (" valid_field" ));
66+ EXPECT_TRUE (validAvroName (" field123" ));
67+ EXPECT_TRUE (validAvroName (" _private" ));
68+ EXPECT_TRUE (validAvroName (" CamelCase" ));
69+ EXPECT_TRUE (validAvroName (" field_with_underscores" ));
70+ }
71+
72+ TEST (ValidAvroNameTest, InvalidNames) {
73+ // Names starting with numbers should return false
74+ EXPECT_FALSE (validAvroName (" 123field" ));
75+ EXPECT_FALSE (validAvroName (" 0value" ));
76+
77+ // Names with special characters should return false
78+ EXPECT_FALSE (validAvroName (" field-name" ));
79+ EXPECT_FALSE (validAvroName (" field.name" ));
80+ EXPECT_FALSE (validAvroName (" field name" ));
81+ EXPECT_FALSE (validAvroName (" field@name" ));
82+ EXPECT_FALSE (validAvroName (" field#name" ));
83+ }
84+
85+ TEST (ValidAvroNameTest, EmptyName) {
86+ // Empty name should throw an exception
87+ EXPECT_THROW (validAvroName (" " ), std::runtime_error);
88+ }
89+
90+ TEST (SanitizeFieldNameTest, ValidFieldNames) {
91+ // Valid field names should remain unchanged
92+ EXPECT_EQ (SanitizeFieldName (" valid_field" ), " valid_field" );
93+ EXPECT_EQ (SanitizeFieldName (" field123" ), " field123" );
94+ EXPECT_EQ (SanitizeFieldName (" _private" ), " _private" );
95+ EXPECT_EQ (SanitizeFieldName (" CamelCase" ), " CamelCase" );
96+ EXPECT_EQ (SanitizeFieldName (" field_with_underscores" ), " field_with_underscores" );
97+ }
98+
99+ TEST (SanitizeFieldNameTest, InvalidFieldNames) {
100+ // Field names starting with numbers should be prefixed with underscore
101+ EXPECT_EQ (SanitizeFieldName (" 123field" ), " _123field" );
102+ EXPECT_EQ (SanitizeFieldName (" 0value" ), " _0value" );
103+
104+ // Field names with special characters should be encoded with hex values
105+ EXPECT_EQ (SanitizeFieldName (" field-name" ), " field_x2Dname" );
106+ EXPECT_EQ (SanitizeFieldName (" field.name" ), " field_x2Ename" );
107+ EXPECT_EQ (SanitizeFieldName (" field name" ), " field_x20name" );
108+ EXPECT_EQ (SanitizeFieldName (" field@name" ), " field_x40name" );
109+ EXPECT_EQ (SanitizeFieldName (" field#name" ), " field_x23name" );
110+
111+ // Complex field names with multiple issues
112+ EXPECT_EQ (SanitizeFieldName (" 1field-with.special@chars" ), " _1field_x2Dwith_x2Especial_x40chars" );
113+ EXPECT_EQ (SanitizeFieldName (" user-email" ), " user_x2Demail" );
114+ }
115+
116+ TEST (SanitizeFieldNameTest, EdgeCases) {
117+ // Empty field name
118+ EXPECT_EQ (SanitizeFieldName (" " ), " _x0" );
119+
120+ // Field name with only special characters
121+ EXPECT_EQ (SanitizeFieldName (" @#$" ), " _x40_x23_x24" );
122+
123+ // Field name starting with special character
124+ EXPECT_EQ (SanitizeFieldName (" -field" ), " _x2Dfield" );
125+ EXPECT_EQ (SanitizeFieldName (" .field" ), " _x2Efield" );
126+ }
127+
52128TEST (ToAvroNodeVisitorTest, BooleanType) {
53129 ::avro::NodePtr node;
54130 EXPECT_THAT (ToAvroNodeVisitor{}.Visit (BooleanType{}, &node), IsOk ());
@@ -181,6 +257,67 @@ TEST(ToAvroNodeVisitorTest, StructType) {
181257 EXPECT_EQ (node->leafAt (1 )->leafAt (1 )->type (), ::avro::AVRO_INT );
182258}
183259
260+ TEST (ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
261+ // Test struct with field names that require sanitization
262+ StructType struct_type{
263+ {SchemaField{/* field_id=*/ 1 , " user-name" , iceberg::string (),
264+ /* optional=*/ false },
265+ SchemaField{/* field_id=*/ 2 , " email.address" , iceberg::string (),
266+ /* optional=*/ true },
267+ SchemaField{/* field_id=*/ 3 , " 123field" , iceberg::int32 (),
268+ /* optional=*/ false },
269+ SchemaField{/* field_id=*/ 4 , " field with spaces" , iceberg::boolean (),
270+ /* optional=*/ true }}};
271+
272+ ::avro::NodePtr node;
273+ EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
274+ EXPECT_EQ (node->type (), ::avro::AVRO_RECORD );
275+
276+ // Check that field names are sanitized
277+ ASSERT_EQ (node->names (), 4 );
278+ EXPECT_EQ (node->nameAt (0 ), " user_x2Dname" ); // "user-name" -> "user_x2Dname"
279+ EXPECT_EQ (node->nameAt (1 ), " email_x2Eaddress" ); // "email.address" -> "email_x2Eaddress"
280+ EXPECT_EQ (node->nameAt (2 ), " _123field" ); // "123field" -> "_123field"
281+ EXPECT_EQ (node->nameAt (3 ),
282+ " field_x20with_x20spaces" ); // "field with spaces" -> "field_x20with_x20spaces"
283+
284+ // Check that field IDs are correctly applied
285+ // Each field has 1 custom attribute: field-id
286+ ASSERT_EQ (node->customAttributes (), 4 );
287+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 0 , /* field_id=*/ 1 ));
288+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 1 , /* field_id=*/ 2 ));
289+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 2 , /* field_id=*/ 3 ));
290+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 3 , /* field_id=*/ 4 ));
291+ }
292+
293+ TEST (ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
294+ // Test struct with field names that don't require sanitization
295+ StructType struct_type{{SchemaField{/* field_id=*/ 1 , " valid_field" , iceberg::string (),
296+ /* optional=*/ false },
297+ SchemaField{/* field_id=*/ 2 , " AnotherField" , iceberg::int32 (),
298+ /* optional=*/ true }}};
299+
300+ ::avro::NodePtr node;
301+ EXPECT_THAT (ToAvroNodeVisitor{}.Visit (struct_type, &node), IsOk ());
302+ EXPECT_EQ (node->type (), ::avro::AVRO_RECORD );
303+
304+ // Check that field names remain unchanged
305+ ASSERT_EQ (node->names (), 2 );
306+ EXPECT_EQ (node->nameAt (0 ), " valid_field" );
307+ EXPECT_EQ (node->nameAt (1 ), " AnotherField" );
308+
309+ // Check that field IDs are correctly applied
310+ ASSERT_EQ (node->customAttributes (), 2 );
311+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 0 , /* field_id=*/ 1 ));
312+ ASSERT_NO_FATAL_FAILURE (CheckFieldIdAt (node, /* index=*/ 1 , /* field_id=*/ 2 ));
313+
314+ // For valid field names, there should be no iceberg-field-name attributes
315+ const auto & attrs0 = node->customAttributesAt (0 );
316+ const auto & attrs1 = node->customAttributesAt (1 );
317+ EXPECT_FALSE (attrs0.getAttribute (" iceberg-field-name" ).has_value ());
318+ EXPECT_FALSE (attrs1.getAttribute (" iceberg-field-name" ).has_value ());
319+ }
320+
184321TEST (ToAvroNodeVisitorTest, ListType) {
185322 ListType list_type{SchemaField{/* field_id=*/ 5 , " element" , iceberg::string (),
186323 /* optional=*/ true }};
@@ -1436,5 +1573,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
14361573 auto result = MakeAvroNodeWithFieldIds (avro_schema.root (), *name_mapping);
14371574 ASSERT_THAT (result, IsError (ErrorKind::kInvalidSchema ));
14381575}
1439-
14401576} // namespace iceberg::avro
0 commit comments