@@ -63,6 +63,33 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363 return attributes;
6464}
6565
66+ std::string SanitizeFieldName (std::string_view field_name) {
67+ if (field_name.empty ()) {
68+ return " _empty" ;
69+ }
70+
71+ std::string result;
72+ result.reserve (field_name.size ());
73+
74+ // First character must be a letter or underscore
75+ if (!std::isalpha (field_name[0 ]) && field_name[0 ] != ' _' ) {
76+ result.push_back (' _' );
77+ } else {
78+ result.push_back (field_name[0 ]);
79+ }
80+
81+ // Rest of characters must be letters, digits, or underscores
82+ for (size_t i = 1 ; i < field_name.size (); ++i) {
83+ char c = field_name[i];
84+ if (std::isalnum (c) || c == ' _' ) {
85+ result.push_back (c);
86+ } else {
87+ result.push_back (' _' );
88+ }
89+ }
90+ return result;
91+ }
92+
6693} // namespace
6794
6895std::string ToString (const ::avro::NodePtr& node) {
@@ -188,8 +215,16 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
188215 ::avro::NodePtr field_node;
189216 ICEBERG_RETURN_UNEXPECTED (Visit (sub_field, &field_node));
190217
191- // TODO(gangwu): sanitize field name
192- (*node)->addName (std::string (sub_field.name ()));
218+ // Sanitize field name to ensure it follows Avro field name requirements
219+ std::string sanitized_name = SanitizeFieldName (sub_field.name ());
220+ // Store original name as a custom attribute if it was modified
221+ if (sanitized_name != sub_field.name ()) {
222+ // Add custom attribute to preserve the original field name
223+ ::avro::CustomAttributes attrs;
224+ attrs.addAttribute (std::string (kIcebergFieldNameProp ), std::string (sub_field.name ()));
225+ (*node)->addCustomAttributesForField (attrs);
226+ }
227+ (*node)->addName (sanitized_name);
193228 (*node)->addLeaf (field_node);
194229 (*node)->addCustomAttributesForField (GetAttributesWithFieldId (sub_field.field_id ()));
195230 }
@@ -839,7 +874,15 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig
839874 // Recursively apply field IDs to nested fields
840875 ICEBERG_ASSIGN_OR_RAISE (auto new_nested_node,
841876 MakeAvroNodeWithFieldIds (field_node, *nested_field));
842- new_record_node->addName (field_name);
877+ std::string sanitized_name = SanitizeFieldName (field_name);
878+ // Store original name as a custom attribute if it was modified
879+ if (sanitized_name != field_name) {
880+ // Add custom attribute to preserve the original field name
881+ ::avro::CustomAttributes attrs;
882+ attrs.addAttribute (std::string (kIcebergFieldNameProp ), field_name);
883+ new_record_node->addCustomAttributesForField (attrs);
884+ }
885+ new_record_node->addName (sanitized_name);
843886 new_record_node->addLeaf (new_nested_node);
844887 }
845888
0 commit comments