@@ -63,25 +63,26 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363 return attributes;
6464}
6565
66+ } // namespace
67+
6668std::string SanitizeFieldName (std::string_view field_name) {
6769 if (field_name.empty ()) {
6870 return " _empty" ;
6971 }
7072
7173 std::string result;
72- result.reserve (field_name.size ());
74+ result.reserve (field_name.size () + 1 ); // +1 for potential leading underscore
7375
7476 // First character must be a letter or underscore
75- if (!std::isalpha (field_name[0 ]) && field_name[0 ] != ' _' ) {
77+ char first_char = field_name[0 ];
78+ if (!std::isalpha (static_cast <unsigned char >(first_char)) && first_char != ' _' ) {
7679 result.push_back (' _' );
77- } else {
78- result.push_back (field_name[0 ]);
7980 }
8081
81- // Rest of characters must be letters, digits, or underscores
82- for (size_t i = 1 ; i < field_name.size (); ++i) {
82+ // Process all characters
83+ for (size_t i = 0 ; i < field_name.size (); ++i) {
8384 char c = field_name[i];
84- if (std::isalnum (c ) || c == ' _' ) {
85+ if (std::isalnum (static_cast < unsigned char >(c) ) || c == ' _' ) {
8586 result.push_back (c);
8687 } else {
8788 result.push_back (' _' );
@@ -90,8 +91,6 @@ std::string SanitizeFieldName(std::string_view field_name) {
9091 return result;
9192}
9293
93- } // namespace
94-
9594std::string ToString (const ::avro::NodePtr& node) {
9695 std::stringstream ss;
9796 ss << *node;
@@ -217,14 +216,16 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
217216
218217 // Sanitize field name to ensure it follows Avro field name requirements
219218 std::string sanitized_name = SanitizeFieldName (sub_field.name ());
219+
220220 // Store original name as a custom attribute if it was modified
221221 if (sanitized_name != sub_field.name ()) {
222222 // Add custom attribute to preserve the original field name
223- ::avro::CustomAttributes attrs ;
224- attrs .addAttribute (std::string (kIcebergFieldNameProp ),
225- std::string (sub_field.name ()));
226- (*node)->addCustomAttributesForField (attrs );
223+ ::avro::CustomAttributes name_attrs ;
224+ name_attrs .addAttribute (std::string (kIcebergFieldNameProp ),
225+ std::string (sub_field.name ()));
226+ (*node)->addCustomAttributesForField (name_attrs );
227227 }
228+
228229 (*node)->addName (sanitized_name);
229230 (*node)->addLeaf (field_node);
230231 (*node)->addCustomAttributesForField (GetAttributesWithFieldId (sub_field.field_id ()));
@@ -475,7 +476,50 @@ Result<int32_t> GetValueId(const ::avro::NodePtr& node) {
475476
476477Result<int32_t > GetFieldId (const ::avro::NodePtr& node, size_t field_idx) {
477478 static const std::string kFieldIdKey {kFieldIdProp };
478- return GetId (node, kFieldIdKey , field_idx);
479+
480+ // When field names are sanitized, we add custom attributes in this order for each field:
481+ // 1. If the field name was sanitized: iceberg-field-name attribute
482+ // 2. Always: field-id attribute
483+ // So for field i, we need to find the correct attribute index containing field-id
484+
485+ size_t attribute_search_start = 0 ;
486+ for (size_t field = 0 ; field <= field_idx; ++field) {
487+ if (field == field_idx) {
488+ // For the target field, search for field-id in the remaining attributes
489+ for (size_t attr_idx = attribute_search_start; attr_idx < node->customAttributes (); ++attr_idx) {
490+ auto id_str = node->customAttributesAt (attr_idx).getAttribute (kFieldIdKey );
491+ if (id_str.has_value ()) {
492+ try {
493+ return std::stoi (id_str.value ());
494+ } catch (const std::exception& e) {
495+ return InvalidSchema (" Invalid {}: {}" , kFieldIdKey , id_str.value ());
496+ }
497+ }
498+ // If this attribute doesn't have field-id, move to next
499+ auto name_attr = node->customAttributesAt (attr_idx).getAttribute (std::string (kIcebergFieldNameProp ));
500+ if (name_attr.has_value ()) {
501+ // This is a name attribute, the next one should be field-id
502+ continue ;
503+ }
504+ }
505+ break ;
506+ } else {
507+ // For previous fields, count how many attributes they used
508+ // Check if this field has a name attribute (means the field name was sanitized)
509+ if (attribute_search_start < node->customAttributes ()) {
510+ auto name_attr = node->customAttributesAt (attribute_search_start).getAttribute (std::string (kIcebergFieldNameProp ));
511+ if (name_attr.has_value ()) {
512+ // This field has both name and id attributes
513+ attribute_search_start += 2 ;
514+ } else {
515+ // This field only has id attribute
516+ attribute_search_start += 1 ;
517+ }
518+ }
519+ }
520+ }
521+
522+ return InvalidSchema (" Missing avro attribute: {}" , kFieldIdKey );
479523}
480524
481525Status ValidateAvroSchemaEvolution (const Type& expected_type,
0 commit comments