@@ -63,25 +63,26 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363 return attributes;
6464}
6565
66+ } // namespace
67+
6668std::string SanitizeFieldName (std::string_view field_name) {
6769 if (field_name.empty ()) {
6870 return " _empty" ;
6971 }
7072
7173 std::string result;
72- result.reserve (field_name.size ());
74+ result.reserve (field_name.size () + 1 ); // +1 for potential leading underscore
7375
7476 // First character must be a letter or underscore
75- if (!std::isalpha (field_name[0 ]) && field_name[0 ] != ' _' ) {
77+ char first_char = field_name[0 ];
78+ if (!std::isalpha (static_cast <unsigned char >(first_char)) && first_char != ' _' ) {
7679 result.push_back (' _' );
77- } else {
78- result.push_back (field_name[0 ]);
7980 }
8081
81- // Rest of characters must be letters, digits, or underscores
82- for (size_t i = 1 ; i < field_name.size (); ++i) {
82+ // Process all characters
83+ for (size_t i = 0 ; i < field_name.size (); ++i) {
8384 char c = field_name[i];
84- if (std::isalnum (c ) || c == ' _' ) {
85+ if (std::isalnum (static_cast < unsigned char >(c) ) || c == ' _' ) {
8586 result.push_back (c);
8687 } else {
8788 result.push_back (' _' );
@@ -90,8 +91,6 @@ std::string SanitizeFieldName(std::string_view field_name) {
9091 return result;
9192}
9293
93- } // namespace
94-
9594std::string ToString (const ::avro::NodePtr& node) {
9695 std::stringstream ss;
9796 ss << *node;
@@ -217,14 +216,16 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
217216
218217 // Sanitize field name to ensure it follows Avro field name requirements
219218 std::string sanitized_name = SanitizeFieldName (sub_field.name ());
219+
220220 // Store original name as a custom attribute if it was modified
221221 if (sanitized_name != sub_field.name ()) {
222222 // Add custom attribute to preserve the original field name
223- ::avro::CustomAttributes attrs ;
224- attrs .addAttribute (std::string (kIcebergFieldNameProp ),
225- std::string (sub_field.name ()));
226- (*node)->addCustomAttributesForField (attrs );
223+ ::avro::CustomAttributes name_attrs ;
224+ name_attrs .addAttribute (std::string (kIcebergFieldNameProp ),
225+ std::string (sub_field.name ()));
226+ (*node)->addCustomAttributesForField (name_attrs );
227227 }
228+
228229 (*node)->addName (sanitized_name);
229230 (*node)->addLeaf (field_node);
230231 (*node)->addCustomAttributesForField (GetAttributesWithFieldId (sub_field.field_id ()));
@@ -475,7 +476,54 @@ Result<int32_t> GetValueId(const ::avro::NodePtr& node) {
475476
476477Result<int32_t > GetFieldId (const ::avro::NodePtr& node, size_t field_idx) {
477478 static const std::string kFieldIdKey {kFieldIdProp };
478- return GetId (node, kFieldIdKey , field_idx);
479+
480+ // When field names are sanitized, we add custom attributes in this order for each
481+ // field:
482+ // 1. If the field name was sanitized: iceberg-field-name attribute
483+ // 2. Always: field-id attribute
484+ // So for field i, we need to find the correct attribute index containing field-id
485+
486+ size_t attribute_search_start = 0 ;
487+ for (size_t field = 0 ; field <= field_idx; ++field) {
488+ if (field == field_idx) {
489+ // For the target field, search for field-id in the remaining attributes
490+ for (size_t attr_idx = attribute_search_start; attr_idx < node->customAttributes ();
491+ ++attr_idx) {
492+ auto id_str = node->customAttributesAt (attr_idx).getAttribute (kFieldIdKey );
493+ if (id_str.has_value ()) {
494+ try {
495+ return std::stoi (id_str.value ());
496+ } catch (const std::exception& e) {
497+ return InvalidSchema (" Invalid {}: {}" , kFieldIdKey , id_str.value ());
498+ }
499+ }
500+ // If this attribute doesn't have field-id, move to next
501+ auto name_attr = node->customAttributesAt (attr_idx).getAttribute (
502+ std::string (kIcebergFieldNameProp ));
503+ if (name_attr.has_value ()) {
504+ // This is a name attribute, the next one should be field-id
505+ continue ;
506+ }
507+ }
508+ break ;
509+ } else {
510+ // For previous fields, count how many attributes they used
511+ // Check if this field has a name attribute (means the field name was sanitized)
512+ if (attribute_search_start < node->customAttributes ()) {
513+ auto name_attr = node->customAttributesAt (attribute_search_start)
514+ .getAttribute (std::string (kIcebergFieldNameProp ));
515+ if (name_attr.has_value ()) {
516+ // This field has both name and id attributes
517+ attribute_search_start += 2 ;
518+ } else {
519+ // This field only has id attribute
520+ attribute_search_start += 1 ;
521+ }
522+ }
523+ }
524+ }
525+
526+ return InvalidSchema (" Missing avro attribute: {}" , kFieldIdKey );
479527}
480528
481529Status ValidateAvroSchemaEvolution (const Type& expected_type,
0 commit comments