Skip to content

Commit d588cea

Browse files
Add test and fix GetFieldId
1 parent 80a0822 commit d588cea

3 files changed

Lines changed: 549 additions & 14 deletions

File tree

src/iceberg/avro/avro_schema_util.cc

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,26 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363
return attributes;
6464
}
6565

66+
} // namespace
67+
6668
std::string SanitizeFieldName(std::string_view field_name) {
6769
if (field_name.empty()) {
6870
return "_empty";
6971
}
7072

7173
std::string result;
72-
result.reserve(field_name.size());
74+
result.reserve(field_name.size() + 1); // +1 for potential leading underscore
7375

7476
// First character must be a letter or underscore
75-
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
77+
char first_char = field_name[0];
78+
if (!std::isalpha(static_cast<unsigned char>(first_char)) && first_char != '_') {
7679
result.push_back('_');
77-
} else {
78-
result.push_back(field_name[0]);
7980
}
8081

81-
// Rest of characters must be letters, digits, or underscores
82-
for (size_t i = 1; i < field_name.size(); ++i) {
82+
// Process all characters
83+
for (size_t i = 0; i < field_name.size(); ++i) {
8384
char c = field_name[i];
84-
if (std::isalnum(c) || c == '_') {
85+
if (std::isalnum(static_cast<unsigned char>(c)) || c == '_') {
8586
result.push_back(c);
8687
} else {
8788
result.push_back('_');
@@ -90,8 +91,6 @@ std::string SanitizeFieldName(std::string_view field_name) {
9091
return result;
9192
}
9293

93-
} // namespace
94-
9594
std::string ToString(const ::avro::NodePtr& node) {
9695
std::stringstream ss;
9796
ss << *node;
@@ -217,14 +216,16 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
217216

218217
// Sanitize field name to ensure it follows Avro field name requirements
219218
std::string sanitized_name = SanitizeFieldName(sub_field.name());
219+
220220
// Store original name as a custom attribute if it was modified
221221
if (sanitized_name != sub_field.name()) {
222222
// Add custom attribute to preserve the original field name
223-
::avro::CustomAttributes attrs;
224-
attrs.addAttribute(std::string(kIcebergFieldNameProp),
225-
std::string(sub_field.name()));
226-
(*node)->addCustomAttributesForField(attrs);
223+
::avro::CustomAttributes name_attrs;
224+
name_attrs.addAttribute(std::string(kIcebergFieldNameProp),
225+
std::string(sub_field.name()));
226+
(*node)->addCustomAttributesForField(name_attrs);
227227
}
228+
228229
(*node)->addName(sanitized_name);
229230
(*node)->addLeaf(field_node);
230231
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
@@ -475,7 +476,50 @@ Result<int32_t> GetValueId(const ::avro::NodePtr& node) {
475476

476477
Result<int32_t> GetFieldId(const ::avro::NodePtr& node, size_t field_idx) {
477478
static const std::string kFieldIdKey{kFieldIdProp};
478-
return GetId(node, kFieldIdKey, field_idx);
479+
480+
// When field names are sanitized, we add custom attributes in this order for each field:
481+
// 1. If the field name was sanitized: iceberg-field-name attribute
482+
// 2. Always: field-id attribute
483+
// So for field i, we need to find the correct attribute index containing field-id
484+
485+
size_t attribute_search_start = 0;
486+
for (size_t field = 0; field <= field_idx; ++field) {
487+
if (field == field_idx) {
488+
// For the target field, search for field-id in the remaining attributes
489+
for (size_t attr_idx = attribute_search_start; attr_idx < node->customAttributes(); ++attr_idx) {
490+
auto id_str = node->customAttributesAt(attr_idx).getAttribute(kFieldIdKey);
491+
if (id_str.has_value()) {
492+
try {
493+
return std::stoi(id_str.value());
494+
} catch (const std::exception& e) {
495+
return InvalidSchema("Invalid {}: {}", kFieldIdKey, id_str.value());
496+
}
497+
}
498+
// If this attribute doesn't have field-id, move to next
499+
auto name_attr = node->customAttributesAt(attr_idx).getAttribute(std::string(kIcebergFieldNameProp));
500+
if (name_attr.has_value()) {
501+
// This is a name attribute, the next one should be field-id
502+
continue;
503+
}
504+
}
505+
break;
506+
} else {
507+
// For previous fields, count how many attributes they used
508+
// Check if this field has a name attribute (means the field name was sanitized)
509+
if (attribute_search_start < node->customAttributes()) {
510+
auto name_attr = node->customAttributesAt(attribute_search_start).getAttribute(std::string(kIcebergFieldNameProp));
511+
if (name_attr.has_value()) {
512+
// This field has both name and id attributes
513+
attribute_search_start += 2;
514+
} else {
515+
// This field only has id attribute
516+
attribute_search_start += 1;
517+
}
518+
}
519+
}
520+
}
521+
522+
return InvalidSchema("Missing avro attribute: {}", kFieldIdKey);
479523
}
480524

481525
Status ValidateAvroSchemaEvolution(const Type& expected_type,

src/iceberg/avro/avro_schema_util_internal.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,9 @@ Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original
163163
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
164164
const NameMapping& mapping);
165165

166+
/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
167+
/// \param field_name The original field name to sanitize.
168+
/// \return A sanitized field name that follows Avro naming conventions.
169+
std::string SanitizeFieldName(std::string_view field_name);
170+
166171
} // namespace iceberg::avro

0 commit comments

Comments
 (0)