Skip to content

Commit 4d37488

Browse files
Add test and fix GetFieldId
1 parent 80a0822 commit 4d37488

File tree

3 files changed

+569
-14
lines changed

3 files changed

+569
-14
lines changed

src/iceberg/avro/avro_schema_util.cc

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,25 +63,26 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363
return attributes;
6464
}
6565

66+
} // namespace
67+
6668
std::string SanitizeFieldName(std::string_view field_name) {
6769
if (field_name.empty()) {
6870
return "_empty";
6971
}
7072

7173
std::string result;
72-
result.reserve(field_name.size());
74+
result.reserve(field_name.size() + 1); // +1 for potential leading underscore
7375

7476
// First character must be a letter or underscore
75-
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
77+
char first_char = field_name[0];
78+
if (!std::isalpha(static_cast<unsigned char>(first_char)) && first_char != '_') {
7679
result.push_back('_');
77-
} else {
78-
result.push_back(field_name[0]);
7980
}
8081

81-
// Rest of characters must be letters, digits, or underscores
82-
for (size_t i = 1; i < field_name.size(); ++i) {
82+
// Process all characters
83+
for (size_t i = 0; i < field_name.size(); ++i) {
8384
char c = field_name[i];
84-
if (std::isalnum(c) || c == '_') {
85+
if (std::isalnum(static_cast<unsigned char>(c)) || c == '_') {
8586
result.push_back(c);
8687
} else {
8788
result.push_back('_');
@@ -90,8 +91,6 @@ std::string SanitizeFieldName(std::string_view field_name) {
9091
return result;
9192
}
9293

93-
} // namespace
94-
9594
std::string ToString(const ::avro::NodePtr& node) {
9695
std::stringstream ss;
9796
ss << *node;
@@ -217,14 +216,16 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
217216

218217
// Sanitize field name to ensure it follows Avro field name requirements
219218
std::string sanitized_name = SanitizeFieldName(sub_field.name());
219+
220220
// Store original name as a custom attribute if it was modified
221221
if (sanitized_name != sub_field.name()) {
222222
// Add custom attribute to preserve the original field name
223-
::avro::CustomAttributes attrs;
224-
attrs.addAttribute(std::string(kIcebergFieldNameProp),
225-
std::string(sub_field.name()));
226-
(*node)->addCustomAttributesForField(attrs);
223+
::avro::CustomAttributes name_attrs;
224+
name_attrs.addAttribute(std::string(kIcebergFieldNameProp),
225+
std::string(sub_field.name()));
226+
(*node)->addCustomAttributesForField(name_attrs);
227227
}
228+
228229
(*node)->addName(sanitized_name);
229230
(*node)->addLeaf(field_node);
230231
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
@@ -475,7 +476,54 @@ Result<int32_t> GetValueId(const ::avro::NodePtr& node) {
475476

476477
Result<int32_t> GetFieldId(const ::avro::NodePtr& node, size_t field_idx) {
477478
static const std::string kFieldIdKey{kFieldIdProp};
478-
return GetId(node, kFieldIdKey, field_idx);
479+
480+
// When field names are sanitized, we add custom attributes in this order for each
481+
// field:
482+
// 1. If the field name was sanitized: iceberg-field-name attribute
483+
// 2. Always: field-id attribute
484+
// So for field i, we need to find the correct attribute index containing field-id
485+
486+
size_t attribute_search_start = 0;
487+
for (size_t field = 0; field <= field_idx; ++field) {
488+
if (field == field_idx) {
489+
// For the target field, search for field-id in the remaining attributes
490+
for (size_t attr_idx = attribute_search_start; attr_idx < node->customAttributes();
491+
++attr_idx) {
492+
auto id_str = node->customAttributesAt(attr_idx).getAttribute(kFieldIdKey);
493+
if (id_str.has_value()) {
494+
try {
495+
return std::stoi(id_str.value());
496+
} catch (const std::exception& e) {
497+
return InvalidSchema("Invalid {}: {}", kFieldIdKey, id_str.value());
498+
}
499+
}
500+
// If this attribute doesn't have field-id, move to next
501+
auto name_attr = node->customAttributesAt(attr_idx).getAttribute(
502+
std::string(kIcebergFieldNameProp));
503+
if (name_attr.has_value()) {
504+
// This is a name attribute, the next one should be field-id
505+
continue;
506+
}
507+
}
508+
break;
509+
} else {
510+
// For previous fields, count how many attributes they used
511+
// Check if this field has a name attribute (means the field name was sanitized)
512+
if (attribute_search_start < node->customAttributes()) {
513+
auto name_attr = node->customAttributesAt(attribute_search_start)
514+
.getAttribute(std::string(kIcebergFieldNameProp));
515+
if (name_attr.has_value()) {
516+
// This field has both name and id attributes
517+
attribute_search_start += 2;
518+
} else {
519+
// This field only has id attribute
520+
attribute_search_start += 1;
521+
}
522+
}
523+
}
524+
}
525+
526+
return InvalidSchema("Missing avro attribute: {}", kFieldIdKey);
479527
}
480528

481529
Status ValidateAvroSchemaEvolution(const Type& expected_type,

src/iceberg/avro/avro_schema_util_internal.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,9 @@ Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original
163163
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
164164
const NameMapping& mapping);
165165

166+
/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
167+
/// \param field_name The original field name to sanitize.
168+
/// \return A sanitized field name that follows Avro naming conventions.
169+
std::string SanitizeFieldName(std::string_view field_name);
170+
166171
} // namespace iceberg::avro

0 commit comments

Comments
 (0)