Skip to content

Commit 54c87b6

Browse files
feat: avro schema add sanitize field name
1 parent 82a1cd6 commit 54c87b6

1 file changed

Lines changed: 46 additions & 3 deletions

File tree

src/iceberg/avro/avro_schema_util.cc

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,33 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6363
return attributes;
6464
}
6565

66+
std::string SanitizeFieldName(std::string_view field_name) {
67+
if (field_name.empty()) {
68+
return "_empty";
69+
}
70+
71+
std::string result;
72+
result.reserve(field_name.size());
73+
74+
// First character must be a letter or underscore
75+
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
76+
result.push_back('_');
77+
} else {
78+
result.push_back(field_name[0]);
79+
}
80+
81+
// Rest of characters must be letters, digits, or underscores
82+
for (size_t i = 1; i < field_name.size(); ++i) {
83+
char c = field_name[i];
84+
if (std::isalnum(c) || c == '_') {
85+
result.push_back(c);
86+
} else {
87+
result.push_back('_');
88+
}
89+
}
90+
return result;
91+
}
92+
6693
} // namespace
6794

6895
std::string ToString(const ::avro::NodePtr& node) {
@@ -188,8 +215,16 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
188215
::avro::NodePtr field_node;
189216
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
190217

191-
// TODO(gangwu): sanitize field name
192-
(*node)->addName(std::string(sub_field.name()));
218+
// Sanitize field name to ensure it follows Avro field name requirements
219+
std::string sanitized_name = SanitizeFieldName(sub_field.name());
220+
// Store original name as a custom attribute if it was modified
221+
if (sanitized_name != sub_field.name()) {
222+
// Add custom attribute to preserve the original field name
223+
::avro::CustomAttributes attrs;
224+
attrs.addAttribute(std::string(kIcebergFieldNameProp), std::string(sub_field.name()));
225+
(*node)->addCustomAttributesForField(attrs);
226+
}
227+
(*node)->addName(sanitized_name);
193228
(*node)->addLeaf(field_node);
194229
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
195230
}
@@ -839,7 +874,15 @@ Result<::avro::NodePtr> CreateRecordNodeWithFieldIds(const ::avro::NodePtr& orig
839874
// Recursively apply field IDs to nested fields
840875
ICEBERG_ASSIGN_OR_RAISE(auto new_nested_node,
841876
MakeAvroNodeWithFieldIds(field_node, *nested_field));
842-
new_record_node->addName(field_name);
877+
std::string sanitized_name = SanitizeFieldName(field_name);
878+
// Store original name as a custom attribute if it was modified
879+
if (sanitized_name != field_name) {
880+
// Add custom attribute to preserve the original field name
881+
::avro::CustomAttributes attrs;
882+
attrs.addAttribute(std::string(kIcebergFieldNameProp), field_name);
883+
new_record_node->addCustomAttributesForField(attrs);
884+
}
885+
new_record_node->addName(sanitized_name);
843886
new_record_node->addLeaf(new_nested_node);
844887
}
845888

0 commit comments

Comments
 (0)