Skip to content

Commit 96fbf22

Browse files
feat: avro schema add sanitize field name
1 parent 82a1cd6 commit 96fbf22

3 files changed

Lines changed: 213 additions & 3 deletions

File tree

src/iceberg/avro/avro_schema_util.cc

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ constexpr std::string_view kIcebergFieldNameProp = "iceberg-field-name";
4848
constexpr std::string_view kFieldIdProp = "field-id";
4949
constexpr std::string_view kKeyIdProp = "key-id";
5050
constexpr std::string_view kValueIdProp = "value-id";
51+
5152
constexpr std::string_view kElementIdProp = "element-id";
5253
constexpr std::string_view kAdjustToUtcProp = "adjust-to-utc";
5354

@@ -65,6 +66,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {
6566

6667
} // namespace
6768

69+
bool validAvroName(const std::string& name) {
70+
if (name.empty()) {
71+
throw std::runtime_error("Empty name");
72+
}
73+
74+
char first = name[0];
75+
if (!(std::isalpha(first) || first == '_')) {
76+
return false;
77+
}
78+
79+
for (size_t i = 1; i < name.length(); i++) {
80+
char character = name[i];
81+
if (!(std::isalnum(character) || character == '_')) {
82+
return false;
83+
}
84+
}
85+
return true;
86+
}
87+
88+
std::string SanitizeChar(char c) {
89+
if (std::isdigit(c)) {
90+
return std::string("_") + c;
91+
}
92+
std::stringstream ss;
93+
ss << "_x" << std::uppercase << std::hex << static_cast<int>(c);
94+
return ss.str();
95+
}
96+
97+
std::string SanitizeFieldName(std::string_view field_name) {
98+
std::string result;
99+
result.reserve(field_name.size());
100+
101+
if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
102+
result.append(SanitizeChar(field_name[0]));
103+
} else {
104+
result.push_back(field_name[0]);
105+
}
106+
107+
for (size_t i = 1; i < field_name.size(); ++i) {
108+
char c = field_name[i];
109+
if (std::isalnum(c) || c == '_') {
110+
result.push_back(c);
111+
} else {
112+
result.append(SanitizeChar(c));
113+
}
114+
}
115+
116+
return result;
117+
}
118+
68119
std::string ToString(const ::avro::NodePtr& node) {
69120
std::stringstream ss;
70121
ss << *node;
@@ -188,8 +239,9 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
188239
::avro::NodePtr field_node;
189240
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));
190241

191-
// TODO(gangwu): sanitize field name
192-
(*node)->addName(std::string(sub_field.name()));
242+
bool isValidFieldName = validAvroName(std::string(sub_field.name()));
243+
std::string fieldName = isValidFieldName ? std::string(sub_field.name()) : SanitizeFieldName(sub_field.name());
244+
(*node)->addName(fieldName);
193245
(*node)->addLeaf(field_node);
194246
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
195247
}

src/iceberg/avro/avro_schema_util_internal.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,26 @@ Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original
163163
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
164164
const NameMapping& mapping);
165165

166+
/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
167+
///
168+
/// Converts names that are not valid Avro names to valid Avro names.
169+
/// Conversion rules:
170+
/// 1. If the first character is not a letter or underscore, it is specially handled:
171+
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
172+
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal representation
173+
/// of the character (e.g., '$' -> '_x24')
174+
/// 2. For characters other than the first:
175+
/// - If it's a letter, digit, or underscore, it remains unchanged
176+
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal representation
177+
///
178+
/// Examples:
179+
/// - "123field" -> "_123field"
180+
/// - "user-name" -> "user_x2Dname"
181+
/// - "$price" -> "_x24price"
182+
/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
183+
///
184+
/// \param field_name The original field name to sanitize.
185+
/// \return A sanitized field name that follows Avro naming conventions.
186+
std::string SanitizeFieldName(std::string_view field_name);
187+
166188
} // namespace iceberg::avro

test/avro_schema_test.cc

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232

3333
namespace iceberg::avro {
3434

35+
// Forward declaration of functions to test
36+
bool validAvroName(const std::string& name);
37+
3538
namespace {
3639

3740
void CheckCustomLogicalType(const ::avro::NodePtr& node, const std::string& type_name) {
@@ -47,8 +50,81 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
4750
ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id)));
4851
}
4952

53+
// Helper function to check if a custom attribute exists for a field name preservation
54+
void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
55+
const std::string& original_name) {
56+
ASSERT_LT(index, node->customAttributes());
57+
const auto& attrs = node->customAttributesAt(index);
58+
ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name));
59+
}
60+
5061
} // namespace
5162

63+
TEST(ValidAvroNameTest, ValidNames) {
64+
// Valid field names should return true
65+
EXPECT_TRUE(validAvroName("valid_field"));
66+
EXPECT_TRUE(validAvroName("field123"));
67+
EXPECT_TRUE(validAvroName("_private"));
68+
EXPECT_TRUE(validAvroName("CamelCase"));
69+
EXPECT_TRUE(validAvroName("field_with_underscores"));
70+
}
71+
72+
TEST(ValidAvroNameTest, InvalidNames) {
73+
// Names starting with numbers should return false
74+
EXPECT_FALSE(validAvroName("123field"));
75+
EXPECT_FALSE(validAvroName("0value"));
76+
77+
// Names with special characters should return false
78+
EXPECT_FALSE(validAvroName("field-name"));
79+
EXPECT_FALSE(validAvroName("field.name"));
80+
EXPECT_FALSE(validAvroName("field name"));
81+
EXPECT_FALSE(validAvroName("field@name"));
82+
EXPECT_FALSE(validAvroName("field#name"));
83+
}
84+
85+
TEST(ValidAvroNameTest, EmptyName) {
86+
// Empty name should throw an exception
87+
EXPECT_THROW(validAvroName(""), std::runtime_error);
88+
}
89+
90+
TEST(SanitizeFieldNameTest, ValidFieldNames) {
91+
// Valid field names should remain unchanged
92+
EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
93+
EXPECT_EQ(SanitizeFieldName("field123"), "field123");
94+
EXPECT_EQ(SanitizeFieldName("_private"), "_private");
95+
EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
96+
EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores");
97+
}
98+
99+
TEST(SanitizeFieldNameTest, InvalidFieldNames) {
100+
// Field names starting with numbers should be prefixed with underscore
101+
EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
102+
EXPECT_EQ(SanitizeFieldName("0value"), "_0value");
103+
104+
// Field names with special characters should be encoded with hex values
105+
EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
106+
EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
107+
EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
108+
EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
109+
EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");
110+
111+
// Complex field names with multiple issues
112+
EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"), "_1field_x2Dwith_x2Especial_x40chars");
113+
EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
114+
}
115+
116+
TEST(SanitizeFieldNameTest, EdgeCases) {
117+
// Empty field name
118+
EXPECT_EQ(SanitizeFieldName(""), "_x0");
119+
120+
// Field name with only special characters
121+
EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");
122+
123+
// Field name starting with special character
124+
EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
125+
EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
126+
}
127+
52128
TEST(ToAvroNodeVisitorTest, BooleanType) {
53129
::avro::NodePtr node;
54130
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
@@ -181,6 +257,67 @@ TEST(ToAvroNodeVisitorTest, StructType) {
181257
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
182258
}
183259

260+
TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
261+
// Test struct with field names that require sanitization
262+
StructType struct_type{
263+
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
264+
/*optional=*/false},
265+
SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
266+
/*optional=*/true},
267+
SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
268+
/*optional=*/false},
269+
SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
270+
/*optional=*/true}}};
271+
272+
::avro::NodePtr node;
273+
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
274+
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
275+
276+
// Check that field names are sanitized
277+
ASSERT_EQ(node->names(), 4);
278+
EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
279+
EXPECT_EQ(node->nameAt(1), "email_x2Eaddress"); // "email.address" -> "email_x2Eaddress"
280+
EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field"
281+
EXPECT_EQ(node->nameAt(3),
282+
"field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces"
283+
284+
// Check that field IDs are correctly applied
285+
// Each field has 1 custom attribute: field-id
286+
ASSERT_EQ(node->customAttributes(), 4);
287+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
288+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
289+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
290+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
291+
}
292+
293+
TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
294+
// Test struct with field names that don't require sanitization
295+
StructType struct_type{{SchemaField{/*field_id=*/1, "valid_field", iceberg::string(),
296+
/*optional=*/false},
297+
SchemaField{/*field_id=*/2, "AnotherField", iceberg::int32(),
298+
/*optional=*/true}}};
299+
300+
::avro::NodePtr node;
301+
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
302+
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);
303+
304+
// Check that field names remain unchanged
305+
ASSERT_EQ(node->names(), 2);
306+
EXPECT_EQ(node->nameAt(0), "valid_field");
307+
EXPECT_EQ(node->nameAt(1), "AnotherField");
308+
309+
// Check that field IDs are correctly applied
310+
ASSERT_EQ(node->customAttributes(), 2);
311+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
312+
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
313+
314+
// For valid field names, there should be no iceberg-field-name attributes
315+
const auto& attrs0 = node->customAttributesAt(0);
316+
const auto& attrs1 = node->customAttributesAt(1);
317+
EXPECT_FALSE(attrs0.getAttribute("iceberg-field-name").has_value());
318+
EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
319+
}
320+
184321
TEST(ToAvroNodeVisitorTest, ListType) {
185322
ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
186323
/*optional=*/true}};
@@ -1436,5 +1573,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
14361573
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
14371574
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
14381575
}
1439-
14401576
} // namespace iceberg::avro

0 commit comments

Comments
 (0)