Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 54 additions & 2 deletions src/iceberg/avro/avro_schema_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,56 @@ ::avro::CustomAttributes GetAttributesWithFieldId(int32_t field_id) {

} // namespace

bool validAvroName(const std::string& name) {
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
if (name.empty()) {
throw std::runtime_error("Empty name");
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
}

char first = name[0];
if (!(std::isalpha(first) || first == '_')) {
return false;
}

for (size_t i = 1; i < name.length(); i++) {
char character = name[i];
if (!(std::isalnum(character) || character == '_')) {
return false;
}
}
return true;
}

std::string SanitizeChar(char c) {
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
if (std::isdigit(c)) {
return std::string("_") + c;
}
std::stringstream ss;
ss << "_x" << std::uppercase << std::hex << static_cast<int>(c);
return ss.str();
}

std::string SanitizeFieldName(std::string_view field_name) {
std::string result;
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
result.reserve(field_name.size());

if (!std::isalpha(field_name[0]) && field_name[0] != '_') {
Comment thread
MisterRaindrop marked this conversation as resolved.
result.append(SanitizeChar(field_name[0]));
} else {
result.push_back(field_name[0]);
}

for (size_t i = 1; i < field_name.size(); ++i) {
char c = field_name[i];
if (std::isalnum(c) || c == '_') {
result.push_back(c);
} else {
result.append(SanitizeChar(c));
}
}

return result;
}

std::string ToString(const ::avro::NodePtr& node) {
std::stringstream ss;
ss << *node;
Expand Down Expand Up @@ -181,8 +231,10 @@ Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) {
::avro::NodePtr field_node;
ICEBERG_RETURN_UNEXPECTED(Visit(sub_field, &field_node));

// TODO(gangwu): sanitize field name
(*node)->addName(std::string(sub_field.name()));
bool isValidFieldName = validAvroName(std::string(sub_field.name()));
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
std::string fieldName = isValidFieldName ? std::string(sub_field.name())
: SanitizeFieldName(sub_field.name());
(*node)->addName(fieldName);
(*node)->addLeaf(field_node);
(*node)->addCustomAttributesForField(GetAttributesWithFieldId(sub_field.field_id()));
}
Expand Down
24 changes: 24 additions & 0 deletions src/iceberg/avro/avro_schema_util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,28 @@ bool HasMapLogicalType(const ::avro::NodePtr& node);
Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(const ::avro::NodePtr& original_node,
const NameMapping& mapping);

/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
Comment thread
MisterRaindrop marked this conversation as resolved.
///
/// Converts names that are not valid Avro names to valid Avro names.
/// Conversion rules:
/// 1. If the first character is not a letter or underscore, it is specially handled:
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation
/// of the character (e.g., '$' -> '_x24')
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
/// 2. For characters other than the first:
/// - If it's a letter, digit, or underscore, it remains unchanged
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation
///
/// Examples:
/// - "123field" -> "_123field"
/// - "user-name" -> "user_x2Dname"
/// - "$price" -> "_x24price"
/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
///
/// \param field_name The original field name to sanitize.
/// \return A sanitized field name that follows Avro naming conventions.
std::string SanitizeFieldName(std::string_view field_name);

} // namespace iceberg::avro
141 changes: 140 additions & 1 deletion test/avro_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

namespace iceberg::avro {

// Forward declaration of functions to test
bool validAvroName(const std::string& name);
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated

namespace {

void CheckCustomLogicalType(const ::avro::NodePtr& node, const std::string& type_name) {
Expand All @@ -47,8 +50,82 @@ void CheckFieldIdAt(const ::avro::NodePtr& node, size_t index, int32_t field_id,
ASSERT_EQ(attrs.getAttribute(key), std::make_optional(std::to_string(field_id)));
}

// Helper function to check if a custom attribute exists for a field name preservation
void CheckIcebergFieldName(const ::avro::NodePtr& node, size_t index,
const std::string& original_name) {
ASSERT_LT(index, node->customAttributes());
const auto& attrs = node->customAttributesAt(index);
ASSERT_EQ(attrs.getAttribute("iceberg-field-name"), std::make_optional(original_name));
}

} // namespace

TEST(ValidAvroNameTest, ValidNames) {
// Valid field names should return true
EXPECT_TRUE(validAvroName("valid_field"));
EXPECT_TRUE(validAvroName("field123"));
EXPECT_TRUE(validAvroName("_private"));
EXPECT_TRUE(validAvroName("CamelCase"));
EXPECT_TRUE(validAvroName("field_with_underscores"));
}

TEST(ValidAvroNameTest, InvalidNames) {
// Names starting with numbers should return false
EXPECT_FALSE(validAvroName("123field"));
EXPECT_FALSE(validAvroName("0value"));

// Names with special characters should return false
EXPECT_FALSE(validAvroName("field-name"));
EXPECT_FALSE(validAvroName("field.name"));
EXPECT_FALSE(validAvroName("field name"));
EXPECT_FALSE(validAvroName("field@name"));
EXPECT_FALSE(validAvroName("field#name"));
}

TEST(ValidAvroNameTest, EmptyName) {
// Empty name should throw an exception
EXPECT_THROW(validAvroName(""), std::runtime_error);
}

TEST(SanitizeFieldNameTest, ValidFieldNames) {
// Valid field names should remain unchanged
EXPECT_EQ(SanitizeFieldName("valid_field"), "valid_field");
EXPECT_EQ(SanitizeFieldName("field123"), "field123");
EXPECT_EQ(SanitizeFieldName("_private"), "_private");
EXPECT_EQ(SanitizeFieldName("CamelCase"), "CamelCase");
EXPECT_EQ(SanitizeFieldName("field_with_underscores"), "field_with_underscores");
}

TEST(SanitizeFieldNameTest, InvalidFieldNames) {
// Field names starting with numbers should be prefixed with underscore
EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
EXPECT_EQ(SanitizeFieldName("0value"), "_0value");

// Field names with special characters should be encoded with hex values
EXPECT_EQ(SanitizeFieldName("field-name"), "field_x2Dname");
EXPECT_EQ(SanitizeFieldName("field.name"), "field_x2Ename");
EXPECT_EQ(SanitizeFieldName("field name"), "field_x20name");
EXPECT_EQ(SanitizeFieldName("field@name"), "field_x40name");
EXPECT_EQ(SanitizeFieldName("field#name"), "field_x23name");

// Complex field names with multiple issues
EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"),
"_1field_x2Dwith_x2Especial_x40chars");
EXPECT_EQ(SanitizeFieldName("user-email"), "user_x2Demail");
}

TEST(SanitizeFieldNameTest, EdgeCases) {
// Empty field name
EXPECT_EQ(SanitizeFieldName(""), "_x0");

// Field name with only special characters
EXPECT_EQ(SanitizeFieldName("@#$"), "_x40_x23_x24");

// Field name starting with special character
EXPECT_EQ(SanitizeFieldName("-field"), "_x2Dfield");
EXPECT_EQ(SanitizeFieldName(".field"), "_x2Efield");
}

TEST(ToAvroNodeVisitorTest, BooleanType) {
::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(BooleanType{}, &node), IsOk());
Expand Down Expand Up @@ -181,6 +258,69 @@ TEST(ToAvroNodeVisitorTest, StructType) {
EXPECT_EQ(node->leafAt(1)->leafAt(1)->type(), ::avro::AVRO_INT);
}

TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
// Test struct with field names that require sanitization
StructType struct_type{
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
/*optional=*/false},
SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
/*optional=*/true},
SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
/*optional=*/false},
SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
/*optional=*/true}}};

::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);

// Check that field names are sanitized
ASSERT_EQ(node->names(), 4);
EXPECT_EQ(node->nameAt(0), "user_x2Dname"); // "user-name" -> "user_x2Dname"
EXPECT_EQ(node->nameAt(1),
"email_x2Eaddress"); // "email.address" -> "email_x2Eaddress"
EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field"
EXPECT_EQ(
node->nameAt(3),
"field_x20with_x20spaces"); // "field with spaces" -> "field_x20with_x20spaces"

// Check that field IDs are correctly applied
// Each field has 1 custom attribute: field-id
ASSERT_EQ(node->customAttributes(), 4);
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/2, /*field_id=*/3));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/4));
}

TEST(ToAvroNodeVisitorTest, StructTypeWithValidFieldNames) {
Comment thread
MisterRaindrop marked this conversation as resolved.
Outdated
// Test struct with field names that don't require sanitization
StructType struct_type{{SchemaField{/*field_id=*/1, "valid_field", iceberg::string(),
/*optional=*/false},
SchemaField{/*field_id=*/2, "AnotherField", iceberg::int32(),
/*optional=*/true}}};

::avro::NodePtr node;
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
EXPECT_EQ(node->type(), ::avro::AVRO_RECORD);

// Check that field names remain unchanged
ASSERT_EQ(node->names(), 2);
EXPECT_EQ(node->nameAt(0), "valid_field");
EXPECT_EQ(node->nameAt(1), "AnotherField");

// Check that field IDs are correctly applied
ASSERT_EQ(node->customAttributes(), 2);
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/0, /*field_id=*/1));
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/2));

// For valid field names, there should be no iceberg-field-name attributes
const auto& attrs0 = node->customAttributesAt(0);
const auto& attrs1 = node->customAttributesAt(1);
EXPECT_FALSE(attrs0.getAttribute("iceberg-field-name").has_value());
EXPECT_FALSE(attrs1.getAttribute("iceberg-field-name").has_value());
}

TEST(ToAvroNodeVisitorTest, ListType) {
ListType list_type{SchemaField{/*field_id=*/5, "element", iceberg::string(),
/*optional=*/true}};
Expand Down Expand Up @@ -1436,5 +1576,4 @@ TEST_F(NameMappingAvroSchemaTest, MissingFieldIdError) {
auto result = MakeAvroNodeWithFieldIds(avro_schema.root(), *name_mapping);
ASSERT_THAT(result, IsError(ErrorKind::kInvalidSchema));
}

Comment thread
MisterRaindrop marked this conversation as resolved.
} // namespace iceberg::avro
Loading