Skip to content

Commit 6af132b

Browse files
format code
1 parent d588cea commit 6af132b

2 files changed

Lines changed: 88 additions & 67 deletions

File tree

src/iceberg/avro/avro_schema_util.cc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,8 @@ Result<int32_t> GetValueId(const ::avro::NodePtr& node) {
477477
Result<int32_t> GetFieldId(const ::avro::NodePtr& node, size_t field_idx) {
478478
static const std::string kFieldIdKey{kFieldIdProp};
479479

480-
// When field names are sanitized, we add custom attributes in this order for each field:
480+
// When field names are sanitized, we add custom attributes in this order for each
481+
// field:
481482
// 1. If the field name was sanitized: iceberg-field-name attribute
482483
// 2. Always: field-id attribute
483484
// So for field i, we need to find the correct attribute index containing field-id
@@ -486,7 +487,8 @@ Result<int32_t> GetFieldId(const ::avro::NodePtr& node, size_t field_idx) {
486487
for (size_t field = 0; field <= field_idx; ++field) {
487488
if (field == field_idx) {
488489
// For the target field, search for field-id in the remaining attributes
489-
for (size_t attr_idx = attribute_search_start; attr_idx < node->customAttributes(); ++attr_idx) {
490+
for (size_t attr_idx = attribute_search_start; attr_idx < node->customAttributes();
491+
++attr_idx) {
490492
auto id_str = node->customAttributesAt(attr_idx).getAttribute(kFieldIdKey);
491493
if (id_str.has_value()) {
492494
try {
@@ -496,7 +498,8 @@ Result<int32_t> GetFieldId(const ::avro::NodePtr& node, size_t field_idx) {
496498
}
497499
}
498500
// If this attribute doesn't have field-id, move to next
499-
auto name_attr = node->customAttributesAt(attr_idx).getAttribute(std::string(kIcebergFieldNameProp));
501+
auto name_attr = node->customAttributesAt(attr_idx).getAttribute(
502+
std::string(kIcebergFieldNameProp));
500503
if (name_attr.has_value()) {
501504
// This is a name attribute, the next one should be field-id
502505
continue;
@@ -507,7 +510,8 @@ Result<int32_t> GetFieldId(const ::avro::NodePtr& node, size_t field_idx) {
507510
// For previous fields, count how many attributes they used
508511
// Check if this field has a name attribute (means the field name was sanitized)
509512
if (attribute_search_start < node->customAttributes()) {
510-
auto name_attr = node->customAttributesAt(attribute_search_start).getAttribute(std::string(kIcebergFieldNameProp));
513+
auto name_attr = node->customAttributesAt(attribute_search_start)
514+
.getAttribute(std::string(kIcebergFieldNameProp));
511515
if (name_attr.has_value()) {
512516
// This field has both name and id attributes
513517
attribute_search_start += 2;

test/avro_schema_test.cc

Lines changed: 80 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -70,27 +70,28 @@ TEST(SanitizeFieldNameTest, InvalidFieldNames) {
7070
// Field names starting with numbers should be prefixed with underscore
7171
EXPECT_EQ(SanitizeFieldName("123field"), "_123field");
7272
EXPECT_EQ(SanitizeFieldName("0value"), "_0value");
73-
73+
7474
// Field names with special characters should have them replaced with underscores
7575
EXPECT_EQ(SanitizeFieldName("field-name"), "field_name");
7676
EXPECT_EQ(SanitizeFieldName("field.name"), "field_name");
7777
EXPECT_EQ(SanitizeFieldName("field name"), "field_name");
7878
EXPECT_EQ(SanitizeFieldName("field@name"), "field_name");
7979
EXPECT_EQ(SanitizeFieldName("field#name"), "field_name");
80-
80+
8181
// Complex field names with multiple issues
8282
EXPECT_EQ(SanitizeFieldName("1field-with.special@chars"), "_1field_with_special_chars");
8383
EXPECT_EQ(SanitizeFieldName("user-email"), "user_email");
84-
EXPECT_EQ(SanitizeFieldName("价格"), "_______"); // Non-ASCII characters become underscores
84+
EXPECT_EQ(SanitizeFieldName("价格"),
85+
"_______"); // Non-ASCII characters become underscores
8586
}
8687

8788
TEST(SanitizeFieldNameTest, EdgeCases) {
8889
// Empty field name
8990
EXPECT_EQ(SanitizeFieldName(""), "_empty");
90-
91+
9192
// Field name with only special characters
9293
EXPECT_EQ(SanitizeFieldName("@#$"), "____");
93-
94+
9495
// Field name starting with special character
9596
EXPECT_EQ(SanitizeFieldName("-field"), "__field");
9697
EXPECT_EQ(SanitizeFieldName(".field"), "__field");
@@ -230,14 +231,15 @@ TEST(ToAvroNodeVisitorTest, StructType) {
230231

231232
TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
232233
// Test struct with field names that require sanitization
233-
StructType struct_type{{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
234-
/*optional=*/false},
235-
SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
236-
/*optional=*/true},
237-
SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
238-
/*optional=*/false},
239-
SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
240-
/*optional=*/true}}};
234+
StructType struct_type{
235+
{SchemaField{/*field_id=*/1, "user-name", iceberg::string(),
236+
/*optional=*/false},
237+
SchemaField{/*field_id=*/2, "email.address", iceberg::string(),
238+
/*optional=*/true},
239+
SchemaField{/*field_id=*/3, "123field", iceberg::int32(),
240+
/*optional=*/false},
241+
SchemaField{/*field_id=*/4, "field with spaces", iceberg::boolean(),
242+
/*optional=*/true}}};
241243

242244
::avro::NodePtr node;
243245
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
@@ -248,10 +250,12 @@ TEST(ToAvroNodeVisitorTest, StructTypeWithSanitizedFieldNames) {
248250
EXPECT_EQ(node->nameAt(0), "user_name"); // "user-name" -> "user_name"
249251
EXPECT_EQ(node->nameAt(1), "email_address"); // "email.address" -> "email_address"
250252
EXPECT_EQ(node->nameAt(2), "_123field"); // "123field" -> "_123field"
251-
EXPECT_EQ(node->nameAt(3), "field_with_spaces"); // "field with spaces" -> "field_with_spaces"
253+
EXPECT_EQ(node->nameAt(3),
254+
"field_with_spaces"); // "field with spaces" -> "field_with_spaces"
252255

253256
// Check that field IDs are correctly applied
254-
// Each field has 2 custom attributes: iceberg-field-name (index 0,2,4,6) and field-id (index 1,3,5,7)
257+
// Each field has 2 custom attributes: iceberg-field-name (index 0,2,4,6) and field-id
258+
// (index 1,3,5,7)
255259
ASSERT_EQ(node->customAttributes(), 8);
256260
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/1, /*field_id=*/1));
257261
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(node, /*index=*/3, /*field_id=*/2));
@@ -1644,7 +1648,7 @@ TEST_F(NameMappingAvroSchemaTest, SanitizeFieldNamesNestedRecord) {
16441648
EXPECT_EQ(node->names(), 2);
16451649

16461650
// Check top-level field names
1647-
EXPECT_EQ(node->nameAt(0), "id"); // Valid name, unchanged
1651+
EXPECT_EQ(node->nameAt(0), "id"); // Valid name, unchanged
16481652
EXPECT_EQ(node->nameAt(1), "person_info"); // Sanitized from "person-info"
16491653

16501654
// Check that top-level field IDs are applied
@@ -1661,11 +1665,12 @@ TEST_F(NameMappingAvroSchemaTest, SanitizeFieldNamesNestedRecord) {
16611665
EXPECT_EQ(person_node->names(), 3);
16621666

16631667
// Check that nested field names are sanitized
1664-
EXPECT_EQ(person_node->nameAt(0), "first_name"); // "first-name" -> "first_name"
1665-
EXPECT_EQ(person_node->nameAt(1), "last_name"); // "last.name" -> "last_name"
1666-
EXPECT_EQ(person_node->nameAt(2), "_123age"); // "123age" -> "_123age"
1668+
EXPECT_EQ(person_node->nameAt(0), "first_name"); // "first-name" -> "first_name"
1669+
EXPECT_EQ(person_node->nameAt(1), "last_name"); // "last.name" -> "last_name"
1670+
EXPECT_EQ(person_node->nameAt(2), "_123age"); // "123age" -> "_123age"
16671671

1668-
// Check that nested field IDs are applied (all 3 fields are sanitized, so 6 attributes total)
1672+
// Check that nested field IDs are applied (all 3 fields are sanitized, so 6 attributes
1673+
// total)
16691674
ASSERT_EQ(person_node->customAttributes(), 6);
16701675
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(person_node, /*index=*/0, /*field_id=*/10));
16711676
ASSERT_NO_FATAL_FAILURE(CheckFieldIdAt(person_node, /*index=*/2, /*field_id=*/11));
@@ -1679,12 +1684,13 @@ TEST_F(NameMappingAvroSchemaTest, SanitizeFieldNamesNestedRecord) {
16791684

16801685
TEST(CustomAttributesTest, PreserveOriginalFieldNames) {
16811686
// Test that custom attributes properly preserve original field names
1682-
StructType struct_type{{SchemaField{/*field_id=*/1, "normal_field", iceberg::string(),
1683-
/*optional=*/false},
1684-
SchemaField{/*field_id=*/2, "field-with-dashes", iceberg::int32(),
1685-
/*optional=*/true},
1686-
SchemaField{/*field_id=*/3, "field.with.dots", iceberg::boolean(),
1687-
/*optional=*/false}}};
1687+
StructType struct_type{
1688+
{SchemaField{/*field_id=*/1, "normal_field", iceberg::string(),
1689+
/*optional=*/false},
1690+
SchemaField{/*field_id=*/2, "field-with-dashes", iceberg::int32(),
1691+
/*optional=*/true},
1692+
SchemaField{/*field_id=*/3, "field.with.dots", iceberg::boolean(),
1693+
/*optional=*/false}}};
16881694

16891695
::avro::NodePtr node;
16901696
EXPECT_THAT(ToAvroNodeVisitor{}.Visit(struct_type, &node), IsOk());
@@ -1715,7 +1721,7 @@ TEST(CustomAttributesTest, PreserveOriginalFieldNames) {
17151721

17161722
TEST(CustomAttributesTest, MultipleCustomAttributesCoexist) {
17171723
// Test that iceberg-field-name attributes coexist with other custom attributes
1718-
StructType struct_type{{SchemaField{/*field_id=*/100, "field@with#special$chars",
1724+
StructType struct_type{{SchemaField{/*field_id=*/100, "field@with#special$chars",
17191725
iceberg::timestamp_tz(), /*optional=*/false}}};
17201726

17211727
::avro::NodePtr node;
@@ -1729,7 +1735,8 @@ TEST(CustomAttributesTest, MultipleCustomAttributesCoexist) {
17291735
ASSERT_EQ(node->customAttributes(), 2);
17301736
const auto& field_name_attrs = node->customAttributesAt(0);
17311737
EXPECT_TRUE(field_name_attrs.getAttribute("iceberg-field-name").has_value());
1732-
EXPECT_EQ(field_name_attrs.getAttribute("iceberg-field-name").value(), "field@with#special$chars");
1738+
EXPECT_EQ(field_name_attrs.getAttribute("iceberg-field-name").value(),
1739+
"field@with#special$chars");
17331740
const auto& field_id_attrs = node->customAttributesAt(1);
17341741
EXPECT_TRUE(field_id_attrs.getAttribute("field-id").has_value());
17351742
EXPECT_EQ(field_id_attrs.getAttribute("field-id").value(), "100");
@@ -1747,7 +1754,8 @@ TEST(CustomAttributesTest, NestedStructuresPreserveOriginalNames) {
17471754
// Test that nested structures properly preserve original field names at all levels
17481755
auto inner_struct = std::make_shared<StructType>(std::vector<SchemaField>{
17491756
SchemaField{/*field_id=*/10, "inner-field", iceberg::string(), /*optional=*/false},
1750-
SchemaField{/*field_id=*/11, "another.field", iceberg::int32(), /*optional=*/true}});
1757+
SchemaField{/*field_id=*/11, "another.field", iceberg::int32(),
1758+
/*optional=*/true}});
17511759

17521760
StructType root_struct{{SchemaField{/*field_id=*/1, "normal_field", iceberg::boolean(),
17531761
/*optional=*/false},
@@ -1764,11 +1772,11 @@ TEST(CustomAttributesTest, NestedStructuresPreserveOriginalNames) {
17641772

17651773
// Check root level attributes
17661774
ASSERT_EQ(root_node->customAttributes(), 3);
1767-
1775+
17681776
// Normal field should not have iceberg-field-name
17691777
const auto& root_attrs0 = root_node->customAttributesAt(0);
17701778
EXPECT_FALSE(root_attrs0.getAttribute("iceberg-field-name").has_value());
1771-
1779+
17721780
// Nested struct field should have iceberg-field-name preserved
17731781
const auto& root_attrs1_name = root_node->customAttributesAt(1);
17741782
EXPECT_TRUE(root_attrs1_name.getAttribute("iceberg-field-name").has_value());
@@ -1778,23 +1786,24 @@ TEST(CustomAttributesTest, NestedStructuresPreserveOriginalNames) {
17781786
auto nested_union_node = root_node->leafAt(1);
17791787
ASSERT_EQ(nested_union_node->type(), ::avro::AVRO_UNION);
17801788
ASSERT_EQ(nested_union_node->leaves(), 2);
1781-
1789+
17821790
auto nested_struct_node = nested_union_node->leafAt(1);
17831791
ASSERT_EQ(nested_struct_node->type(), ::avro::AVRO_RECORD);
17841792
ASSERT_EQ(nested_struct_node->names(), 2);
1785-
EXPECT_EQ(nested_struct_node->nameAt(0), "inner_field"); // Sanitized
1786-
EXPECT_EQ(nested_struct_node->nameAt(1), "another_field"); // Sanitized
1793+
EXPECT_EQ(nested_struct_node->nameAt(0), "inner_field"); // Sanitized
1794+
EXPECT_EQ(nested_struct_node->nameAt(1), "another_field"); // Sanitized
17871795

17881796
// Check nested field attributes
17891797
ASSERT_EQ(nested_struct_node->customAttributes(), 4);
1790-
1798+
17911799
const auto& nested_attrs0_name = nested_struct_node->customAttributesAt(0);
17921800
EXPECT_TRUE(nested_attrs0_name.getAttribute("iceberg-field-name").has_value());
17931801
EXPECT_EQ(nested_attrs0_name.getAttribute("iceberg-field-name").value(), "inner-field");
1794-
1802+
17951803
const auto& nested_attrs1_name = nested_struct_node->customAttributesAt(2);
17961804
EXPECT_TRUE(nested_attrs1_name.getAttribute("iceberg-field-name").has_value());
1797-
EXPECT_EQ(nested_attrs1_name.getAttribute("iceberg-field-name").value(), "another.field");
1805+
EXPECT_EQ(nested_attrs1_name.getAttribute("iceberg-field-name").value(),
1806+
"another.field");
17981807
}
17991808

18001809
TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
@@ -1807,8 +1816,10 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18071816
SchemaField::MakeOptional(
18081817
/*field_id=*/5, "nested-record",
18091818
std::make_shared<StructType>(std::vector<SchemaField>{
1810-
SchemaField::MakeRequired(/*field_id=*/10, "inner.field", iceberg::string()),
1811-
SchemaField::MakeOptional(/*field_id=*/11, "another-field", iceberg::float64()),
1819+
SchemaField::MakeRequired(/*field_id=*/10, "inner.field",
1820+
iceberg::string()),
1821+
SchemaField::MakeOptional(/*field_id=*/11, "another-field",
1822+
iceberg::float64()),
18121823
})),
18131824
});
18141825

@@ -1819,18 +1830,22 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18191830

18201831
// Verify that field names are sanitized in the Avro schema
18211832
ASSERT_EQ(avro_node->names(), 5);
1822-
EXPECT_EQ(avro_node->nameAt(0), "user_id"); // "user-id" -> "user_id"
1823-
EXPECT_EQ(avro_node->nameAt(1), "email_address"); // "email.address" -> "email_address"
1824-
EXPECT_EQ(avro_node->nameAt(2), "_123numeric_start"); // "123numeric_start" -> "_123numeric_start"
1825-
EXPECT_EQ(avro_node->nameAt(3), "field_with_spaces"); // "field with spaces" -> "field_with_spaces"
1826-
EXPECT_EQ(avro_node->nameAt(4), "nested_record"); // "nested-record" -> "nested_record"
1833+
EXPECT_EQ(avro_node->nameAt(0), "user_id"); // "user-id" -> "user_id"
1834+
EXPECT_EQ(avro_node->nameAt(1), "email_address"); // "email.address" -> "email_address"
1835+
EXPECT_EQ(avro_node->nameAt(2),
1836+
"_123numeric_start"); // "123numeric_start" -> "_123numeric_start"
1837+
EXPECT_EQ(avro_node->nameAt(3),
1838+
"field_with_spaces"); // "field with spaces" -> "field_with_spaces"
1839+
EXPECT_EQ(avro_node->nameAt(4), "nested_record"); // "nested-record" -> "nested_record"
18271840

18281841
// Verify that original field names are preserved in custom attributes
18291842
ASSERT_EQ(avro_node->customAttributes(), 10);
18301843
ASSERT_NO_FATAL_FAILURE(CheckIcebergFieldName(avro_node, /*index=*/0, "user-id"));
18311844
ASSERT_NO_FATAL_FAILURE(CheckIcebergFieldName(avro_node, /*index=*/2, "email.address"));
1832-
ASSERT_NO_FATAL_FAILURE(CheckIcebergFieldName(avro_node, /*index=*/4, "123numeric_start"));
1833-
ASSERT_NO_FATAL_FAILURE(CheckIcebergFieldName(avro_node, /*index=*/6, "field with spaces"));
1845+
ASSERT_NO_FATAL_FAILURE(
1846+
CheckIcebergFieldName(avro_node, /*index=*/4, "123numeric_start"));
1847+
ASSERT_NO_FATAL_FAILURE(
1848+
CheckIcebergFieldName(avro_node, /*index=*/6, "field with spaces"));
18341849
ASSERT_NO_FATAL_FAILURE(CheckIcebergFieldName(avro_node, /*index=*/8, "nested-record"));
18351850

18361851
// Verify nested structure
@@ -1839,21 +1854,25 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18391854
auto nested_struct_node = nested_union_node->leafAt(1);
18401855
ASSERT_EQ(nested_struct_node->type(), ::avro::AVRO_RECORD);
18411856
ASSERT_EQ(nested_struct_node->names(), 2);
1842-
EXPECT_EQ(nested_struct_node->nameAt(0), "inner_field"); // "inner.field" -> "inner_field"
1843-
EXPECT_EQ(nested_struct_node->nameAt(1), "another_field"); // "another-field" -> "another_field"
1857+
EXPECT_EQ(nested_struct_node->nameAt(0),
1858+
"inner_field"); // "inner.field" -> "inner_field"
1859+
EXPECT_EQ(nested_struct_node->nameAt(1),
1860+
"another_field"); // "another-field" -> "another_field"
18441861

18451862
// Verify nested field name preservation
18461863
ASSERT_EQ(nested_struct_node->customAttributes(), 4);
1847-
ASSERT_NO_FATAL_FAILURE(CheckIcebergFieldName(nested_struct_node, /*index=*/0, "inner.field"));
1848-
ASSERT_NO_FATAL_FAILURE(CheckIcebergFieldName(nested_struct_node, /*index=*/2, "another-field"));
1864+
ASSERT_NO_FATAL_FAILURE(
1865+
CheckIcebergFieldName(nested_struct_node, /*index=*/0, "inner.field"));
1866+
ASSERT_NO_FATAL_FAILURE(
1867+
CheckIcebergFieldName(nested_struct_node, /*index=*/2, "another-field"));
18491868

18501869
// Step 2: Project back to Iceberg schema (simulate reading the Avro data)
18511870
auto projection_result = Project(iceberg_schema, avro_node, /*prune_source=*/false);
18521871
ASSERT_THAT(projection_result, IsOk());
18531872

18541873
const auto& projection = *projection_result;
18551874
ASSERT_EQ(projection.fields.size(), 5);
1856-
1875+
18571876
// Verify that all fields can be correctly projected back
18581877
for (size_t i = 0; i < projection.fields.size(); ++i) {
18591878
ASSERT_EQ(projection.fields[i].kind, FieldProjection::Kind::kProjected);
@@ -1871,21 +1890,19 @@ TEST(EndToEndFieldSanitizationTest, IcebergToAvroAndBack) {
18711890
TEST(EndToEndFieldSanitizationTest, ValidateRoundTripConsistency) {
18721891
// Test that multiple round trips maintain consistency
18731892
std::vector<std::string> problematic_field_names = {
1874-
"user-name", "email.address", "123field", "field with spaces",
1875-
"field@special", "field#hash", "field$dollar", "field%percent",
1876-
"field&ampersand", "field(parenthesis)", "field[bracket]",
1877-
"field{brace}", "field|pipe", "field\\backslash", "field/slash",
1878-
"field:colon", "field;semicolon", "field'quote", "field\"doublequote"
1893+
"user-name", "email.address", "123field", "field with spaces",
1894+
"field@special", "field#hash", "field$dollar", "field%percent",
1895+
"field&ampersand", "field(parenthesis)", "field[bracket]", "field{brace}",
1896+
"field|pipe", "field\\backslash", "field/slash", "field:colon",
1897+
"field;semicolon", "field'quote", "field\"doublequote"
18791898
};
18801899

18811900
for (size_t i = 0; i < problematic_field_names.size(); ++i) {
18821901
const auto& original_name = problematic_field_names[i];
1883-
1902+
18841903
// Create a simple schema with the problematic field name
1885-
Schema iceberg_schema({
1886-
SchemaField::MakeRequired(/*field_id=*/static_cast<int32_t>(i + 1),
1887-
original_name, iceberg::string())
1888-
});
1904+
Schema iceberg_schema({SchemaField::MakeRequired(
1905+
/*field_id=*/static_cast<int32_t>(i + 1), original_name, iceberg::string())});
18891906

18901907
// Convert to Avro
18911908
::avro::NodePtr avro_node;
@@ -1894,14 +1911,14 @@ TEST(EndToEndFieldSanitizationTest, ValidateRoundTripConsistency) {
18941911
// Verify the field name was processed
18951912
ASSERT_EQ(avro_node->names(), 1);
18961913
std::string sanitized_name = avro_node->nameAt(0);
1897-
1914+
18981915
// Verify the original name is preserved if sanitization occurred
18991916
if (sanitized_name != original_name) {
19001917
ASSERT_EQ(avro_node->customAttributes(), 2);
19011918
const auto& name_attrs = avro_node->customAttributesAt(0);
19021919
EXPECT_TRUE(name_attrs.getAttribute("iceberg-field-name").has_value());
19031920
EXPECT_EQ(name_attrs.getAttribute("iceberg-field-name").value(), original_name);
1904-
1921+
19051922
const auto& id_attrs = avro_node->customAttributesAt(1);
19061923
EXPECT_TRUE(id_attrs.getAttribute("field-id").has_value());
19071924
EXPECT_EQ(id_attrs.getAttribute("field-id").value(), std::to_string(i + 1));

0 commit comments

Comments
 (0)