Skip to content

Commit be99a3d

Browse files
authored
Preserve single-field unannotated repeated groups in parquet schema sanitization (rapidsai#22567)
Closes rapidsai#22541. Parquet's backward-compatibility rules say that when a `REPEATED` field is a group, the element type is the group itself, regardless of its field count. `sanitize_schema()` already canonicalized multi-child repeated groups into `list<struct<...>>`, but a `num_children > 1` guard skipped single-child groups and let them collapse to `list<primitive>`. This widens the guard to `num_children >= 1` (LIST/MAP-parent and VARIANT exclusions unchanged) and reorders the rewrite so `schema`'s `children_idx` references remain valid while the synthetic struct node is inserted. Authors: - Allen Xu (https://github.com/wjxiz1992) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: rapidsai#22567
1 parent 5967cc2 commit be99a3d

2 files changed

Lines changed: 150 additions & 29 deletions

File tree

cpp/src/io/parquet/reader_impl_helpers.cpp

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -280,49 +280,53 @@ void metadata::sanitize_schema()
280280
parent_schema.logical_type->type == LogicalType::VARIANT;
281281
auto const parent_type = parent_schema.converted_type;
282282
if (not is_parent_variant && schema_elem.repetition_type == FieldRepetitionType::REPEATED &&
283-
schema_elem.num_children > 1 && parent_type != ConvertedType::LIST &&
283+
schema_elem.num_children >= 1 && parent_type != ConvertedType::LIST &&
284284
parent_type != ConvertedType::MAP) {
285-
// This is a list of structs, so we need to mark this as a list, but also
286-
// add a struct child and move this element's children to the struct
287-
schema_elem.converted_type = ConvertedType::LIST;
288-
schema_elem.logical_type = LogicalType::LIST;
289-
schema_elem.repetition_type = FieldRepetitionType::OPTIONAL;
290-
auto const struct_node_idx = static_cast<size_type>(schema.size());
285+
// Parquet backward-compatibility rule: when the repeated field is a group,
286+
// the element type is the group itself, regardless of its field count. Rewrite
287+
// as `list<struct<...>>` for any number of children >= 1.
288+
// Spec:
289+
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
290+
auto const struct_node_idx = static_cast<size_type>(schema.size());
291291

292292
SchemaElement struct_elem;
293-
struct_elem.name = "struct_node";
294-
struct_elem.repetition_type = FieldRepetitionType::REQUIRED;
295-
struct_elem.num_children = schema_elem.num_children;
296-
struct_elem.type = Type::UNDEFINED;
297-
struct_elem.converted_type = std::nullopt;
298-
299-
// swap children
300-
struct_elem.children_idx = std::move(schema_elem.children_idx);
301-
schema_elem.children_idx = {struct_node_idx};
302-
schema_elem.num_children = 1;
303-
293+
struct_elem.name = "struct_node";
294+
struct_elem.repetition_type = FieldRepetitionType::REQUIRED;
295+
struct_elem.num_children = schema_elem.num_children;
296+
struct_elem.type = Type::UNDEFINED;
297+
struct_elem.converted_type = std::nullopt;
298+
struct_elem.parent_idx = schema_idx;
304299
struct_elem.max_definition_level = schema_elem.max_definition_level;
305300
struct_elem.max_repetition_level = schema_elem.max_repetition_level;
301+
struct_elem.children_idx = std::move(schema_elem.children_idx);
302+
for (auto const child_idx : struct_elem.children_idx) {
303+
schema[child_idx].parent_idx = struct_node_idx;
304+
}
305+
306+
schema_elem.converted_type = ConvertedType::LIST;
307+
schema_elem.logical_type = LogicalType::LIST;
308+
schema_elem.repetition_type = FieldRepetitionType::OPTIONAL;
306309
schema_elem.max_definition_level--;
307310
schema_elem.max_repetition_level = schema[schema_elem.parent_idx].max_repetition_level;
308311

309-
// change parent index on new node and on children
310-
struct_elem.parent_idx = schema_idx;
311-
for (auto& child_idx : struct_elem.children_idx) {
312-
schema[child_idx].parent_idx = struct_node_idx;
313-
}
314-
// add our struct
315-
schema.push_back(struct_elem);
312+
// push_back may reallocate; do not use `schema_elem` past this point.
313+
schema.push_back(std::move(struct_elem));
314+
schema[schema_idx].children_idx = {struct_node_idx};
315+
schema[schema_idx].num_children = 1;
316316
}
317317
}
318318

319319
// convert ConvertedType to LogicalType for older files
320-
if (schema_elem.converted_type.has_value() and not schema_elem.logical_type.has_value()) {
321-
schema_elem.logical_type = converted_to_logical_type(schema_elem);
320+
if (schema[schema_idx].converted_type.has_value() and
321+
not schema[schema_idx].logical_type.has_value()) {
322+
schema[schema_idx].logical_type = converted_to_logical_type(schema[schema_idx]);
322323
}
323324

324-
for (auto& child_idx : schema_elem.children_idx) {
325-
process(child_idx);
325+
// Recursive `process` may append to `schema`; index into it each iteration
326+
// instead of holding a reference.
327+
auto const num_children = schema[schema_idx].num_children;
328+
for (size_type i = 0; i < num_children; ++i) {
329+
process(schema[schema_idx].children_idx[i]);
326330
}
327331
};
328332

cpp/tests/io/parquet_reader_test.cpp

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2302,6 +2302,123 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
23022302
CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected);
23032303
}
23042304

2305+
// Regression test for https://github.com/rapidsai/cudf/issues/22541.
2306+
// Schema (single-field inner repeated group must decode as list<struct<someId>>):
2307+
// required group root {
2308+
// optional int32 primitive;
2309+
// repeated group myComplex {
2310+
// optional int32 id;
2311+
// repeated group repeatedMessage {
2312+
// optional int32 someId;
2313+
// }
2314+
// }
2315+
// }
2316+
// Regenerate `nested_array_bytes` from the parquet-mr source file with:
2317+
// xxd -i nested-array-struct.parquet > /tmp/blob.cpp
2318+
// (source: spark/sql/core/src/test/resources/test-data/nested-array-struct.parquet)
2319+
TEST_F(ParquetReaderTest, RepeatedNoAnnotationsSingleFieldNested)
2320+
{
2321+
constexpr std::array<unsigned char, 775> nested_array_bytes{
2322+
0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x24, 0x15, 0x24, 0x2c, 0x15, 0x06, 0x15, 0x00, 0x15,
2323+
0x06, 0x15, 0x08, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x07, 0x02, 0x00, 0x00, 0x00, 0x05,
2324+
0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x32, 0x15, 0x32, 0x2c, 0x15, 0x06,
2325+
0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00,
2326+
0x00, 0x00, 0x03, 0x2a, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00,
2327+
0x00, 0x15, 0x00, 0x15, 0x34, 0x15, 0x34, 0x2c, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06,
2328+
0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x3f, 0x00,
2329+
0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c,
2330+
0x48, 0x29, 0x54, 0x65, 0x73, 0x74, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x41,
2331+
0x72, 0x72, 0x61, 0x79, 0x57, 0x69, 0x74, 0x68, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x47, 0x72,
2332+
0x6f, 0x75, 0x70, 0x41, 0x6e, 0x64, 0x41, 0x72, 0x72, 0x61, 0x79, 0x15, 0x04, 0x00, 0x15, 0x02,
2333+
0x25, 0x02, 0x18, 0x09, 0x70, 0x72, 0x69, 0x6d, 0x69, 0x74, 0x69, 0x76, 0x65, 0x00, 0x35, 0x04,
2334+
0x18, 0x09, 0x6d, 0x79, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x15, 0x04, 0x00, 0x15, 0x02,
2335+
0x25, 0x02, 0x18, 0x02, 0x69, 0x64, 0x00, 0x35, 0x04, 0x18, 0x0f, 0x72, 0x65, 0x70, 0x65, 0x61,
2336+
0x74, 0x65, 0x64, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x15, 0x02, 0x00, 0x15, 0x02, 0x25,
2337+
0x02, 0x18, 0x06, 0x73, 0x6f, 0x6d, 0x65, 0x49, 0x64, 0x00, 0x16, 0x06, 0x19, 0x1c, 0x19, 0x3c,
2338+
0x26, 0x08, 0x1c, 0x15, 0x02, 0x19, 0x35, 0x08, 0x06, 0x00, 0x19, 0x18, 0x09, 0x70, 0x72, 0x69,
2339+
0x6d, 0x69, 0x74, 0x69, 0x76, 0x65, 0x15, 0x00, 0x16, 0x06, 0x16, 0x46, 0x16, 0x46, 0x26, 0x08,
2340+
0x00, 0x00, 0x26, 0x4e, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x06, 0x00, 0x19, 0x28, 0x09, 0x6d, 0x79,
2341+
0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x02, 0x69, 0x64, 0x15, 0x00, 0x16, 0x06, 0x16, 0x54,
2342+
0x16, 0x54, 0x26, 0x4e, 0x00, 0x00, 0x26, 0xa2, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x06, 0x00,
2343+
0x19, 0x38, 0x09, 0x6d, 0x79, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x0f, 0x72, 0x65, 0x70,
2344+
0x65, 0x61, 0x74, 0x65, 0x64, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x06, 0x73, 0x6f, 0x6d,
2345+
0x65, 0x49, 0x64, 0x15, 0x00, 0x16, 0x06, 0x16, 0x56, 0x16, 0x56, 0x26, 0xa2, 0x01, 0x00, 0x00,
2346+
0x16, 0xf0, 0x01, 0x16, 0x06, 0x00, 0x19, 0x2c, 0x18, 0x18, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65,
2347+
0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
2348+
0x6f, 0x72, 0x18, 0xf8, 0x01, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x41, 0x72, 0x72, 0x61,
2349+
0x79, 0x57, 0x69, 0x74, 0x68, 0x4e, 0x65, 0x73, 0x74, 0x65, 0x64, 0x47, 0x72, 0x6f, 0x75, 0x70,
2350+
0x41, 0x6e, 0x64, 0x41, 0x72, 0x72, 0x61, 0x79, 0x22, 0x0a, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x20,
2351+
0x7b, 0x0a, 0x20, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, 0x72, 0x69, 0x6d, 0x69,
2352+
0x74, 0x69, 0x76, 0x65, 0x22, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20,
2353+
0x31, 0x0a, 0x20, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x4c, 0x41, 0x42, 0x45, 0x4c,
2354+
0x5f, 0x4f, 0x50, 0x54, 0x49, 0x4f, 0x4e, 0x41, 0x4c, 0x0a, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65,
2355+
0x3a, 0x20, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x33, 0x32, 0x0a, 0x7d, 0x0a, 0x66,
2356+
0x69, 0x65, 0x6c, 0x64, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22,
2357+
0x6d, 0x79, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x22, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6d,
2358+
0x62, 0x65, 0x72, 0x3a, 0x20, 0x32, 0x0a, 0x20, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20,
2359+
0x4c, 0x41, 0x42, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x45, 0x41, 0x54, 0x45, 0x44, 0x0a, 0x20,
2360+
0x20, 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x4d, 0x45, 0x53, 0x53,
2361+
0x41, 0x47, 0x45, 0x0a, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x3a,
2362+
0x20, 0x22, 0x2e, 0x54, 0x65, 0x73, 0x74, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e,
2363+
0x4d, 0x79, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x22, 0x0a, 0x7d, 0x0a, 0x00, 0x18, 0x13,
2364+
0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x63, 0x6c,
2365+
0x61, 0x73, 0x73, 0x18, 0x3c, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2e, 0x70, 0x72, 0x6f,
2366+
0x74, 0x6f, 0x2e, 0x74, 0x65, 0x73, 0x74, 0x2e, 0x54, 0x65, 0x73, 0x74, 0x50, 0x72, 0x6f, 0x74,
2367+
0x6f, 0x62, 0x75, 0x66, 0x24, 0x41, 0x72, 0x72, 0x61, 0x79, 0x57, 0x69, 0x74, 0x68, 0x4e, 0x65,
2368+
0x73, 0x74, 0x65, 0x64, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x41, 0x6e, 0x64, 0x41, 0x72, 0x72, 0x61,
2369+
0x79, 0x00, 0x18, 0x0a, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x6d, 0x72, 0x00, 0x83,
2370+
0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
2371+
2372+
auto read_opts = cudf::io::parquet_reader_options::builder(
2373+
cudf::io::source_info{cudf::host_span<std::byte const>{
2374+
reinterpret_cast<std::byte const*>(nested_array_bytes.data()), nested_array_bytes.size()}});
2375+
auto result = cudf::io::read_parquet(read_opts);
2376+
2377+
// 3 rows: (2, [{1, [{3}]}]), (5, [{4, [{6}]}]), (8, [{7, [{9}]}])
2378+
auto const tv = result.tbl->view();
2379+
ASSERT_EQ(tv.num_columns(), 2);
2380+
EXPECT_EQ(tv.column(0).size(), 3);
2381+
2382+
// Column 0: primitive int32 = [2, 5, 8]
2383+
column_wrapper<int32_t> col0{2, 5, 8};
2384+
2385+
// Column 1: list<struct<id:int32, repeatedMessage:list<struct<someId:int32>>>>
2386+
auto const& outer_list = tv.column(1);
2387+
ASSERT_EQ(outer_list.type().id(), cudf::type_id::LIST);
2388+
auto const outer_child = outer_list.child(cudf::lists_column_view::child_column_index);
2389+
ASSERT_EQ(outer_child.type().id(), cudf::type_id::STRUCT);
2390+
ASSERT_EQ(outer_child.num_children(), 2);
2391+
EXPECT_EQ(outer_child.child(0).type().id(), cudf::type_id::INT32);
2392+
ASSERT_EQ(outer_child.child(1).type().id(), cudf::type_id::LIST);
2393+
auto const inner_child = outer_child.child(1).child(cudf::lists_column_view::child_column_index);
2394+
// Inner single-field group must be a STRUCT, not collapsed to a primitive (issue #22541).
2395+
ASSERT_EQ(inner_child.type().id(), cudf::type_id::STRUCT);
2396+
ASSERT_EQ(inner_child.num_children(), 1);
2397+
EXPECT_EQ(inner_child.child(0).type().id(), cudf::type_id::INT32);
2398+
2399+
column_wrapper<int32_t> inner_someid{3, 6, 9};
2400+
auto inner_struct = cudf::test::structs_column_wrapper{{inner_someid}};
2401+
auto inner_list_offsets =
2402+
cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3}.release();
2403+
auto inner_list = cudf::make_lists_column(
2404+
3, std::move(inner_list_offsets), inner_struct.release(), 0, rmm::device_buffer{});
2405+
2406+
column_wrapper<int32_t> outer_id{1, 4, 7};
2407+
std::vector<std::unique_ptr<cudf::column>> outer_struct_children;
2408+
outer_struct_children.push_back(outer_id.release());
2409+
outer_struct_children.push_back(std::move(inner_list));
2410+
auto outer_struct_col = cudf::test::structs_column_wrapper{{std::move(outer_struct_children)}};
2411+
2412+
auto outer_list_offsets =
2413+
cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3}.release();
2414+
auto outer_list_col = cudf::make_lists_column(
2415+
3, std::move(outer_list_offsets), outer_struct_col.release(), 0, rmm::device_buffer{});
2416+
2417+
// Testing for equivalence here because we only care about the outermost validity buffers.
2418+
table_view expected{{col0, *outer_list_col}};
2419+
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected);
2420+
}
2421+
23052422
TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
23062423
{
23072424
using cudf::io::column_encoding;

0 commit comments

Comments
 (0)