feat: Enable LargeListArray support in Parquet reader schema validation

callmepandey · claude · callmepandey · commit 3eb74b36e647 · 2026-01-16T02:10:12.000+05:30
Update ValidateParquetSchemaEvolution to accept LARGE_LIST as compatible with Iceberg's ListType. This enables reading Parquet files containing LargeListArray (64-bit offsets) through the Iceberg reader. The data conversion layer already supports LargeListArray via the templated ProjectListArrayImpl<> added in apache#510. This change removes the schema validation blocker that was rejecting LARGE_LIST types. Closes apache#513 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/src/iceberg/parquet/parquet_schema_util.cc b/src/iceberg/parquet/parquet_schema_util.cc
@@ -175,7 +175,8 @@ Status ValidateParquetSchemaEvolution(
       }
       break;
     case TypeId::kList:
-      if (arrow_type->id() == ::arrow::Type::LIST) {
+      if (arrow_type->id() == ::arrow::Type::LIST ||
+          arrow_type->id() == ::arrow::Type::LARGE_LIST) {
         return {};
       }
       break;
diff --git a/src/iceberg/test/parquet_test.cc b/src/iceberg/test/parquet_test.cc
@@ -462,4 +462,191 @@ TEST_F(ParquetReadWrite, SimpleTypeRoundTrip) {
   ASSERT_TRUE(out->Equals(*array));
 }
 
+// Test that the reader can handle LargeListArray (64-bit offsets) from parquet files
+class LargeListArrayReaderTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() { parquet::RegisterAll(); }
+
+  void SetUp() override {
+    file_io_ = arrow::ArrowFileSystemFileIO::MakeMockFileIO();
+    temp_parquet_file_ = "large_list_test.parquet";
+  }
+
+  // Write a parquet file with LargeListArray using Arrow's writer directly
+  void WriteLargeListParquetFile(const std::shared_ptr<::arrow::Schema>& arrow_schema,
+                                 const std::shared_ptr<::arrow::Array>& data) {
+    auto io = internal::checked_cast<arrow::ArrowFileSystemFileIO&>(*file_io_);
+    auto outfile = io.fs()->OpenOutputStream(temp_parquet_file_).ValueOrDie();
+
+    auto record_batch =
+        ::arrow::RecordBatch::FromStructArray(
+            std::static_pointer_cast<::arrow::StructArray>(data))
+            .ValueOrDie();
+    auto table = ::arrow::Table::FromRecordBatches({record_batch}).ValueOrDie();
+
+    ASSERT_TRUE(::parquet::arrow::WriteTable(*table, ::arrow::default_memory_pool(),
+                                             outfile, /*chunk_size=*/1024)
+                    .ok());
+    ASSERT_TRUE(outfile->Close().ok());
+  }
+
+  std::shared_ptr<FileIO> file_io_;
+  std::string temp_parquet_file_;
+};
+
+TEST_F(LargeListArrayReaderTest, ReadLargeListOfIntegers) {
+  const std::string kParquetFieldIdKey = "PARQUET:field_id";
+
+  // Create Arrow schema with LargeListArray (64-bit offsets)
+  // Need to add field id to the element field within the list
+  auto element_field =
+      ::arrow::field("element", ::arrow::int32(), /*nullable=*/true,
+                     ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"2"}));
+  auto arrow_schema = ::arrow::schema({::arrow::field(
+      "values", ::arrow::large_list(element_field), /*nullable=*/true,
+      ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"1"}))});
+
+  // Create data with LargeListArray
+  auto data =
+      ::arrow::json::ArrayFromJSONString(::arrow::struct_(arrow_schema->fields()),
+                                         R"([[[1, 2, 3]], [[4, 5]], [null], [[]]])")
+          .ValueOrDie();
+
+  WriteLargeListParquetFile(arrow_schema, data);
+
+  // Create Iceberg schema - uses ListType (no distinction between LIST and LARGE_LIST)
+  auto iceberg_schema = std::make_shared<Schema>(std::vector<SchemaField>{
+      SchemaField::MakeOptional(
+          1, "values",
+          list(SchemaField::MakeOptional(2, "element", int32()))),
+  });
+
+  // Read using Iceberg reader
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto reader,
+      ReaderFactoryRegistry::Open(
+          FileFormatType::kParquet,
+          {.path = temp_parquet_file_, .io = file_io_, .projection = iceberg_schema}));
+
+  // Get Arrow schema from reader
+  auto schema_result = reader->Schema();
+  ASSERT_THAT(schema_result, IsOk());
+  auto arrow_c_schema = std::move(schema_result.value());
+  auto import_schema_result = ::arrow::ImportType(&arrow_c_schema);
+  auto read_arrow_schema = import_schema_result.ValueOrDie();
+
+  // Get data from reader
+  auto read_result = reader->Next();
+  ASSERT_THAT(read_result, IsOk());
+  ASSERT_TRUE(read_result.value().has_value());
+  auto arrow_c_array = read_result.value().value();
+  auto read_array =
+      ::arrow::ImportArray(&arrow_c_array, read_arrow_schema).ValueOrDie();
+
+  // The output is ListArray (not LargeListArray) because Iceberg schema maps to
+  // Arrow LIST. The important thing is that we can read LargeListArray from Parquet.
+  auto struct_array = std::static_pointer_cast<::arrow::StructArray>(read_array);
+  ASSERT_EQ(struct_array->num_fields(), 1);
+  ASSERT_EQ(struct_array->field(0)->type_id(), ::arrow::Type::LIST);
+
+  // Verify data content - create expected with regular list type
+  auto expected_element_field =
+      ::arrow::field("element", ::arrow::int32(), /*nullable=*/true);
+  auto expected_schema = ::arrow::schema({::arrow::field(
+      "values", ::arrow::list(expected_element_field), /*nullable=*/true)});
+  auto expected =
+      ::arrow::json::ArrayFromJSONString(::arrow::struct_(expected_schema->fields()),
+                                         R"([[[1, 2, 3]], [[4, 5]], [null], [[]]])")
+          .ValueOrDie();
+  ASSERT_TRUE(read_array->Equals(*expected));
+}
+
+TEST_F(LargeListArrayReaderTest, ReadLargeListOfStructs) {
+  const std::string kParquetFieldIdKey = "PARQUET:field_id";
+
+  // Create Arrow schema with LargeListArray containing structs
+  auto struct_type = ::arrow::struct_({
+      ::arrow::field("id", ::arrow::int32(), /*nullable=*/false,
+                     ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"3"})),
+      ::arrow::field("name", ::arrow::utf8(), /*nullable=*/true,
+                     ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"4"})),
+  });
+
+  // Create the element field with field_id for the struct (element of the list)
+  auto element_field =
+      ::arrow::field("element", struct_type, /*nullable=*/true,
+                     ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"2"}));
+
+  auto arrow_schema = ::arrow::schema({::arrow::field(
+      "items", ::arrow::large_list(element_field), /*nullable=*/true,
+      ::arrow::KeyValueMetadata::Make({kParquetFieldIdKey}, {"1"}))});
+
+  // Create data with LargeListArray of structs
+  auto data = ::arrow::json::ArrayFromJSONString(
+                  ::arrow::struct_(arrow_schema->fields()),
+                  R"([[[{"id": 1, "name": "a"}, {"id": 2, "name": "b"}]],
+                      [[{"id": 3, "name": null}]],
+                      [null],
+                      [[]]])")
+                  .ValueOrDie();
+
+  WriteLargeListParquetFile(arrow_schema, data);
+
+  // Create Iceberg schema
+  auto iceberg_schema = std::make_shared<Schema>(std::vector<SchemaField>{
+      SchemaField::MakeOptional(
+          1, "items",
+          list(SchemaField::MakeOptional(
+              2, "element",
+              struct_({
+                  SchemaField::MakeRequired(3, "id", int32()),
+                  SchemaField::MakeOptional(4, "name", string()),
+              })))),
+  });
+
+  // Read using Iceberg reader
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto reader,
+      ReaderFactoryRegistry::Open(
+          FileFormatType::kParquet,
+          {.path = temp_parquet_file_, .io = file_io_, .projection = iceberg_schema}));
+
+  // Get data from reader
+  auto schema_result = reader->Schema();
+  ASSERT_THAT(schema_result, IsOk());
+  auto arrow_c_schema = std::move(schema_result.value());
+  auto read_arrow_schema = ::arrow::ImportType(&arrow_c_schema).ValueOrDie();
+
+  auto read_result = reader->Next();
+  ASSERT_THAT(read_result, IsOk());
+  ASSERT_TRUE(read_result.value().has_value());
+  auto arrow_c_array = read_result.value().value();
+  auto read_array =
+      ::arrow::ImportArray(&arrow_c_array, read_arrow_schema).ValueOrDie();
+
+  // The output is ListArray (not LargeListArray) because Iceberg schema maps to
+  // Arrow LIST. The important thing is that we can read LargeListArray from Parquet.
+  auto struct_array = std::static_pointer_cast<::arrow::StructArray>(read_array);
+  ASSERT_EQ(struct_array->num_fields(), 1);
+  ASSERT_EQ(struct_array->field(0)->type_id(), ::arrow::Type::LIST);
+
+  // Verify data content - create expected with regular list type
+  auto expected_struct_type = ::arrow::struct_({
+      ::arrow::field("id", ::arrow::int32(), /*nullable=*/false),
+      ::arrow::field("name", ::arrow::utf8(), /*nullable=*/true),
+  });
+  auto expected_element_field =
+      ::arrow::field("element", expected_struct_type, /*nullable=*/true);
+  auto expected_schema = ::arrow::schema({::arrow::field(
+      "items", ::arrow::list(expected_element_field), /*nullable=*/true)});
+  auto expected = ::arrow::json::ArrayFromJSONString(
+                      ::arrow::struct_(expected_schema->fields()),
+                      R"([[[{"id": 1, "name": "a"}, {"id": 2, "name": "b"}]],
+                          [[{"id": 3, "name": null}]],
+                          [null],
+                          [[]]])")
+                      .ValueOrDie();
+  ASSERT_TRUE(read_array->Equals(*expected));
+}
+
 }  // namespace iceberg::parquet

Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,8 @@ Status ValidateParquetSchemaEvolution(`
`175`	`175`	`}`
`176`	`176`	`break;`
`177`	`177`	`case TypeId::kList:`
`178`		`- if (arrow_type->id() == ::arrow::Type::LIST) {`
	`178`	`+ if (arrow_type->id() == ::arrow::Type::LIST \|\|`
	`179`	`+ arrow_type->id() == ::arrow::Type::LARGE_LIST) {`
`179`	`180`	`return {};`
`180`	`181`	`}`
`181`	`182`	`break;`