2020#include " iceberg/parquet/parquet_writer.h"
2121
2222#include < memory>
23+ #include < optional>
24+ #include < utility>
25+ #include < vector>
2326
2427#include < arrow/c/bridge.h>
2528#include < arrow/record_batch.h>
3134
3235#include " iceberg/arrow/arrow_io_internal.h"
3336#include " iceberg/arrow/arrow_status_internal.h"
37+ #include " iceberg/parquet/parquet_data_util_internal.h"
38+ #include " iceberg/schema.h"
3439#include " iceberg/schema_internal.h"
40+ #include " iceberg/schema_util.h"
41+ #include " iceberg/type.h"
3542#include " iceberg/util/macros.h"
3643
3744namespace iceberg ::parquet {
@@ -71,6 +78,64 @@ Result<std::optional<int32_t>> ParseCodecLevel(const WriterProperties& propertie
7178 return level;
7279}
7380
81+ std::optional<std::shared_ptr<Type>> PruneUnknownType (const std::shared_ptr<Type>& type);
82+
83+ std::optional<SchemaField> PruneUnknownField (const SchemaField& field) {
84+ auto pruned_type = PruneUnknownType (field.type ());
85+ if (!pruned_type.has_value ()) {
86+ return std::nullopt ;
87+ }
88+ return SchemaField (field.field_id (), field.name (), std::move (pruned_type.value ()),
89+ field.optional (), field.doc ());
90+ }
91+
92+ std::optional<std::shared_ptr<Type>> PruneUnknownType (const std::shared_ptr<Type>& type) {
93+ switch (type->type_id ()) {
94+ case TypeId::kUnknown :
95+ return std::nullopt ;
96+ case TypeId::kStruct : {
97+ const auto & struct_type = static_cast <const StructType&>(*type);
98+ std::vector<SchemaField> fields;
99+ for (const auto & field : struct_type.fields ()) {
100+ if (auto pruned_field = PruneUnknownField (field)) {
101+ fields.emplace_back (std::move (pruned_field.value ()));
102+ }
103+ }
104+ return std::make_shared<StructType>(std::move (fields));
105+ }
106+ case TypeId::kList : {
107+ const auto & list_type = static_cast <const ListType&>(*type);
108+ auto pruned_element = PruneUnknownField (list_type.element ());
109+ if (!pruned_element.has_value ()) {
110+ return std::nullopt ;
111+ }
112+ return std::make_shared<ListType>(std::move (pruned_element.value ()));
113+ }
114+ case TypeId::kMap : {
115+ const auto & map_type = static_cast <const MapType&>(*type);
116+ auto pruned_key = PruneUnknownField (map_type.key ());
117+ auto pruned_value = PruneUnknownField (map_type.value ());
118+ if (!pruned_key.has_value () || !pruned_value.has_value ()) {
119+ return std::nullopt ;
120+ }
121+ return std::make_shared<MapType>(std::move (pruned_key.value ()),
122+ std::move (pruned_value.value ()));
123+ }
124+ default :
125+ return type;
126+ }
127+ }
128+
129+ std::shared_ptr<Schema> PruneUnknownFields (const Schema& schema) {
130+ std::vector<SchemaField> fields;
131+ for (const auto & field : schema.fields ()) {
132+ if (auto pruned_field = PruneUnknownField (field)) {
133+ fields.emplace_back (std::move (pruned_field.value ()));
134+ }
135+ }
136+ return std::make_shared<Schema>(std::move (fields), schema.schema_id ());
137+ }
138+
74139} // namespace
75140
76141class ParquetWriter ::Impl {
@@ -87,8 +152,17 @@ class ParquetWriter::Impl {
87152 auto writer_properties = properties_builder.memory_pool (pool_)->build ();
88153 auto arrow_writer_properties = ::parquet::default_arrow_writer_properties ();
89154
155+ ArrowSchema input_c_schema;
156+ ICEBERG_RETURN_UNEXPECTED (ToArrowSchema (*options.schema , &input_c_schema));
157+ ICEBERG_ARROW_ASSIGN_OR_RETURN (input_arrow_schema_,
158+ ::arrow::ImportSchema (&input_c_schema));
159+
160+ write_schema_ = PruneUnknownFields (*options.schema );
161+ ICEBERG_ASSIGN_OR_RAISE (write_projection_, Project (*write_schema_, *options.schema ,
162+ /* prune_source=*/ false ));
163+
90164 ArrowSchema c_schema;
91- ICEBERG_RETURN_UNEXPECTED (ToArrowSchema (*options. schema , &c_schema));
165+ ICEBERG_RETURN_UNEXPECTED (ToArrowSchema (*write_schema_ , &c_schema));
92166 ICEBERG_ARROW_ASSIGN_OR_RETURN (arrow_schema_, ::arrow::ImportSchema (&c_schema));
93167
94168 std::shared_ptr<::parquet::SchemaDescriptor> schema_descriptor;
@@ -110,8 +184,12 @@ class ParquetWriter::Impl {
110184 }
111185
112186 Status Write (ArrowArray* array) {
113- ICEBERG_ARROW_ASSIGN_OR_RETURN (auto batch,
114- ::arrow::ImportRecordBatch (array, arrow_schema_));
187+ ICEBERG_ARROW_ASSIGN_OR_RETURN (
188+ auto input_batch, ::arrow::ImportRecordBatch (array, input_arrow_schema_));
189+ ICEBERG_ASSIGN_OR_RAISE (
190+ auto batch,
191+ ProjectRecordBatch (std::move (input_batch), arrow_schema_, *write_schema_,
192+ write_projection_, arrow::MetadataColumnContext{}, pool_));
115193
116194 ICEBERG_ARROW_RETURN_NOT_OK (writer_->WriteRecordBatch (*batch));
117195
@@ -155,6 +233,11 @@ class ParquetWriter::Impl {
155233 private:
156234 // TODO(gangwu): make memory pool configurable
157235 ::arrow::MemoryPool* pool_ = ::arrow::default_memory_pool();
236+ // The schema accepted from callers.
237+ std::shared_ptr<::arrow::Schema> input_arrow_schema_;
238+ // The Iceberg schema that has v3 unknown fields removed for physical writes.
239+ std::shared_ptr<Schema> write_schema_;
240+ SchemaProjection write_projection_;
158241 // Schema to write from the Parquet file.
159242 std::shared_ptr<::arrow::Schema> arrow_schema_;
160243 // The output stream to write Parquet file.
0 commit comments