diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index e669277f1d06..208b169e7dd9 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -17,7 +17,7 @@ //! [`VariantArrayBuilder`] implementation -use crate::VariantArray; +use crate::{VariantArray, shred_variant}; use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Fields}; use parquet_variant::{ @@ -96,9 +96,12 @@ pub struct VariantArrayBuilder { value_offsets: Vec, /// The fields of the final `StructArray` /// - /// TODO: 1) Add extension type metadata - /// TODO: 2) Add support for shredding + /// TODO: Add extension type metadata fields: Fields, + /// Optional shredding schema. When set, [`build`](Self::build) returns a + /// shredded `VariantArray` produced by [`shred_variant`](crate::shred_variant) + /// against this Arrow `DataType` (used as the `typed_value` layout). + shredding_schema: Option, } impl VariantArrayBuilder { @@ -114,9 +117,21 @@ impl VariantArrayBuilder { value_builder: ValueBuilder::new(), value_offsets: Vec::with_capacity(row_capacity), fields: Fields::from(vec![metadata_field, value_field]), + shredding_schema: None, } } + /// Configure this builder to produce a shredded [`VariantArray`]. + /// + /// Rows are still appended in the unshredded form via + /// [`append_variant`](Self::append_variant); the shredding pass runs once at + /// [`build`](Self::build) time by delegating to + /// [`shred_variant`](crate::shred_variant). + pub fn with_shredding(mut self, as_type: DataType) -> Self { + self.shredding_schema = Some(as_type); + self + } + /// Build the final builder pub fn build(self) -> VariantArray { let Self { @@ -126,6 +141,7 @@ impl VariantArrayBuilder { value_builder, value_offsets, fields, + shredding_schema, } = self; let metadata_buffer = metadata_builder.into_inner(); @@ -145,7 +161,11 @@ impl VariantArrayBuilder { ); // TODO add arrow extension type metadata - VariantArray::try_new(&inner).expect("valid VariantArray by construction") + let unshredded = VariantArray::try_new(&inner).expect("valid VariantArray by construction"); + match shredding_schema { + Some(as_type) => shred_variant(&unshredded, &as_type).expect("shred_variant failed"), + None => unshredded, + } } /// Appends a null row to the builder. @@ -659,4 +679,32 @@ mod test { assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap()); assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap()); } + + #[test] + fn with_shredding_round_trip_primitive_long() { + let mut b = VariantArrayBuilder::new(3).with_shredding(DataType::Int64); + b.append_variant(Variant::Int64(42)); + b.append_variant(Variant::Int64(100)); + b.append_null(); + let arr = b.build(); + assert!( + arr.typed_value_field().is_some(), + "shredded array must have typed_value" + ); + assert_eq!(arr.len(), 3); + } + + #[test] + fn with_shredding_produces_typed_value_column() { + let typed = DataType::Struct(vec![Field::new("brand", DataType::Utf8, true)].into()); + let mut b = VariantArrayBuilder::new(2).with_shredding(typed); + b.new_object().with_field("brand", "Apple").finish(); + b.append_null(); + let arr = b.build(); + assert!( + arr.typed_value_field().is_some(), + "shredded array must have typed_value" + ); + assert_eq!(arr.len(), 2); + } }