1717
1818//! [`VariantArrayBuilder`] implementation
1919
20- use crate :: VariantArray ;
20+ use crate :: { VariantArray , shred_variant } ;
2121use arrow:: array:: { ArrayRef , BinaryViewArray , BinaryViewBuilder , NullBufferBuilder , StructArray } ;
2222use arrow_schema:: { ArrowError , DataType , Field , Fields } ;
2323use parquet_variant:: {
@@ -37,50 +37,57 @@ use std::sync::Arc;
3737/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
3838/// the metadata and value fields.
3939///
40- /// # TODO
41- /// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
42- ///
43- /// ## Example:
40+ /// ## Example
4441/// ```
4542/// # use arrow::array::Array;
4643/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
4744/// # use parquet_variant_compute::VariantArrayBuilder;
4845/// # use parquet_variant::ShortString;
49- /// // Create a new VariantArrayBuilder with a capacity of 100 rows
5046/// let mut builder = VariantArrayBuilder::new(100);
51- /// // append variant values
5247/// builder.append_variant(Variant::from(42));
53- /// // append a null row (note not a Variant::Null)
5448/// builder.append_null();
55- /// // append an object to the builder using VariantBuilderExt methods directly
5649/// builder.new_object()
5750/// .with_field("foo", "bar")
5851/// .finish();
59- ///
60- /// // bulk insert a list of values
61- /// // `Option::None` is a null value
6252/// builder.extend([None, Some(Variant::from("norm"))]);
63- ///
64- /// // create the final VariantArray
6553/// let variant_array = builder.build();
6654/// assert_eq!(variant_array.len(), 5);
67- /// // // Access the values
68- /// // row 1 is not null and is an integer
6955/// assert!(!variant_array.is_null(0));
7056/// assert_eq!(variant_array.value(0), Variant::from(42i32));
71- /// // row 1 is null
7257/// assert!(variant_array.is_null(1));
73- /// // row 2 is not null and is an object
7458/// assert!(!variant_array.is_null(2));
7559/// let value = variant_array.value(2);
7660/// let obj = value.as_object().expect("expected object");
7761/// assert_eq!(obj.get("foo"), Some(Variant::from("bar")));
78- /// // row 3 is null
7962/// assert!(variant_array.is_null(3));
80- /// // row 4 is not null and is a short string
81- /// assert!(!variant_array.is_null(4));
82- /// let value = variant_array.value(4);
83- /// assert_eq!(value, Variant::ShortString(ShortString::try_new("norm").unwrap()));
63+ /// ```
64+ ///
65+ /// ## Shredded Example
66+ ///
67+ /// Use [`Self::build_shredded`] with [`ShreddedSchemaBuilder`] to produce a
68+ /// shredded [`VariantArray`] where known fields are extracted into typed columns.
69+ ///
70+ /// ```
71+ /// # use arrow::array::Array;
72+ /// # use arrow_schema::DataType;
73+ /// # use parquet_variant::{Variant, VariantBuilderExt};
74+ /// # use parquet_variant_compute::{ShreddedSchemaBuilder, VariantArrayBuilder};
75+ /// let schema = ShreddedSchemaBuilder::default()
76+ /// .with_path("brand", &DataType::Utf8).unwrap()
77+ /// .with_path("price", &DataType::Float64).unwrap()
78+ /// .build();
79+ ///
80+ /// let mut builder = VariantArrayBuilder::new(3);
81+ /// builder.new_object().with_field("brand", "Apple").with_field("price", 999.0f64).finish();
82+ /// builder.new_object().with_field("brand", "Samsung").finish();
83+ /// builder.append_null();
84+ ///
85+ /// let arr = builder.build_shredded(&schema).unwrap();
86+ /// assert_eq!(arr.len(), 3);
87+ /// assert!(arr.typed_value_field().is_some());
88+ /// assert!(!arr.is_null(0));
89+ /// assert!(!arr.is_null(1));
90+ /// assert!(arr.is_null(2));
8491/// ```
8592#[ derive( Debug ) ]
8693pub struct VariantArrayBuilder {
@@ -96,8 +103,7 @@ pub struct VariantArrayBuilder {
96103 value_offsets : Vec < usize > ,
97104 /// The fields of the final `StructArray`
98105 ///
99- /// TODO: 1) Add extension type metadata
100- /// TODO: 2) Add support for shredding
106+ /// TODO: Add extension type metadata
101107 fields : Fields ,
102108}
103109
@@ -117,7 +123,7 @@ impl VariantArrayBuilder {
117123 }
118124 }
119125
120- /// Build the final builder
126+ /// Build the final [`VariantArray`] (unshredded).
121127 pub fn build ( self ) -> VariantArray {
122128 let Self {
123129 mut nulls,
@@ -134,7 +140,6 @@ impl VariantArrayBuilder {
134140 let value_buffer = value_builder. into_inner ( ) ;
135141 let value_array = binary_view_array_from_buffers ( value_buffer, value_offsets) ;
136142
137- // The build the final struct array
138143 let inner = StructArray :: new (
139144 fields,
140145 vec ! [
@@ -148,6 +153,31 @@ impl VariantArrayBuilder {
148153 VariantArray :: try_new ( & inner) . expect ( "valid VariantArray by construction" )
149154 }
150155
156+ /// Build a shredded [`VariantArray`] using `as_type` as the shredding schema.
157+ ///
158+ /// Rows shredded to `as_type` are placed in the `typed_value` column; rows
159+ /// that do not match fall back to the binary `value` column. Use
160+ /// [`ShreddedSchemaBuilder`] to construct `as_type` for struct schemas.
161+ ///
162+ /// Returns `Err` if `as_type` is not a valid variant shredding type.
163+ ///
164+ /// # Example
165+ /// ```
166+ /// # use arrow_schema::DataType;
167+ /// # use parquet_variant::Variant;
168+ /// # use parquet_variant_compute::VariantArrayBuilder;
169+ /// let mut b = VariantArrayBuilder::new(3);
170+ /// b.append_variant(Variant::Int64(42));
171+ /// b.append_variant(Variant::from("not an int")); // falls back to value column
172+ /// b.append_null();
173+ /// let arr = b.build_shredded(&DataType::Int64).unwrap();
174+ /// assert!(arr.typed_value_field().is_some());
175+ /// assert_eq!(arr.len(), 3);
176+ /// ```
177+ pub fn build_shredded ( self , as_type : & DataType ) -> Result < VariantArray , ArrowError > {
178+ shred_variant ( & self . build ( ) , as_type)
179+ }
180+
151181 /// Appends a null row to the builder.
152182 pub fn append_null ( & mut self ) {
153183 self . nulls . append_null ( ) ;
@@ -471,6 +501,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar
471501#[ cfg( test) ]
472502mod test {
473503 use super :: * ;
504+ use crate :: ShreddedSchemaBuilder ;
474505 use arrow:: array:: Array ;
475506 use parquet_variant:: { ShortString , Variant } ;
476507
@@ -659,4 +690,190 @@ mod test {
659690 assert_eq ! ( array. value( 2 ) , array2. value( 2 ) . get_list_element( 0 ) . unwrap( ) ) ;
660691 assert_eq ! ( array. value( 2 ) , array2. value( 2 ) . get_list_element( 1 ) . unwrap( ) ) ;
661692 }
693+
694+ #[ test]
695+ fn build_shredded_primitive_int64 ( ) {
696+ let mut b = VariantArrayBuilder :: new ( 3 ) ;
697+ b. append_variant ( Variant :: Int64 ( 42 ) ) ;
698+ b. append_variant ( Variant :: Int64 ( 100 ) ) ;
699+ b. append_null ( ) ;
700+ let arr = b. build_shredded ( & DataType :: Int64 ) . unwrap ( ) ;
701+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
702+ assert_eq ! ( arr. len( ) , 3 ) ;
703+ assert ! ( !arr. is_null( 0 ) ) ;
704+ assert ! ( !arr. is_null( 1 ) ) ;
705+ assert ! ( arr. is_null( 2 ) ) ;
706+ }
707+
708+ #[ test]
709+ fn build_shredded_primitive_utf8 ( ) {
710+ let mut b = VariantArrayBuilder :: new ( 2 ) ;
711+ b. append_variant ( Variant :: from ( "hello" ) ) ;
712+ b. append_null ( ) ;
713+ let arr = b. build_shredded ( & DataType :: Utf8 ) . unwrap ( ) ;
714+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
715+ assert_eq ! ( arr. len( ) , 2 ) ;
716+ assert ! ( !arr. is_null( 0 ) ) ;
717+ assert ! ( arr. is_null( 1 ) ) ;
718+ }
719+
720+ #[ test]
721+ fn build_shredded_primitive_float64 ( ) {
722+ let mut b = VariantArrayBuilder :: new ( 2 ) ;
723+ b. append_variant ( Variant :: Float ( 3.14 ) ) ;
724+ b. append_null ( ) ;
725+ let arr = b. build_shredded ( & DataType :: Float64 ) . unwrap ( ) ;
726+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
727+ assert_eq ! ( arr. len( ) , 2 ) ;
728+ }
729+
730+ #[ test]
731+ fn build_shredded_primitive_bool ( ) {
732+ let mut b = VariantArrayBuilder :: new ( 2 ) ;
733+ b. append_variant ( Variant :: BooleanTrue ) ;
734+ b. append_variant ( Variant :: BooleanFalse ) ;
735+ let arr = b. build_shredded ( & DataType :: Boolean ) . unwrap ( ) ;
736+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
737+ assert_eq ! ( arr. len( ) , 2 ) ;
738+ }
739+
740+ #[ test]
741+ fn build_shredded_type_mismatch_falls_back_to_value_column ( ) {
742+ // Row 0: matches Int64 -> typed_value non-null, value null
743+ // Row 1: string, does not match -> value non-null, typed_value null
744+ let mut b = VariantArrayBuilder :: new ( 2 ) ;
745+ b. append_variant ( Variant :: Int64 ( 7 ) ) ;
746+ b. append_variant ( Variant :: from ( "not an int" ) ) ;
747+ let arr = b. build_shredded ( & DataType :: Int64 ) . unwrap ( ) ;
748+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
749+ assert_eq ! ( arr. len( ) , 2 ) ;
750+ assert ! ( !arr. is_null( 0 ) ) ;
751+ assert ! ( !arr. is_null( 1 ) ) ;
752+ }
753+
754+ #[ test]
755+ fn build_shredded_struct_single_field ( ) {
756+ let schema = DataType :: Struct ( vec ! [ Field :: new( "brand" , DataType :: Utf8 , true ) ] . into ( ) ) ;
757+ let mut b = VariantArrayBuilder :: new ( 3 ) ;
758+ b. new_object ( ) . with_field ( "brand" , "Apple" ) . finish ( ) ;
759+ b. new_object ( ) . with_field ( "brand" , "Samsung" ) . finish ( ) ;
760+ b. append_null ( ) ;
761+ let arr = b. build_shredded ( & schema) . unwrap ( ) ;
762+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
763+ assert_eq ! ( arr. len( ) , 3 ) ;
764+ assert ! ( !arr. is_null( 0 ) ) ;
765+ assert ! ( !arr. is_null( 1 ) ) ;
766+ assert ! ( arr. is_null( 2 ) ) ;
767+ }
768+
769+ #[ test]
770+ fn build_shredded_struct_multi_field ( ) {
771+ let schema = ShreddedSchemaBuilder :: default ( )
772+ . with_path ( "name" , & DataType :: Utf8 )
773+ . unwrap ( )
774+ . with_path ( "age" , & DataType :: Int32 )
775+ . unwrap ( )
776+ . build ( ) ;
777+ let mut b = VariantArrayBuilder :: new ( 2 ) ;
778+ b. new_object ( )
779+ . with_field ( "name" , "Alice" )
780+ . with_field ( "age" , 30i32 )
781+ . finish ( ) ;
782+ b. new_object ( ) . with_field ( "name" , "Bob" ) . finish ( ) ;
783+ let arr = b. build_shredded ( & schema) . unwrap ( ) ;
784+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
785+ assert_eq ! ( arr. len( ) , 2 ) ;
786+ }
787+
788+ #[ test]
789+ fn build_shredded_nested_struct ( ) {
790+ let schema = ShreddedSchemaBuilder :: default ( )
791+ . with_path ( "address.city" , & DataType :: Utf8 )
792+ . unwrap ( )
793+ . with_path ( "address.zip" , & DataType :: Utf8 )
794+ . unwrap ( )
795+ . build ( ) ;
796+ let mut b = VariantArrayBuilder :: new ( 2 ) ;
797+ {
798+ let mut obj = b. new_object ( ) ;
799+ obj. new_object ( "address" )
800+ . with_field ( "city" , "NYC" )
801+ . with_field ( "zip" , "10001" )
802+ . finish ( ) ;
803+ obj. finish ( ) ;
804+ }
805+ b. append_null ( ) ;
806+ let arr = b. build_shredded ( & schema) . unwrap ( ) ;
807+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
808+ assert_eq ! ( arr. len( ) , 2 ) ;
809+ assert ! ( !arr. is_null( 0 ) ) ;
810+ assert ! ( arr. is_null( 1 ) ) ;
811+ }
812+
813+ #[ test]
814+ fn build_shredded_list_of_int64 ( ) {
815+ use arrow_schema:: Field as ArrowField ;
816+ use std:: sync:: Arc ;
817+ let list_schema = DataType :: List ( Arc :: new ( ArrowField :: new ( "item" , DataType :: Int64 , true ) ) ) ;
818+ let mut b = VariantArrayBuilder :: new ( 2 ) ;
819+ b. new_list ( )
820+ . with_value ( Variant :: Int64 ( 1 ) )
821+ . with_value ( Variant :: Int64 ( 2 ) )
822+ . finish ( ) ;
823+ b. append_null ( ) ;
824+ let arr = b. build_shredded ( & list_schema) . unwrap ( ) ;
825+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
826+ assert_eq ! ( arr. len( ) , 2 ) ;
827+ assert ! ( !arr. is_null( 0 ) ) ;
828+ assert ! ( arr. is_null( 1 ) ) ;
829+ }
830+
831+ #[ test]
832+ fn build_shredded_extend_then_shred ( ) {
833+ let mut b = VariantArrayBuilder :: new ( 4 ) ;
834+ b. extend ( [
835+ Some ( Variant :: Int64 ( 1 ) ) ,
836+ None ,
837+ Some ( Variant :: Int64 ( 3 ) ) ,
838+ Some ( Variant :: from ( "oops" ) ) ,
839+ ] ) ;
840+ let arr = b. build_shredded ( & DataType :: Int64 ) . unwrap ( ) ;
841+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
842+ assert_eq ! ( arr. len( ) , 4 ) ;
843+ assert ! ( !arr. is_null( 0 ) ) ;
844+ assert ! ( arr. is_null( 1 ) ) ;
845+ assert ! ( !arr. is_null( 2 ) ) ;
846+ assert ! ( !arr. is_null( 3 ) ) ;
847+ }
848+
849+ #[ test]
850+ fn build_shredded_all_nulls ( ) {
851+ let mut b = VariantArrayBuilder :: new ( 3 ) ;
852+ b. append_null ( ) ;
853+ b. append_null ( ) ;
854+ b. append_null ( ) ;
855+ let arr = b. build_shredded ( & DataType :: Int64 ) . unwrap ( ) ;
856+ assert_eq ! ( arr. len( ) , 3 ) ;
857+ assert ! ( arr. is_null( 0 ) ) ;
858+ assert ! ( arr. is_null( 1 ) ) ;
859+ assert ! ( arr. is_null( 2 ) ) ;
860+ }
861+
862+ #[ test]
863+ fn build_shredded_invalid_type_returns_err ( ) {
864+ let mut b = VariantArrayBuilder :: new ( 1 ) ;
865+ b. append_variant ( Variant :: Int64 ( 1 ) ) ;
866+ let result = b. build_shredded ( & DataType :: FixedSizeBinary ( 17 ) ) ;
867+ assert ! ( result. is_err( ) ) ;
868+ }
869+
870+ #[ test]
871+ fn build_shredded_uuid_fixed_size_binary_16 ( ) {
872+ let uuid_bytes: Vec < u8 > = ( 0u8 ..16 ) . collect ( ) ;
873+ let mut b = VariantArrayBuilder :: new ( 1 ) ;
874+ b. append_variant ( Variant :: from ( uuid_bytes. as_slice ( ) ) ) ;
875+ let arr = b. build_shredded ( & DataType :: FixedSizeBinary ( 16 ) ) . unwrap ( ) ;
876+ assert ! ( arr. typed_value_field( ) . is_some( ) ) ;
877+ assert_eq ! ( arr. len( ) , 1 ) ;
878+ }
662879}
0 commit comments