Skip to content

Commit 52655f3

Browse files
committed
feat: add VariantArrayBuilder::build_shredded
1 parent 30185d6 commit 52655f3

1 file changed

Lines changed: 225 additions & 8 deletions

File tree

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 225 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//! [`VariantArrayBuilder`] implementation
1919
20-
use crate::VariantArray;
20+
use crate::{VariantArray, shred_variant};
2121
use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
2222
use arrow_schema::{ArrowError, DataType, Field, Fields};
2323
use parquet_variant::{
@@ -37,9 +37,6 @@ use std::sync::Arc;
3737
/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
3838
/// the metadata and value fields.
3939
///
40-
/// # TODO
41-
/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
42-
///
4340
/// ## Example:
4441
/// ```
4542
/// # use arrow::array::Array;
@@ -82,6 +79,34 @@ use std::sync::Arc;
8279
/// let value = variant_array.value(4);
8380
/// assert_eq!(value, Variant::ShortString(ShortString::try_new("norm").unwrap()));
8481
/// ```
82+
///
83+
/// ## Shredded Example
84+
///
85+
/// Use [`Self::build_shredded`] with [`ShreddedSchemaBuilder`] to produce a
86+
/// shredded [`VariantArray`] where known fields are extracted into typed columns.
87+
///
88+
/// ```
89+
/// # use arrow::array::Array;
90+
/// # use arrow_schema::DataType;
91+
/// # use parquet_variant::{Variant, VariantBuilderExt};
92+
/// # use parquet_variant_compute::{ShreddedSchemaBuilder, VariantArrayBuilder};
93+
/// let schema = ShreddedSchemaBuilder::default()
94+
/// .with_path("brand", &DataType::Utf8).unwrap()
95+
/// .with_path("price", &DataType::Float64).unwrap()
96+
/// .build();
97+
///
98+
/// let mut builder = VariantArrayBuilder::new(3);
99+
/// builder.new_object().with_field("brand", "Apple").with_field("price", 999.0f64).finish();
100+
/// builder.new_object().with_field("brand", "Samsung").finish();
101+
/// builder.append_null();
102+
///
103+
/// let arr = builder.build_shredded(&schema).unwrap();
104+
/// assert_eq!(arr.len(), 3);
105+
/// assert!(arr.typed_value_field().is_some());
106+
/// assert!(!arr.is_null(0));
107+
/// assert!(!arr.is_null(1));
108+
/// assert!(arr.is_null(2));
109+
/// ```
85110
#[derive(Debug)]
86111
pub struct VariantArrayBuilder {
87112
/// Nulls
@@ -96,8 +121,7 @@ pub struct VariantArrayBuilder {
96121
value_offsets: Vec<usize>,
97122
/// The fields of the final `StructArray`
98123
///
99-
/// TODO: 1) Add extension type metadata
100-
/// TODO: 2) Add support for shredding
124+
/// TODO: Add extension type metadata
101125
fields: Fields,
102126
}
103127

@@ -117,7 +141,7 @@ impl VariantArrayBuilder {
117141
}
118142
}
119143

120-
/// Build the final builder
144+
/// Build the final [`VariantArray`] (unshredded).
121145
pub fn build(self) -> VariantArray {
122146
let Self {
123147
mut nulls,
@@ -134,7 +158,6 @@ impl VariantArrayBuilder {
134158
let value_buffer = value_builder.into_inner();
135159
let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
136160

137-
// The build the final struct array
138161
let inner = StructArray::new(
139162
fields,
140163
vec![
@@ -148,6 +171,13 @@ impl VariantArrayBuilder {
148171
VariantArray::try_new(&inner).expect("valid VariantArray by construction")
149172
}
150173

174+
/// Build a shredded [`VariantArray`] using `as_type` as the shredding schema.
175+
/// Use [`ShreddedSchemaBuilder`] to construct `as_type` for struct schemas.
176+
/// Returns `Err` if `as_type` is not a valid variant shredding type.
177+
pub fn build_shredded(self, as_type: &DataType) -> Result<VariantArray, ArrowError> {
178+
shred_variant(&self.build(), as_type)
179+
}
180+
151181
/// Appends a null row to the builder.
152182
pub fn append_null(&mut self) {
153183
self.nulls.append_null();
@@ -471,6 +501,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar
471501
#[cfg(test)]
472502
mod test {
473503
use super::*;
504+
use crate::ShreddedSchemaBuilder;
474505
use arrow::array::Array;
475506
use parquet_variant::{ShortString, Variant};
476507

@@ -659,4 +690,190 @@ mod test {
659690
assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap());
660691
assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap());
661692
}
693+
694+
#[test]
695+
fn build_shredded_primitive_int64() {
696+
let mut b = VariantArrayBuilder::new(3);
697+
b.append_variant(Variant::Int64(42));
698+
b.append_variant(Variant::Int64(100));
699+
b.append_null();
700+
let arr = b.build_shredded(&DataType::Int64).unwrap();
701+
assert!(arr.typed_value_field().is_some());
702+
assert_eq!(arr.len(), 3);
703+
assert!(!arr.is_null(0));
704+
assert!(!arr.is_null(1));
705+
assert!(arr.is_null(2));
706+
}
707+
708+
#[test]
709+
fn build_shredded_primitive_utf8() {
710+
let mut b = VariantArrayBuilder::new(2);
711+
b.append_variant(Variant::from("hello"));
712+
b.append_null();
713+
let arr = b.build_shredded(&DataType::Utf8).unwrap();
714+
assert!(arr.typed_value_field().is_some());
715+
assert_eq!(arr.len(), 2);
716+
assert!(!arr.is_null(0));
717+
assert!(arr.is_null(1));
718+
}
719+
720+
#[test]
721+
fn build_shredded_primitive_float64() {
722+
let mut b = VariantArrayBuilder::new(2);
723+
b.append_variant(Variant::Float(3.14));
724+
b.append_null();
725+
let arr = b.build_shredded(&DataType::Float64).unwrap();
726+
assert!(arr.typed_value_field().is_some());
727+
assert_eq!(arr.len(), 2);
728+
}
729+
730+
#[test]
731+
fn build_shredded_primitive_bool() {
732+
let mut b = VariantArrayBuilder::new(2);
733+
b.append_variant(Variant::BooleanTrue);
734+
b.append_variant(Variant::BooleanFalse);
735+
let arr = b.build_shredded(&DataType::Boolean).unwrap();
736+
assert!(arr.typed_value_field().is_some());
737+
assert_eq!(arr.len(), 2);
738+
}
739+
740+
#[test]
741+
fn build_shredded_type_mismatch_falls_back_to_value_column() {
742+
// Row 0: matches Int64 -> typed_value non-null, value null
743+
// Row 1: string, does not match -> value non-null, typed_value null
744+
let mut b = VariantArrayBuilder::new(2);
745+
b.append_variant(Variant::Int64(7));
746+
b.append_variant(Variant::from("not an int"));
747+
let arr = b.build_shredded(&DataType::Int64).unwrap();
748+
assert!(arr.typed_value_field().is_some());
749+
assert_eq!(arr.len(), 2);
750+
assert!(!arr.is_null(0));
751+
assert!(!arr.is_null(1));
752+
}
753+
754+
#[test]
755+
fn build_shredded_struct_single_field() {
756+
let schema = DataType::Struct(vec![Field::new("brand", DataType::Utf8, true)].into());
757+
let mut b = VariantArrayBuilder::new(3);
758+
b.new_object().with_field("brand", "Apple").finish();
759+
b.new_object().with_field("brand", "Samsung").finish();
760+
b.append_null();
761+
let arr = b.build_shredded(&schema).unwrap();
762+
assert!(arr.typed_value_field().is_some());
763+
assert_eq!(arr.len(), 3);
764+
assert!(!arr.is_null(0));
765+
assert!(!arr.is_null(1));
766+
assert!(arr.is_null(2));
767+
}
768+
769+
#[test]
770+
fn build_shredded_struct_multi_field() {
771+
let schema = ShreddedSchemaBuilder::default()
772+
.with_path("name", &DataType::Utf8)
773+
.unwrap()
774+
.with_path("age", &DataType::Int32)
775+
.unwrap()
776+
.build();
777+
let mut b = VariantArrayBuilder::new(2);
778+
b.new_object()
779+
.with_field("name", "Alice")
780+
.with_field("age", 30i32)
781+
.finish();
782+
b.new_object().with_field("name", "Bob").finish();
783+
let arr = b.build_shredded(&schema).unwrap();
784+
assert!(arr.typed_value_field().is_some());
785+
assert_eq!(arr.len(), 2);
786+
}
787+
788+
#[test]
789+
fn build_shredded_nested_struct() {
790+
let schema = ShreddedSchemaBuilder::default()
791+
.with_path("address.city", &DataType::Utf8)
792+
.unwrap()
793+
.with_path("address.zip", &DataType::Utf8)
794+
.unwrap()
795+
.build();
796+
let mut b = VariantArrayBuilder::new(2);
797+
{
798+
let mut obj = b.new_object();
799+
obj.new_object("address")
800+
.with_field("city", "NYC")
801+
.with_field("zip", "10001")
802+
.finish();
803+
obj.finish();
804+
}
805+
b.append_null();
806+
let arr = b.build_shredded(&schema).unwrap();
807+
assert!(arr.typed_value_field().is_some());
808+
assert_eq!(arr.len(), 2);
809+
assert!(!arr.is_null(0));
810+
assert!(arr.is_null(1));
811+
}
812+
813+
#[test]
814+
fn build_shredded_list_of_int64() {
815+
use arrow_schema::Field as ArrowField;
816+
use std::sync::Arc;
817+
let list_schema = DataType::List(Arc::new(ArrowField::new("item", DataType::Int64, true)));
818+
let mut b = VariantArrayBuilder::new(2);
819+
b.new_list()
820+
.with_value(Variant::Int64(1))
821+
.with_value(Variant::Int64(2))
822+
.finish();
823+
b.append_null();
824+
let arr = b.build_shredded(&list_schema).unwrap();
825+
assert!(arr.typed_value_field().is_some());
826+
assert_eq!(arr.len(), 2);
827+
assert!(!arr.is_null(0));
828+
assert!(arr.is_null(1));
829+
}
830+
831+
#[test]
832+
fn build_shredded_extend_then_shred() {
833+
let mut b = VariantArrayBuilder::new(4);
834+
b.extend([
835+
Some(Variant::Int64(1)),
836+
None,
837+
Some(Variant::Int64(3)),
838+
Some(Variant::from("oops")),
839+
]);
840+
let arr = b.build_shredded(&DataType::Int64).unwrap();
841+
assert!(arr.typed_value_field().is_some());
842+
assert_eq!(arr.len(), 4);
843+
assert!(!arr.is_null(0));
844+
assert!(arr.is_null(1));
845+
assert!(!arr.is_null(2));
846+
assert!(!arr.is_null(3));
847+
}
848+
849+
#[test]
850+
fn build_shredded_all_nulls() {
851+
let mut b = VariantArrayBuilder::new(3);
852+
b.append_null();
853+
b.append_null();
854+
b.append_null();
855+
let arr = b.build_shredded(&DataType::Int64).unwrap();
856+
assert_eq!(arr.len(), 3);
857+
assert!(arr.is_null(0));
858+
assert!(arr.is_null(1));
859+
assert!(arr.is_null(2));
860+
}
861+
862+
#[test]
863+
fn build_shredded_invalid_type_returns_err() {
864+
let mut b = VariantArrayBuilder::new(1);
865+
b.append_variant(Variant::Int64(1));
866+
let result = b.build_shredded(&DataType::FixedSizeBinary(17));
867+
assert!(result.is_err());
868+
}
869+
870+
#[test]
871+
fn build_shredded_uuid_fixed_size_binary_16() {
872+
let uuid_bytes: Vec<u8> = (0u8..16).collect();
873+
let mut b = VariantArrayBuilder::new(1);
874+
b.append_variant(Variant::from(uuid_bytes.as_slice()));
875+
let arr = b.build_shredded(&DataType::FixedSizeBinary(16)).unwrap();
876+
assert!(arr.typed_value_field().is_some());
877+
assert_eq!(arr.len(), 1);
878+
}
662879
}

0 commit comments

Comments
 (0)