Skip to content

Commit e571952

Browse files
committed
feat: add VariantArrayBuilder::build_shredded
1 parent 30185d6 commit e571952

1 file changed

Lines changed: 244 additions & 27 deletions

File tree

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 244 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//! [`VariantArrayBuilder`] implementation
1919
20-
use crate::VariantArray;
20+
use crate::{VariantArray, shred_variant};
2121
use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
2222
use arrow_schema::{ArrowError, DataType, Field, Fields};
2323
use parquet_variant::{
@@ -37,50 +37,57 @@ use std::sync::Arc;
3737
/// This builder always creates a `VariantArray` using [`BinaryViewArray`] for both
3838
/// the metadata and value fields.
3939
///
40-
/// # TODO
41-
/// 1. Support shredding: <https://github.com/apache/arrow-rs/issues/7895>
42-
///
43-
/// ## Example:
40+
/// ## Example
4441
/// ```
4542
/// # use arrow::array::Array;
4643
/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
4744
/// # use parquet_variant_compute::VariantArrayBuilder;
4845
/// # use parquet_variant::ShortString;
49-
/// // Create a new VariantArrayBuilder with a capacity of 100 rows
5046
/// let mut builder = VariantArrayBuilder::new(100);
51-
/// // append variant values
5247
/// builder.append_variant(Variant::from(42));
53-
/// // append a null row (note not a Variant::Null)
5448
/// builder.append_null();
55-
/// // append an object to the builder using VariantBuilderExt methods directly
5649
/// builder.new_object()
5750
/// .with_field("foo", "bar")
5851
/// .finish();
59-
///
60-
/// // bulk insert a list of values
61-
/// // `Option::None` is a null value
6252
/// builder.extend([None, Some(Variant::from("norm"))]);
63-
///
64-
/// // create the final VariantArray
6553
/// let variant_array = builder.build();
6654
/// assert_eq!(variant_array.len(), 5);
67-
/// // // Access the values
68-
/// // row 1 is not null and is an integer
6955
/// assert!(!variant_array.is_null(0));
7056
/// assert_eq!(variant_array.value(0), Variant::from(42i32));
71-
/// // row 1 is null
7257
/// assert!(variant_array.is_null(1));
73-
/// // row 2 is not null and is an object
7458
/// assert!(!variant_array.is_null(2));
7559
/// let value = variant_array.value(2);
7660
/// let obj = value.as_object().expect("expected object");
7761
/// assert_eq!(obj.get("foo"), Some(Variant::from("bar")));
78-
/// // row 3 is null
7962
/// assert!(variant_array.is_null(3));
80-
/// // row 4 is not null and is a short string
81-
/// assert!(!variant_array.is_null(4));
82-
/// let value = variant_array.value(4);
83-
/// assert_eq!(value, Variant::ShortString(ShortString::try_new("norm").unwrap()));
63+
/// ```
64+
///
65+
/// ## Shredded Example
66+
///
67+
/// Use [`Self::build_shredded`] with [`ShreddedSchemaBuilder`] to produce a
68+
/// shredded [`VariantArray`] where known fields are extracted into typed columns.
69+
///
70+
/// ```
71+
/// # use arrow::array::Array;
72+
/// # use arrow_schema::DataType;
73+
/// # use parquet_variant::{Variant, VariantBuilderExt};
74+
/// # use parquet_variant_compute::{ShreddedSchemaBuilder, VariantArrayBuilder};
75+
/// let schema = ShreddedSchemaBuilder::default()
76+
/// .with_path("brand", &DataType::Utf8).unwrap()
77+
/// .with_path("price", &DataType::Float64).unwrap()
78+
/// .build();
79+
///
80+
/// let mut builder = VariantArrayBuilder::new(3);
81+
/// builder.new_object().with_field("brand", "Apple").with_field("price", 999.0f64).finish();
82+
/// builder.new_object().with_field("brand", "Samsung").finish();
83+
/// builder.append_null();
84+
///
85+
/// let arr = builder.build_shredded(&schema).unwrap();
86+
/// assert_eq!(arr.len(), 3);
87+
/// assert!(arr.typed_value_field().is_some());
88+
/// assert!(!arr.is_null(0));
89+
/// assert!(!arr.is_null(1));
90+
/// assert!(arr.is_null(2));
8491
/// ```
8592
#[derive(Debug)]
8693
pub struct VariantArrayBuilder {
@@ -96,8 +103,7 @@ pub struct VariantArrayBuilder {
96103
value_offsets: Vec<usize>,
97104
/// The fields of the final `StructArray`
98105
///
99-
/// TODO: 1) Add extension type metadata
100-
/// TODO: 2) Add support for shredding
106+
/// TODO: Add extension type metadata
101107
fields: Fields,
102108
}
103109

@@ -117,7 +123,7 @@ impl VariantArrayBuilder {
117123
}
118124
}
119125

120-
/// Build the final builder
126+
/// Build the final [`VariantArray`] (unshredded).
121127
pub fn build(self) -> VariantArray {
122128
let Self {
123129
mut nulls,
@@ -134,7 +140,6 @@ impl VariantArrayBuilder {
134140
let value_buffer = value_builder.into_inner();
135141
let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
136142

137-
// The build the final struct array
138143
let inner = StructArray::new(
139144
fields,
140145
vec![
@@ -148,6 +153,31 @@ impl VariantArrayBuilder {
148153
VariantArray::try_new(&inner).expect("valid VariantArray by construction")
149154
}
150155

156+
/// Build a shredded [`VariantArray`] using `as_type` as the shredding schema.
157+
///
158+
/// Rows shredded to `as_type` are placed in the `typed_value` column; rows
159+
/// that do not match fall back to the binary `value` column. Use
160+
/// [`ShreddedSchemaBuilder`] to construct `as_type` for struct schemas.
161+
///
162+
/// Returns `Err` if `as_type` is not a valid variant shredding type.
163+
///
164+
/// # Example
165+
/// ```
166+
/// # use arrow_schema::DataType;
167+
/// # use parquet_variant::Variant;
168+
/// # use parquet_variant_compute::VariantArrayBuilder;
169+
/// let mut b = VariantArrayBuilder::new(3);
170+
/// b.append_variant(Variant::Int64(42));
171+
/// b.append_variant(Variant::from("not an int")); // falls back to value column
172+
/// b.append_null();
173+
/// let arr = b.build_shredded(&DataType::Int64).unwrap();
174+
/// assert!(arr.typed_value_field().is_some());
175+
/// assert_eq!(arr.len(), 3);
176+
/// ```
177+
pub fn build_shredded(self, as_type: &DataType) -> Result<VariantArray, ArrowError> {
178+
shred_variant(&self.build(), as_type)
179+
}
180+
151181
/// Appends a null row to the builder.
152182
pub fn append_null(&mut self) {
153183
self.nulls.append_null();
@@ -471,6 +501,7 @@ fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> Binar
471501
#[cfg(test)]
472502
mod test {
473503
use super::*;
504+
use crate::ShreddedSchemaBuilder;
474505
use arrow::array::Array;
475506
use parquet_variant::{ShortString, Variant};
476507

@@ -659,4 +690,190 @@ mod test {
659690
assert_eq!(array.value(2), array2.value(2).get_list_element(0).unwrap());
660691
assert_eq!(array.value(2), array2.value(2).get_list_element(1).unwrap());
661692
}
693+
694+
#[test]
695+
fn build_shredded_primitive_int64() {
696+
let mut b = VariantArrayBuilder::new(3);
697+
b.append_variant(Variant::Int64(42));
698+
b.append_variant(Variant::Int64(100));
699+
b.append_null();
700+
let arr = b.build_shredded(&DataType::Int64).unwrap();
701+
assert!(arr.typed_value_field().is_some());
702+
assert_eq!(arr.len(), 3);
703+
assert!(!arr.is_null(0));
704+
assert!(!arr.is_null(1));
705+
assert!(arr.is_null(2));
706+
}
707+
708+
#[test]
709+
fn build_shredded_primitive_utf8() {
710+
let mut b = VariantArrayBuilder::new(2);
711+
b.append_variant(Variant::from("hello"));
712+
b.append_null();
713+
let arr = b.build_shredded(&DataType::Utf8).unwrap();
714+
assert!(arr.typed_value_field().is_some());
715+
assert_eq!(arr.len(), 2);
716+
assert!(!arr.is_null(0));
717+
assert!(arr.is_null(1));
718+
}
719+
720+
#[test]
721+
fn build_shredded_primitive_float64() {
722+
let mut b = VariantArrayBuilder::new(2);
723+
b.append_variant(Variant::Float(3.14));
724+
b.append_null();
725+
let arr = b.build_shredded(&DataType::Float64).unwrap();
726+
assert!(arr.typed_value_field().is_some());
727+
assert_eq!(arr.len(), 2);
728+
}
729+
730+
#[test]
731+
fn build_shredded_primitive_bool() {
732+
let mut b = VariantArrayBuilder::new(2);
733+
b.append_variant(Variant::BooleanTrue);
734+
b.append_variant(Variant::BooleanFalse);
735+
let arr = b.build_shredded(&DataType::Boolean).unwrap();
736+
assert!(arr.typed_value_field().is_some());
737+
assert_eq!(arr.len(), 2);
738+
}
739+
740+
#[test]
741+
fn build_shredded_type_mismatch_falls_back_to_value_column() {
742+
// Row 0: matches Int64 -> typed_value non-null, value null
743+
// Row 1: string, does not match -> value non-null, typed_value null
744+
let mut b = VariantArrayBuilder::new(2);
745+
b.append_variant(Variant::Int64(7));
746+
b.append_variant(Variant::from("not an int"));
747+
let arr = b.build_shredded(&DataType::Int64).unwrap();
748+
assert!(arr.typed_value_field().is_some());
749+
assert_eq!(arr.len(), 2);
750+
assert!(!arr.is_null(0));
751+
assert!(!arr.is_null(1));
752+
}
753+
754+
#[test]
755+
fn build_shredded_struct_single_field() {
756+
let schema = DataType::Struct(vec![Field::new("brand", DataType::Utf8, true)].into());
757+
let mut b = VariantArrayBuilder::new(3);
758+
b.new_object().with_field("brand", "Apple").finish();
759+
b.new_object().with_field("brand", "Samsung").finish();
760+
b.append_null();
761+
let arr = b.build_shredded(&schema).unwrap();
762+
assert!(arr.typed_value_field().is_some());
763+
assert_eq!(arr.len(), 3);
764+
assert!(!arr.is_null(0));
765+
assert!(!arr.is_null(1));
766+
assert!(arr.is_null(2));
767+
}
768+
769+
#[test]
770+
fn build_shredded_struct_multi_field() {
771+
let schema = ShreddedSchemaBuilder::default()
772+
.with_path("name", &DataType::Utf8)
773+
.unwrap()
774+
.with_path("age", &DataType::Int32)
775+
.unwrap()
776+
.build();
777+
let mut b = VariantArrayBuilder::new(2);
778+
b.new_object()
779+
.with_field("name", "Alice")
780+
.with_field("age", 30i32)
781+
.finish();
782+
b.new_object().with_field("name", "Bob").finish();
783+
let arr = b.build_shredded(&schema).unwrap();
784+
assert!(arr.typed_value_field().is_some());
785+
assert_eq!(arr.len(), 2);
786+
}
787+
788+
#[test]
789+
fn build_shredded_nested_struct() {
790+
let schema = ShreddedSchemaBuilder::default()
791+
.with_path("address.city", &DataType::Utf8)
792+
.unwrap()
793+
.with_path("address.zip", &DataType::Utf8)
794+
.unwrap()
795+
.build();
796+
let mut b = VariantArrayBuilder::new(2);
797+
{
798+
let mut obj = b.new_object();
799+
obj.new_object("address")
800+
.with_field("city", "NYC")
801+
.with_field("zip", "10001")
802+
.finish();
803+
obj.finish();
804+
}
805+
b.append_null();
806+
let arr = b.build_shredded(&schema).unwrap();
807+
assert!(arr.typed_value_field().is_some());
808+
assert_eq!(arr.len(), 2);
809+
assert!(!arr.is_null(0));
810+
assert!(arr.is_null(1));
811+
}
812+
813+
#[test]
814+
fn build_shredded_list_of_int64() {
815+
use arrow_schema::Field as ArrowField;
816+
use std::sync::Arc;
817+
let list_schema = DataType::List(Arc::new(ArrowField::new("item", DataType::Int64, true)));
818+
let mut b = VariantArrayBuilder::new(2);
819+
b.new_list()
820+
.with_value(Variant::Int64(1))
821+
.with_value(Variant::Int64(2))
822+
.finish();
823+
b.append_null();
824+
let arr = b.build_shredded(&list_schema).unwrap();
825+
assert!(arr.typed_value_field().is_some());
826+
assert_eq!(arr.len(), 2);
827+
assert!(!arr.is_null(0));
828+
assert!(arr.is_null(1));
829+
}
830+
831+
#[test]
832+
fn build_shredded_extend_then_shred() {
833+
let mut b = VariantArrayBuilder::new(4);
834+
b.extend([
835+
Some(Variant::Int64(1)),
836+
None,
837+
Some(Variant::Int64(3)),
838+
Some(Variant::from("oops")),
839+
]);
840+
let arr = b.build_shredded(&DataType::Int64).unwrap();
841+
assert!(arr.typed_value_field().is_some());
842+
assert_eq!(arr.len(), 4);
843+
assert!(!arr.is_null(0));
844+
assert!(arr.is_null(1));
845+
assert!(!arr.is_null(2));
846+
assert!(!arr.is_null(3));
847+
}
848+
849+
#[test]
850+
fn build_shredded_all_nulls() {
851+
let mut b = VariantArrayBuilder::new(3);
852+
b.append_null();
853+
b.append_null();
854+
b.append_null();
855+
let arr = b.build_shredded(&DataType::Int64).unwrap();
856+
assert_eq!(arr.len(), 3);
857+
assert!(arr.is_null(0));
858+
assert!(arr.is_null(1));
859+
assert!(arr.is_null(2));
860+
}
861+
862+
#[test]
863+
fn build_shredded_invalid_type_returns_err() {
864+
let mut b = VariantArrayBuilder::new(1);
865+
b.append_variant(Variant::Int64(1));
866+
let result = b.build_shredded(&DataType::FixedSizeBinary(17));
867+
assert!(result.is_err());
868+
}
869+
870+
#[test]
871+
fn build_shredded_uuid_fixed_size_binary_16() {
872+
let uuid_bytes: Vec<u8> = (0u8..16).collect();
873+
let mut b = VariantArrayBuilder::new(1);
874+
b.append_variant(Variant::from(uuid_bytes.as_slice()));
875+
let arr = b.build_shredded(&DataType::FixedSizeBinary(16)).unwrap();
876+
assert!(arr.typed_value_field().is_some());
877+
assert_eq!(arr.len(), 1);
878+
}
662879
}

0 commit comments

Comments
 (0)