Skip to content

Commit f7d5575

Browse files
bert-beyondloopsBert Vermeirenblagininalamb
authored
Fix: compact view buffers in ScalarValue::compact for all container t… (#21934)
## Which issue does this PR close? - Closes #21928. ## Rationale for this change ScalarValue::compact copies array data for container types to release sliced-buffer overhead, but never called .gc() on nested StringViewArray / BinaryViewArray. A scalar extracted from a large batch would therefore still hold a reference to the entire original view backing buffer, negating the benefit of compaction for any list, struct, or map whose values are view-typed. ## What changes are included in this PR? - ScalarValue::compact now compacts nested view buffers for all container types (List, LargeList, FixedSizeList, ListView, LargeListView, Struct, Map), trimming StringViewArray / BinaryViewArray backing buffers to only the referenced bytes. - The internal compact_view_buffers helper recursively handles FixedSizeList, ListView, LargeListView, and Map. ## Are these changes tested? new unit tests are added in scalar::tests, one per container type. Each test verifies that after compact() the backing buffer is reduced to exactly the bytes of the referenced string, and that the scalar value is preserved. ## Are there any user-facing changes? No. The public compact / compacted API is unchanged; this PR only fixes the behaviour for view-typed nested arrays. Co-authored-by: Bert Vermeiren <bert.vermeiren@datadobi.com> Co-authored-by: Dmitrii Blaginin <dmitrii@blaginin.me> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 3d45e42 commit f7d5575

1 file changed

Lines changed: 300 additions & 20 deletions

File tree

  • datafusion/common/src/scalar

datafusion/common/src/scalar/mod.rs

Lines changed: 300 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4719,6 +4719,14 @@ impl ScalarValue {
47194719
/// This can be relevant when `self` is a list or contains a list as a nested value, as
47204720
/// a single list holds an Arc to its entire original array buffer.
47214721
pub fn compact(&mut self) {
4722+
// copy_array_data + compact_view_buffers + downcast back, all in one step.
4723+
macro_rules! compact_array {
4724+
($arr:expr, $from_type:ty, $($as_method:tt)+) => {
4725+
*Arc::make_mut($arr) = ScalarValue::compact_view_buffers(
4726+
Arc::new(<$from_type>::from(copy_array_data(&$arr.to_data()))) as ArrayRef,
4727+
).$($as_method)+.clone()
4728+
};
4729+
}
47224730
match self {
47234731
ScalarValue::Null
47244732
| ScalarValue::Boolean(_)
@@ -4762,33 +4770,20 @@ impl ScalarValue {
47624770
| ScalarValue::LargeBinary(_)
47634771
| ScalarValue::BinaryView(_) => (),
47644772
ScalarValue::FixedSizeList(arr) => {
4765-
let array = copy_array_data(&arr.to_data());
4766-
*Arc::make_mut(arr) = FixedSizeListArray::from(array);
4767-
}
4768-
ScalarValue::List(arr) => {
4769-
let array = copy_array_data(&arr.to_data());
4770-
*Arc::make_mut(arr) = ListArray::from(array);
4773+
compact_array!(arr, FixedSizeListArray, as_fixed_size_list())
47714774
}
4775+
ScalarValue::List(arr) => compact_array!(arr, ListArray, as_list::<i32>()),
47724776
ScalarValue::LargeList(arr) => {
4773-
let array = copy_array_data(&arr.to_data());
4774-
*Arc::make_mut(arr) = LargeListArray::from(array)
4777+
compact_array!(arr, LargeListArray, as_list::<i64>())
47754778
}
47764779
ScalarValue::ListView(arr) => {
4777-
let array = copy_array_data(&arr.to_data());
4778-
*Arc::make_mut(arr) = ListViewArray::from(array);
4780+
compact_array!(arr, ListViewArray, as_list_view::<i32>())
47794781
}
47804782
ScalarValue::LargeListView(arr) => {
4781-
let array = copy_array_data(&arr.to_data());
4782-
*Arc::make_mut(arr) = LargeListViewArray::from(array)
4783-
}
4784-
ScalarValue::Struct(arr) => {
4785-
let array = copy_array_data(&arr.to_data());
4786-
*Arc::make_mut(arr) = StructArray::from(array);
4787-
}
4788-
ScalarValue::Map(arr) => {
4789-
let array = copy_array_data(&arr.to_data());
4790-
*Arc::make_mut(arr) = MapArray::from(array);
4783+
compact_array!(arr, LargeListViewArray, as_list_view::<i64>())
47914784
}
4785+
ScalarValue::Struct(arr) => compact_array!(arr, StructArray, as_struct()),
4786+
ScalarValue::Map(arr) => compact_array!(arr, MapArray, as_map()),
47924787
ScalarValue::Union(val, _, _) => {
47934788
if let Some((_, value)) = val.as_mut() {
47944789
value.compact();
@@ -4809,6 +4804,95 @@ impl ScalarValue {
48094804
self
48104805
}
48114806

4807+
/// Recursively compacts the backing buffers of any [`StringViewArray`] or
4808+
/// [`BinaryViewArray`] nested within `array`.
4809+
///
4810+
/// View-typed arrays keep an `Arc` reference to their original backing
4811+
/// buffers, so a single scalar extracted from a large batch still retains
4812+
/// the entire buffer. Calling [`.gc()`][StringViewArray::gc] copies only
4813+
/// the bytes that are actually referenced by the surviving views, releasing
4814+
/// the rest.
4815+
///
4816+
/// Container types (`List`, `LargeList`, `FixedSizeList`, `ListView`,
4817+
/// `LargeListView`, `Struct`, `Map`) are handled by recursing into their
4818+
/// child / values arrays and reconstructing the parent with the compacted
4819+
/// children. All other types are returned unchanged.
4820+
fn compact_view_buffers(array: ArrayRef) -> ArrayRef {
4821+
// Macro for the i32/i64-offset list pair (List / LargeList).
4822+
macro_rules! gc_list {
4823+
($field:expr, $offset_type:ty, $array_type:ty) => {{
4824+
let list = array.as_list::<$offset_type>();
4825+
Arc::new(<$array_type>::new(
4826+
Arc::clone($field),
4827+
list.offsets().clone(),
4828+
ScalarValue::compact_view_buffers(Arc::clone(list.values())),
4829+
list.nulls().cloned(),
4830+
)) as ArrayRef
4831+
}};
4832+
}
4833+
// Macro for the i32/i64-offset list-view pair (ListView / LargeListView).
4834+
macro_rules! gc_list_view {
4835+
($field:expr, $offset_type:ty, $array_type:ty) => {{
4836+
let list = array.as_list_view::<$offset_type>();
4837+
Arc::new(<$array_type>::new(
4838+
Arc::clone($field),
4839+
list.offsets().clone(),
4840+
list.sizes().clone(),
4841+
ScalarValue::compact_view_buffers(Arc::clone(list.values())),
4842+
list.nulls().cloned(),
4843+
)) as ArrayRef
4844+
}};
4845+
}
4846+
4847+
match array.data_type() {
4848+
DataType::Utf8View => Arc::new(array.as_string_view().gc()),
4849+
DataType::BinaryView => Arc::new(array.as_binary_view().gc()),
4850+
DataType::Struct(_) => {
4851+
let s = array.as_struct();
4852+
let columns = s
4853+
.columns()
4854+
.iter()
4855+
.map(|c| ScalarValue::compact_view_buffers(Arc::clone(c)))
4856+
.collect();
4857+
Arc::new(StructArray::new(
4858+
s.fields().clone(),
4859+
columns,
4860+
s.nulls().cloned(),
4861+
))
4862+
}
4863+
DataType::List(field) => gc_list!(field, i32, ListArray),
4864+
DataType::LargeList(field) => gc_list!(field, i64, LargeListArray),
4865+
DataType::FixedSizeList(field, size) => {
4866+
let list = array.as_fixed_size_list();
4867+
Arc::new(FixedSizeListArray::new(
4868+
Arc::clone(field),
4869+
*size,
4870+
ScalarValue::compact_view_buffers(Arc::clone(list.values())),
4871+
list.nulls().cloned(),
4872+
))
4873+
}
4874+
DataType::ListView(field) => gc_list_view!(field, i32, ListViewArray),
4875+
DataType::LargeListView(field) => {
4876+
gc_list_view!(field, i64, LargeListViewArray)
4877+
}
4878+
DataType::Map(field, ordered) => {
4879+
let map = array.as_map();
4880+
let entries = ScalarValue::compact_view_buffers(Arc::new(
4881+
map.entries().clone(),
4882+
)
4883+
as ArrayRef);
4884+
Arc::new(MapArray::new(
4885+
Arc::clone(field),
4886+
map.offsets().clone(),
4887+
entries.as_struct().clone(),
4888+
map.nulls().cloned(),
4889+
*ordered,
4890+
))
4891+
}
4892+
_ => array,
4893+
}
4894+
}
4895+
48124896
/// Returns the minimum value for the given numeric `DataType`.
48134897
///
48144898
/// This function returns the smallest representable value for numeric
@@ -10708,4 +10792,200 @@ mod tests {
1070810792
]
1070910793
);
1071010794
}
10795+
10796+
// ── compact / compact_view_buffers ───────────────────────────────────────
10797+
10798+
/// Builds a `StringViewArray` with `n` strings that are all longer than
10799+
/// 12 bytes so they are stored in backing buffers rather than inline.
10800+
fn make_long_strings(n: usize) -> StringViewArray {
10801+
let mut b = StringViewBuilder::new();
10802+
for i in 0..n {
10803+
b.append_value(format!("long_string_value_pad_{i:04}"));
10804+
}
10805+
b.finish()
10806+
}
10807+
10808+
/// Total bytes across all backing buffers of a `StringViewArray`.
10809+
fn utf8view_buffer_bytes(a: &StringViewArray) -> usize {
10810+
a.data_buffers().iter().map(|b| b.len()).sum()
10811+
}
10812+
10813+
#[test]
10814+
fn test_compact_list_utf8view() {
10815+
const N: usize = 50;
10816+
let strings = make_long_strings(N);
10817+
let one_len = strings.value(0).len();
10818+
assert!(utf8view_buffer_bytes(&strings) >= N * one_len);
10819+
10820+
let single_row_list_array =
10821+
SingleRowListArrayBuilder::new(Arc::new(strings.slice(0, 1)) as ArrayRef)
10822+
.build_list_array();
10823+
let mut scalar = ScalarValue::List(Arc::new(single_row_list_array));
10824+
scalar.compact();
10825+
10826+
let ScalarValue::List(arr) = &scalar else {
10827+
panic!("expected List")
10828+
};
10829+
assert_eq!(
10830+
utf8view_buffer_bytes(arr.values().as_string_view()),
10831+
one_len
10832+
);
10833+
assert_eq!(arr.values().as_string_view().value(0), strings.value(0));
10834+
}
10835+
10836+
#[test]
10837+
fn test_compact_large_list_utf8view() {
10838+
const N: usize = 50;
10839+
let strings = make_long_strings(N);
10840+
let one_len = strings.value(0).len();
10841+
assert!(utf8view_buffer_bytes(&strings) >= N * one_len);
10842+
10843+
let single_row_list_array =
10844+
SingleRowListArrayBuilder::new(Arc::new(strings.slice(0, 1)) as ArrayRef)
10845+
.build_large_list_array();
10846+
let mut scalar = ScalarValue::LargeList(Arc::new(single_row_list_array));
10847+
scalar.compact();
10848+
10849+
let ScalarValue::LargeList(arr) = &scalar else {
10850+
panic!("expected LargeList")
10851+
};
10852+
assert_eq!(
10853+
utf8view_buffer_bytes(arr.values().as_string_view()),
10854+
one_len
10855+
);
10856+
assert_eq!(arr.values().as_string_view().value(0), strings.value(0));
10857+
}
10858+
10859+
#[test]
10860+
fn test_compact_fixed_size_list_utf8view() {
10861+
const N: usize = 50;
10862+
let strings = make_long_strings(N);
10863+
let one_len = strings.value(0).len();
10864+
assert!(utf8view_buffer_bytes(&strings) >= N * one_len);
10865+
10866+
let single_row_list_array =
10867+
SingleRowListArrayBuilder::new(Arc::new(strings.slice(0, 1)) as ArrayRef)
10868+
.build_fixed_size_list_array(1);
10869+
let mut scalar = ScalarValue::FixedSizeList(Arc::new(single_row_list_array));
10870+
scalar.compact();
10871+
10872+
let ScalarValue::FixedSizeList(arr) = &scalar else {
10873+
panic!("expected FixedSizeList")
10874+
};
10875+
assert_eq!(
10876+
utf8view_buffer_bytes(arr.values().as_string_view()),
10877+
one_len
10878+
);
10879+
assert_eq!(arr.values().as_string_view().value(0), strings.value(0));
10880+
}
10881+
10882+
#[test]
10883+
fn test_compact_list_view_utf8view() {
10884+
const N: usize = 50;
10885+
let strings = make_long_strings(N);
10886+
let one_len = strings.value(0).len();
10887+
assert!(utf8view_buffer_bytes(&strings) >= N * one_len);
10888+
10889+
let single_row_list_array =
10890+
SingleRowListArrayBuilder::new(Arc::new(strings.slice(0, 1)) as ArrayRef)
10891+
.build_list_view_array();
10892+
let mut scalar = ScalarValue::ListView(Arc::new(single_row_list_array));
10893+
scalar.compact();
10894+
10895+
let ScalarValue::ListView(arr) = &scalar else {
10896+
panic!("expected ListView")
10897+
};
10898+
assert_eq!(
10899+
utf8view_buffer_bytes(arr.values().as_string_view()),
10900+
one_len
10901+
);
10902+
assert_eq!(arr.values().as_string_view().value(0), strings.value(0));
10903+
}
10904+
10905+
#[test]
10906+
fn test_compact_large_list_view_utf8view() {
10907+
const N: usize = 50;
10908+
let strings = make_long_strings(N);
10909+
let one_len = strings.value(0).len();
10910+
assert!(utf8view_buffer_bytes(&strings) >= N * one_len);
10911+
10912+
let single_row_list_array =
10913+
SingleRowListArrayBuilder::new(Arc::new(strings.slice(0, 1)) as ArrayRef)
10914+
.build_large_list_view_array();
10915+
let mut scalar = ScalarValue::LargeListView(Arc::new(single_row_list_array));
10916+
scalar.compact();
10917+
10918+
let ScalarValue::LargeListView(arr) = &scalar else {
10919+
panic!("expected LargeListView")
10920+
};
10921+
assert_eq!(
10922+
utf8view_buffer_bytes(arr.values().as_string_view()),
10923+
one_len
10924+
);
10925+
assert_eq!(arr.values().as_string_view().value(0), strings.value(0));
10926+
}
10927+
10928+
#[test]
10929+
fn test_compact_struct_utf8view() {
10930+
const N: usize = 50;
10931+
let strings = make_long_strings(N);
10932+
let one_len = strings.value(0).len();
10933+
10934+
let field = Arc::new(Field::new("name", DataType::Utf8View, true));
10935+
let struct_arr = StructArray::new(
10936+
Fields::from(vec![Arc::clone(&field)]),
10937+
vec![Arc::new(strings.slice(0, 1)) as ArrayRef],
10938+
None,
10939+
);
10940+
10941+
let mut scalar = ScalarValue::Struct(Arc::new(struct_arr));
10942+
scalar.compact();
10943+
10944+
let ScalarValue::Struct(arr) = &scalar else {
10945+
panic!("expected Struct")
10946+
};
10947+
let col = arr.column(0).as_string_view();
10948+
assert_eq!(utf8view_buffer_bytes(col), one_len);
10949+
assert_eq!(col.value(0), strings.value(0));
10950+
}
10951+
10952+
#[test]
10953+
fn test_compact_map_utf8view() {
10954+
const N: usize = 50;
10955+
let strings = make_long_strings(N);
10956+
let one_len = strings.value(0).len();
10957+
10958+
let key_field = Arc::new(Field::new("key", DataType::Utf8View, false));
10959+
let val_field = Arc::new(Field::new("value", DataType::Int32, true));
10960+
let entries = StructArray::new(
10961+
Fields::from(vec![Arc::clone(&key_field), Arc::clone(&val_field)]),
10962+
vec![
10963+
Arc::new(strings.slice(0, 1)) as ArrayRef,
10964+
Arc::new(Int32Array::from(vec![1i32])) as ArrayRef,
10965+
],
10966+
None,
10967+
);
10968+
let entries_field = Arc::new(Field::new(
10969+
"entries",
10970+
DataType::Struct(Fields::from(vec![key_field, val_field])),
10971+
false,
10972+
));
10973+
let map = MapArray::new(
10974+
entries_field,
10975+
OffsetBuffer::new(vec![0i32, 1].into()),
10976+
entries,
10977+
None,
10978+
false,
10979+
);
10980+
10981+
let mut scalar = ScalarValue::Map(Arc::new(map));
10982+
scalar.compact();
10983+
10984+
let ScalarValue::Map(arr) = &scalar else {
10985+
panic!("expected Map")
10986+
};
10987+
let keys = arr.entries().column(0).as_string_view();
10988+
assert_eq!(utf8view_buffer_bytes(keys), one_len);
10989+
assert_eq!(keys.value(0), strings.value(0));
10990+
}
1071110991
}

0 commit comments

Comments
 (0)