Skip to content

Commit 15cec3b

Browse files
authored
Tune varbinview compaction for Arrow export (#8585)
## Summary Compact `VarBinViewArray` before exporting Arrow byte-view arrays so filtered arrays do not retain unselected backing buffers. This keeps Arrow `Utf8View` / `BinaryView` exports from holding onto large payload buffers that no selected row references. Signed-off-by: Nicholas Gates <nick@nickgates.com>
1 parent aeae579 commit 15cec3b

3 files changed

Lines changed: 45 additions & 23 deletions

File tree

vortex-array/src/arrays/varbinview/compact.rs

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ use crate::arrays::varbinview::Ref;
1818
use crate::builders::ArrayBuilder;
1919
use crate::builders::VarBinViewBuilder;
2020

21+
const DEFAULT_COMPACTION_THRESHOLD: f64 = 0.5;
22+
const MIN_RETAINED_BYTES_PER_ROW_TO_CHECK_COMPACTION: u64 = 128;
23+
2124
impl VarBinViewArray {
2225
/// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
2326
/// operation can be very expensive, in the worst case copying all existing string data into
@@ -33,8 +36,7 @@ impl VarBinViewArray {
3336
return Ok(self.clone());
3437
}
3538

36-
// Use selective compaction with threshold of 1.0 (compact any buffer with any waste)
37-
self.compact_with_threshold(1.0)
39+
self.compact_with_threshold(DEFAULT_COMPACTION_THRESHOLD)
3840
}
3941

4042
fn should_compact(&self) -> VortexResult<bool> {
@@ -50,12 +52,18 @@ impl VarBinViewArray {
5052
return Ok(true);
5153
}
5254

53-
let bytes_referenced: u64 = self.count_referenced_bytes()?;
5455
let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
56+
if buffer_total_bytes == 0 {
57+
return Ok(true);
58+
}
5559

56-
// If there is any wasted space, we want to repack.
57-
// This is very aggressive.
58-
Ok(bytes_referenced < buffer_total_bytes || buffer_total_bytes == 0)
60+
let len = u64::try_from(self.len()).unwrap_or(u64::MAX);
61+
if len > 0 && buffer_total_bytes / len <= MIN_RETAINED_BYTES_PER_ROW_TO_CHECK_COMPACTION {
62+
return Ok(false);
63+
}
64+
65+
let bytes_referenced: u64 = self.count_referenced_bytes()?;
66+
Ok((bytes_referenced as f64 / buffer_total_bytes as f64) < DEFAULT_COMPACTION_THRESHOLD)
5967
}
6068

6169
/// Iterates over all valid, non-inlined views, calling the provided
@@ -264,8 +272,7 @@ mod tests {
264272
.execute::<VarBinViewArray>(&mut array_session().create_execution_ctx())
265273
.unwrap();
266274

267-
// Optimize the taken array
268-
let optimized_array = taken_array.compact_buffers().unwrap();
275+
let optimized_array = taken_array.compact_with_threshold(1.0).unwrap();
269276

270277
// The optimized array should have exactly 1 buffer (consolidated)
271278
assert_eq!(optimized_array.data_buffers().len(), 1);

vortex-array/src/arrow/executor/byte.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,11 @@ mod tests {
8080
use arrow_array::cast::AsArray;
8181
use arrow_schema::DataType;
8282
use rstest::rstest;
83+
use vortex_error::VortexResult;
84+
use vortex_mask::Mask;
8385

8486
use crate::IntoArray;
87+
use crate::LEGACY_SESSION;
8588
use crate::VortexSessionExecute;
8689
use crate::array_session;
8790
use crate::arrow::ArrowArrayExecutor;
@@ -179,4 +182,27 @@ mod tests {
179182
assert!(arrow.is_null(1));
180183
assert!(!arrow.is_null(2));
181184
}
185+
186+
#[test]
187+
fn filtered_utf8_view_export_does_not_retain_unselected_buffers() -> VortexResult<()> {
188+
let unselected = "x".repeat(1 << 20);
189+
let array =
190+
VarBinViewArray::from_iter_str(["selected", unselected.as_str(), unselected.as_str()]);
191+
let filtered = array
192+
.into_array()
193+
.filter(Mask::from_iter([true, false, false]))?;
194+
195+
let arrow = filtered.execute_arrow(
196+
Some(&DataType::Utf8View),
197+
&mut LEGACY_SESSION.create_execution_ctx(),
198+
)?;
199+
200+
assert_eq!(arrow.as_string_view().value(0), "selected");
201+
assert!(
202+
arrow.get_array_memory_size() < unselected.len(),
203+
"filtered export retained unselected payload: {} bytes",
204+
arrow.get_array_memory_size()
205+
);
206+
Ok(())
207+
}
182208
}

vortex-array/src/arrow/executor/byte_view.rs

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ use vortex_error::VortexResult;
1212
use crate::ArrayRef;
1313
use crate::ExecutionCtx;
1414
use crate::arrays::VarBinViewArray;
15-
use crate::arrow::executor::validity::to_arrow_null_buffer;
1615
use crate::arrow::null_buffer::to_null_buffer;
1716
use crate::builtins::ArrayBuiltins;
1817
use crate::dtype::DType;
@@ -48,19 +47,8 @@ pub fn execute_varbinview_to_arrow<T: ByteViewType>(
4847
array: &VarBinViewArray,
4948
ctx: &mut ExecutionCtx,
5049
) -> VortexResult<ArrowArrayRef> {
51-
let views =
52-
ScalarBuffer::<u128>::from(array.views_handle().as_host().clone().into_arrow_buffer());
53-
let buffers: Vec<_> = array
54-
.data_buffers()
55-
.iter()
56-
.map(|buffer| buffer.as_host().clone().into_arrow_buffer())
57-
.collect();
58-
let nulls = to_arrow_null_buffer(array.validity()?, array.len(), ctx)?;
59-
60-
// SAFETY: our own VarBinView array is considered safe.
61-
Ok(Arc::new(unsafe {
62-
GenericByteViewArray::<T>::new_unchecked(views, buffers, nulls)
63-
}))
50+
let compacted = array.compact_buffers()?;
51+
canonical_varbinview_to_arrow::<T>(&compacted, ctx)
6452
}
6553

6654
pub(super) fn to_arrow_byte_view<T: ByteViewType>(
@@ -73,6 +61,7 @@ pub(super) fn to_arrow_byte_view<T: ByteViewType>(
7361
// flexible since there's no prescribed nullability in Arrow types.
7462
let array = array.cast(DType::from_arrow((&T::DATA_TYPE, Nullability::Nullable)))?;
7563

64+
let array = array.execute::<ArrayRef>(ctx)?;
7665
let varbinview = array.execute::<VarBinViewArray>(ctx)?;
77-
canonical_varbinview_to_arrow::<T>(&varbinview, ctx)
66+
execute_varbinview_to_arrow::<T>(&varbinview, ctx)
7867
}

0 commit comments

Comments
 (0)