Skip to content

Commit 8d3b902

Browse files
author
Matthew Katz
committed
address comments
Signed-off-by: Matthew Katz <katz@spiraldb.com>
1 parent cd7c97e commit 8d3b902

7 files changed

Lines changed: 84 additions & 78 deletions

File tree

vortex-array/public-api.lock

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4444,15 +4444,17 @@ pub vortex_array::arrays::listview::ListViewDataParts::sizes: vortex_array::Arra
44444444

44454445
pub vortex_array::arrays::listview::ListViewDataParts::validity: vortex_array::validity::Validity
44464446

4447+
pub const vortex_array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD: f32
4448+
44474449
pub trait vortex_array::arrays::listview::ListViewArrayExt: vortex_array::TypedArrayRef<vortex_array::arrays::ListView>
44484450

4449-
pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>
4451+
pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>
44504452

4451-
pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_mask::Mask>>
4453+
pub fn vortex_array::arrays::listview::ListViewArrayExt::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_mask::Mask>
44524454

44534455
pub fn vortex_array::arrays::listview::ListViewArrayExt::elements(&self) -> &vortex_array::ArrayRef
44544456

4455-
pub fn vortex_array::arrays::listview::ListViewArrayExt::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>
4457+
pub fn vortex_array::arrays::listview::ListViewArrayExt::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>
44564458

44574459
pub fn vortex_array::arrays::listview::ListViewArrayExt::list_elements_at(&self, usize) -> vortex_error::VortexResult<vortex_array::ArrayRef>
44584460

@@ -4472,13 +4474,13 @@ pub fn vortex_array::arrays::listview::ListViewArrayExt::verify_is_zero_copy_to_
44724474

44734475
impl<T: vortex_array::TypedArrayRef<vortex_array::arrays::ListView>> vortex_array::arrays::listview::ListViewArrayExt for T
44744476

4475-
pub fn T::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>
4477+
pub fn T::compute_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>
44764478

4477-
pub fn T::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_mask::Mask>>
4479+
pub fn T::compute_referenced_elements_mask(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_mask::Mask>
44784480

44794481
pub fn T::elements(&self) -> &vortex_array::ArrayRef
44804482

4481-
pub fn T::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<f32>>
4483+
pub fn T::estimate_density(&self, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<f32>
44824484

44834485
pub fn T::list_elements_at(&self, usize) -> vortex_error::VortexResult<vortex_array::ArrayRef>
44844486

@@ -23966,8 +23968,6 @@ impl vortex_array::Array<vortex_array::arrays::ListView>
2396623968

2396723969
pub fn vortex_array::Array<vortex_array::arrays::ListView>::rebuild(&self, vortex_array::arrays::listview::ListViewRebuildMode) -> vortex_error::VortexResult<vortex_array::arrays::ListViewArray>
2396823970

23969-
pub fn vortex_array::Array<vortex_array::arrays::ListView>::should_rebuild(&self, bool, &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<bool>
23970-
2397123971
impl vortex_array::Array<vortex_array::arrays::Masked>
2397223972

2397323973
pub fn vortex_array::Array<vortex_array::arrays::Masked>::try_new(vortex_array::ArrayRef, vortex_array::validity::Validity) -> vortex_error::VortexResult<Self>

vortex-array/src/arrays/listview/array.rs

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -407,16 +407,13 @@ pub trait ListViewArrayExt: TypedArrayRef<ListView> {
407407
/// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`,
408408
/// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly.
409409
///
410-
/// Returns `Ok(None)` when `elements` is empty.
411-
#[allow(clippy::cognitive_complexity, clippy::unnecessary_fallible_conversions)]
412-
fn compute_referenced_elements_mask(
413-
&self,
414-
ctx: &mut ExecutionCtx,
415-
) -> VortexResult<Option<Mask>> {
410+
/// **Preconditions**
411+
///
412+
/// Assumes that `self.elements()` is non-empty.
413+
#[allow(clippy::cognitive_complexity)]
414+
fn compute_referenced_elements_mask(&self, ctx: &mut ExecutionCtx) -> VortexResult<Mask> {
415+
debug_assert!(!self.elements().is_empty());
416416
let len = self.elements().len();
417-
if len == 0 {
418-
return Ok(None);
419-
}
420417

421418
let offsets_primitive = self.offsets().clone().execute::<PrimitiveArray>(ctx)?;
422419
let sizes_primitive = self.sizes().clone().execute::<PrimitiveArray>(ctx)?;
@@ -430,49 +427,55 @@ pub trait ListViewArrayExt: TypedArrayRef<ListView> {
430427
let sizes_slice = sizes_primitive.as_slice::<S>();
431428

432429
for i in 0..offset_len {
433-
let start =
434-
usize::try_from(offsets_slice[i]).vortex_expect("offset must fit in usize");
435-
let size =
436-
usize::try_from(sizes_slice[i]).vortex_expect("size must fit in usize");
430+
let start: usize = offsets_slice[i].as_();
431+
let size: usize = sizes_slice[i].as_();
437432
buf.fill_range(start, start + size, true);
438433
}
439434
})
440435
});
441436

442-
Ok(Some(Mask::from_buffer(buf.freeze())))
437+
Ok(Mask::from_buffer(buf.freeze()))
443438
}
444439

445440
/// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly.
446441
///
447-
/// Returns `Ok(None)` when `elements` is empty.
448-
fn compute_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<Option<f32>> {
449-
Ok(self
450-
.compute_referenced_elements_mask(ctx)?
451-
.map(|mask| match mask {
452-
Mask::AllTrue(_) => 1.0,
453-
Mask::AllFalse(_) => 0.0,
454-
Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32,
455-
}))
442+
/// Returns `Ok(1.0)` when `elements` is empty.
443+
fn compute_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<f32> {
444+
if self.elements().is_empty() {
445+
return Ok(1.0);
446+
}
447+
448+
if self.sizes().is_empty() {
449+
return Ok(0.0);
450+
}
451+
452+
let density = match self.compute_referenced_elements_mask(ctx)? {
453+
Mask::AllTrue(_) => 1.0,
454+
Mask::AllFalse(_) => 0.0,
455+
Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32,
456+
};
457+
458+
Ok(density)
456459
}
457460

458461
/// Upper-bound estimate of [`compute_density`](Self::compute_density) via
459462
/// `sum(sizes) / elements.len()`, clamped to `[0.0, 1.0]`.
460463
///
461464
/// Exact for non-overlapping views, but overcounts when multiple views share the same elements.
462465
///
463-
/// Returns `Ok(None)` when `elements` is empty.
464-
fn estimate_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<Option<f32>> {
466+
/// Returns `Ok(1.0)` when `elements` is empty.
467+
fn estimate_density(&self, ctx: &mut ExecutionCtx) -> VortexResult<f32> {
465468
let n_elts = self.elements().len();
466-
if n_elts == 0 {
467-
return Ok(None);
469+
if self.elements().is_empty() {
470+
return Ok(1.0);
468471
}
469472

470473
let sizes = self.sizes();
471474
if sizes.is_empty() {
472-
return Ok(Some(0.0));
475+
return Ok(0.0);
473476
}
474477

475-
// compute_stat short-circuits on a cached exact Sum and otherwise computes-and-caches.
478+
// compute_stat short-circuits on a cached exact Sum and otherwise computes
476479
let sizes_sum = sizes
477480
.statistics()
478481
.compute_stat(Stat::Sum, ctx)?
@@ -481,9 +484,11 @@ pub trait ListViewArrayExt: TypedArrayRef<ListView> {
481484
.as_::<u64>()
482485
.ok_or_else(|| vortex_err!("could not cast sum of sizes to u64"))?;
483486

484-
let estimate = (sizes_sum as f32 / n_elts as f32).clamp(0.0, 1.0);
487+
let estimate = (sizes_sum as f32 / n_elts as f32).min(1.0);
488+
489+
debug_assert!(estimate >= 0.0);
485490

486-
Ok(Some(estimate))
491+
Ok(estimate)
487492
}
488493
}
489494
impl<T: TypedArrayRef<ListView>> ListViewArrayExt for T {}

vortex-array/src/arrays/listview/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pub use conversion::list_view_from_list;
1818
pub use conversion::recursive_list_from_list_view;
1919

2020
mod rebuild;
21+
pub use rebuild::DEFAULT_REBUILD_DENSITY_THRESHOLD;
2122
pub use rebuild::ListViewRebuildMode;
2223

2324
#[cfg(test)]

vortex-array/src/arrays/listview/rebuild.rs

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ use vortex_buffer::BufferMut;
66
use vortex_error::VortexExpect;
77
use vortex_error::VortexResult;
88

9-
use crate::ExecutionCtx;
109
use crate::IntoArray;
1110
use crate::LEGACY_SESSION;
1211
#[expect(deprecated)]
@@ -31,7 +30,10 @@ use crate::scalar_fn::fns::operators::Operator;
3130
/// A `ListViewArray` can accumulate unreferenced bytes in its `elements` buffer after
3231
/// metadata-only operations like `take` and `filter`. When density (referenced fraction of `elements`)
3332
/// falls below this threshold, the benefits of a rebuild may outweigh its cost.
34-
const REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
33+
///
34+
/// This is a somewhat arbitrary rule-of-thumb and may be suboptimal depending on different use cases and
35+
/// list element dtypes.
36+
pub const DEFAULT_REBUILD_DENSITY_THRESHOLD: f32 = 0.1;
3537

3638
/// Modes for rebuilding a [`ListViewArray`].
3739
pub enum ListViewRebuildMode {
@@ -384,16 +386,6 @@ impl ListViewArray {
384386
self.rebuild_zero_copy_to_list()
385387
}
386388
}
387-
388-
pub fn should_rebuild(&self, exact: bool, ctx: &mut ExecutionCtx) -> VortexResult<bool> {
389-
let density = if exact {
390-
self.compute_density(ctx)?
391-
} else {
392-
self.estimate_density(ctx)?
393-
};
394-
395-
Ok(density.unwrap_or(1.0) < REBUILD_DENSITY_THRESHOLD)
396-
}
397389
}
398390

399391
#[cfg(test)]

vortex-array/src/arrays/listview/tests/density.rs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ fn test_execution_ctx() -> ExecutionCtx {
3333
fn full_density_no_overlap() -> VortexResult<()> {
3434
let mut ctx = test_execution_ctx();
3535
let lv = create_basic_listview();
36-
let exact = lv.compute_density(&mut ctx)?.expect("non-empty elements");
37-
let est = lv.estimate_density(&mut ctx)?.expect("non-empty elements");
36+
let exact = lv.compute_density(&mut ctx)?;
37+
let est = lv.estimate_density(&mut ctx)?;
3838

3939
assert!((exact - 1.0).abs() < EPS);
4040
assert!((est - 1.0).abs() < EPS);
@@ -45,8 +45,8 @@ fn full_density_no_overlap() -> VortexResult<()> {
4545
fn sparse_no_overlap_matches_exact() -> VortexResult<()> {
4646
let mut ctx = test_execution_ctx();
4747
let lv = create_large_listview();
48-
let exact = lv.compute_density(&mut ctx)?.expect("non-empty");
49-
let est = lv.estimate_density(&mut ctx)?.expect("non-empty");
48+
let exact = lv.compute_density(&mut ctx)?;
49+
let est = lv.estimate_density(&mut ctx)?;
5050

5151
assert!((exact - 0.5).abs() < EPS);
5252
assert!((est - 0.5).abs() < EPS);
@@ -57,12 +57,8 @@ fn sparse_no_overlap_matches_exact() -> VortexResult<()> {
5757
fn all_empty_lists_is_zero_density() -> VortexResult<()> {
5858
let mut ctx = test_execution_ctx();
5959
let lv = create_empty_lists_listview();
60-
let exact = lv
61-
.compute_density(&mut ctx)?
62-
.expect("elements has length 1");
63-
let est = lv
64-
.estimate_density(&mut ctx)?
65-
.expect("elements has length 1");
60+
let exact = lv.compute_density(&mut ctx)?;
61+
let est = lv.estimate_density(&mut ctx)?;
6662

6763
assert_eq!(exact, 0.0);
6864
assert_eq!(est, 0.0);
@@ -73,8 +69,8 @@ fn all_empty_lists_is_zero_density() -> VortexResult<()> {
7369
fn overlap_full_coverage_clamps_estimate() -> VortexResult<()> {
7470
let mut ctx = test_execution_ctx();
7571
let lv = create_overlapping_listview();
76-
let exact = lv.compute_density(&mut ctx)?.expect("non-empty");
77-
let est = lv.estimate_density(&mut ctx)?.expect("non-empty");
72+
let exact = lv.compute_density(&mut ctx)?;
73+
let est = lv.estimate_density(&mut ctx)?;
7874

7975
assert!((exact - 1.0).abs() < EPS);
8076
assert!((est - 1.0).abs() < EPS);
@@ -86,21 +82,24 @@ fn overlap_differential_exact_lower_than_estimate() -> VortexResult<()> {
8682
let mut ctx = test_execution_ctx();
8783
let lv = create_sparse_overlapping_listview();
8884

89-
let exact = lv.compute_density(&mut ctx)?.expect("non-empty");
90-
let est = lv.estimate_density(&mut ctx)?.expect("non-empty");
85+
let exact = lv.compute_density(&mut ctx)?;
86+
let est = lv.estimate_density(&mut ctx)?;
9187

9288
assert!((exact - 0.25).abs() < EPS);
9389
assert!((est - 0.40).abs() < EPS);
9490
Ok(())
9591
}
9692

9793
#[test]
98-
fn empty_elements_returns_none() -> VortexResult<()> {
94+
fn empty_elements_returns_zero() -> VortexResult<()> {
9995
let mut ctx = test_execution_ctx();
10096
let lv = create_empty_elements_listview();
10197

102-
assert!(lv.compute_density(&mut ctx)?.is_none());
103-
assert!(lv.estimate_density(&mut ctx)?.is_none());
98+
let exact = lv.compute_density(&mut ctx)?;
99+
let est = lv.estimate_density(&mut ctx)?;
100+
101+
assert!(exact.abs() < EPS);
102+
assert!(est.abs() < EPS);
104103
Ok(())
105104
}
106105

@@ -114,7 +113,7 @@ fn estimate_uses_cached_sum_stat() -> VortexResult<()> {
114113
.statistics()
115114
.set(Stat::Sum, Precision::Exact(ScalarValue::from(5u64)));
116115

117-
let est = lv.estimate_density(&mut ctx)?.expect("non-empty");
116+
let est = lv.estimate_density(&mut ctx)?;
118117
assert!((est - 0.5).abs() < EPS);
119118
Ok(())
120119
}
@@ -123,9 +122,7 @@ fn estimate_uses_cached_sum_stat() -> VortexResult<()> {
123122
fn referenced_mask_set_bits_match_views() -> VortexResult<()> {
124123
let mut ctx = test_execution_ctx();
125124
let lv = create_sparse_overlapping_listview();
126-
let mask = lv
127-
.compute_referenced_elements_mask(&mut ctx)?
128-
.expect("non-empty elements");
125+
let mask = lv.compute_referenced_elements_mask(&mut ctx)?;
129126
let bits = match mask {
130127
Mask::Values(v) => v,
131128
_ => panic!("expected Values mask"),

vortex-array/src/arrow/executor/list_view.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ use crate::ArrayRef;
1313
use crate::ExecutionCtx;
1414
use crate::arrays::ListViewArray;
1515
use crate::arrays::PrimitiveArray;
16+
use crate::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD;
17+
use crate::arrays::listview::ListViewArrayExt;
1618
use crate::arrays::listview::ListViewDataParts;
1719
use crate::arrays::listview::ListViewRebuildMode;
1820
use crate::arrow::executor::validity::to_arrow_null_buffer;
@@ -29,8 +31,11 @@ pub(super) fn to_arrow_list_view<O: OffsetSizeTrait + IntegerPType>(
2931
) -> VortexResult<arrow_array::ArrayRef> {
3032
let array = array.execute::<ListViewArray>(ctx)?;
3133

32-
// If array is sparse, rebuild before handing it to Arrow
33-
let array = if array.should_rebuild(false, ctx)? {
34+
// If the array is sufficiently sparse, rebuild before handing it to Arrow. Otherwise downstream
35+
// consumers hold an elements buffer containing unreferenced data in memory indefinitely,
36+
// and any compute pass over that buffer wastes work on data nothing references.
37+
let density = array.estimate_density(ctx)?;
38+
let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD {
3439
array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
3540
} else {
3641
array

vortex-duckdb/src/exporter/list_view.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ use parking_lot::Mutex;
1010
use vortex::array::ExecutionCtx;
1111
use vortex::array::arrays::ListViewArray;
1212
use vortex::array::arrays::PrimitiveArray;
13+
use vortex::array::arrays::listview::DEFAULT_REBUILD_DENSITY_THRESHOLD;
14+
use vortex::array::arrays::listview::ListViewArrayExt;
1315
use vortex::array::arrays::listview::ListViewDataParts;
1416
use vortex::array::arrays::listview::ListViewRebuildMode;
1517
use vortex::array::match_each_integer_ptype;
@@ -50,21 +52,25 @@ pub(crate) fn new_exporter(
5052
cache: &ConversionCache,
5153
ctx: &mut ExecutionCtx,
5254
) -> VortexResult<Box<dyn ColumnExporter>> {
53-
let len = array.len();
54-
55-
let compact_array = if array.should_rebuild(false, ctx)? {
55+
// If the array is sufficiently sparse, rebuild. Otherwise the DuckDB vector will
56+
// hold an elements buffer containing unreferenced data in memory indefinitely,
57+
// and any compute pass over that buffer wastes work on data nothing references.
58+
let density = array.estimate_density(ctx)?;
59+
let array = if density < DEFAULT_REBUILD_DENSITY_THRESHOLD {
5660
array.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
5761
} else {
5862
array
5963
};
6064

65+
let len = array.len();
66+
6167
let ListViewDataParts {
6268
elements_dtype,
6369
elements,
6470
offsets,
6571
sizes,
6672
validity,
67-
} = compact_array.into_data_parts();
73+
} = array.into_data_parts();
6874
// Cache an `elements` vector up front so that future exports can reference it.
6975
let num_elements = elements.len();
7076

0 commit comments

Comments
 (0)