Skip to content

Commit 7e40e81

Browse files
Matt KatzMatt Katz
authored andcommitted
add density calculation and estimation
1 parent 2ab3724 commit 7e40e81

2 files changed

Lines changed: 103 additions & 0 deletions

File tree

vortex-array/src/arrays/listview/array.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,21 @@ use std::sync::Arc;
77

88
use num_traits::AsPrimitive;
99
use smallvec::smallvec;
10+
use vortex_buffer::BitBufferMut;
1011
use vortex_error::VortexExpect;
1112
use vortex_error::VortexResult;
1213
use vortex_error::vortex_bail;
1314
use vortex_error::vortex_ensure;
1415
use vortex_error::vortex_err;
16+
use vortex_mask::Mask;
1517

1618
use crate::ArrayRef;
1719
use crate::ArraySlots;
1820
use crate::LEGACY_SESSION;
1921
#[expect(deprecated)]
2022
use crate::ToCanonical as _;
2123
use crate::VortexSessionExecute;
24+
use crate::aggregate_fn::fns::sum::sum;
2225
use crate::array::Array;
2326
use crate::array::ArrayParts;
2427
use crate::array::TypedArrayRef;
@@ -30,6 +33,9 @@ use crate::arrays::PrimitiveArray;
3033
use crate::arrays::bool;
3134
use crate::dtype::DType;
3235
use crate::dtype::IntegerPType;
36+
use crate::expr::stats::Precision;
37+
use crate::expr::stats::Stat;
38+
use crate::expr::stats::StatsProvider;
3339
use crate::match_each_integer_ptype;
3440
use crate::validity::Validity;
3541

@@ -396,6 +402,92 @@ pub trait ListViewArrayExt: TypedArrayRef<ListView> {
396402
let sizes_primitive = self.sizes().to_primitive();
397403
validate_zctl(self.elements(), offsets_primitive, sizes_primitive).is_ok()
398404
}
405+
406+
/// Returns a [`Mask`] of length `elements.len()` where each bit is set iff that
407+
/// position in `elements` is referenced by at least one view.
408+
///
409+
/// Walks every `(offset, size)` pair, canonicalizes both `offsets` and `sizes`,
410+
/// and allocates a `BitBuffer` of length `elements.len()`, so it is extremely costly.
411+
///
412+
/// Returns `None` when `elements` is empty.
413+
fn compute_referenced_elements_mask(&self) -> Option<Mask> {
414+
let len = self.elements().len();
415+
if len == 0 {
416+
return None;
417+
}
418+
419+
let offsets_dtype = self.offsets().dtype();
420+
let sizes_dtype = self.sizes().dtype();
421+
422+
#[expect(deprecated)]
423+
let offsets_primitive = self.offsets().to_primitive();
424+
#[expect(deprecated)]
425+
let sizes_primitive = self.sizes().to_primitive();
426+
427+
let mut buf = BitBufferMut::new_unset(len);
428+
let offset_len = self.as_ref().len();
429+
430+
match_each_integer_ptype!(offsets_dtype.as_ptype(), |O| {
431+
match_each_integer_ptype!(sizes_dtype.as_ptype(), |S| {
432+
let offsets_slice = offsets_primitive.as_slice::<O>();
433+
let sizes_slice = sizes_primitive.as_slice::<S>();
434+
435+
(0..offset_len).for_each(|i| {
436+
let start = offsets_slice[i] as usize;
437+
let size = sizes_slice[i] as usize;
438+
buf.fill_range(start, start + size, true);
439+
});
440+
})
441+
});
442+
443+
Some(Mask::from_buffer(buf.freeze()))
444+
}
445+
446+
/// Exact fraction of `elements` referenced by some view, in `[0.0, 1.0]`. Extremely costly.
447+
///
448+
/// Returns `None` when `elements` is empty.
449+
fn compute_density(&self) -> Option<f32> {
450+
self.compute_referenced_elements_mask()
451+
.map(|mask| match mask {
452+
Mask::AllTrue(_) => 1.0,
453+
Mask::AllFalse(_) => 0.0,
454+
Mask::Values(values) => values.true_count() as f32 / self.elements().len() as f32,
455+
})
456+
}
457+
458+
/// Upper-bound estimate of [`compute_density`](Self::compute_density) via
459+
/// `sum(sizes) / elements.len()`, clamped to `[0.0, 1.0]`.
460+
///
461+
/// Exact for non-overlapping views, but overcounts when multiple views share the same elements.
462+
///
463+
/// Returns `Ok(None)` when `elements` is empty
464+
fn estimate_density(&self) -> VortexResult<Option<f32>> {
465+
let n_elts = self.elements().len();
466+
if n_elts == 0 {
467+
return Ok(None);
468+
}
469+
470+
let sizes = self.sizes();
471+
if sizes.is_empty() {
472+
return Ok(Some(0.0));
473+
}
474+
475+
// Try to fetch the cached sum stat, otherwise fall back to calculating it on the spot
476+
let sizes_sum = if let Some(Precision::Exact(scalar)) = sizes.statistics().get(Stat::Sum)
477+
&& let Some(sum) = scalar.as_primitive().as_::<u64>()
478+
{
479+
sum
480+
} else {
481+
sum(sizes, &mut LEGACY_SESSION.create_execution_ctx())?
482+
.as_primitive()
483+
.as_::<u64>()
484+
.unwrap()
485+
};
486+
487+
let estimate = (sizes_sum as f32 / n_elts as f32).clamp(0.0, 1.0);
488+
489+
Ok(Some(estimate))
490+
}
399491
}
400492
impl<T: TypedArrayRef<ListView>> ListViewArrayExt for T {}
401493

vortex-array/src/arrays/listview/rebuild.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use crate::aggregate_fn::fns::min_max::min_max;
1515
use crate::arrays::ConstantArray;
1616
use crate::arrays::ListViewArray;
1717
use crate::arrays::listview::ListViewArrayExt;
18+
use crate::arrays::listview::compute::REBUILD_DENSITY_THRESHOLD;
1819
use crate::builders::builder_with_capacity;
1920
use crate::builtins::ArrayBuiltins;
2021
use crate::dtype::DType;
@@ -376,6 +377,16 @@ impl ListViewArray {
376377
self.rebuild_zero_copy_to_list()
377378
}
378379
}
380+
381+
fn should_rebuild(&self, exact: bool) -> bool {
382+
let density = if exact {
383+
self.compute_density()
384+
} else {
385+
self.estimate_density().ok().flatten()
386+
};
387+
388+
density.unwrap_or(1.0) < REBUILD_DENSITY_THRESHOLD
389+
}
379390
}
380391

381392
#[cfg(test)]

0 commit comments

Comments
 (0)