Skip to content

Commit 3be9427

Browse files
authored
Add list_length scalar function (#8495)
Adds a `list_length` scalar function returning the number of elements in each list of a `List`-like array. - Computed purely from the list's offsets/sizes — it never reads elements. Different paths for `List`, `ListView`, and `FixedSizeList` arrays. - Returns a `U64` array; a null list yields a null length. - Registered as a built-in (`vortex.list.length`) alongside `list_contains`, and exposed via the `list_length(expr)` expression constructor. --------- Signed-off-by: Matt Katz <mhkatz97@gmail.com>
1 parent 3451cb0 commit 3be9427

6 files changed

Lines changed: 564 additions & 0 deletions

File tree

vortex-array/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,10 @@ harness = false
236236
name = "filter_bool"
237237
harness = false
238238

239+
[[bench]]
240+
name = "list_length"
241+
harness = false
242+
239243
[[bench]]
240244
name = "listview_rebuild"
241245
harness = false
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Benchmarks for the `list_length` scalar function over `List` and `ListView` inputs.
5+
//!
6+
//! `list_length` reads only the offsets/sizes (never the elements), so its cost scales with the
7+
//! number of lists.
8+
9+
#![expect(clippy::unwrap_used)]
10+
#![expect(clippy::cast_possible_truncation)]
11+
12+
use std::sync::LazyLock;
13+
14+
use divan::Bencher;
15+
use rand::RngExt;
16+
use rand::SeedableRng;
17+
use rand::distr::Uniform;
18+
use rand::rngs::StdRng;
19+
use vortex_array::ArrayRef;
20+
use vortex_array::Canonical;
21+
use vortex_array::IntoArray;
22+
use vortex_array::VortexSessionExecute;
23+
use vortex_array::arrays::BoolArray;
24+
use vortex_array::arrays::ListArray;
25+
use vortex_array::arrays::ListViewArray;
26+
use vortex_array::arrays::PrimitiveArray;
27+
use vortex_array::expr::list_length;
28+
use vortex_array::expr::root;
29+
use vortex_array::validity::Validity;
30+
use vortex_buffer::Buffer;
31+
use vortex_session::VortexSession;
32+
33+
fn main() {
34+
divan::main();
35+
}
36+
37+
static SESSION: LazyLock<VortexSession> = LazyLock::new(vortex_array::array_session);
38+
39+
const BASE_LIST_SIZE: usize = 8;
40+
41+
const SMALL: usize = 100;
42+
const MEDIUM: usize = 10_000;
43+
const LARGE: usize = 1_000_000;
44+
45+
/// A uniformly-random partition of `num_lists * LIST_SIZE` elements into `num_lists` lists,
46+
/// plus a validity mask with ~1/8 of lists null at random positions.
47+
fn random_lists(num_lists: usize) -> (Vec<i32>, Validity) {
48+
let mut rng = StdRng::seed_from_u64(num_lists as u64);
49+
let total = (num_lists * BASE_LIST_SIZE) as i32;
50+
51+
let cut_dist = Uniform::new_inclusive(0i32, total).unwrap();
52+
let mut cuts: Vec<i32> = (0..num_lists - 1).map(|_| rng.sample(cut_dist)).collect();
53+
cuts.sort_unstable();
54+
let mut sizes = Vec::with_capacity(num_lists);
55+
let mut prev = 0i32;
56+
for cut in cuts {
57+
sizes.push(cut - prev);
58+
prev = cut;
59+
}
60+
sizes.push(total - prev);
61+
62+
let null_dist = Uniform::new(0u32, 8).unwrap();
63+
let valid = (0..num_lists).map(|_| rng.sample(null_dist) != 0);
64+
(
65+
sizes,
66+
Validity::Array(BoolArray::from_iter(valid).into_array()),
67+
)
68+
}
69+
70+
/// A canonical `List<i32>` of `num_lists` variable-length lists, ~1/8 of them null.
71+
fn make_list(num_lists: usize) -> ArrayRef {
72+
let (sizes, validity) = random_lists(num_lists);
73+
let total: i32 = sizes.iter().sum();
74+
let elements = PrimitiveArray::from_iter(0..total).into_array();
75+
let offsets: Buffer<i32> = std::iter::once(0)
76+
.chain(sizes.iter().scan(0i32, |acc, &s| {
77+
*acc += s;
78+
Some(*acc)
79+
}))
80+
.collect();
81+
ListArray::try_new(elements, offsets.into_array(), validity)
82+
.unwrap()
83+
.into_array()
84+
}
85+
86+
/// A gapless `ListView<i32>` of `num_lists` variable-length lists, ~1/8 of them null.
87+
fn make_listview(num_lists: usize) -> ArrayRef {
88+
let (sizes, validity) = random_lists(num_lists);
89+
let total: i32 = sizes.iter().sum();
90+
let elements = PrimitiveArray::from_iter(0..total).into_array();
91+
let offsets: Buffer<i32> = sizes
92+
.iter()
93+
.scan(0i32, |acc, &s| {
94+
let start = *acc;
95+
*acc += s;
96+
Some(start)
97+
})
98+
.collect();
99+
let sizes: Buffer<i32> = sizes.into_iter().collect();
100+
ListViewArray::new(elements, offsets.into_array(), sizes.into_array(), validity).into_array()
101+
}
102+
103+
/// Apply `list_length(root())` and materialize the result.
104+
fn run(bencher: Bencher, array: ArrayRef) {
105+
let expr = list_length(root());
106+
bencher
107+
.with_inputs(|| (&array, SESSION.create_execution_ctx()))
108+
.bench_refs(|(array, ctx)| {
109+
array
110+
.clone()
111+
.apply(&expr)
112+
.unwrap()
113+
.execute::<Canonical>(ctx)
114+
.unwrap()
115+
});
116+
}
117+
118+
#[divan::bench]
119+
fn list_length_small(bencher: Bencher) {
120+
run(bencher, make_list(SMALL));
121+
}
122+
123+
#[divan::bench]
124+
fn list_length_medium(bencher: Bencher) {
125+
run(bencher, make_list(MEDIUM));
126+
}
127+
128+
#[divan::bench]
129+
fn list_length_large(bencher: Bencher) {
130+
run(bencher, make_list(LARGE));
131+
}
132+
133+
#[divan::bench]
134+
fn listview_length_small(bencher: Bencher) {
135+
run(bencher, make_listview(SMALL));
136+
}
137+
138+
#[divan::bench]
139+
fn listview_length_medium(bencher: Bencher) {
140+
run(bencher, make_listview(MEDIUM));
141+
}
142+
143+
#[divan::bench]
144+
fn listview_length_large(bencher: Bencher) {
145+
run(bencher, make_listview(LARGE));
146+
}

vortex-array/src/expr/exprs.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ use crate::scalar_fn::fns::is_null::IsNull;
3737
use crate::scalar_fn::fns::like::Like;
3838
use crate::scalar_fn::fns::like::LikeOptions;
3939
use crate::scalar_fn::fns::list_contains::ListContains;
40+
use crate::scalar_fn::fns::list_length::ListLength;
4041
use crate::scalar_fn::fns::literal::Literal;
4142
use crate::scalar_fn::fns::mask::Mask;
4243
use crate::scalar_fn::fns::merge::DuplicateHandling;
@@ -750,3 +751,17 @@ pub fn byte_length(input: Expression) -> Expression {
750751
pub fn ext_storage(input: Expression) -> Expression {
751752
ExtStorage.new_expr(EmptyOptions, [input])
752753
}
754+
755+
// ---- ListLength ----
756+
757+
/// Creates an expression that computes the number of elements in each list
758+
/// for `List` and `FixedSizeList` inputs. This is akin to ANSI SQL `CARDINALITY()`,
759+
/// or DuckDB's `len()`/`array_length()`.
760+
///
761+
/// ```rust
762+
/// # use vortex_array::expr::{list_length, root};
763+
/// let expr = list_length(root());
764+
/// ```
765+
pub fn list_length(input: Expression) -> Expression {
766+
ListLength.new_expr(EmptyOptions, [input])
767+
}

0 commit comments

Comments
 (0)