Skip to content

Commit 89cffea

Browse files
authored
Prunning expressions can reference rowcount (#7589)
This lets us effectively prune expressions like IsNotNull fix #7187 --------- Signed-off-by: Robert Kruszewski <github@robertk.io>
1 parent 1689d7a commit 89cffea

20 files changed

Lines changed: 588 additions & 209 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

java/testfiles/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rust-toolchain.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[toolchain]
22
channel = "1.91.0"
33
components = ["rust-src", "rustfmt", "clippy", "rust-analyzer"]
4-
profile = "minimal"
4+
profile = "minimal"

vortex-array/public-api.lock

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17532,6 +17532,58 @@ pub trait vortex_array::scalar_fn::fns::zip::ZipReduce: vortex_array::VTable
1753217532

1753317533
pub fn vortex_array::scalar_fn::fns::zip::ZipReduce::zip(array: vortex_array::ArrayView<'_, Self>, if_false: &vortex_array::ArrayRef, mask: &vortex_array::ArrayRef) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
1753417534

17535+
pub mod vortex_array::scalar_fn::internal
17536+
17537+
pub mod vortex_array::scalar_fn::internal::row_count
17538+
17539+
pub struct vortex_array::scalar_fn::internal::row_count::RowCount
17540+
17541+
impl core::clone::Clone for vortex_array::scalar_fn::internal::row_count::RowCount
17542+
17543+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::clone(&self) -> vortex_array::scalar_fn::internal::row_count::RowCount
17544+
17545+
impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::internal::row_count::RowCount
17546+
17547+
pub type vortex_array::scalar_fn::internal::row_count::RowCount::Options = vortex_array::scalar_fn::EmptyOptions
17548+
17549+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::arity(&self, _options: &Self::Options) -> vortex_array::scalar_fn::Arity
17550+
17551+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::child_name(&self, _options: &Self::Options, _child_idx: usize) -> vortex_array::scalar_fn::ChildName
17552+
17553+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::coerce_args(&self, options: &Self::Options, args: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<alloc::vec::Vec<vortex_array::dtype::DType>>
17554+
17555+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::deserialize(&self, _metadata: &[u8], _session: &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
17556+
17557+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::execute(&self, _options: &Self::Options, _args: &dyn vortex_array::scalar_fn::ExecutionArgs, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::ArrayRef>
17558+
17559+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::fmt_sql(&self, _options: &Self::Options, _expr: &vortex_array::expr::Expression, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result
17560+
17561+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::id(&self) -> vortex_array::scalar_fn::ScalarFnId
17562+
17563+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::is_fallible(&self, _options: &Self::Options) -> bool
17564+
17565+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::is_null_sensitive(&self, _options: &Self::Options) -> bool
17566+
17567+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::reduce(&self, options: &Self::Options, node: &dyn vortex_array::scalar_fn::ReduceNode, ctx: &dyn vortex_array::scalar_fn::ReduceCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::scalar_fn::ReduceNodeRef>>
17568+
17569+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::return_dtype(&self, _options: &Self::Options, _args: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
17570+
17571+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::serialize(&self, options: &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
17572+
17573+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::simplify(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, ctx: &dyn vortex_array::scalar_fn::SimplifyCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::expr::Expression>>
17574+
17575+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::simplify_untyped(&self, options: &Self::Options, expr: &vortex_array::expr::Expression) -> vortex_error::VortexResult<core::option::Option<vortex_array::expr::Expression>>
17576+
17577+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::stat_expression(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, stat: vortex_array::expr::stats::Stat, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option<vortex_array::expr::Expression>
17578+
17579+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::stat_falsification(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option<vortex_array::expr::Expression>
17580+
17581+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::validity(&self, options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult<core::option::Option<vortex_array::expr::Expression>>
17582+
17583+
pub fn vortex_array::scalar_fn::internal::row_count::contains_row_count(array: &vortex_array::ArrayRef) -> bool
17584+
17585+
pub fn vortex_array::scalar_fn::internal::row_count::substitute_row_count(array: vortex_array::ArrayRef, replacement: &vortex_array::ArrayRef) -> vortex_error::VortexResult<vortex_array::ArrayRef>
17586+
1753517587
pub mod vortex_array::scalar_fn::session
1753617588

1753717589
pub struct vortex_array::scalar_fn::session::ScalarFnSession
@@ -18680,6 +18732,44 @@ pub fn vortex_array::scalar_fn::fns::zip::Zip::stat_falsification(&self, options
1868018732

1868118733
pub fn vortex_array::scalar_fn::fns::zip::Zip::validity(&self, options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult<core::option::Option<vortex_array::expr::Expression>>
1868218734

18735+
impl vortex_array::scalar_fn::ScalarFnVTable for vortex_array::scalar_fn::internal::row_count::RowCount
18736+
18737+
pub type vortex_array::scalar_fn::internal::row_count::RowCount::Options = vortex_array::scalar_fn::EmptyOptions
18738+
18739+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::arity(&self, _options: &Self::Options) -> vortex_array::scalar_fn::Arity
18740+
18741+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::child_name(&self, _options: &Self::Options, _child_idx: usize) -> vortex_array::scalar_fn::ChildName
18742+
18743+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::coerce_args(&self, options: &Self::Options, args: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<alloc::vec::Vec<vortex_array::dtype::DType>>
18744+
18745+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::deserialize(&self, _metadata: &[u8], _session: &vortex_session::VortexSession) -> vortex_error::VortexResult<Self::Options>
18746+
18747+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::execute(&self, _options: &Self::Options, _args: &dyn vortex_array::scalar_fn::ExecutionArgs, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::ArrayRef>
18748+
18749+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::fmt_sql(&self, _options: &Self::Options, _expr: &vortex_array::expr::Expression, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result
18750+
18751+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::id(&self) -> vortex_array::scalar_fn::ScalarFnId
18752+
18753+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::is_fallible(&self, _options: &Self::Options) -> bool
18754+
18755+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::is_null_sensitive(&self, _options: &Self::Options) -> bool
18756+
18757+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::reduce(&self, options: &Self::Options, node: &dyn vortex_array::scalar_fn::ReduceNode, ctx: &dyn vortex_array::scalar_fn::ReduceCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::scalar_fn::ReduceNodeRef>>
18758+
18759+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::return_dtype(&self, _options: &Self::Options, _args: &[vortex_array::dtype::DType]) -> vortex_error::VortexResult<vortex_array::dtype::DType>
18760+
18761+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::serialize(&self, options: &Self::Options) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
18762+
18763+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::simplify(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, ctx: &dyn vortex_array::scalar_fn::SimplifyCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::expr::Expression>>
18764+
18765+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::simplify_untyped(&self, options: &Self::Options, expr: &vortex_array::expr::Expression) -> vortex_error::VortexResult<core::option::Option<vortex_array::expr::Expression>>
18766+
18767+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::stat_expression(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, stat: vortex_array::expr::stats::Stat, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option<vortex_array::expr::Expression>
18768+
18769+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::stat_falsification(&self, options: &Self::Options, expr: &vortex_array::expr::Expression, catalog: &dyn vortex_array::expr::pruning::StatsCatalog) -> core::option::Option<vortex_array::expr::Expression>
18770+
18771+
pub fn vortex_array::scalar_fn::internal::row_count::RowCount::validity(&self, options: &Self::Options, expression: &vortex_array::expr::Expression) -> vortex_error::VortexResult<core::option::Option<vortex_array::expr::Expression>>
18772+
1868318773
pub trait vortex_array::scalar_fn::ScalarFnVTableExt: vortex_array::scalar_fn::ScalarFnVTable
1868418774

1868518775
pub fn vortex_array::scalar_fn::ScalarFnVTableExt::bind(&self, options: Self::Options) -> vortex_array::scalar_fn::ScalarFnRef

vortex-array/src/expr/pruning/pruning_expr.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ pub fn field_path_stat_field_name(field_path: &FieldPath, stat: Stat) -> FieldNa
8686
/// cannot hold, and false if it cannot be determined from stats alone whether the positions can
8787
/// be pruned.
8888
///
89+
/// Some rewrites, such as `is_not_null(...)`, emit
90+
/// [`row_count`][crate::scalar_fn::internal::row_count] placeholders. The evaluation layer must
91+
/// replace those placeholders with the row count for its current scope before
92+
/// executing the returned expression.
93+
///
8994
/// If the falsification logic attempts to access an unknown stat,
9095
/// this function will return `None`.
9196
pub fn checked_pruning_expr(

vortex-array/src/scalar_fn/fns/is_not_null.rs

Lines changed: 24 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
use std::fmt::Formatter;
55

6+
use vortex_array::scalar_fn::internal::row_count::RowCount;
67
use vortex_error::VortexResult;
78
use vortex_session::VortexSession;
89

@@ -14,17 +15,15 @@ use crate::dtype::DType;
1415
use crate::dtype::Nullability;
1516
use crate::expr::Expression;
1617
use crate::expr::StatsCatalog;
17-
use crate::expr::and;
1818
use crate::expr::eq;
19-
use crate::expr::gt;
20-
use crate::expr::lit;
2119
use crate::expr::stats::Stat;
2220
use crate::scalar_fn::Arity;
2321
use crate::scalar_fn::ChildName;
2422
use crate::scalar_fn::EmptyOptions;
2523
use crate::scalar_fn::ExecutionArgs;
2624
use crate::scalar_fn::ScalarFnId;
2725
use crate::scalar_fn::ScalarFnVTable;
26+
use crate::scalar_fn::ScalarFnVTableExt;
2827
use crate::validity::Validity;
2928

3029
/// Expression that checks for non-null values.
@@ -106,40 +105,42 @@ impl ScalarFnVTable for IsNotNull {
106105
expr: &Expression,
107106
catalog: &dyn StatsCatalog,
108107
) -> Option<Expression> {
109-
// is_not_null is falsified when ALL values are null, i.e. null_count == len.
110-
// Since there is no len stat in the zone map, we approximate using IsConstant:
111-
// if the zone is constant and has any nulls, then all values must be null.
112-
//
113-
// TODO(#7187): Add a len stat to enable the more general falsification:
114-
// null_count == len => is_not_null is all false.
115-
let null_count_expr = expr.child(0).stat_expression(Stat::NullCount, catalog)?;
116-
let is_constant_expr = expr.child(0).stat_expression(Stat::IsConstant, catalog)?;
117-
// If the zone is constant (is_constant == true) and has nulls (null_count > 0),
118-
// then all values must be null, so is_not_null is all false.
119-
Some(and(
120-
eq(is_constant_expr, lit(true)),
121-
gt(null_count_expr, lit(0u64)),
122-
))
108+
// is_not_null is falsified when ALL values are null, i.e. null_count == row_count.
109+
let child = expr.child(0);
110+
let null_count_expr = child.stat_expression(Stat::NullCount, catalog)?;
111+
Some(eq(null_count_expr, RowCount.new_expr(EmptyOptions, [])))
123112
}
124113
}
125114

126115
#[cfg(test)]
127116
mod tests {
128117
use vortex_buffer::buffer;
129118
use vortex_error::VortexExpect as _;
119+
use vortex_utils::aliases::hash_map::HashMap;
120+
use vortex_utils::aliases::hash_set::HashSet;
130121

131122
use crate::IntoArray;
132123
use crate::LEGACY_SESSION;
133124
use crate::VortexSessionExecute;
134125
use crate::arrays::PrimitiveArray;
135126
use crate::arrays::StructArray;
136127
use crate::dtype::DType;
128+
use crate::dtype::Field;
129+
use crate::dtype::FieldPath;
130+
use crate::dtype::FieldPathSet;
137131
use crate::dtype::Nullability;
132+
use crate::expr::col;
133+
use crate::expr::eq;
138134
use crate::expr::get_item;
139135
use crate::expr::is_not_null;
136+
use crate::expr::pruning::checked_pruning_expr;
140137
use crate::expr::root;
138+
use crate::expr::stats::Stat;
141139
use crate::expr::test_harness;
142140
use crate::scalar::Scalar;
141+
use crate::scalar_fn::EmptyOptions;
142+
use crate::scalar_fn::internal::row_count::RowCount;
143+
use crate::scalar_fn::vtable::ScalarFnVTableExt;
143144

144145
#[test]
145146
fn dtype() {
@@ -255,50 +256,29 @@ mod tests {
255256

256257
#[test]
257258
fn test_is_not_null_sensitive() {
258-
use crate::expr::col;
259259
assert!(is_not_null(col("a")).signature().is_null_sensitive());
260260
}
261261

262262
#[test]
263263
fn test_is_not_null_falsification() {
264-
use vortex_utils::aliases::hash_map::HashMap;
265-
use vortex_utils::aliases::hash_set::HashSet;
266-
267-
use crate::dtype::Field;
268-
use crate::dtype::FieldPath;
269-
use crate::dtype::FieldPathSet;
270-
use crate::expr::and;
271-
use crate::expr::col;
272-
use crate::expr::eq;
273-
use crate::expr::gt;
274-
use crate::expr::lit;
275-
use crate::expr::pruning::checked_pruning_expr;
276-
use crate::expr::stats::Stat;
277-
278264
let expr = is_not_null(col("a"));
279265

280266
let (pruning_expr, st) = checked_pruning_expr(
281267
&expr,
282-
&FieldPathSet::from_iter([
283-
FieldPath::from_iter([Field::Name("a".into()), Field::Name("null_count".into())]),
284-
FieldPath::from_iter([Field::Name("a".into()), Field::Name("is_constant".into())]),
285-
]),
268+
&FieldPathSet::from_iter([FieldPath::from_iter([
269+
Field::Name("a".into()),
270+
Field::Name("null_count".into()),
271+
])]),
286272
)
287273
.unwrap();
288274

289275
assert_eq!(
290276
&pruning_expr,
291-
&and(
292-
eq(col("a_is_constant"), lit(true)),
293-
gt(col("a_null_count"), lit(0u64)),
294-
)
277+
&eq(col("a_null_count"), RowCount.new_expr(EmptyOptions, []))
295278
);
296279
assert_eq!(
297280
st.map(),
298-
&HashMap::from_iter([(
299-
FieldPath::from_name("a"),
300-
HashSet::from([Stat::NullCount, Stat::IsConstant])
301-
)])
281+
&HashMap::from_iter([(FieldPath::from_name("a"), HashSet::from([Stat::NullCount]))])
302282
);
303283
}
304284
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
pub mod row_count;

0 commit comments

Comments
 (0)