Skip to content

Commit 8b46e1e

Browse files
committed
Restore Parquet pruning for ANY statements
1 parent 8741a77 commit 8b46e1e

4 files changed

Lines changed: 64 additions & 74 deletions

File tree

datafusion/sql/src/expr/mod.rs

Lines changed: 40 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,12 @@ use datafusion_expr::expr::SetQuantifier;
3636
use datafusion_expr::expr::{InList, WildcardOptions};
3737
use datafusion_expr::{
3838
Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal,
39-
Operator, TryCast, lit, when,
39+
Operator, TryCast, lit,
4040
};
4141

4242
use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
4343
use datafusion_functions_nested::expr_fn::{
44-
array_has, array_max, array_min, array_position, cardinality,
44+
array_has, array_max, array_min, cardinality,
4545
};
4646

4747
mod binary_op;
@@ -1259,64 +1259,59 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
12591259
}
12601260
}
12611261

1262-
/// Plans `needle <compare_op> ANY/ALL(haystack)` with proper SQL NULL semantics.
1263-
///
1264-
/// CASE/WHEN structure:
1265-
/// WHEN arr IS NULL → NULL
1266-
/// WHEN empty → vacuous_result (ANY:false, ALL:true)
1267-
/// WHEN lhs IS NULL → NULL
1268-
/// WHEN decisive_condition → decisive_result (ANY:true match found, ALL:false violation found)
1269-
/// WHEN has_nulls → NULL
1270-
/// ELSE → vacuous_result
1262+
/// Plans `needle <op> ANY/ALL(haystack)` by desugaring to `array_has`,
1263+
/// `array_min`, or `array_max`. Desugars using min/max get a cardinality guard
1264+
/// so empty arrays return the vacuous result (ANY → false, ALL → true) instead
1265+
/// of NULL.
12711266
fn plan_quantified_op(
12721267
needle: &Expr,
12731268
haystack: &Expr,
12741269
compare_op: &BinaryOperator,
12751270
quantifier: SetQuantifier,
12761271
) -> Result<Expr> {
1277-
let null_arr_check = haystack.clone().is_null();
1278-
let empty_check = cardinality(haystack.clone()).eq(lit(0u64));
1279-
let null_lhs_check = needle.clone().is_null();
1280-
// DataFusion's array_position uses is_null() checks internally (not equality),
1281-
// so it can locate NULL elements even though NULL = NULL is NULL in standard SQL.
1282-
let has_nulls =
1283-
array_position(haystack.clone(), lit(ScalarValue::Null), lit(1i64)).is_not_null();
1284-
1285-
let decisive_condition = match (compare_op, quantifier) {
1286-
(BinaryOperator::Eq, SetQuantifier::Any)
1287-
| (BinaryOperator::NotEq, SetQuantifier::All) => {
1288-
array_has(haystack.clone(), needle.clone())
1272+
let (cmp, needs_empty_guard) = match (compare_op, quantifier) {
1273+
(BinaryOperator::Eq, SetQuantifier::Any) => {
1274+
(array_has(haystack.clone(), needle.clone()), false)
12891275
}
1290-
(BinaryOperator::Eq, SetQuantifier::All)
1291-
| (BinaryOperator::NotEq, SetQuantifier::Any) => {
1292-
let all_equal = array_min(haystack.clone())
1276+
(BinaryOperator::NotEq, SetQuantifier::All) => (
1277+
Expr::Not(Box::new(array_has(haystack.clone(), needle.clone()))),
1278+
false,
1279+
),
1280+
(BinaryOperator::Eq, SetQuantifier::All) => (
1281+
array_min(haystack.clone())
12931282
.eq(needle.clone())
1294-
.and(array_max(haystack.clone()).eq(needle.clone()));
1295-
Expr::Not(Box::new(all_equal))
1296-
}
1283+
.and(array_max(haystack.clone()).eq(needle.clone())),
1284+
true,
1285+
),
1286+
(BinaryOperator::NotEq, SetQuantifier::Any) => (
1287+
array_min(haystack.clone())
1288+
.not_eq(needle.clone())
1289+
.or(array_max(haystack.clone()).not_eq(needle.clone())),
1290+
true,
1291+
),
12971292
(BinaryOperator::Gt, SetQuantifier::Any) => {
1298-
needle.clone().gt(array_min(haystack.clone()))
1293+
(needle.clone().gt(array_min(haystack.clone())), true)
12991294
}
13001295
(BinaryOperator::Gt, SetQuantifier::All) => {
1301-
Expr::Not(Box::new(needle.clone().gt(array_max(haystack.clone()))))
1296+
(needle.clone().gt(array_max(haystack.clone())), true)
13021297
}
13031298
(BinaryOperator::Lt, SetQuantifier::Any) => {
1304-
needle.clone().lt(array_max(haystack.clone()))
1299+
(needle.clone().lt(array_max(haystack.clone())), true)
13051300
}
13061301
(BinaryOperator::Lt, SetQuantifier::All) => {
1307-
Expr::Not(Box::new(needle.clone().lt(array_min(haystack.clone()))))
1302+
(needle.clone().lt(array_min(haystack.clone())), true)
13081303
}
13091304
(BinaryOperator::GtEq, SetQuantifier::Any) => {
1310-
needle.clone().gt_eq(array_min(haystack.clone()))
1305+
(needle.clone().gt_eq(array_min(haystack.clone())), true)
13111306
}
13121307
(BinaryOperator::GtEq, SetQuantifier::All) => {
1313-
Expr::Not(Box::new(needle.clone().gt_eq(array_max(haystack.clone()))))
1308+
(needle.clone().gt_eq(array_max(haystack.clone())), true)
13141309
}
13151310
(BinaryOperator::LtEq, SetQuantifier::Any) => {
1316-
needle.clone().lt_eq(array_max(haystack.clone()))
1311+
(needle.clone().lt_eq(array_max(haystack.clone())), true)
13171312
}
13181313
(BinaryOperator::LtEq, SetQuantifier::All) => {
1319-
Expr::Not(Box::new(needle.clone().lt_eq(array_min(haystack.clone()))))
1314+
(needle.clone().lt_eq(array_min(haystack.clone())), true)
13201315
}
13211316
_ => {
13221317
return plan_err!(
@@ -1325,18 +1320,15 @@ fn plan_quantified_op(
13251320
}
13261321
};
13271322

1328-
let (vacuous_result, decisive_result) = match quantifier {
1329-
SetQuantifier::Any => (false, true),
1330-
SetQuantifier::All => (true, false),
1323+
let expr = if needs_empty_guard {
1324+
match quantifier {
1325+
SetQuantifier::Any => cardinality(haystack.clone()).gt(lit(0u64)).and(cmp),
1326+
SetQuantifier::All => cardinality(haystack.clone()).eq(lit(0u64)).or(cmp),
1327+
}
1328+
} else {
1329+
cmp
13311330
};
1332-
1333-
let null_bool = lit(ScalarValue::Boolean(None));
1334-
when(null_arr_check, null_bool.clone())
1335-
.when(empty_check, lit(vacuous_result))
1336-
.when(null_lhs_check, null_bool.clone())
1337-
.when(decisive_condition, lit(decisive_result))
1338-
.when(has_nulls, null_bool)
1339-
.otherwise(lit(vacuous_result))
1331+
Ok(expr)
13401332
}
13411333

13421334
#[cfg(test)]

datafusion/sql/tests/cases/plan_to_sql.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ fn roundtrip_statement_postgres_any_array_expr() -> Result<(), DataFusionError>
369369
sql: "select left from array where 1 = any(left);",
370370
parser_dialect: GenericDialect {},
371371
unparser_dialect: UnparserPostgreSqlDialect {},
372-
expected: @r#"SELECT "array"."left" FROM "array" WHERE CASE WHEN "array"."left" IS NULL THEN NULL WHEN (cardinality("array"."left") = 0) THEN false WHEN 1 IS NULL THEN NULL WHEN 1 = ANY("array"."left") THEN true WHEN array_position("array"."left", NULL, 1) IS NOT NULL THEN NULL ELSE false END"#,
372+
expected: @r#"SELECT "array"."left" FROM "array" WHERE 1 = ANY("array"."left")"#,
373373
);
374374
Ok(())
375375
}

datafusion/sqllogictest/test_files/array/array_all.slt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ NULL
145145
query B
146146
select 5 <> ALL(make_array(NULL::INT, NULL::INT));
147147
----
148-
NULL
148+
true
149149

150150
query B
151151
select 5 > ALL(make_array(NULL::INT, NULL::INT));
@@ -171,22 +171,22 @@ NULL
171171
query B
172172
select 5 > ALL(make_array(3, NULL));
173173
----
174-
NULL
174+
true
175175

176176
query B
177177
select 5 >= ALL(make_array(5, NULL));
178178
----
179-
NULL
179+
true
180180

181181
query B
182182
select 1 < ALL(make_array(3, NULL));
183183
----
184-
NULL
184+
true
185185

186186
query B
187187
select 1 <= ALL(make_array(1, NULL));
188188
----
189-
NULL
189+
true
190190

191191
# Mixed NULL + non-NULL (not satisfying condition → FALSE wins over NULL)
192192
query B

datafusion/sqllogictest/test_files/array/array_has.slt

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -517,18 +517,16 @@ logical_plan
517517
03)----SubqueryAlias: test
518518
04)------SubqueryAlias: t
519519
05)--------Projection:
520-
06)----------Filter: __common_expr_3 IS NULL AND Boolean(NULL) OR __common_expr_3 IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) IS NOT DISTINCT FROM Boolean(true) AND __common_expr_3 IS NOT NULL
521-
07)------------Projection: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) AS __common_expr_3
522-
08)--------------TableScan: generate_series() projection=[value]
520+
06)----------Filter: substr(CAST(md5(CAST(generate_series().value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
521+
07)------------TableScan: generate_series() projection=[value]
523522
physical_plan
524523
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
525524
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
526525
03)----CoalescePartitionsExec
527526
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
528-
05)--------FilterExec: __common_expr_3@0 IS NULL AND NULL OR __common_expr_3@0 IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]) IS NOT DISTINCT FROM true AND __common_expr_3@0 IS NOT NULL, projection=[]
529-
06)----------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8View)), 1, 32) as __common_expr_3]
530-
07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
531-
08)--------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
527+
05)--------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN (SET) ([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), projection=[]
528+
06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
529+
07)------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
532530

533531
query I
534532
with test AS (SELECT substr(md5(i::text)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
@@ -724,7 +722,8 @@ select 5 <> any(make_array(5, 5, 5));
724722
----
725723
false
726724

727-
# Empty array: all operators should return false (no elements satisfy the condition)
725+
# Empty array: vacuous false. min/max-based desugars are guarded by cardinality
726+
# so `value > any([])` and friends don't leak NULL.
728727
query B
729728
select 5 = any(make_array());
730729
----
@@ -755,27 +754,26 @@ select 5 <= any(make_array());
755754
----
756755
false
757756

758-
# Mixed NULL + non-NULL array where no non-NULL element satisfies the condition
759-
# These return NULL because NULLs leave the result indeterminate
757+
# Mixed NULL + non-NULL, no non-NULL element satisfies: false (min/max skip NULLs).
760758
query B
761759
select 5 > any(make_array(6, NULL));
762760
----
763-
NULL
761+
false
764762

765763
query B
766764
select 5 < any(make_array(3, NULL));
767765
----
768-
NULL
766+
false
769767

770768
query B
771769
select 5 >= any(make_array(6, NULL));
772770
----
773-
NULL
771+
false
774772

775773
query B
776774
select 5 <= any(make_array(3, NULL));
777775
----
778-
NULL
776+
false
779777

780778
# Mixed NULL + non-NULL array where a non-NULL element satisfies the condition
781779
query B
@@ -806,9 +804,9 @@ true
806804
query B
807805
select 5 <> any(make_array(5, NULL));
808806
----
809-
NULL
807+
false
810808

811-
# All-NULL array: all operators should return NULL (unknown comparison)
809+
# All-NULL array: NULL from min/max-based desugars; `= ANY` is false via array_has.
812810
query B
813811
select 5 > any(make_array(NULL::INT, NULL::INT));
814812
----
@@ -837,7 +835,7 @@ NULL
837835
query B
838836
select 5 = any(make_array(NULL::INT, NULL::INT));
839837
----
840-
NULL
838+
false
841839

842840
# NULL left operand: should return NULL for non-empty arrays
843841
query B
@@ -865,7 +863,7 @@ select NULL <> any(make_array(1, 2, 3));
865863
----
866864
NULL
867865

868-
# NULL left operand with empty array: should return false
866+
# NULL left operand + empty array: vacuous false (cardinality guard short-circuits).
869867
query B
870868
select NULL > any(make_array());
871869
----
@@ -920,11 +918,11 @@ select 5 = any(make_array(5, NULL));
920918
----
921919
true
922920

923-
# = ANY with mixed NULL (non-satisfying): NULLs leave result indeterminate
921+
# = ANY with mixed NULL, no match: false (array_has treats NULL as absent).
924922
query B
925923
select 5 = any(make_array(1, 2, NULL));
926924
----
927-
NULL
925+
false
928926

929927
statement ok
930928
DROP TABLE any_op_test;

0 commit comments

Comments
 (0)