From 0fd7fd5dbc12001734afd35de2e46fe8e771bb5c Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 21 May 2026 18:52:54 -0400 Subject: [PATCH 1/2] fix: tighten and clean up null-rejection analysis in EliminateOuterJoin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The null-rejection analysis used by `EliminateOuterJoin` to decide whether an outer join can be tightened had a soundness gap: `IS TRUE` / `IS FALSE` / `IS NOT UNKNOWN` recursed into their operand at any depth. These predicates return a definite FALSE on NULL input (never NULL), so they're null-rejecting at the top of a WHERE clause but not under a surrounding `NOT` (or any context that resets `top_level`), where the FALSE inverts to TRUE and accepts NULL rows. The existing `IS NOT NULL` arm already gated on `top_level`; the three others now share the same gate. While here, clean up several other rough edges: * Defer to `Operator::returns_null_on_null()` instead of maintaining a hand-rolled list of null-propagating binary operators in `extract_non_nullable_columns`. The hand list previously only covered the six comparison operators; the upstream method is the single source of truth and also picks up arithmetic / bitwise / regex / SIMILAR TO / array-containment operators that were missed. * Rename `extract_non_nullable_columns` to `extract_null_rejecting_columns`. The returned columns are columns the predicate rejects NULL on (a predicate property), not columns that are themselves non-nullable (a schema property). The two are distinct. * Rewrite the OR / nested-AND merge logic from an O(n*m) double-loop with break into bucketed per-side lookups. * Handle `Expr::Negative` alongside `Expr::Not` (both propagate NULL on their operand). * Unify `IsNotNull` / `IsTrue` / `IsFalse` / `IsNotUnknown` into a single match arm now that they share the same `top_level` gate. * Rewrite the function-level doc and inline comments. Tests: * 7 new unit tests covering the soundness fix (three `NOT(... IS TRUE/FALSE/NOT UNKNOWN)` regressions), the IS DISTINCT FROM / IS NOT DISTINCT FROM negative cases, and positive coverage for the arithmetic / unary-minus shapes that the operator-list extension enables. * 1 new sqllogictest case asserting both the logical plan (`Left Join` preserved) and the row set (LEFT-padded NULL rows survive) for `WHERE NOT((t2.y > 150) IS TRUE)` — the row-set assertion is what catches a future regression that an explain-only test would let through under snapshot updates. --- .../optimizer/src/eliminate_outer_join.rs | 401 +++++++++++++----- .../test_files/eliminate_outer_join.slt | 19 + 2 files changed, 318 insertions(+), 102 deletions(-) diff --git a/datafusion/optimizer/src/eliminate_outer_join.rs b/datafusion/optimizer/src/eliminate_outer_join.rs index cd060469b2990..748b04d5cf718 100644 --- a/datafusion/optimizer/src/eliminate_outer_join.rs +++ b/datafusion/optimizer/src/eliminate_outer_join.rs @@ -80,11 +80,11 @@ impl OptimizerRule for EliminateOuterJoin { match plan { LogicalPlan::Filter(mut filter) => match Arc::unwrap_or_clone(filter.input) { LogicalPlan::Join(join) => { - let mut non_nullable_cols: Vec = vec![]; + let mut null_rejecting_cols: Vec = vec![]; - extract_non_nullable_columns( + extract_null_rejecting_columns( &filter.predicate, - &mut non_nullable_cols, + &mut null_rejecting_cols, join.left.schema(), join.right.schema(), true, @@ -93,7 +93,7 @@ impl OptimizerRule for EliminateOuterJoin { let new_join_type = if join.join_type.is_outer() { let mut left_non_nullable = false; let mut right_non_nullable = false; - for col in non_nullable_cols.iter() { + for col in null_rejecting_cols.iter() { if join.left.schema().has_column(col) { left_non_nullable = true; } @@ -163,62 +163,49 @@ pub fn eliminate_outer( new_join_type } -/// Recursively traverses expr, if expr returns false when -/// any inputs are null, treats columns of both sides as non_nullable columns. +/// Find the columns that `expr` rejects NULL on. If any of these columns are +/// NULL, `expr` is guaranteed to evaluate to NULL or false, and the row +/// therefore cannot survive a WHERE clause. Matching columns are appended to +/// `null_rejecting_cols`. /// -/// For and/or expr, extracts from all sub exprs and merges the columns. -/// For or expr, if one of sub exprs returns true, discards all columns from or expr. -/// For IS NOT NULL/NOT expr, always returns false for NULL input. -/// extracts columns from these exprs. -/// For all other exprs, fall through -fn extract_non_nullable_columns( +/// The caller uses the result to decide whether an outer join's null-padded +/// rows could survive the predicate above the join: if a column from the +/// nullable side appears in `null_rejecting_cols`, it cannot, and the outer +/// join can be converted to an inner join. +/// +/// `left_schema` and `right_schema` are the join's two child schemas. +/// `top_level` is true at the root of the WHERE predicate and false on each +/// recursion. +fn extract_null_rejecting_columns( expr: &Expr, - non_nullable_cols: &mut Vec, + null_rejecting_cols: &mut Vec, left_schema: &Arc, right_schema: &Arc, top_level: bool, ) { match expr { Expr::Column(col) => { - non_nullable_cols.push(col.clone()); + null_rejecting_cols.push(col.clone()); } Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op { - // If one of the inputs are null for these operators, the results should be false. - Operator::Eq - | Operator::NotEq - | Operator::Lt - | Operator::LtEq - | Operator::Gt - | Operator::GtEq => { - extract_non_nullable_columns( - left, - non_nullable_cols, - left_schema, - right_schema, - false, - ); - extract_non_nullable_columns( - right, - non_nullable_cols, - left_schema, - right_schema, - false, - ) - } Operator::And | Operator::Or => { - // treat And as Or if does not from top level, such as - // not (c1 < 10 and c2 > 100) + // AND distributes only down a top-level AND chain in the WHERE + // clause: each conjunct is independently null- rejecting, so + // any column either side discovers is a column the WHERE + // rejects NULL on. Once an AND appears below any other context, + // we fall back to the per-side analysis used for OR, because + // the context might influence whether the row is filtered. if top_level && *op == Operator::And { - extract_non_nullable_columns( + extract_null_rejecting_columns( left, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, top_level, ); - extract_non_nullable_columns( + extract_null_rejecting_columns( right, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, top_level, @@ -226,124 +213,139 @@ fn extract_non_nullable_columns( return; } - let mut left_non_nullable_cols: Vec = vec![]; - let mut right_non_nullable_cols: Vec = vec![]; - - extract_non_nullable_columns( + // OR (and nested AND): a row survives if EITHER operand returns + // true. We can credit a join side as null-rejecting only when + // BOTH operands independently reject NULL on a column from that + // side — otherwise the other branch could let the NULL row + // through. + let mut left_cols: Vec = vec![]; + let mut right_cols: Vec = vec![]; + extract_null_rejecting_columns( left, - &mut left_non_nullable_cols, + &mut left_cols, left_schema, right_schema, top_level, ); - extract_non_nullable_columns( + extract_null_rejecting_columns( right, - &mut right_non_nullable_cols, + &mut right_cols, left_schema, right_schema, top_level, ); - // for query: select *** from a left join b where b.c1 ... or b.c2 ... - // this can be eliminated to inner join. - // for query: select *** from a left join b where a.c1 ... or b.c2 ... - // this can not be eliminated. - // If columns of relation exist in both sub exprs, any columns of this relation - // can be added to non nullable columns. - if !left_non_nullable_cols.is_empty() - && !right_non_nullable_cols.is_empty() - { - for left_col in &left_non_nullable_cols { - for right_col in &right_non_nullable_cols { - if (left_schema.has_column(left_col) - && left_schema.has_column(right_col)) - || (right_schema.has_column(left_col) - && right_schema.has_column(right_col)) - { - non_nullable_cols.push(left_col.clone()); - break; - } - } + let find_on = |cols: &[Column], schema: &DFSchema| { + cols.iter().find(|c| schema.has_column(c)).cloned() + }; + for schema in [left_schema, right_schema] { + if let (Some(c), Some(_)) = + (find_on(&left_cols, schema), find_on(&right_cols, schema)) + { + null_rejecting_cols.push(c); } } } + // Any other operator that DataFusion declares as NULL-on-NULL: + // recurse into both operands so we collect their columns. + op if op.returns_null_on_null() => { + extract_null_rejecting_columns( + left, + null_rejecting_cols, + left_schema, + right_schema, + false, + ); + extract_null_rejecting_columns( + right, + null_rejecting_cols, + left_schema, + right_schema, + false, + ) + } + // All other operators (notably including IS [ NOT ] DISTINCT FROM) + // are declared as not null-propagating, so they don't contribute + // any null-rejecting columns. _ => {} }, - Expr::Not(arg) => extract_non_nullable_columns( + Expr::Not(arg) | Expr::Negative(arg) => extract_null_rejecting_columns( arg, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, false, ), - Expr::IsNotNull(arg) => { + // IS NOT NULL / IS TRUE / IS FALSE / IS NOT UNKNOWN all return FALSE on + // NULL input. At the top of a WHERE clause, that FALSE filters the row + // and so we can recurse; below the top level the surrounding context + // may transform that FALSE into something that accepts NULL rows, + // making the recursion unsound. + Expr::IsNotNull(arg) + | Expr::IsTrue(arg) + | Expr::IsFalse(arg) + | Expr::IsNotUnknown(arg) => { if !top_level { return; } - extract_non_nullable_columns( + extract_null_rejecting_columns( arg, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, false, ) } Expr::Cast(Cast { expr, field: _ }) - | Expr::TryCast(TryCast { expr, field: _ }) => extract_non_nullable_columns( + | Expr::TryCast(TryCast { expr, field: _ }) => extract_null_rejecting_columns( expr, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, false, ), // IN list and BETWEEN are null-rejecting on the input expression: - // if the input column is NULL, the result is NULL (filtered out), - // regardless of whether the list/range contains NULLs. - Expr::InList(InList { expr, .. }) => extract_non_nullable_columns( + // NULL input yields a NULL result, regardless of whether the list + // or range bounds themselves contain NULLs. + Expr::InList(InList { expr, .. }) => extract_null_rejecting_columns( expr, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, false, ), - Expr::Between(between) => extract_non_nullable_columns( + Expr::Between(between) => extract_null_rejecting_columns( &between.expr, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, false, ), - // LIKE is null-rejecting: if either the input column or the pattern - // is NULL, the result is NULL (filtered out by WHERE). Expr::Like(Like { expr, pattern, .. }) => { - extract_non_nullable_columns( + extract_null_rejecting_columns( expr, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, false, ); - extract_non_nullable_columns( + extract_null_rejecting_columns( pattern, - non_nullable_cols, + null_rejecting_cols, left_schema, right_schema, false, ); } - // IS TRUE, IS FALSE, and IS NOT UNKNOWN are null-rejecting: - // if the input is NULL, they return false (filtered out by WHERE). - // Note: IS NOT TRUE, IS NOT FALSE, and IS UNKNOWN are NOT null-rejecting - // because they return true for NULL input. - Expr::IsTrue(arg) | Expr::IsFalse(arg) | Expr::IsNotUnknown(arg) => { - extract_non_nullable_columns( - arg, - non_nullable_cols, - left_schema, - right_schema, - false, - ) - } + // Anything not handled above contributes no null-rejecting + // columns. Two categories worth calling out: + // - IS NULL, IS NOT TRUE, IS NOT FALSE, IS UNKNOWN — return + // TRUE on NULL input, so they actively *accept* NULL rows + // and are intentionally excluded. + // - Function calls (scalar / aggregate / window / UDF), + // scalar subqueries, struct/list accessors, aliases, + // literals, etc. — we don't have a uniform NULL-propagation + // guarantee for these cases, so we conservatively skip them. _ => {} } } @@ -360,7 +362,7 @@ mod tests { Operator::{And, Or}, binary_expr, cast, col, lit, logical_plan::builder::LogicalPlanBuilder, - try_cast, + not, try_cast, }; macro_rules! assert_optimized_plan_equal { @@ -896,6 +898,83 @@ mod tests { ") } + #[test] + fn no_eliminate_left_with_not_is_true() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + // NOT( IS TRUE) is equivalent to ( IS NOT TRUE): TRUE when + // is FALSE OR NULL. So `WHERE NOT((t2.b > 5) IS TRUE)` accepts + // rows where t2.b is NULL (because t2.b > 5 is NULL → IS TRUE is + // FALSE → NOT FALSE = TRUE). The LEFT JOIN must NOT be converted. + let plan = LogicalPlanBuilder::from(t1) + .join( + t2, + JoinType::Left, + (vec![Column::from_name("a")], vec![Column::from_name("a")]), + None, + )? + .filter(not(col("t2.b").gt(lit(5u32)).is_true()))? + .build()?; + + assert_optimized_plan_equal!(plan, @r" + Filter: NOT t2.b > UInt32(5) IS TRUE + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + ") + } + + #[test] + fn no_eliminate_left_with_not_is_false() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + // Same shape, IS FALSE: NOT( IS FALSE) accepts NULL on the + // inner column. + let plan = LogicalPlanBuilder::from(t1) + .join( + t2, + JoinType::Left, + (vec![Column::from_name("a")], vec![Column::from_name("a")]), + None, + )? + .filter(not(col("t2.b").gt(lit(5u32)).is_false()))? + .build()?; + + assert_optimized_plan_equal!(plan, @r" + Filter: NOT t2.b > UInt32(5) IS FALSE + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + ") + } + + #[test] + fn no_eliminate_left_with_not_is_not_unknown() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + // Same shape, IS NOT UNKNOWN: NOT( IS NOT UNKNOWN) is + // equivalent to ( IS UNKNOWN), which is TRUE when is NULL. + let plan = LogicalPlanBuilder::from(t1) + .join( + t2, + JoinType::Left, + (vec![Column::from_name("a")], vec![Column::from_name("a")]), + None, + )? + .filter(not(col("t2.b").gt(lit(5u32)).is_not_unknown()))? + .build()?; + + assert_optimized_plan_equal!(plan, @r" + Filter: NOT t2.b > UInt32(5) IS NOT UNKNOWN + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + ") + } + #[test] fn eliminate_full_with_type_cast() -> Result<()> { let t1 = test_table_scan_with_name("t1")?; @@ -1171,4 +1250,122 @@ mod tests { TableScan: t2 ") } + + #[test] + fn eliminate_left_with_arithmetic_predicate() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + // t2.b * 2 + 1 > 10 is null-rejecting on t2.b: arithmetic + // operators propagate NULL, so the whole expression is NULL when + // t2.b is NULL, and NULL > 10 is filtered out by WHERE. + let plan = LogicalPlanBuilder::from(t1) + .join( + t2, + JoinType::Left, + (vec![Column::from_name("a")], vec![Column::from_name("a")]), + None, + )? + .filter( + binary_expr( + binary_expr(col("t2.b"), Operator::Multiply, lit(2u32)), + Operator::Plus, + lit(1u32), + ) + .gt(lit(10u32)), + )? + .build()?; + + assert_optimized_plan_equal!(plan, @r" + Filter: t2.b * UInt32(2) + UInt32(1) > UInt32(10) + Inner Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + ") + } + + #[test] + fn eliminate_left_with_negative_predicate() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + // Unary minus propagates NULL: -NULL is NULL, so `WHERE -t2.b > 0` + // is null-rejecting on t2.b. + let plan = LogicalPlanBuilder::from(t1) + .join( + t2, + JoinType::Left, + (vec![Column::from_name("a")], vec![Column::from_name("a")]), + None, + )? + .filter(Expr::Negative(Box::new(col("t2.b"))).gt(lit(0u32)))? + .build()?; + + assert_optimized_plan_equal!(plan, @r" + Filter: (- t2.b) > UInt32(0) + Inner Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + ") + } + + #[test] + fn no_eliminate_left_with_is_distinct_from() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + // IS DISTINCT FROM is NOT null-rejecting: t2.b IS DISTINCT FROM 5 is + // true when t2.b is NULL (NULL is distinct from 5). Padding rows from + // a LEFT JOIN would survive the filter, so the LEFT JOIN must stay. + let plan = LogicalPlanBuilder::from(t1) + .join( + t2, + JoinType::Left, + (vec![Column::from_name("a")], vec![Column::from_name("a")]), + None, + )? + .filter(binary_expr( + col("t2.b"), + Operator::IsDistinctFrom, + lit(5u32), + ))? + .build()?; + + assert_optimized_plan_equal!(plan, @r" + Filter: t2.b IS DISTINCT FROM UInt32(5) + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + ") + } + + #[test] + fn no_eliminate_left_with_is_not_distinct_from() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + // IS NOT DISTINCT FROM is also not null-rejecting: t2.b IS NOT + // DISTINCT FROM NULL is true when t2.b is NULL. The LEFT JOIN must + // stay. + let plan = LogicalPlanBuilder::from(t1) + .join( + t2, + JoinType::Left, + (vec![Column::from_name("a")], vec![Column::from_name("a")]), + None, + )? + .filter(binary_expr( + col("t2.b"), + Operator::IsNotDistinctFrom, + lit(ScalarValue::UInt32(None)), + ))? + .build()?; + + assert_optimized_plan_equal!(plan, @r" + Filter: t2.b IS NOT DISTINCT FROM UInt32(NULL) + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + ") + } } diff --git a/datafusion/sqllogictest/test_files/eliminate_outer_join.slt b/datafusion/sqllogictest/test_files/eliminate_outer_join.slt index dff7692a4451e..c2b9d890ad9a7 100644 --- a/datafusion/sqllogictest/test_files/eliminate_outer_join.slt +++ b/datafusion/sqllogictest/test_files/eliminate_outer_join.slt @@ -370,6 +370,25 @@ select * from t1 left join t2 on t1.a = t2.x where (t2.y > 150) is unknown; 3 30 c NULL NULL NULL NULL 40 d NULL NULL NULL +# LEFT JOIN + WHERE NOT ((t2.y > 150) IS TRUE) -> stays LEFT JOIN +query TT +explain select * from t1 left join t2 on t1.a = t2.x where not ((t2.y > 150) is true); +---- +logical_plan +01)Filter: NOT t2.y > Int32(150) IS TRUE +02)--Left Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b, c] +04)----TableScan: t2 projection=[x, y, z] + +# Both the matched-with-low-y row AND the LEFT-padded NULL rows must +# survive. +query IITIIT rowsort +select * from t1 left join t2 on t1.a = t2.x where not ((t2.y > 150) is true); +---- +1 10 a 1 100 p +3 30 c NULL NULL NULL +NULL 40 d NULL NULL NULL + ### ### FULL JOIN → LEFT / RIGHT conversion tests ### From 8893b63012d73869482353ffc4da77598bdd85c6 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sat, 23 May 2026 08:49:58 -0400 Subject: [PATCH 2/2] Add additional SLT tests --- .../test_files/eliminate_outer_join.slt | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/datafusion/sqllogictest/test_files/eliminate_outer_join.slt b/datafusion/sqllogictest/test_files/eliminate_outer_join.slt index c2b9d890ad9a7..d22a7f2e3ce42 100644 --- a/datafusion/sqllogictest/test_files/eliminate_outer_join.slt +++ b/datafusion/sqllogictest/test_files/eliminate_outer_join.slt @@ -389,6 +389,47 @@ select * from t1 left join t2 on t1.a = t2.x where not ((t2.y > 150) is true); 3 30 c NULL NULL NULL NULL 40 d NULL NULL NULL +# LEFT JOIN + WHERE NOT ((t2.y > 150) IS FALSE) -> stays LEFT JOIN +# NOT( IS FALSE) is TRUE when is TRUE OR NULL, so it accepts the +# LEFT-padded NULL rows and is not null-rejecting. +query TT +explain select * from t1 left join t2 on t1.a = t2.x where not ((t2.y > 150) is false); +---- +logical_plan +01)Filter: NOT t2.y > Int32(150) IS FALSE +02)--Left Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b, c] +04)----TableScan: t2 projection=[x, y, z] + +# The matched-with-high-y row AND the LEFT-padded NULL rows must survive. +query IITIIT rowsort +select * from t1 left join t2 on t1.a = t2.x where not ((t2.y > 150) is false); +---- +2 20 b 2 200 q +3 30 c NULL NULL NULL +NULL 40 d NULL NULL NULL + +# LEFT JOIN + WHERE NOT ((t2.y > 150) IS NOT UNKNOWN) -> stays LEFT JOIN +# NOT( IS NOT UNKNOWN) is equivalent to IS UNKNOWN: TRUE only when +# is NULL, so it accepts the LEFT-padded NULL rows and is not +# null-rejecting. +query TT +explain select * from t1 left join t2 on t1.a = t2.x where not ((t2.y > 150) is not unknown); +---- +logical_plan +01)Filter: NOT t2.y > Int32(150) IS NOT UNKNOWN +02)--Left Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b, c] +04)----TableScan: t2 projection=[x, y, z] + +# Only the LEFT-padded NULL rows (where t2.y > 150 evaluates to NULL) +# should survive. +query IITIIT rowsort +select * from t1 left join t2 on t1.a = t2.x where not ((t2.y > 150) is not unknown); +---- +3 30 c NULL NULL NULL +NULL 40 d NULL NULL NULL + ### ### FULL JOIN → LEFT / RIGHT conversion tests ###