Skip to content

Commit baa497d

Browse files
authored
fix: Disable join dynamic filters for null-equal joins (#22965)
## Which issue does this PR close? - Closes ##22964 ## Rationale for this change We presently allow dynamic filter pushdown to be applied to null-equal hash joins. This might result in pushing a predicate down into the probe-side plan, where the predicate will not be evaluated with the null-equal semantics that are required. Longer-term, we might consider supporting this case with the correct semantics (e.g., generate a predicate with `OR IS NULL ...`), but for now disabling pushdown for null-equal joins seems much more practical. ## What changes are included in this PR? * Disable hash join dynamic filter pushdown for null-equal joins * Add SLT test with end-to-end repro * Add unit test ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent 8cda78b commit baa497d

2 files changed

Lines changed: 79 additions & 0 deletions

File tree

datafusion/physical-plan/src/joins/hash_join/exec.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,14 @@ impl HashJoinExec {
844844
return false;
845845
}
846846

847+
// Bounds and membership filters derived from the build side do not
848+
// account for null-equal matching: a probe-side NULL key evaluates
849+
// such predicates to NULL and would be pruned, even though it can
850+
// match a build-side NULL when nulls compare equal.
851+
if self.null_equality == NullEquality::NullEqualsNull {
852+
return false;
853+
}
854+
847855
// `preserve_file_partitions` can report Hash partitioning for Hive-style
848856
// file groups, but those partitions are not actually hash-distributed.
849857
// Partitioned dynamic filters rely on hash routing, so disable them in
@@ -6417,6 +6425,35 @@ mod tests {
64176425
Ok(())
64186426
}
64196427

6428+
#[test]
6429+
fn test_dynamic_filter_pushdown_rejects_null_equal_join() -> Result<()> {
6430+
let (_, _, on) = build_schema_and_on()?;
6431+
let left = build_table(("a1", &vec![1]), ("b1", &vec![1]), ("c1", &vec![1]));
6432+
let right = build_table(("a2", &vec![1]), ("b1", &vec![1]), ("c2", &vec![1]));
6433+
6434+
let mut session_config = SessionConfig::default();
6435+
session_config
6436+
.options_mut()
6437+
.optimizer
6438+
.enable_join_dynamic_filter_pushdown = true;
6439+
6440+
let join = HashJoinExec::try_new(
6441+
left,
6442+
right,
6443+
on,
6444+
None,
6445+
&JoinType::RightSemi,
6446+
None,
6447+
PartitionMode::CollectLeft,
6448+
NullEquality::NullEqualsNull,
6449+
false,
6450+
)?;
6451+
6452+
assert!(!join.allow_join_dynamic_filter_pushdown(session_config.options()));
6453+
6454+
Ok(())
6455+
}
6456+
64206457
#[test]
64216458
fn test_with_dynamic_filter_rejects_invalid_columns() -> Result<()> {
64226459
let (_, _, on) = build_schema_and_on()?;

datafusion/sqllogictest/test_files/push_down_filter_parquet.slt

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,48 @@ statement ok
10231023
drop table int_probe;
10241024

10251025

1026+
########
1027+
# Dynamic filters must not be created for null-equal joins (IS NOT DISTINCT
1028+
# FROM, INTERSECT): min/max bounds and membership filters derived from the
1029+
# build side evaluate to NULL for probe-side NULL keys and would prune rows
1030+
# that can null-match a build-side NULL.
1031+
########
1032+
1033+
statement ok
1034+
COPY (SELECT * FROM (VALUES (11), (22), (NULL)) v(id)) TO 'test_files/scratch/push_down_filter_parquet/nej_probe.parquet' STORED AS PARQUET;
1035+
1036+
statement ok
1037+
COPY (SELECT * FROM (VALUES (11), (NULL)) v(id)) TO 'test_files/scratch/push_down_filter_parquet/nej_build.parquet' STORED AS PARQUET;
1038+
1039+
statement ok
1040+
CREATE EXTERNAL TABLE nej_probe STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter_parquet/nej_probe.parquet';
1041+
1042+
statement ok
1043+
CREATE EXTERNAL TABLE nej_build STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter_parquet/nej_build.parquet';
1044+
1045+
# The probe-side NULL key must survive to match the build-side NULL
1046+
query II rowsort
1047+
SELECT nej_build.id, nej_probe.id FROM nej_build JOIN nej_probe ON nej_build.id IS NOT DISTINCT FROM nej_probe.id
1048+
----
1049+
11 11
1050+
NULL NULL
1051+
1052+
# No DynamicFilter predicate may appear on the probe side of a null-equal join
1053+
query TT
1054+
EXPLAIN SELECT nej_build.id, nej_probe.id FROM nej_build JOIN nej_probe ON nej_build.id IS NOT DISTINCT FROM nej_probe.id
1055+
----
1056+
physical_plan
1057+
01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], NullsEqual: true
1058+
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nej_build.parquet]]}, projection=[id], file_type=parquet
1059+
03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/nej_probe.parquet]]}, projection=[id], file_type=parquet
1060+
1061+
statement ok
1062+
drop table nej_build;
1063+
1064+
statement ok
1065+
drop table nej_probe;
1066+
1067+
10261068
# Config reset
10271069
statement ok
10281070
RESET datafusion.explain.physical_plan_only;

0 commit comments

Comments
 (0)