Skip to content

Commit aca8c14

Browse files
fwojciecclaude
andauthored
Fix FilterExec converting Absent column stats to Exact(NULL) (#20391)
## Which issue does this PR close? - Closes #20388. ## Rationale for this change `collect_new_statistics` in `FilterExec` wraps NULL interval bounds in `Precision::Exact`, converting what should be `Precision::Absent` column statistics into `Precision::Exact(ScalarValue::Int32(None))`. Downstream, `estimate_disjoint_inputs` treats these as real bounds and incorrectly concludes join inputs are disjoint, forcing Partitioned join mode and disabling dynamic filter pushdown for Parquet row group pruning. ## What changes are included in this PR? Single change to `collect_new_statistics` in `filter.rs`: check `is_null()` on interval bounds before wrapping in `Precision`, mapping NULL bounds back to `Absent`. ## Are these changes tested? Yes — includes a regression test (`test_filter_statistics_absent_columns_stay_absent`) that fails on current main and passes with the fix. ## Are there any user-facing changes? No API changes. Corrects statistics propagation for tables/views with absent column statistics. --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 097f04c commit aca8c14

File tree

1 file changed

+54
-5
lines changed

1 file changed

+54
-5
lines changed

datafusion/physical-plan/src/filter.rs

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -764,6 +764,21 @@ impl EmbeddedProjection for FilterExec {
764764
}
765765
}
766766

767+
/// Converts an interval bound to a [`Precision`] value. NULL bounds (which
768+
/// represent "unbounded" in the interval type) map to [`Precision::Absent`].
769+
fn interval_bound_to_precision(
770+
bound: ScalarValue,
771+
is_exact: bool,
772+
) -> Precision<ScalarValue> {
773+
if bound.is_null() {
774+
Precision::Absent
775+
} else if is_exact {
776+
Precision::Exact(bound)
777+
} else {
778+
Precision::Inexact(bound)
779+
}
780+
}
781+
767782
/// This function ensures that all bounds in the `ExprBoundaries` vector are
768783
/// converted to closed bounds. If a lower/upper bound is initially open, it
769784
/// is adjusted by using the next/previous value for its data type to convert
@@ -796,11 +811,9 @@ fn collect_new_statistics(
796811
};
797812
};
798813
let (lower, upper) = interval.into_bounds();
799-
let (min_value, max_value) = if lower.eq(&upper) {
800-
(Precision::Exact(lower), Precision::Exact(upper))
801-
} else {
802-
(Precision::Inexact(lower), Precision::Inexact(upper))
803-
};
814+
let is_exact = !lower.is_null() && !upper.is_null() && lower == upper;
815+
let min_value = interval_bound_to_precision(lower, is_exact);
816+
let max_value = interval_bound_to_precision(upper, is_exact);
804817
ColumnStatistics {
805818
null_count: input_column_stats[idx].null_count.to_inexact(),
806819
max_value,
@@ -2078,4 +2091,40 @@ mod tests {
20782091

20792092
Ok(())
20802093
}
2094+
2095+
/// Columns with Absent min/max statistics should remain Absent after
2096+
/// FilterExec.
2097+
#[tokio::test]
2098+
async fn test_filter_statistics_absent_columns_stay_absent() -> Result<()> {
2099+
let schema = Schema::new(vec![
2100+
Field::new("a", DataType::Int32, false),
2101+
Field::new("b", DataType::Int32, false),
2102+
]);
2103+
let input = Arc::new(StatisticsExec::new(
2104+
Statistics {
2105+
num_rows: Precision::Inexact(1000),
2106+
total_byte_size: Precision::Absent,
2107+
column_statistics: vec![
2108+
ColumnStatistics::default(),
2109+
ColumnStatistics::default(),
2110+
],
2111+
},
2112+
schema.clone(),
2113+
));
2114+
2115+
let predicate = Arc::new(BinaryExpr::new(
2116+
Arc::new(Column::new("a", 0)),
2117+
Operator::Eq,
2118+
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
2119+
));
2120+
let filter: Arc<dyn ExecutionPlan> =
2121+
Arc::new(FilterExec::try_new(predicate, input)?);
2122+
2123+
let statistics = filter.partition_statistics(None)?;
2124+
let col_b_stats = &statistics.column_statistics[1];
2125+
assert_eq!(col_b_stats.min_value, Precision::Absent);
2126+
assert_eq!(col_b_stats.max_value, Precision::Absent);
2127+
2128+
Ok(())
2129+
}
20812130
}

0 commit comments

Comments
 (0)