Skip to content

Commit 2a67096

Browse files
committed
Use ExpressionAnalyzer for projection statistics NDV estimation
Replace manual single-column check with ExpressionAnalyzerRegistry. This enables proper NDV estimation for: - Arithmetic expressions (col + 1, col * 2) - Injective string functions (UPPER, LOWER) - Date/time extraction functions (MONTH, HOUR)
1 parent 4f4bcb0 commit 2a67096

File tree

2 files changed

+22
-18
lines changed

2 files changed

+22
-18
lines changed

datafusion/physical-expr/src/projection.rs

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use std::ops::Deref;
2121
use std::sync::Arc;
2222

2323
use crate::PhysicalExpr;
24+
use crate::expression_analyzer::ExpressionAnalyzerRegistry;
2425
use crate::expressions::{Column, Literal};
2526
use crate::utils::collect_columns;
2627

@@ -711,24 +712,22 @@ impl ProjectionExprs {
711712
}
712713
}
713714
} else {
714-
// TODO: expressions should compute their own statistics
715-
//
716-
// For now, try to preserve NDV if the expression references a
717-
// single column (as a conservative upper bound).
718-
// More accurate NDV propagation would require tracking injectivity
719-
// of functions (e.g., `a + 1` preserves NDV exactly, `ABS(a)` may
720-
// reduce it, `a % 10` bounds it to 10)
721-
let columns = collect_columns(expr);
722-
if columns.len() == 1 {
723-
let col_idx = columns.iter().next().unwrap().index();
724-
ColumnStatistics {
725-
distinct_count: stats.column_statistics[col_idx]
726-
.distinct_count
727-
.to_inexact(),
728-
..ColumnStatistics::new_unknown()
729-
}
730-
} else {
731-
ColumnStatistics::new_unknown()
715+
// Use ExpressionAnalyzer to estimate NDV for arbitrary expressions
716+
// This handles:
717+
// - Column references (preserves NDV)
718+
// - Literals (NDV = 1)
719+
// - Injective functions like UPPER(col) (preserves NDV)
720+
// - Non-injective functions like FLOOR(col) (reduces NDV)
721+
// - Date/time functions like MONTH(col) (bounded NDV)
722+
let registry = ExpressionAnalyzerRegistry::with_builtin_analyzers();
723+
let distinct_count = registry
724+
.get_distinct_count(expr, &stats)
725+
.map(Precision::Inexact)
726+
.unwrap_or(Precision::Absent);
727+
728+
ColumnStatistics {
729+
distinct_count,
730+
..ColumnStatistics::new_unknown()
732731
}
733732
};
734733
column_statistics.push(col_stats);

datafusion/physical-plan/src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ pub mod display;
7474
pub mod empty;
7575
pub mod execution_plan;
7676
pub mod explain;
77+
78+
// Re-export expression_analyzer from physical-expr for backwards compatibility
79+
pub mod expression_analyzer {
80+
pub use datafusion_physical_expr::expression_analyzer::*;
81+
}
7782
pub mod filter;
7883
pub mod filter_pushdown;
7984
pub mod joins;

0 commit comments

Comments
 (0)