Skip to content

Commit a12a0e5

Browse files
committed
Improve NDV propagation through statistics merge and projection
- Statistics merge: use max as conservative lower bound instead of discarding NDV (duplicates may exist across partitions) - Projection: preserve NDV for single-column expressions as upper bound
1 parent 53f12f6 commit a12a0e5

2 files changed

Lines changed: 42 additions & 8 deletions

File tree

datafusion/common/src/stats.rs

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,24 @@ impl Statistics {
637637
col_stats.max_value = col_stats.max_value.max(&item_col_stats.max_value);
638638
col_stats.min_value = col_stats.min_value.min(&item_col_stats.min_value);
639639
col_stats.sum_value = col_stats.sum_value.add(&item_col_stats.sum_value);
640-
col_stats.distinct_count = Precision::Absent;
640+
// Use max as a conservative lower bound for distinct count
641+
// (can't accurately merge NDV since duplicates may exist across partitions)
642+
col_stats.distinct_count =
643+
match (&col_stats.distinct_count, &item_col_stats.distinct_count) {
644+
(Precision::Exact(a), Precision::Exact(b))
645+
| (Precision::Inexact(a), Precision::Exact(b))
646+
| (Precision::Exact(a), Precision::Inexact(b))
647+
| (Precision::Inexact(a), Precision::Inexact(b)) => {
648+
Precision::Inexact(if a >= b { *a } else { *b })
649+
}
650+
(Precision::Exact(v), Precision::Absent)
651+
| (Precision::Inexact(v), Precision::Absent)
652+
| (Precision::Absent, Precision::Exact(v))
653+
| (Precision::Absent, Precision::Inexact(v)) => {
654+
Precision::Inexact(*v)
655+
}
656+
(Precision::Absent, Precision::Absent) => Precision::Absent,
657+
};
641658
col_stats.byte_size = col_stats.byte_size.add(&item_col_stats.byte_size);
642659
}
643660

@@ -1357,8 +1374,8 @@ mod tests {
13571374
col_stats.max_value,
13581375
Precision::Exact(ScalarValue::Int32(Some(20)))
13591376
);
1360-
// Distinct count should be Absent after merge
1361-
assert_eq!(col_stats.distinct_count, Precision::Absent);
1377+
// Distinct count should be Inexact(max) after merge as a conservative lower bound
1378+
assert_eq!(col_stats.distinct_count, Precision::Inexact(7));
13621379
}
13631380

13641381
#[test]

datafusion/physical-expr/src/projection.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -711,9 +711,25 @@ impl ProjectionExprs {
711711
}
712712
}
713713
} else {
714-
// TODO stats: estimate more statistics from expressions
715-
// (expressions should compute their statistics themselves)
716-
ColumnStatistics::new_unknown()
714+
// TODO: expressions should compute their own statistics
715+
//
716+
// For now, try to preserve NDV if the expression references a
717+
// single column (as a conservative upper bound).
718+
// More accurate NDV propagation would require tracking injectivity
719+
// of functions (e.g., `a + 1` preserves NDV exactly, `ABS(a)` may
720+
// reduce it, `a % 10` bounds it to 10)
721+
let columns = collect_columns(expr);
722+
if columns.len() == 1 {
723+
let col_idx = columns.iter().next().unwrap().index();
724+
ColumnStatistics {
725+
distinct_count: stats.column_statistics[col_idx]
726+
.distinct_count
727+
.to_inexact(),
728+
..ColumnStatistics::new_unknown()
729+
}
730+
} else {
731+
ColumnStatistics::new_unknown()
732+
}
717733
};
718734
column_statistics.push(col_stats);
719735
}
@@ -2718,10 +2734,11 @@ pub(crate) mod tests {
27182734
// Should have 2 column statistics
27192735
assert_eq!(output_stats.column_statistics.len(), 2);
27202736

2721-
// First column (expression) should have unknown statistics
2737+
// First column (expression `col0 + 1`) preserves NDV from the single
2738+
// referenced column as a conservative upper bound (marked Inexact)
27222739
assert_eq!(
27232740
output_stats.column_statistics[0].distinct_count,
2724-
Precision::Absent
2741+
Precision::Inexact(5)
27252742
);
27262743
assert_eq!(
27272744
output_stats.column_statistics[0].max_value,

0 commit comments

Comments
 (0)