Skip to content

Commit 840c6ce

Browse files
committed
Preserve NDV precision in max_distinct_count when cap is not triggered
1 parent 5a27275 commit 840c6ce

1 file changed

Lines changed: 79 additions & 1 deletion

File tree

  • datafusion/physical-plan/src/joins

datafusion/physical-plan/src/joins/utils.rs

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,13 @@ fn max_distinct_count(
712712
// NDV can never exceed the number of rows
713713
match num_rows {
714714
Precision::Absent => dc,
715-
_ => dc.min(num_rows).to_inexact(),
715+
_ => {
716+
if dc.get_value() <= num_rows.get_value() {
717+
dc
718+
} else {
719+
num_rows.to_inexact()
720+
}
721+
}
716722
}
717723
}
718724
_ => {
@@ -2974,4 +2980,76 @@ mod tests {
29742980
let result = max_distinct_count(&num_rows, &stats);
29752981
assert_eq!(result, Exact(0));
29762982
}
2983+
2984+
#[test]
2985+
fn test_max_distinct_count_preserves_precision_when_not_capped() {
2986+
assert_eq!(
2987+
max_distinct_count(
2988+
&Exact(10),
2989+
&ColumnStatistics {
2990+
distinct_count: Exact(5),
2991+
..Default::default()
2992+
}
2993+
),
2994+
Exact(5)
2995+
);
2996+
assert_eq!(
2997+
max_distinct_count(
2998+
&Exact(10),
2999+
&ColumnStatistics {
3000+
distinct_count: Inexact(5),
3001+
..Default::default()
3002+
}
3003+
),
3004+
Inexact(5)
3005+
);
3006+
// Inexact num_rows does not affect an exact NDV that is within bounds
3007+
assert_eq!(
3008+
max_distinct_count(
3009+
&Inexact(10),
3010+
&ColumnStatistics {
3011+
distinct_count: Exact(5),
3012+
..Default::default()
3013+
}
3014+
),
3015+
Exact(5)
3016+
);
3017+
}
3018+
3019+
#[test]
3020+
fn test_max_distinct_count_demotes_to_inexact_when_capped() {
3021+
// Exact NDV > Exact num_rows is an illegal state (NDV <= num_rows is a
3022+
// mathematical invariant), but the code handles it defensively by
3023+
// capping and demoting to inexact
3024+
assert_eq!(
3025+
max_distinct_count(
3026+
&Exact(10),
3027+
&ColumnStatistics {
3028+
distinct_count: Exact(15),
3029+
..Default::default()
3030+
}
3031+
),
3032+
Inexact(10)
3033+
);
3034+
assert_eq!(
3035+
max_distinct_count(
3036+
&Inexact(10),
3037+
&ColumnStatistics {
3038+
distinct_count: Exact(15),
3039+
..Default::default()
3040+
}
3041+
),
3042+
Inexact(10)
3043+
);
3044+
assert_eq!(
3045+
max_distinct_count(
3046+
&Exact(10),
3047+
&ColumnStatistics {
3048+
distinct_count: Inexact(15),
3049+
..Default::default()
3050+
}
3051+
),
3052+
Inexact(10)
3053+
);
3054+
}
29773055
}

0 commit comments

Comments
 (0)