Skip to content

Commit 090f62c

Browse files
committed
apply fixes
1 parent 0c571c7 commit 090f62c

1 file changed

Lines changed: 130 additions & 9 deletions

File tree

datafusion/physical-expr/src/analysis.rs

Lines changed: 130 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,44 @@ fn shrink_boundaries(
260260
Ok(AnalysisContext::new(target_boundaries).with_selectivity(selectivity))
261261
}
262262

263+
/// Returns `Some(1.0 / distinct_count)` when the filter demonstrably collapsed
264+
/// a non-singleton interval down to a single point, i.e. an equality predicate
265+
/// was applied. Returns `None` in all other cases, signalling that the caller
266+
/// should fall back to [`cardinality_ratio`].
267+
///
268+
/// The `initial_interval` guard prevents double-counting selectivity when the
269+
/// column statistics already described a singleton before any filter was
270+
/// applied: if the initial interval was already the same single point, no
271+
/// additional selectivity has been gained and the `1 / NDV` shortcut must not
272+
/// fire.
273+
fn singleton_selectivity(
274+
initial_interval: &Interval,
275+
target_interval: &Interval,
276+
distinct_count: usize,
277+
) -> Option<f64> {
278+
// The target must have collapsed to a single non-null value.
279+
if distinct_count == 0
280+
|| target_interval.lower().is_null()
281+
|| target_interval.lower() != target_interval.upper()
282+
{
283+
return None;
284+
}
285+
286+
// Only treat this as a newly-applied equality filter when the initial
287+
// interval was not already that same singleton. If it was, the stats
288+
// already encoded this restriction and applying 1/NDV again would
289+
// under-estimate the row count.
290+
let initial_is_same_singleton = !initial_interval.lower().is_null()
291+
&& initial_interval.lower() == initial_interval.upper()
292+
&& initial_interval.lower() == target_interval.lower();
293+
294+
if initial_is_same_singleton {
295+
return None;
296+
}
297+
298+
Some(1.0 / distinct_count as f64)
299+
}
300+
263301
/// This function calculates the filter predicate's selectivity by comparing
264302
/// the initial and pruned column boundaries. Selectivity is defined as the
265303
/// ratio of rows in a table that satisfy the filter's predicate.
@@ -279,15 +317,17 @@ fn calculate_selectivity(
279317
for (initial, target) in initial_boundaries.iter().zip(target_boundaries) {
280318
match (initial.interval.as_ref(), target.interval.as_ref()) {
281319
(Some(initial_interval), Some(target_interval)) => {
282-
// If it is equality predicate, calculate selectivity as `1 / distinct_count`
283320
if let Precision::Exact(distinct_count)
284321
| Precision::Inexact(distinct_count) = target.distinct_count
285-
&& distinct_count > 0
286-
&& !target_interval.lower().is_null()
287-
&& target_interval.lower() == target_interval.upper()
288322
{
289-
acc *= 1.0 / distinct_count as f64;
290-
continue;
323+
if let Some(s) = singleton_selectivity(
324+
initial_interval,
325+
target_interval,
326+
distinct_count,
327+
) {
328+
acc *= s;
329+
continue;
330+
}
291331
}
292332
acc *= cardinality_ratio(initial_interval, target_interval);
293333
}
@@ -308,14 +348,14 @@ mod tests {
308348
use std::sync::Arc;
309349

310350
use arrow::datatypes::{DataType, Field, Schema};
311-
use datafusion_common::{DFSchema, assert_contains};
351+
use datafusion_common::{DFSchema, ScalarValue, assert_contains, stats::Precision};
312352
use datafusion_expr::{
313353
Expr, col, execution_props::ExecutionProps, interval_arithmetic::Interval, lit,
314354
};
315355

316-
use crate::{AnalysisContext, create_physical_expr};
356+
use crate::{AnalysisContext, create_physical_expr, expressions::Column};
317357

318-
use super::{ExprBoundaries, analyze};
358+
use super::{ExprBoundaries, analyze, calculate_selectivity, singleton_selectivity};
319359

320360
fn make_field(name: &str, data_type: DataType) -> Field {
321361
let nullable = false;
@@ -446,4 +486,85 @@ mod tests {
446486
.unwrap_err();
447487
assert_contains!(analysis_error.to_string(), expected_error);
448488
}
489+
fn make_boundary(lower: i32, upper: i32, distinct_count: usize) -> ExprBoundaries {
490+
ExprBoundaries {
491+
column: Column::new("a", 0),
492+
interval: Some(
493+
Interval::try_new(
494+
ScalarValue::Int32(Some(lower)),
495+
ScalarValue::Int32(Some(upper)),
496+
)
497+
.unwrap(),
498+
),
499+
distinct_count: Precision::Exact(distinct_count),
500+
}
501+
}
502+
503+
/// When the initial interval is already the same singleton as the target,
504+
/// `singleton_selectivity` must return `None` so we do not double-apply
505+
/// 1/NDV selectivity.
506+
#[test]
507+
fn test_singleton_selectivity_skipped_when_initial_is_same_singleton() {
508+
let singleton =
509+
Interval::try_new(ScalarValue::Int32(Some(5)), ScalarValue::Int32(Some(5)))
510+
.unwrap();
511+
// Both initial and target are [5, 5] — no new equality filter was applied.
512+
assert_eq!(
513+
singleton_selectivity(&singleton, &singleton, 10),
514+
None,
515+
"shortcut must not fire when initial interval was already the same singleton"
516+
);
517+
}
518+
519+
/// When the initial interval is a broader range and the target collapses to
520+
/// a singleton, `singleton_selectivity` must return `Some(1/NDV)`.
521+
#[test]
522+
fn test_singleton_selectivity_applied_when_range_collapses() {
523+
let initial =
524+
Interval::try_new(ScalarValue::Int32(Some(1)), ScalarValue::Int32(Some(100)))
525+
.unwrap();
526+
let target =
527+
Interval::try_new(ScalarValue::Int32(Some(5)), ScalarValue::Int32(Some(5)))
528+
.unwrap();
529+
let result = singleton_selectivity(&initial, &target, 10);
530+
assert_eq!(
531+
result,
532+
Some(0.1),
533+
"shortcut must return 1/NDV when a range collapses to a singleton"
534+
);
535+
}
536+
537+
/// Regression test: `calculate_selectivity` must not apply the `1/NDV`
538+
/// shortcut when the column statistics already describe a singleton interval
539+
/// (i.e. before the filter, the column only ever held one value). In that
540+
/// case the target and initial intervals are the same singleton, so the
541+
/// cardinality ratio is 1.0 and the overall selectivity should remain 1.0.
542+
#[test]
543+
fn test_calculate_selectivity_already_singleton_initial_interval() {
544+
let already_singleton = make_boundary(7, 7, 1);
545+
546+
let selectivity =
547+
calculate_selectivity(&[already_singleton.clone()], &[already_singleton])
548+
.unwrap();
549+
550+
let wide_initial = make_boundary(1, 100, 50);
551+
let same_singleton_target = make_boundary(7, 7, 50);
552+
let selectivity_new =
553+
calculate_selectivity(&[same_singleton_target], &[wide_initial]).unwrap();
554+
assert!(
555+
(selectivity_new - 0.02).abs() < 1e-10,
556+
"expected selectivity 1/NDV = 0.02, got {selectivity_new}"
557+
);
558+
559+
let singleton_initial = make_boundary(7, 7, 50);
560+
let singleton_target = make_boundary(7, 7, 50);
561+
let selectivity_no_new_filter =
562+
calculate_selectivity(&[singleton_target], &[singleton_initial]).unwrap();
563+
assert!(
564+
(selectivity_no_new_filter - 1.0).abs() < 1e-10,
565+
"expected selectivity 1.0 when initial was already the same singleton, got {selectivity_no_new_filter}"
566+
);
567+
568+
let _ = selectivity; // silence unused warning
569+
}
449570
}

0 commit comments

Comments
 (0)