@@ -260,6 +260,44 @@ fn shrink_boundaries(
260260 Ok ( AnalysisContext :: new ( target_boundaries) . with_selectivity ( selectivity) )
261261}
262262
263+ /// Returns `Some(1.0 / distinct_count)` when the filter demonstrably collapsed
264+ /// a non-singleton interval down to a single point, i.e. an equality predicate
265+ /// was applied. Returns `None` in all other cases, signalling that the caller
266+ /// should fall back to [`cardinality_ratio`].
267+ ///
268+ /// The `initial_interval` guard prevents double-counting selectivity when the
269+ /// column statistics already described a singleton before any filter was
270+ /// applied: if the initial interval was already the same single point, no
271+ /// additional selectivity has been gained and the `1 / NDV` shortcut must not
272+ /// fire.
273+ fn singleton_selectivity (
274+ initial_interval : & Interval ,
275+ target_interval : & Interval ,
276+ distinct_count : usize ,
277+ ) -> Option < f64 > {
278+ // The target must have collapsed to a single non-null value.
279+ if distinct_count == 0
280+ || target_interval. lower ( ) . is_null ( )
281+ || target_interval. lower ( ) != target_interval. upper ( )
282+ {
283+ return None ;
284+ }
285+
286+ // Only treat this as a newly-applied equality filter when the initial
287+ // interval was not already that same singleton. If it was, the stats
288+ // already encoded this restriction and applying 1/NDV again would
289+ // under-estimate the row count.
290+ let initial_is_same_singleton = !initial_interval. lower ( ) . is_null ( )
291+ && initial_interval. lower ( ) == initial_interval. upper ( )
292+ && initial_interval. lower ( ) == target_interval. lower ( ) ;
293+
294+ if initial_is_same_singleton {
295+ return None ;
296+ }
297+
298+ Some ( 1.0 / distinct_count as f64 )
299+ }
300+
263301/// This function calculates the filter predicate's selectivity by comparing
264302/// the initial and pruned column boundaries. Selectivity is defined as the
265303/// ratio of rows in a table that satisfy the filter's predicate.
@@ -279,15 +317,17 @@ fn calculate_selectivity(
279317 for ( initial, target) in initial_boundaries. iter ( ) . zip ( target_boundaries) {
280318 match ( initial. interval . as_ref ( ) , target. interval . as_ref ( ) ) {
281319 ( Some ( initial_interval) , Some ( target_interval) ) => {
282- // If it is equality predicate, calculate selectivity as `1 / distinct_count`
283320 if let Precision :: Exact ( distinct_count)
284321 | Precision :: Inexact ( distinct_count) = target. distinct_count
285- && distinct_count > 0
286- && !target_interval. lower ( ) . is_null ( )
287- && target_interval. lower ( ) == target_interval. upper ( )
288322 {
289- acc *= 1.0 / distinct_count as f64 ;
290- continue ;
323+ if let Some ( s) = singleton_selectivity (
324+ initial_interval,
325+ target_interval,
326+ distinct_count,
327+ ) {
328+ acc *= s;
329+ continue ;
330+ }
291331 }
292332 acc *= cardinality_ratio ( initial_interval, target_interval) ;
293333 }
@@ -308,14 +348,14 @@ mod tests {
308348 use std:: sync:: Arc ;
309349
310350 use arrow:: datatypes:: { DataType , Field , Schema } ;
311- use datafusion_common:: { DFSchema , assert_contains} ;
351+ use datafusion_common:: { DFSchema , ScalarValue , assert_contains, stats :: Precision } ;
312352 use datafusion_expr:: {
313353 Expr , col, execution_props:: ExecutionProps , interval_arithmetic:: Interval , lit,
314354 } ;
315355
316- use crate :: { AnalysisContext , create_physical_expr} ;
356+ use crate :: { AnalysisContext , create_physical_expr, expressions :: Column } ;
317357
318- use super :: { ExprBoundaries , analyze} ;
358+ use super :: { ExprBoundaries , analyze, calculate_selectivity , singleton_selectivity } ;
319359
320360 fn make_field ( name : & str , data_type : DataType ) -> Field {
321361 let nullable = false ;
@@ -446,4 +486,85 @@ mod tests {
446486 . unwrap_err ( ) ;
447487 assert_contains ! ( analysis_error. to_string( ) , expected_error) ;
448488 }
489+ fn make_boundary ( lower : i32 , upper : i32 , distinct_count : usize ) -> ExprBoundaries {
490+ ExprBoundaries {
491+ column : Column :: new ( "a" , 0 ) ,
492+ interval : Some (
493+ Interval :: try_new (
494+ ScalarValue :: Int32 ( Some ( lower) ) ,
495+ ScalarValue :: Int32 ( Some ( upper) ) ,
496+ )
497+ . unwrap ( ) ,
498+ ) ,
499+ distinct_count : Precision :: Exact ( distinct_count) ,
500+ }
501+ }
502+
503+ /// When the initial interval is already the same singleton as the target,
504+ /// `singleton_selectivity` must return `None` so we do not double-apply
505+ /// 1/NDV selectivity.
506+ #[ test]
507+ fn test_singleton_selectivity_skipped_when_initial_is_same_singleton ( ) {
508+ let singleton =
509+ Interval :: try_new ( ScalarValue :: Int32 ( Some ( 5 ) ) , ScalarValue :: Int32 ( Some ( 5 ) ) )
510+ . unwrap ( ) ;
511+ // Both initial and target are [5, 5] — no new equality filter was applied.
512+ assert_eq ! (
513+ singleton_selectivity( & singleton, & singleton, 10 ) ,
514+ None ,
515+ "shortcut must not fire when initial interval was already the same singleton"
516+ ) ;
517+ }
518+
519+ /// When the initial interval is a broader range and the target collapses to
520+ /// a singleton, `singleton_selectivity` must return `Some(1/NDV)`.
521+ #[ test]
522+ fn test_singleton_selectivity_applied_when_range_collapses ( ) {
523+ let initial =
524+ Interval :: try_new ( ScalarValue :: Int32 ( Some ( 1 ) ) , ScalarValue :: Int32 ( Some ( 100 ) ) )
525+ . unwrap ( ) ;
526+ let target =
527+ Interval :: try_new ( ScalarValue :: Int32 ( Some ( 5 ) ) , ScalarValue :: Int32 ( Some ( 5 ) ) )
528+ . unwrap ( ) ;
529+ let result = singleton_selectivity ( & initial, & target, 10 ) ;
530+ assert_eq ! (
531+ result,
532+ Some ( 0.1 ) ,
533+ "shortcut must return 1/NDV when a range collapses to a singleton"
534+ ) ;
535+ }
536+
537+ /// Regression test: `calculate_selectivity` must not apply the `1/NDV`
538+ /// shortcut when the column statistics already describe a singleton interval
539+ /// (i.e. before the filter, the column only ever held one value). In that
540+ /// case the target and initial intervals are the same singleton, so the
541+ /// cardinality ratio is 1.0 and the overall selectivity should remain 1.0.
542+ #[ test]
543+ fn test_calculate_selectivity_already_singleton_initial_interval ( ) {
544+ let already_singleton = make_boundary ( 7 , 7 , 1 ) ;
545+
546+ let selectivity =
547+ calculate_selectivity ( & [ already_singleton. clone ( ) ] , & [ already_singleton] )
548+ . unwrap ( ) ;
549+
550+ let wide_initial = make_boundary ( 1 , 100 , 50 ) ;
551+ let same_singleton_target = make_boundary ( 7 , 7 , 50 ) ;
552+ let selectivity_new =
553+ calculate_selectivity ( & [ same_singleton_target] , & [ wide_initial] ) . unwrap ( ) ;
554+ assert ! (
555+ ( selectivity_new - 0.02 ) . abs( ) < 1e-10 ,
556+ "expected selectivity 1/NDV = 0.02, got {selectivity_new}"
557+ ) ;
558+
559+ let singleton_initial = make_boundary ( 7 , 7 , 50 ) ;
560+ let singleton_target = make_boundary ( 7 , 7 , 50 ) ;
561+ let selectivity_no_new_filter =
562+ calculate_selectivity ( & [ singleton_target] , & [ singleton_initial] ) . unwrap ( ) ;
563+ assert ! (
564+ ( selectivity_no_new_filter - 1.0 ) . abs( ) < 1e-10 ,
565+ "expected selectivity 1.0 when initial was already the same singleton, got {selectivity_no_new_filter}"
566+ ) ;
567+
568+ let _ = selectivity; // silence unused warning
569+ }
449570}
0 commit comments