@@ -611,10 +611,14 @@ fn estimate_inner_join_cardinality(
611611 // The algorithm here is partly based on the non-histogram selectivity estimation
612612 // from Spark's Catalyst optimizer.
613613 let mut join_selectivity = Precision :: Absent ;
614+ let mut has_any_distinct = false ;
614615 for ( left_stat, right_stat) in left_column_statistics
615616 . iter ( )
616617 . zip ( right_column_statistics. iter ( ) )
617618 {
619+ has_any_distinct |= left_stat. distinct_count . get_value ( ) . is_some ( )
620+ || right_stat. distinct_count . get_value ( ) . is_some ( ) ;
621+
618622 let left_max_distinct = max_distinct_count ( & left_num_rows, left_stat) ;
619623 let right_max_distinct = max_distinct_count ( & right_num_rows, right_stat) ;
620624 let max_distinct = left_max_distinct. max ( & right_max_distinct) ;
@@ -631,13 +635,28 @@ fn estimate_inner_join_cardinality(
631635 // of the two inputs and normalizing it by the selectivity factor.
632636 let left_num_rows = left_stats. num_rows . get_value ( ) ?;
633637 let right_num_rows = right_stats. num_rows . get_value ( ) ?;
638+
639+ // When no actual distinct count stats are available, the formula
640+ // (L*R)/max_distinct can severely underestimate FK joins (e.g.
641+ // warehouse(5) ⋈ catalog_sales(1.4M) → 5 instead of 1.4M) because
642+ // max_distinct falls back to max(num_rows), giving min(L, R).
643+ // Use max(L, R) as a floor to prevent this.
644+ let min_cardinality = if has_any_distinct {
645+ 0
646+ } else {
647+ * left_num_rows. max ( right_num_rows)
648+ } ;
649+
634650 match join_selectivity {
635651 Precision :: Exact ( value) if value > 0 => {
636- Some ( Precision :: Exact ( ( left_num_rows * right_num_rows) / value) )
652+ let estimate = ( left_num_rows * right_num_rows) / value;
653+ Some ( Precision :: Inexact ( estimate. max ( min_cardinality) ) )
637654 }
638655 Precision :: Inexact ( value) if value > 0 => {
639- Some ( Precision :: Inexact ( ( left_num_rows * right_num_rows) / value) )
656+ let estimate = ( left_num_rows * right_num_rows) / value;
657+ Some ( Precision :: Inexact ( estimate. max ( min_cardinality) ) )
640658 }
659+ _ if min_cardinality > 0 => Some ( Precision :: Inexact ( min_cardinality) ) ,
641660 // Since we don't have any information about the selectivity (which is derived
642661 // from the number of distinct rows information) we can give up here for now.
643662 // And let other passes handle this (otherwise we would need to produce an
@@ -2159,6 +2178,7 @@ mod tests {
21592178 Some ( Inexact ( 10 ) ) ,
21602179 ) ,
21612180 // range(left) > range(right)
2181+ // range(left)=5, range(right)=3: formula (10*10)/5 = 20, cap max(10,10) = 10 → 20
21622182 (
21632183 ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
21642184 ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
@@ -2171,10 +2191,11 @@ mod tests {
21712191 Some ( Inexact ( 20 ) ) ,
21722192 ) ,
21732193 // range(left) > len(left), range(right) > len(right)
2194+ // formula: (10*20)/20 = 10, capped at max(10,20) = 20
21742195 (
21752196 ( 10 , Inexact ( 1 ) , Inexact ( 15 ) , Absent , Absent ) ,
21762197 ( 20 , Inexact ( 1 ) , Inexact ( 40 ) , Absent , Absent ) ,
2177- Some ( Inexact ( 10 ) ) ,
2198+ Some ( Inexact ( 20 ) ) ,
21782199 ) ,
21792200 // Distinct count matches the range
21802201 (
@@ -2201,6 +2222,7 @@ mod tests {
22012222 Some ( Inexact ( 20 ) ) ,
22022223 ) ,
22032224 // min(left) < 0 (range(left) > range(right))
2225+ // Without distinct stats, use max(L, R) = max(10, 10) = 10
22042226 (
22052227 ( 10 , Inexact ( -5 ) , Inexact ( 5 ) , Absent , Absent ) ,
22062228 ( 10 , Inexact ( 1 ) , Inexact ( 5 ) , Absent , Absent ) ,
@@ -2222,6 +2244,7 @@ mod tests {
22222244 Some ( Inexact ( 10 ) ) ,
22232245 ) ,
22242246 // range(left) = 1, range(right) = 1
2247+ // range=1 on both sides: formula (10*10)/1 = 100
22252248 (
22262249 ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
22272250 ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
@@ -2282,10 +2305,12 @@ mod tests {
22822305 None ,
22832306 ) ,
22842307 // Inexact row count < exact null count with absent distinct count
2308+ // Inexact row count < exact null count, no distinct stats:
2309+ // max(0, 10) = 10
22852310 (
22862311 ( 0 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Exact ( 5 ) ) ,
22872312 ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
2288- Some ( Inexact ( 0 ) ) ,
2313+ Some ( Inexact ( 10 ) ) ,
22892314 ) ,
22902315 ] ;
22912316
0 commit comments