@@ -611,37 +611,45 @@ fn estimate_inner_join_cardinality(
611611 // The algorithm here is partly based on the non-histogram selectivity estimation
612612 // from Spark's Catalyst optimizer.
613613 let mut join_selectivity = Precision :: Absent ;
614+ let mut has_any_distinct = false ;
614615 for ( left_stat, right_stat) in left_column_statistics
615616 . iter ( )
616617 . zip ( right_column_statistics. iter ( ) )
617618 {
619+ has_any_distinct |= left_stat. distinct_count . get_value ( ) . is_some ( )
620+ || right_stat. distinct_count . get_value ( ) . is_some ( ) ;
621+
618622 let left_max_distinct = max_distinct_count ( & left_num_rows, left_stat) ;
619623 let right_max_distinct = max_distinct_count ( & right_num_rows, right_stat) ;
620624 let max_distinct = left_max_distinct. max ( & right_max_distinct) ;
621625 if max_distinct. get_value ( ) . is_some ( ) {
622- // Seems like there are a few implementations of this algorithm that implement
623- // exponential decay for the selectivity (like Hive's Optiq Optimizer). Needs
624- // further exploration.
625626 join_selectivity = max_distinct;
626627 }
627628 }
628629
629- // With the assumption that the smaller input's domain is generally represented in the bigger
630- // input's domain, we can estimate the inner join's cardinality by taking the cartesian product
631- // of the two inputs and normalizing it by the selectivity factor.
632630 let left_num_rows = left_stats. num_rows . get_value ( ) ?;
633631 let right_num_rows = right_stats. num_rows . get_value ( ) ?;
632+
633+ // When no actual distinct count stats are available, the selectivity
634+ // denominator falls back to max(num_rows_left, num_rows_right), which
635+ // gives cardinality = min(L, R). This severely underestimates FK joins
636+ // (e.g. warehouse(5) ⋈ catalog_sales(1.4M) → 5 instead of 1.4M).
637+ // In this case, use max(L, R) directly as a better heuristic.
638+ if !has_any_distinct {
639+ return Some ( Precision :: Inexact ( * left_num_rows. max ( right_num_rows) ) ) ;
640+ }
641+
634642 match join_selectivity {
635643 Precision :: Exact ( value) if value > 0 => {
636644 Some ( Precision :: Exact ( ( left_num_rows * right_num_rows) / value) )
637645 }
638646 Precision :: Inexact ( value) if value > 0 => {
639647 Some ( Precision :: Inexact ( ( left_num_rows * right_num_rows) / value) )
640648 }
641- // Since we don't have any information about the selectivity (which is derived
642- // from the number of distinct rows information) we can give up here for now .
643- // And let other passes handle this (otherwise we would need to produce an
644- // overestimation using just the cartesian product).
649+ // Selectivity is zero (one side has no non-null values), so the join
650+ // produces no rows.
651+ Precision :: Exact ( 0 ) => Some ( Precision :: Exact ( 0 ) ) ,
652+ Precision :: Inexact ( 0 ) => Some ( Precision :: Inexact ( 0 ) ) ,
645653 _ => None ,
646654 }
647655}
@@ -2159,22 +2167,24 @@ mod tests {
21592167 Some ( Inexact ( 10 ) ) ,
21602168 ) ,
21612169 // range(left) > range(right)
2170+ // Without distinct stats, use max(L, R) = max(10, 10) = 10
21622171 (
21632172 ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
21642173 ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
2165- Some ( Inexact ( 20 ) ) ,
2174+ Some ( Inexact ( 10 ) ) ,
21662175 ) ,
21672176 // range(right) > range(left)
21682177 (
21692178 ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
21702179 ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
2171- Some ( Inexact ( 20 ) ) ,
2180+ Some ( Inexact ( 10 ) ) ,
21722181 ) ,
21732182 // range(left) > len(left), range(right) > len(right)
2183+ // Without distinct stats, min(10, 20) = 10, so (10*20)/10 = 20
21742184 (
21752185 ( 10 , Inexact ( 1 ) , Inexact ( 15 ) , Absent , Absent ) ,
21762186 ( 20 , Inexact ( 1 ) , Inexact ( 40 ) , Absent , Absent ) ,
2177- Some ( Inexact ( 10 ) ) ,
2187+ Some ( Inexact ( 20 ) ) ,
21782188 ) ,
21792189 // Distinct count matches the range
21802190 (
@@ -2201,6 +2211,7 @@ mod tests {
22012211 Some ( Inexact ( 20 ) ) ,
22022212 ) ,
22032213 // min(left) < 0 (range(left) > range(right))
2214+ // Without distinct stats, use max(L, R) = max(10, 10) = 10
22042215 (
22052216 ( 10 , Inexact ( -5 ) , Inexact ( 5 ) , Absent , Absent ) ,
22062217 ( 10 , Inexact ( 1 ) , Inexact ( 5 ) , Absent , Absent ) ,
@@ -2222,10 +2233,11 @@ mod tests {
22222233 Some ( Inexact ( 10 ) ) ,
22232234 ) ,
22242235 // range(left) = 1, range(right) = 1
2236+ // Without distinct stats, use max(L, R) = 10
22252237 (
22262238 ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
22272239 ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
2228- Some ( Inexact ( 100 ) ) ,
2240+ Some ( Inexact ( 10 ) ) ,
22292241 ) ,
22302242 //
22312243 // Edge cases
@@ -2275,17 +2287,18 @@ mod tests {
22752287 ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
22762288 Some ( Inexact ( 0 ) ) ,
22772289 ) ,
2278- // distinct(left) = 0, distinct(right) = 0
2290+ // distinct(left) = 0, distinct(right) = 0: no matching keys possible
22792291 (
22802292 ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
22812293 ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
2282- None ,
2294+ Some ( Inexact ( 0 ) ) ,
22832295 ) ,
22842296 // Inexact row count < exact null count with absent distinct count
2297+ // Without distinct stats, use max(L, R) = max(0, 10) = 10
22852298 (
22862299 ( 0 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Exact ( 5 ) ) ,
22872300 ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
2288- Some ( Inexact ( 0 ) ) ,
2301+ Some ( Inexact ( 10 ) ) ,
22892302 ) ,
22902303 ] ;
22912304
0 commit comments