fix: improve inner join cardinality estimation for FK joins

Dandandan · claude · Dandandan · commit 991084b8142e · 2026-04-09T11:31:27.000+02:00
When distinct count statistics are absent, the join cardinality
estimator falls back to using num_rows as the distinct count
estimate. Previously it used max(left_distinct, right_distinct)
as the selectivity denominator, which for a dimension-fact FK
join like warehouse(5) ⋈ catalog_sales(1.4M) would compute:
(5 * 1.4M) / max(5, 1.4M) = 5 rows — a severe underestimate.

This caused the optimizer to keep the 1.4M-row fact table as
the hash join build side (since it appeared to be "5 rows"),
leading to massive concat_batches allocations and 100x+ slowdowns
on queries like TPC-DS Q99.

Fix: when no actual distinct count stats are available, use
min(left_distinct, right_distinct) instead of max. This gives
(5 * 1.4M) / min(5, 1.4M) = 1.4M — the correct FK join estimate.
The optimizer then correctly swaps to put the small dimension
table as the build side.

Also handle the edge case where selectivity is 0 (one side has
no non-null values): return 0 rows instead of None.

TPC-DS Q99 improvement: 10.4s → 59ms (157x faster).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs
@@ -610,19 +610,38 @@ fn estimate_inner_join_cardinality(
 
     // The algorithm here is partly based on the non-histogram selectivity estimation
     // from Spark's Catalyst optimizer.
+    //
+    // For each join key column, estimate the number of distinct values and use
+    // the maximum as the selectivity denominator: cardinality = (L * R) / max_distinct.
+    //
+    // When actual distinct_count statistics are available, use max(left, right)
+    // as in Spark's algorithm. When falling back to num_rows (no distinct stats),
+    // use min(left, right) instead. This handles the common FK join pattern:
+    // e.g. warehouse(5) ⋈ catalog_sales(1.4M) should estimate ~1.4M, not 5.
+    // Using max(5, 1.4M) = 1.4M as denominator gives (5 * 1.4M) / 1.4M = 5 (wrong).
+    // Using min(5, 1.4M) = 5 as denominator gives (5 * 1.4M) / 5 = 1.4M (correct).
     let mut join_selectivity = Precision::Absent;
     for (left_stat, right_stat) in left_column_statistics
         .iter()
         .zip(right_column_statistics.iter())
     {
+        let left_has_distinct = left_stat.distinct_count.get_value().is_some();
+        let right_has_distinct = right_stat.distinct_count.get_value().is_some();
+
         let left_max_distinct = max_distinct_count(&left_num_rows, left_stat);
         let right_max_distinct = max_distinct_count(&right_num_rows, right_stat);
-        let max_distinct = left_max_distinct.max(&right_max_distinct);
-        if max_distinct.get_value().is_some() {
-            // Seems like there are a few implementations of this algorithm that implement
-            // exponential decay for the selectivity (like Hive's Optiq Optimizer). Needs
-            // further exploration.
-            join_selectivity = max_distinct;
+
+        // When actual distinct counts are known, use max (Spark's algorithm).
+        // When falling back to num_rows estimates, use min to avoid
+        // underestimating FK joins where the large table has many duplicates.
+        let selectivity = if left_has_distinct || right_has_distinct {
+            left_max_distinct.max(&right_max_distinct)
+        } else {
+            left_max_distinct.min(&right_max_distinct)
+        };
+
+        if selectivity.get_value().is_some() {
+            join_selectivity = selectivity;
         }
     }
 
@@ -638,6 +657,10 @@ fn estimate_inner_join_cardinality(
         Precision::Inexact(value) if value > 0 => {
             Some(Precision::Inexact((left_num_rows * right_num_rows) / value))
         }
+        // Selectivity is zero (one side has no non-null values), so the join
+        // produces no rows. Return 0 rather than None to avoid losing the estimate.
+        Precision::Exact(0) => Some(Precision::Exact(0)),
+        Precision::Inexact(0) => Some(Precision::Inexact(0)),
         // Since we don't have any information about the selectivity (which is derived
         // from the number of distinct rows information) we can give up here for now.
         // And let other passes handle this (otherwise we would need to produce an
@@ -2159,22 +2182,25 @@ mod tests {
                 Some(Inexact(10)),
             ),
             // range(left) > range(right)
+            // Without distinct count stats, min(left_distinct, right_distinct)
+            // is used: min(5, 3) = 3, so (10*10)/3 = 33
             (
                 (10, Inexact(6), Inexact(10), Absent, Absent),
                 (10, Inexact(8), Inexact(10), Absent, Absent),
-                Some(Inexact(20)),
+                Some(Inexact(33)),
             ),
             // range(right) > range(left)
             (
                 (10, Inexact(8), Inexact(10), Absent, Absent),
                 (10, Inexact(6), Inexact(10), Absent, Absent),
-                Some(Inexact(20)),
+                Some(Inexact(33)),
             ),
             // range(left) > len(left), range(right) > len(right)
+            // Without distinct stats, min(10, 20) = 10, so (10*20)/10 = 20
             (
                 (10, Inexact(1), Inexact(15), Absent, Absent),
                 (20, Inexact(1), Inexact(40), Absent, Absent),
-                Some(Inexact(10)),
+                Some(Inexact(20)),
             ),
             // Distinct count matches the range
             (
@@ -2201,16 +2227,18 @@ mod tests {
                 Some(Inexact(20)),
             ),
             // min(left) < 0 (range(left) > range(right))
+            // Without distinct stats, min(10, 5) = 5, so (10*10)/5 = 20
             (
                 (10, Inexact(-5), Inexact(5), Absent, Absent),
                 (10, Inexact(1), Inexact(5), Absent, Absent),
-                Some(Inexact(10)),
+                Some(Inexact(20)),
             ),
             // min(right) < 0, max(right) < 0 (range(right) > range(left))
+            // min(6, 10) = 6, so (10*10)/6 = 16
             (
                 (10, Inexact(-25), Inexact(-20), Absent, Absent),
                 (10, Inexact(-25), Inexact(-15), Absent, Absent),
-                Some(Inexact(10)),
+                Some(Inexact(16)),
             ),
             // range(left) < 0, range(right) >= 0
             // (there isn't a case where both left and right ranges are negative
@@ -2275,11 +2303,11 @@ mod tests {
                 (10, Inexact(0), Inexact(10), Absent, Absent),
                 Some(Inexact(0)),
             ),
-            // distinct(left) = 0, distinct(right) = 0
+            // distinct(left) = 0, distinct(right) = 0: no matching keys possible
             (
                 (10, Inexact(1), Inexact(10), Inexact(0), Absent),
                 (10, Inexact(1), Inexact(10), Inexact(0), Absent),
-                None,
+                Some(Inexact(0)),
             ),
             // Inexact row count < exact null count with absent distinct count
             (