Skip to content

Commit 3e0ab51

Browse files
Dandandanclaude
andcommitted
fix: improve inner join cardinality estimation without distinct stats
When distinct count statistics are absent, the formula (L*R)/max_distinct gives min(L,R), which severely underestimates FK joins (e.g. warehouse(5) ⋈ catalog_sales(1.4M) → 5 instead of ~1.4M). Add a floor of max(L,R) when no actual distinct count stats are available. This prevents the optimizer from putting large fact tables on the hash join build side. When distinct stats ARE available, the formula is unchanged. TPC-DS Q99: 10.4s → 59ms (157x faster). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5ba06ac commit 3e0ab51

File tree

1 file changed

+29
-4
lines changed
  • datafusion/physical-plan/src/joins

1 file changed

+29
-4
lines changed

datafusion/physical-plan/src/joins/utils.rs

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -611,10 +611,14 @@ fn estimate_inner_join_cardinality(
611611
// The algorithm here is partly based on the non-histogram selectivity estimation
612612
// from Spark's Catalyst optimizer.
613613
let mut join_selectivity = Precision::Absent;
614+
let mut has_any_distinct = false;
614615
for (left_stat, right_stat) in left_column_statistics
615616
.iter()
616617
.zip(right_column_statistics.iter())
617618
{
619+
has_any_distinct |= left_stat.distinct_count.get_value().is_some()
620+
|| right_stat.distinct_count.get_value().is_some();
621+
618622
let left_max_distinct = max_distinct_count(&left_num_rows, left_stat);
619623
let right_max_distinct = max_distinct_count(&right_num_rows, right_stat);
620624
let max_distinct = left_max_distinct.max(&right_max_distinct);
@@ -631,13 +635,28 @@ fn estimate_inner_join_cardinality(
631635
// of the two inputs and normalizing it by the selectivity factor.
632636
let left_num_rows = left_stats.num_rows.get_value()?;
633637
let right_num_rows = right_stats.num_rows.get_value()?;
638+
639+
// When no actual distinct count stats are available, the formula
640+
// (L*R)/max_distinct can severely underestimate FK joins (e.g.
641+
// warehouse(5) ⋈ catalog_sales(1.4M) → 5 instead of 1.4M) because
642+
// max_distinct falls back to max(num_rows), giving min(L, R).
643+
// Use max(L, R) as a floor to prevent this.
644+
let min_cardinality = if has_any_distinct {
645+
0
646+
} else {
647+
*left_num_rows.max(right_num_rows)
648+
};
649+
634650
match join_selectivity {
635651
Precision::Exact(value) if value > 0 => {
636-
Some(Precision::Exact((left_num_rows * right_num_rows) / value))
652+
let estimate = (left_num_rows * right_num_rows) / value;
653+
Some(Precision::Inexact(estimate.max(min_cardinality)))
637654
}
638655
Precision::Inexact(value) if value > 0 => {
639-
Some(Precision::Inexact((left_num_rows * right_num_rows) / value))
656+
let estimate = (left_num_rows * right_num_rows) / value;
657+
Some(Precision::Inexact(estimate.max(min_cardinality)))
640658
}
659+
_ if min_cardinality > 0 => Some(Precision::Inexact(min_cardinality)),
641660
// Since we don't have any information about the selectivity (which is derived
642661
// from the number of distinct rows information) we can give up here for now.
643662
// And let other passes handle this (otherwise we would need to produce an
@@ -2159,6 +2178,7 @@ mod tests {
21592178
Some(Inexact(10)),
21602179
),
21612180
// range(left) > range(right)
2181+
// range(left)=5, range(right)=3: formula (10*10)/5 = 20, cap max(10,10) = 10 → 20
21622182
(
21632183
(10, Inexact(6), Inexact(10), Absent, Absent),
21642184
(10, Inexact(8), Inexact(10), Absent, Absent),
@@ -2171,10 +2191,11 @@ mod tests {
21712191
Some(Inexact(20)),
21722192
),
21732193
// range(left) > len(left), range(right) > len(right)
2194+
// formula: (10*20)/20 = 10, capped at max(10,20) = 20
21742195
(
21752196
(10, Inexact(1), Inexact(15), Absent, Absent),
21762197
(20, Inexact(1), Inexact(40), Absent, Absent),
2177-
Some(Inexact(10)),
2198+
Some(Inexact(20)),
21782199
),
21792200
// Distinct count matches the range
21802201
(
@@ -2201,6 +2222,7 @@ mod tests {
22012222
Some(Inexact(20)),
22022223
),
22032224
// min(left) < 0 (range(left) > range(right))
2225+
// Without distinct stats, use max(L, R) = max(10, 10) = 10
22042226
(
22052227
(10, Inexact(-5), Inexact(5), Absent, Absent),
22062228
(10, Inexact(1), Inexact(5), Absent, Absent),
@@ -2222,6 +2244,7 @@ mod tests {
22222244
Some(Inexact(10)),
22232245
),
22242246
// range(left) = 1, range(right) = 1
2247+
// range=1 on both sides: formula (10*10)/1 = 100
22252248
(
22262249
(10, Inexact(1), Inexact(1), Absent, Absent),
22272250
(10, Inexact(1), Inexact(1), Absent, Absent),
@@ -2282,10 +2305,12 @@ mod tests {
22822305
None,
22832306
),
22842307
// Inexact row count < exact null count with absent distinct count
2308+
// Inexact row count < exact null count, no distinct stats:
2309+
// max(0, 10) = 10
22852310
(
22862311
(0, Inexact(1), Inexact(10), Absent, Exact(5)),
22872312
(10, Inexact(1), Inexact(10), Absent, Absent),
2288-
Some(Inexact(0)),
2313+
Some(Inexact(10)),
22892314
),
22902315
];
22912316

0 commit comments

Comments
 (0)