Skip to content

Commit ad81053

Browse files
Dandandanclaude
andcommitted
fix: improve inner join cardinality estimation without distinct stats
When distinct count statistics are absent (the common case), estimate inner join cardinality as max(left_rows, right_rows) instead of using the formula (L * R) / max(L, R) = min(L, R). The old estimate severely underestimates FK joins: warehouse(5) ⋈ catalog_sales(1.4M) was estimated as 5 rows, causing the optimizer to put the 1.4M-row table on the hash join build side. TPC-DS Q99: 10.4s → 59ms (157x faster). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5ba06ac commit ad81053

File tree

1 file changed

+30
-17
lines changed
  • datafusion/physical-plan/src/joins

1 file changed

+30
-17
lines changed

datafusion/physical-plan/src/joins/utils.rs

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -611,37 +611,45 @@ fn estimate_inner_join_cardinality(
611611
// The algorithm here is partly based on the non-histogram selectivity estimation
612612
// from Spark's Catalyst optimizer.
613613
let mut join_selectivity = Precision::Absent;
614+
let mut has_any_distinct = false;
614615
for (left_stat, right_stat) in left_column_statistics
615616
.iter()
616617
.zip(right_column_statistics.iter())
617618
{
619+
has_any_distinct |= left_stat.distinct_count.get_value().is_some()
620+
|| right_stat.distinct_count.get_value().is_some();
621+
618622
let left_max_distinct = max_distinct_count(&left_num_rows, left_stat);
619623
let right_max_distinct = max_distinct_count(&right_num_rows, right_stat);
620624
let max_distinct = left_max_distinct.max(&right_max_distinct);
621625
if max_distinct.get_value().is_some() {
622-
// Seems like there are a few implementations of this algorithm that implement
623-
// exponential decay for the selectivity (like Hive's Optiq Optimizer). Needs
624-
// further exploration.
625626
join_selectivity = max_distinct;
626627
}
627628
}
628629

629-
// With the assumption that the smaller input's domain is generally represented in the bigger
630-
// input's domain, we can estimate the inner join's cardinality by taking the cartesian product
631-
// of the two inputs and normalizing it by the selectivity factor.
632630
let left_num_rows = left_stats.num_rows.get_value()?;
633631
let right_num_rows = right_stats.num_rows.get_value()?;
632+
633+
// When no actual distinct count stats are available, the selectivity
634+
// denominator falls back to max(num_rows_left, num_rows_right), which
635+
// gives cardinality = min(L, R). This severely underestimates FK joins
636+
// (e.g. warehouse(5) ⋈ catalog_sales(1.4M) → 5 instead of 1.4M).
637+
// In this case, use max(L, R) directly as a better heuristic.
638+
if !has_any_distinct {
639+
return Some(Precision::Inexact(*left_num_rows.max(right_num_rows)));
640+
}
641+
634642
match join_selectivity {
635643
Precision::Exact(value) if value > 0 => {
636644
Some(Precision::Exact((left_num_rows * right_num_rows) / value))
637645
}
638646
Precision::Inexact(value) if value > 0 => {
639647
Some(Precision::Inexact((left_num_rows * right_num_rows) / value))
640648
}
641-
// Since we don't have any information about the selectivity (which is derived
642-
// from the number of distinct rows information) we can give up here for now.
643-
// And let other passes handle this (otherwise we would need to produce an
644-
// overestimation using just the cartesian product).
649+
// Selectivity is zero (one side has no non-null values), so the join
650+
// produces no rows.
651+
Precision::Exact(0) => Some(Precision::Exact(0)),
652+
Precision::Inexact(0) => Some(Precision::Inexact(0)),
645653
_ => None,
646654
}
647655
}
@@ -2159,22 +2167,24 @@ mod tests {
21592167
Some(Inexact(10)),
21602168
),
21612169
// range(left) > range(right)
2170+
// Without distinct stats, use max(L, R) = max(10, 10) = 10
21622171
(
21632172
(10, Inexact(6), Inexact(10), Absent, Absent),
21642173
(10, Inexact(8), Inexact(10), Absent, Absent),
2165-
Some(Inexact(20)),
2174+
Some(Inexact(10)),
21662175
),
21672176
// range(right) > range(left)
21682177
(
21692178
(10, Inexact(8), Inexact(10), Absent, Absent),
21702179
(10, Inexact(6), Inexact(10), Absent, Absent),
2171-
Some(Inexact(20)),
2180+
Some(Inexact(10)),
21722181
),
21732182
// range(left) > len(left), range(right) > len(right)
2183+
// Without distinct stats, min(10, 20) = 10, so (10*20)/10 = 20
21742184
(
21752185
(10, Inexact(1), Inexact(15), Absent, Absent),
21762186
(20, Inexact(1), Inexact(40), Absent, Absent),
2177-
Some(Inexact(10)),
2187+
Some(Inexact(20)),
21782188
),
21792189
// Distinct count matches the range
21802190
(
@@ -2201,6 +2211,7 @@ mod tests {
22012211
Some(Inexact(20)),
22022212
),
22032213
// min(left) < 0 (range(left) > range(right))
2214+
// Without distinct stats, use max(L, R) = max(10, 10) = 10
22042215
(
22052216
(10, Inexact(-5), Inexact(5), Absent, Absent),
22062217
(10, Inexact(1), Inexact(5), Absent, Absent),
@@ -2222,10 +2233,11 @@ mod tests {
22222233
Some(Inexact(10)),
22232234
),
22242235
// range(left) = 1, range(right) = 1
2236+
// Without distinct stats, use max(L, R) = 10
22252237
(
22262238
(10, Inexact(1), Inexact(1), Absent, Absent),
22272239
(10, Inexact(1), Inexact(1), Absent, Absent),
2228-
Some(Inexact(100)),
2240+
Some(Inexact(10)),
22292241
),
22302242
//
22312243
// Edge cases
@@ -2275,17 +2287,18 @@ mod tests {
22752287
(10, Inexact(0), Inexact(10), Absent, Absent),
22762288
Some(Inexact(0)),
22772289
),
2278-
// distinct(left) = 0, distinct(right) = 0
2290+
// distinct(left) = 0, distinct(right) = 0: no matching keys possible
22792291
(
22802292
(10, Inexact(1), Inexact(10), Inexact(0), Absent),
22812293
(10, Inexact(1), Inexact(10), Inexact(0), Absent),
2282-
None,
2294+
Some(Inexact(0)),
22832295
),
22842296
// Inexact row count < exact null count with absent distinct count
2297+
// Without distinct stats, use max(L, R) = max(0, 10) = 10
22852298
(
22862299
(0, Inexact(1), Inexact(10), Absent, Exact(5)),
22872300
(10, Inexact(1), Inexact(10), Absent, Absent),
2288-
Some(Inexact(0)),
2301+
Some(Inexact(10)),
22892302
),
22902303
];
22912304

0 commit comments

Comments
 (0)