1515// specific language governing permissions and limitations
1616// under the License.
1717
18- use datafusion_common:: { Result , plan_err} ;
19- use datafusion_expr:: { JoinType , LogicalPlan } ;
18+ use datafusion_common:: { Column , Result , plan_err, stats :: Precision } ;
19+ use datafusion_expr:: { Expr , JoinType , LogicalPlan } ;
2020
2121use super :: join_graph:: Edge ;
2222
2323pub trait JoinCostEstimator : std:: fmt:: Debug {
24- fn cardinality ( & self , plan : & LogicalPlan ) -> Option < f64 > {
25- estimate_cardinality ( plan) . ok ( )
24+ /// Cardinality of `plan`.
25+ ///
26+ /// - `column = None`: number of output rows of `plan`.
27+ /// - `column = Some(c)`: number of distinct values of column `c`
28+ /// in `plan`'s output (NDV).
29+ fn cardinality ( & self , plan : & LogicalPlan , column : Option < & Column > ) -> Option < f64 > {
30+ estimate_cardinality ( plan, column) . ok ( )
2631 }
2732
28- fn selectivity ( & self , edge : & Edge ) -> f64 {
29- match edge. join_type {
33+ /// Estimated selectivity of joining `left` with `right` via `edge`.
34+ ///
35+ /// Default: `1 / max(NDV(left.key), NDV(right.key))` for inner equi-joins
36+ /// when both NDVs are available; otherwise a per-join-type constant.
37+ fn selectivity ( & self , edge : & Edge , left : & LogicalPlan , right : & LogicalPlan ) -> f64 {
38+ let fallback = match edge. join_type {
3039 JoinType :: Inner => 0.1 ,
3140 _ => 1.0 ,
41+ } ;
42+ if edge. join_type != JoinType :: Inner || edge. on . is_empty ( ) {
43+ return fallback;
44+ }
45+ // Use only the first equi-pair. Compounding pairwise selectivities
46+ // under independence assumptions overestimates selectivity when
47+ // composite-key columns are correlated, which is the common case.
48+ let ( a, b) = & edge. on [ 0 ] ;
49+ let ( Some ( col_a) , Some ( col_b) ) = ( key_column ( a) , key_column ( b) ) else {
50+ return fallback;
51+ } ;
52+ let ndv_a = ndv_for ( self , col_a, left, right) ;
53+ let ndv_b = ndv_for ( self , col_b, left, right) ;
54+ match ( ndv_a, ndv_b) {
55+ ( Some ( a) , Some ( b) ) if a. max ( b) > 0.0 => 1.0 / a. max ( b) ,
56+ _ => fallback,
3257 }
3358 }
3459
@@ -43,29 +68,114 @@ pub struct DefaultCostEstimator;
4368
4469impl JoinCostEstimator for DefaultCostEstimator { }
4570
46- fn estimate_cardinality ( plan : & LogicalPlan ) -> Result < f64 > {
71+ fn key_column ( expr : & Expr ) -> Option < & Column > {
72+ match expr {
73+ Expr :: Column ( c) => Some ( c) ,
74+ _ => None ,
75+ }
76+ }
77+
78+ /// Look up NDV of `column` on whichever side (left or right) owns it.
79+ fn ndv_for < E : JoinCostEstimator + ?Sized > (
80+ estimator : & E ,
81+ column : & Column ,
82+ left : & LogicalPlan ,
83+ right : & LogicalPlan ,
84+ ) -> Option < f64 > {
85+ if left. schema ( ) . has_column ( column) {
86+ estimator. cardinality ( left, Some ( column) )
87+ } else if right. schema ( ) . has_column ( column) {
88+ estimator. cardinality ( right, Some ( column) )
89+ } else {
90+ None
91+ }
92+ }
93+
94+ fn estimate_cardinality ( plan : & LogicalPlan , column : Option < & Column > ) -> Result < f64 > {
4795 match plan {
48- LogicalPlan :: Filter ( filter) => {
49- let input_cardinality = estimate_cardinality ( & filter. input ) ?;
50- Ok ( 0.1 * input_cardinality)
51- }
52- LogicalPlan :: Aggregate ( agg) => {
53- let input_cardinality = estimate_cardinality ( & agg. input ) ?;
54- Ok ( 0.1 * input_cardinality)
55- }
56- LogicalPlan :: TableScan ( _) => {
57- // The logical-plan-level `TableSource` trait doesn't expose row
58- // statistics. To use cardinalities for base relations, override
59- // `JoinCostEstimator::cardinality`.
60- plan_err ! (
61- "Default JoinCostEstimator cannot size TableScan; \
62- override `cardinality` to provide statistics"
63- )
96+ LogicalPlan :: Filter ( filter) => match column {
97+ None => {
98+ let input = estimate_cardinality ( & filter. input , None ) ?;
99+ Ok ( 0.1 * input)
100+ }
101+ Some ( c) => {
102+ // NDV is bounded above by the input's NDV and by the
103+ // surviving row count.
104+ let ndv_in = estimate_cardinality ( & filter. input , Some ( c) ) ?;
105+ let rows_out = estimate_cardinality ( plan, None ) . unwrap_or ( ndv_in) ;
106+ Ok ( ndv_in. min ( rows_out) )
107+ }
108+ } ,
109+ LogicalPlan :: Aggregate ( agg) => match column {
110+ None => {
111+ let input = estimate_cardinality ( & agg. input , None ) ?;
112+ Ok ( 0.1 * input)
113+ }
114+ Some ( c) => {
115+ // Group-by keys are unique in the aggregate's output, so
116+ // NDV(group_key) equals the post-aggregate row count.
117+ let is_group_key = agg. group_expr . iter ( ) . any ( |e| match e {
118+ Expr :: Column ( g) => g. name == c. name && g. relation == c. relation ,
119+ _ => false ,
120+ } ) ;
121+ if is_group_key {
122+ estimate_cardinality ( plan, None )
123+ } else {
124+ plan_err ! (
125+ "Cannot estimate NDV of non-group-by column \
126+ `{}` through Aggregate",
127+ c. name
128+ )
129+ }
130+ }
131+ } ,
132+ LogicalPlan :: TableScan ( scan) => {
133+ let stats = scan. source . statistics ( ) . ok_or_else ( || {
134+ datafusion_common:: DataFusionError :: Plan ( format ! (
135+ "TableSource for `{}` does not expose statistics" ,
136+ scan. table_name
137+ ) )
138+ } ) ?;
139+ match column {
140+ None => match stats. num_rows {
141+ Precision :: Exact ( n) | Precision :: Inexact ( n) => Ok ( n as f64 ) ,
142+ Precision :: Absent => plan_err ! (
143+ "TableSource for `{}` does not provide a row count" ,
144+ scan. table_name
145+ ) ,
146+ } ,
147+ Some ( c) => {
148+ // `column_statistics` is indexed by the source schema
149+ // (pre-projection), so resolve the column there.
150+ let idx = scan. source . schema ( ) . index_of ( & c. name ) . map_err ( |_| {
151+ datafusion_common:: DataFusionError :: Plan ( format ! (
152+ "Column `{}` not found in source schema of `{}`" ,
153+ c. name, scan. table_name
154+ ) )
155+ } ) ?;
156+ let col_stats =
157+ stats. column_statistics . get ( idx) . ok_or_else ( || {
158+ datafusion_common:: DataFusionError :: Plan ( format ! (
159+ "Column statistics missing for index {idx} \
160+ on `{}`",
161+ scan. table_name
162+ ) )
163+ } ) ?;
164+ match col_stats. distinct_count {
165+ Precision :: Exact ( n) | Precision :: Inexact ( n) => Ok ( n as f64 ) ,
166+ Precision :: Absent => plan_err ! (
167+ "Column `{}` on `{}` has no distinct-count statistic" ,
168+ c. name,
169+ scan. table_name
170+ ) ,
171+ }
172+ }
173+ }
64174 }
65175 x => {
66176 let inputs = x. inputs ( ) ;
67177 if inputs. len ( ) == 1 {
68- estimate_cardinality ( inputs[ 0 ] )
178+ estimate_cardinality ( inputs[ 0 ] , column )
69179 } else {
70180 plan_err ! ( "Cannot estimate cardinality for plan with multiple inputs" )
71181 }
0 commit comments