Skip to content

Commit c8f2fa6

Browse files
Dandandanclaude
andcommitted
fix: propagate column statistics through CAST in projections
When join keys involve CAST expressions (e.g. CAST(id AS Float64)), the column statistics were lost in ProjectionExec because it only propagated stats for plain Column and Literal expressions. Propagate distinct_count and null_count through numeric CAST expressions in ProjectionExec::project_statistics. The distinct count is made Inexact since casting can reduce (but never increase) the number of distinct values. min/max/sum are cleared since they may not be valid after type conversion. This fixes join cardinality estimation for queries with type mismatches in join keys (common in TPC-DS where fact table FKs are Float64 and dimension PKs are Int32). TPC-DS Q99: 10.4s → ~60ms. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5ba06ac commit c8f2fa6

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed

datafusion/physical-expr/src/projection.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,24 @@ impl ProjectionExprs {
714714
byte_size,
715715
}
716716
}
717+
} else if let Some(cast) =
718+
expr.as_any().downcast_ref::<crate::expressions::CastExpr>()
719+
&& let Some(col) = cast.expr.as_any().downcast_ref::<Column>()
720+
&& cast.cast_type().is_numeric()
721+
{
722+
// Numeric casts can only merge values (many-to-one), so the
723+
// source column's distinct count is an upper bound. Propagate
724+
// it as Inexact since the cast may reduce distinct values.
725+
let src = std::mem::take(&mut stats.column_statistics[col.index()]);
726+
ColumnStatistics {
727+
null_count: src.null_count,
728+
distinct_count: src.distinct_count.to_inexact(),
729+
// min/max/sum not valid after cast
730+
min_value: Precision::Absent,
731+
max_value: Precision::Absent,
732+
sum_value: Precision::Absent,
733+
byte_size: Precision::Absent,
734+
}
717735
} else {
718736
// TODO stats: estimate more statistics from expressions
719737
// (expressions should compute their statistics themselves)
@@ -3104,4 +3122,50 @@ pub(crate) mod tests {
31043122

31053123
Ok(())
31063124
}
3125+
3126+
#[test]
3127+
fn test_project_statistics_through_cast() -> Result<()> {
3128+
let input_stats = get_stats();
3129+
let input_schema = get_schema();
3130+
3131+
// CAST(col0 AS Float64) — col0 has distinct_count=Exact(5)
3132+
let cast_expr = Arc::new(crate::expressions::CastExpr::new(
3133+
Arc::new(Column::new("col0", 0)),
3134+
DataType::Float64,
3135+
None,
3136+
));
3137+
3138+
let projection = ProjectionExprs::new(vec![ProjectionExpr {
3139+
expr: cast_expr,
3140+
alias: "col0_f64".to_string(),
3141+
}]);
3142+
3143+
let output_stats = projection.project_statistics(
3144+
input_stats,
3145+
&projection.project_schema(&input_schema)?,
3146+
)?;
3147+
3148+
assert_eq!(output_stats.num_rows, Precision::Exact(5));
3149+
assert_eq!(output_stats.column_statistics.len(), 1);
3150+
// Distinct count preserved but made Inexact (cast may reduce it)
3151+
assert_eq!(
3152+
output_stats.column_statistics[0].distinct_count,
3153+
Precision::Inexact(5)
3154+
);
3155+
assert_eq!(
3156+
output_stats.column_statistics[0].null_count,
3157+
Precision::Exact(0)
3158+
);
3159+
// min/max/sum cleared (not valid after cast)
3160+
assert_eq!(
3161+
output_stats.column_statistics[0].min_value,
3162+
Precision::Absent
3163+
);
3164+
assert_eq!(
3165+
output_stats.column_statistics[0].max_value,
3166+
Precision::Absent
3167+
);
3168+
3169+
Ok(())
3170+
}
31073171
}

0 commit comments

Comments
 (0)