Skip to content

Commit 6edcb6d

Browse files
committed
refactor(stats): document and normalize sum_value widening
1 parent b35253c commit 6edcb6d

File tree

1 file changed

+54
-32
lines changed

1 file changed

+54
-32
lines changed

datafusion/common/src/stats.rs

Lines changed: 54 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,16 @@ impl Precision<ScalarValue> {
211211
}
212212
}
213213

214+
fn cast_scalar_to_sum_type(value: &ScalarValue) -> Result<ScalarValue> {
215+
let source_type = value.data_type();
216+
let target_type = Self::sum_data_type(&source_type);
217+
if source_type == target_type {
218+
Ok(value.clone())
219+
} else {
220+
value.cast_to(&target_type)
221+
}
222+
}
223+
214224
/// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
215225
/// conservatively propagating exactness information. If one of the input
216226
/// values is [`Precision::Absent`], the result is `Absent` too.
@@ -241,39 +251,21 @@ impl Precision<ScalarValue> {
241251
/// This narrows overflow risk when `sum_value` statistics are merged:
242252
/// `Int8/Int16/Int32 -> Int64` and `UInt8/UInt16/UInt32 -> UInt64`.
243253
pub fn cast_to_sum_type(&self) -> Precision<ScalarValue> {
244-
match self {
245-
Precision::Exact(value) => {
246-
let source_type = value.data_type();
247-
let target_type = Self::sum_data_type(&source_type);
248-
if source_type == target_type {
249-
Precision::Exact(value.clone())
250-
} else {
251-
value
252-
.cast_to(&target_type)
253-
.map(Precision::Exact)
254-
.unwrap_or(Precision::Absent)
255-
}
256-
}
257-
Precision::Inexact(value) => {
258-
let source_type = value.data_type();
259-
let target_type = Self::sum_data_type(&source_type);
260-
if source_type == target_type {
261-
Precision::Inexact(value.clone())
262-
} else {
263-
value
264-
.cast_to(&target_type)
265-
.map(Precision::Inexact)
266-
.unwrap_or(Precision::Absent)
267-
}
268-
}
269-
Precision::Absent => Precision::Absent,
254+
match (self.is_exact(), self.get_value()) {
255+
(Some(true), Some(value)) => Self::cast_scalar_to_sum_type(value)
256+
.map(Precision::Exact)
257+
.unwrap_or(Precision::Absent),
258+
(Some(false), Some(value)) => Self::cast_scalar_to_sum_type(value)
259+
.map(Precision::Inexact)
260+
.unwrap_or(Precision::Absent),
261+
(_, _) => Precision::Absent,
270262
}
271263
}
272264

273265
/// SUM-style addition with integer widening to match SQL `SUM` return
274266
/// types for smaller integral inputs.
275267
pub fn add_for_sum(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
276-
self.cast_to_sum_type().add(&other.cast_to_sum_type())
268+
precision_add(&self.cast_to_sum_type(), &other.cast_to_sum_type())
277269
}
278270

279271
/// Calculates the difference of two (possibly inexact) [`ScalarValue`] values,
@@ -926,7 +918,15 @@ pub struct ColumnStatistics {
926918
pub max_value: Precision<ScalarValue>,
927919
/// Minimum value of column
928920
pub min_value: Precision<ScalarValue>,
929-
/// Sum value of a column
921+
/// Sum value of a column.
922+
///
923+
/// For integral columns, values should be kept in SUM-compatible widened
924+
/// types (`Int8/Int16/Int32 -> Int64`, `UInt8/UInt16/UInt32 -> UInt64`) to
925+
/// reduce overflow risk during statistics propagation.
926+
///
927+
/// Callers should prefer [`ColumnStatistics::with_sum_value`] for setting
928+
/// this field and [`Precision<ScalarValue>::add_for_sum`] /
929+
/// [`Precision<ScalarValue>::cast_to_sum_type`] for sum arithmetic.
930930
pub sum_value: Precision<ScalarValue>,
931931
/// Number of distinct values
932932
pub distinct_count: Precision<usize>,
@@ -991,7 +991,19 @@ impl ColumnStatistics {
991991

992992
/// Set the sum value
993993
pub fn with_sum_value(mut self, sum_value: Precision<ScalarValue>) -> Self {
994-
self.sum_value = sum_value;
994+
self.sum_value = match sum_value {
995+
Precision::Exact(value) => {
996+
Precision::<ScalarValue>::cast_scalar_to_sum_type(&value)
997+
.map(Precision::Exact)
998+
.unwrap_or(Precision::Absent)
999+
}
1000+
Precision::Inexact(value) => {
1001+
Precision::<ScalarValue>::cast_scalar_to_sum_type(&value)
1002+
.map(Precision::Inexact)
1003+
.unwrap_or(Precision::Absent)
1004+
}
1005+
Precision::Absent => Precision::Absent,
1006+
};
9951007
self
9961008
}
9971009

@@ -2085,6 +2097,16 @@ mod tests {
20852097
assert_eq!(col_stats.byte_size, Precision::Exact(8192));
20862098
}
20872099

2100+
#[test]
2101+
fn test_with_sum_value_builder_widens_small_integers() {
2102+
let col_stats = ColumnStatistics::new_unknown()
2103+
.with_sum_value(Precision::Exact(ScalarValue::UInt32(Some(123))));
2104+
assert_eq!(
2105+
col_stats.sum_value,
2106+
Precision::Exact(ScalarValue::UInt64(Some(123)))
2107+
);
2108+
}
2109+
20882110
#[test]
20892111
fn test_with_fetch_scales_byte_size() {
20902112
// Test that byte_size is scaled by the row ratio in with_fetch
@@ -2232,7 +2254,7 @@ mod tests {
22322254
);
22332255
assert_eq!(
22342256
col1_stats.sum_value,
2235-
Precision::Exact(ScalarValue::Int32(Some(1100)))
2257+
Precision::Exact(ScalarValue::Int64(Some(1100)))
22362258
);
22372259

22382260
let col2_stats = &summary_stats.column_statistics[1];
@@ -2247,7 +2269,7 @@ mod tests {
22472269
);
22482270
assert_eq!(
22492271
col2_stats.sum_value,
2250-
Precision::Exact(ScalarValue::Int32(Some(2200)))
2272+
Precision::Exact(ScalarValue::Int64(Some(2200)))
22512273
);
22522274
}
22532275

@@ -2596,7 +2618,7 @@ mod tests {
25962618
);
25972619
assert_eq!(
25982620
col_stats.sum_value,
2599-
Precision::Inexact(ScalarValue::Int32(Some(1500)))
2621+
Precision::Inexact(ScalarValue::Int64(Some(1500)))
26002622
);
26012623
}
26022624
}

0 commit comments

Comments
 (0)