Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
33ccecc
Refactor approx_percentile_cont UDFs to use numeric coercion
theirix Mar 20, 2026
3b48e04
Update SLT of approx_percentile_cont to return float
theirix Mar 20, 2026
4637df2
Refactor test UDF stub to use numeric coercion
theirix Mar 20, 2026
968a851
Deprecate aggregates NUMERICS and INTEGERS arrays
theirix Mar 20, 2026
d2bccf3
Update unit test for approx_percentile_cont to return floats
theirix Mar 20, 2026
6fd2148
Update unit test for stubfunction
theirix Mar 20, 2026
ff1eb8b
Update aggregate SLT to use float
theirix Mar 20, 2026
c04052c
Fix flaky test
theirix Mar 20, 2026
0a17a19
Format
theirix Mar 20, 2026
add22d8
Debug flaky test
theirix Mar 20, 2026
98cdc25
Remove flaky assertion
theirix Mar 20, 2026
09c0b90
Use selection of numeric types for example
theirix Mar 22, 2026
abd8b09
Remove TODO as arrays are deprecated
theirix Mar 22, 2026
2ec7332
Revert "Use selection of numeric types for example"
theirix Mar 22, 2026
f19de5f
Merge branch 'main' into rm-aggregates-integers
theirix Mar 22, 2026
2fd9259
Better error for null parameter
theirix Mar 22, 2026
9d2790d
Merge branch 'main' into rm-aggregates-integers
theirix Mar 23, 2026
498a825
Merge branch 'main' into rm-aggregates-integers
theirix Mar 24, 2026
4995989
Merge branch 'main' into rm-aggregates-integers
theirix Apr 10, 2026
ae04043
Change median return type to float
theirix Apr 16, 2026
793609e
Remove integer code path for approx_percentile
theirix Apr 16, 2026
a08f2ff
Change signature of median (aliased to approx_percentile_cont)
theirix Apr 16, 2026
b7b3020
Update median unit tests
theirix Apr 16, 2026
462bfa8
Update median SLTs
theirix Apr 16, 2026
e242290
Clarify signature
theirix Apr 16, 2026
aa2c8c1
Merge branch 'main' into rm-aggregates-integers
theirix Apr 17, 2026
e6752c2
Add upgrade notes for 54
theirix Apr 17, 2026
858bbd8
Run prettier
theirix Apr 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions datafusion/core/tests/dataframe/dataframe_functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ async fn test_fn_approx_median() -> Result<()> {
+-----------------------+
| approx_median(test.b) |
+-----------------------+
| 10 |
| 10.0 |
+-----------------------+
");

Expand All @@ -422,7 +422,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
+---------------------------------------------------------------------------+
| approx_percentile_cont(Float64(0.5)) WITHIN GROUP [test.b ASC NULLS LAST] |
+---------------------------------------------------------------------------+
| 10 |
| 10.0 |
+---------------------------------------------------------------------------+
");

Expand All @@ -437,7 +437,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
+----------------------------------------------------------------------------+
| approx_percentile_cont(Float64(0.1)) WITHIN GROUP [test.b DESC NULLS LAST] |
+----------------------------------------------------------------------------+
| 100 |
| 100.0 |
+----------------------------------------------------------------------------+
");

Expand All @@ -457,7 +457,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
+--------------------------------------------------------------------+
| approx_percentile_cont(arg_2) WITHIN GROUP [test.b ASC NULLS LAST] |
+--------------------------------------------------------------------+
| 10 |
| 10.0 |
+--------------------------------------------------------------------+
"
);
Expand All @@ -477,7 +477,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
+---------------------------------------------------------------------+
| approx_percentile_cont(arg_2) WITHIN GROUP [test.b DESC NULLS LAST] |
+---------------------------------------------------------------------+
| 100 |
| 100.0 |
+---------------------------------------------------------------------+
"
);
Expand All @@ -494,7 +494,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
+------------------------------------------------------------------------------------+
| approx_percentile_cont(Float64(0.5),Int32(2)) WITHIN GROUP [test.b ASC NULLS LAST] |
+------------------------------------------------------------------------------------+
| 30 |
| 30.25 |
+------------------------------------------------------------------------------------+
");

Expand All @@ -510,7 +510,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
+-------------------------------------------------------------------------------------+
| approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] |
+-------------------------------------------------------------------------------------+
| 69 |
| 69.85 |
+-------------------------------------------------------------------------------------+
");

Expand Down
40 changes: 20 additions & 20 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1204,26 +1204,26 @@ async fn window_using_aggregates() -> Result<()> {
| first_value | last_val | approx_distinct | approx_median | median | max | min | c2 | c3 |
+-------------+----------+-----------------+---------------+--------+-----+------+----+------+
| | | | | | | | 1 | -85 |
| -85 | -101 | 14 | -12 | -12 | 83 | -101 | 4 | -54 |
| -85 | -101 | 17 | -25 | -25 | 83 | -101 | 5 | -31 |
| -85 | -12 | 10 | -32 | -34 | 83 | -85 | 3 | 13 |
| -85 | -25 | 3 | -56 | -56 | -25 | -85 | 1 | -5 |
| -85 | -31 | 18 | -29 | -28 | 83 | -101 | 5 | 36 |
| -85 | -38 | 16 | -25 | -25 | 83 | -101 | 4 | 65 |
| -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 |
| -85 | -48 | 6 | -35 | -36 | 83 | -85 | 2 | -43 |
| -85 | -5 | 4 | -37 | -40 | -5 | -85 | 1 | 83 |
| -85 | -54 | 15 | -17 | -18 | 83 | -101 | 4 | -38 |
| -85 | -56 | 2 | -70 | -70 | -56 | -85 | 1 | -25 |
| -85 | -72 | 9 | -43 | -43 | 83 | -85 | 3 | -12 |
| -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 |
| -85 | 13 | 11 | -17 | -18 | 83 | -85 | 3 | 14 |
| -85 | 13 | 11 | -25 | -25 | 83 | -85 | 3 | 13 |
| -85 | 14 | 12 | -12 | -12 | 83 | -85 | 3 | 17 |
| -85 | 17 | 13 | -11 | -8 | 83 | -85 | 4 | -101 |
| -85 | 45 | 8 | -34 | -34 | 83 | -85 | 3 | -72 |
| -85 | 65 | 17 | -17 | -18 | 83 | -101 | 5 | -101 |
| -85 | 83 | 5 | -25 | -25 | 83 | -85 | 2 | -48 |
| -85 | -101 | 14 | -12.0 | -12 | 83 | -101 | 4 | -54 |
| -85 | -101 | 17 | -25.0 | -25 | 83 | -101 | 5 | -31 |
| -85 | -12 | 10 | -32.75 | -34 | 83 | -85 | 3 | 13 |
| -85 | -25 | 3 | -56.0 | -56 | -25 | -85 | 1 | -5 |
| -85 | -31 | 18 | -29.75 | -28 | 83 | -101 | 5 | 36 |
| -85 | -38 | 16 | -25.0 | -25 | 83 | -101 | 4 | 65 |
| -85 | -43 | 7 | -43.0 | -43 | 83 | -85 | 2 | 45 |
| -85 | -48 | 6 | -35.75 | -36 | 83 | -85 | 2 | -43 |
| -85 | -5 | 4 | -37.75 | -40 | -5 | -85 | 1 | 83 |
| -85 | -54 | 15 | -17.0 | -18 | 83 | -101 | 4 | -38 |
| -85 | -56 | 2 | -70.5 | -70 | -56 | -85 | 1 | -25 |
| -85 | -72 | 9 | -43.0 | -43 | 83 | -85 | 3 | -12 |
| -85 | -85 | 1 | -85.0 | -85 | -85 | -85 | 1 | -56 |
| -85 | 13 | 11 | -17.0 | -18 | 83 | -85 | 3 | 14 |
| -85 | 13 | 11 | -25.0 | -25 | 83 | -85 | 3 | 13 |
| -85 | 14 | 12 | -12.0 | -12 | 83 | -85 | 3 | 17 |
| -85 | 17 | 13 | -11.25 | -8 | 83 | -85 | 4 | -101 |
| -85 | 45 | 8 | -34.5 | -34 | 83 | -85 | 3 | -72 |
| -85 | 65 | 17 | -17.0 | -18 | 83 | -101 | 5 | -101 |
| -85 | 83 | 5 | -25.0 | -25 | 83 | -85 | 2 | -48 |
+-------------+----------+-----------------+---------------+--------+-----+------+----+------+
"
);
Expand Down
15 changes: 14 additions & 1 deletion datafusion/expr-common/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ use std::fmt::Display;
use std::hash::Hash;
use std::sync::Arc;

use crate::type_coercion::aggregates::NUMERICS;
use arrow::datatypes::{
DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, DECIMAL128_MAX_PRECISION, DataType,
Decimal128Type, DecimalType, Field, IntervalUnit, TimeUnit,
Expand Down Expand Up @@ -596,6 +595,20 @@ impl Display for ArrayFunctionArgument {
}
}

static NUMERICS: &[DataType] = &[
DataType::Int8,
DataType::Int16,
DataType::Int32,
DataType::Int64,
DataType::UInt8,
DataType::UInt16,
DataType::UInt32,
DataType::UInt64,
DataType::Float16,
DataType::Float32,
DataType::Float64,
];

impl TypeSignature {
pub fn to_string_repr(&self) -> Vec<String> {
match self {
Expand Down
4 changes: 2 additions & 2 deletions datafusion/expr-common/src/type_coercion/aggregates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ use arrow::datatypes::{DataType, FieldRef};

use datafusion_common::{Result, internal_err, plan_err};

// TODO: remove usage of these (INTEGERS and NUMERICS) in favour of signatures
// see https://github.com/apache/datafusion/issues/18092
#[deprecated(since = "54.0.0", note = "Use functions signatures")]
Comment thread
Jefffrey marked this conversation as resolved.
pub static INTEGERS: &[DataType] = &[
DataType::Int8,
DataType::Int16,
Expand All @@ -33,6 +32,7 @@ pub static INTEGERS: &[DataType] = &[
DataType::UInt64,
];

#[deprecated(since = "54.0.0", note = "Use functions signatures")]
pub static NUMERICS: &[DataType] = &[
DataType::Int8,
DataType::Int16,
Expand Down
20 changes: 17 additions & 3 deletions datafusion/expr/src/test/function_stub.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,14 @@ use datafusion_common::plan_err;
use datafusion_common::{Result, exec_err, not_impl_err, utils::take_function_args};

use crate::Volatility::Immutable;
use crate::type_coercion::aggregates::NUMERICS;
use crate::{
Accumulator, AggregateUDFImpl, Expr, GroupsAccumulator, ReversedUDAF, Signature,
Accumulator, AggregateUDFImpl, Coercion, Expr, GroupsAccumulator, ReversedUDAF,
Signature, TypeSignature, TypeSignatureClass,
expr::AggregateFunction,
function::{AccumulatorArgs, StateFieldsArgs},
utils::AggregateOrderSensitivity,
};
use datafusion_common::types::{NativeType, logical_float64};

macro_rules! create_func {
($UDAF:ty, $AGGREGATE_UDF_FN:ident) => {
Expand Down Expand Up @@ -444,9 +445,22 @@ pub struct Avg {

impl Avg {
pub fn new() -> Self {
let signature = Signature::one_of(
vec![
TypeSignature::Coercible(vec![Coercion::new_exact(
TypeSignatureClass::Decimal,
)]),
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Native(logical_float64()),
vec![TypeSignatureClass::Integer, TypeSignatureClass::Float],
NativeType::Float64,
)]),
],
Immutable,
);
Self {
aliases: vec![String::from("mean")],
signature: Signature::uniform(1, NUMERICS.to_vec(), Immutable),
signature,
}
}
}
Expand Down
15 changes: 5 additions & 10 deletions datafusion/functions-aggregate/src/approx_median.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,11 @@ impl ApproxMedian {
pub fn new() -> Self {
Self {
signature: Signature::one_of(
vec![
TypeSignature::Coercible(vec![Coercion::new_exact(
TypeSignatureClass::Integer,
)]),
TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Float,
vec![TypeSignatureClass::Decimal],
NativeType::Float64,
)]),
],
vec![TypeSignature::Coercible(vec![Coercion::new_implicit(
TypeSignatureClass::Float,
vec![TypeSignatureClass::Numeric],
NativeType::Float64,
)])],
Volatility::Immutable,
),
}
Expand Down
115 changes: 42 additions & 73 deletions datafusion/functions-aggregate/src/approx_percentile_cont.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,20 @@ use arrow::array::{Array, Float16Array};
use arrow::compute::{filter, is_not_null};
use arrow::datatypes::FieldRef;
use arrow::{
array::{
ArrayRef, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
Int64Array, UInt8Array, UInt16Array, UInt32Array, UInt64Array,
},
array::{ArrayRef, Float32Array, Float64Array},
datatypes::{DataType, Field},
};
use datafusion_common::types::{NativeType, logical_float64};
use datafusion_common::{
DataFusionError, Result, ScalarValue, downcast_value, internal_err, not_impl_err,
plan_err,
};
use datafusion_expr::expr::{AggregateFunction, Sort};
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
use datafusion_expr::utils::format_state_name;
use datafusion_expr::{
Accumulator, AggregateUDFImpl, Documentation, Expr, Signature, TypeSignature,
Volatility,
Accumulator, AggregateUDFImpl, Coercion, Documentation, Expr, Signature,
TypeSignature, TypeSignatureClass, Volatility,
};
use datafusion_functions_aggregate_common::tdigest::{DEFAULT_MAX_SIZE, TDigest};
use datafusion_macros::user_doc;
Expand Down Expand Up @@ -132,22 +129,44 @@ impl Default for ApproxPercentileCont {
impl ApproxPercentileCont {
/// Create a new [`ApproxPercentileCont`] aggregate function.
pub fn new() -> Self {
let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() + 1));
// Accept any numeric value paired with a float64 percentile
for num in NUMERICS {
variants.push(TypeSignature::Exact(vec![num.clone(), DataType::Float64]));
// Additionally accept an integer number of centroids for T-Digest
for int in INTEGERS {
variants.push(TypeSignature::Exact(vec![
num.clone(),
DataType::Float64,
int.clone(),
]))
}
}
Self {
signature: Signature::one_of(variants, Volatility::Immutable),
}
let signature = Signature::one_of(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're now coercing to floats, we can remove some of the implementation code that handles integer types

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, removed some code. It also affected approx_median, piggybacking on approx_percentile_cont - updated signature and tests as well.

Clarified the scope of this PR in description - it changes more than expected

vec![
// 2 args - numeric, percentile (float)
TypeSignature::Coercible(vec![
Coercion::new_implicit(
TypeSignatureClass::Float,
vec![TypeSignatureClass::Numeric],
NativeType::Float64,
),
Coercion::new_implicit(
TypeSignatureClass::Native(logical_float64()),
vec![TypeSignatureClass::Numeric],
NativeType::Float64,
),
]),
// 3 args - numeric, percentile (float), number of centroid for T-Digest (integer)
TypeSignature::Coercible(vec![
Coercion::new_implicit(
TypeSignatureClass::Float,
vec![TypeSignatureClass::Numeric],
NativeType::Float64,
),
Coercion::new_implicit(
TypeSignatureClass::Native(logical_float64()),
vec![TypeSignatureClass::Numeric],
NativeType::Float64,
),
Coercion::new_implicit(
TypeSignatureClass::Integer,
vec![TypeSignatureClass::Numeric],
NativeType::Int64,
),
]),
],
Volatility::Immutable,
);
Self { signature }
}

pub(crate) fn create_accumulator(
Expand Down Expand Up @@ -177,17 +196,7 @@ impl ApproxPercentileCont {

let data_type = args.expr_fields[0].data_type();
let accumulator: ApproxPercentileAccumulator = match data_type {
DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::Float16
| DataType::Float32
| DataType::Float64 => {
DataType::Float16 | DataType::Float32 | DataType::Float64 => {
if let Some(max_size) = tdigest_max_size {
ApproxPercentileAccumulator::new_with_max_size(
percentile,
Expand Down Expand Up @@ -374,38 +383,6 @@ impl ApproxPercentileAccumulator {
.map(|v| v.to_f64())
.collect::<Vec<_>>())
}
DataType::Int64 => {
let array = downcast_value!(values, Int64Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
DataType::Int32 => {
let array = downcast_value!(values, Int32Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
DataType::Int16 => {
let array = downcast_value!(values, Int16Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
DataType::Int8 => {
let array = downcast_value!(values, Int8Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
DataType::UInt64 => {
let array = downcast_value!(values, UInt64Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
DataType::UInt32 => {
let array = downcast_value!(values, UInt32Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
DataType::UInt16 => {
let array = downcast_value!(values, UInt16Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
DataType::UInt8 => {
let array = downcast_value!(values, UInt8Array);
Ok(array.values().iter().map(|v| *v as f64).collect::<Vec<_>>())
}
e => internal_err!(
"APPROX_PERCENTILE_CONT is not expected to receive the type {e:?}"
),
Expand Down Expand Up @@ -439,14 +416,6 @@ impl Accumulator for ApproxPercentileAccumulator {
// These acceptable return types MUST match the validation in
// ApproxPercentile::create_accumulator.
Ok(match &self.return_type {
DataType::Int8 => ScalarValue::Int8(Some(q as i8)),
DataType::Int16 => ScalarValue::Int16(Some(q as i16)),
DataType::Int32 => ScalarValue::Int32(Some(q as i32)),
DataType::Int64 => ScalarValue::Int64(Some(q as i64)),
DataType::UInt8 => ScalarValue::UInt8(Some(q as u8)),
DataType::UInt16 => ScalarValue::UInt16(Some(q as u16)),
DataType::UInt32 => ScalarValue::UInt32(Some(q as u32)),
DataType::UInt64 => ScalarValue::UInt64(Some(q as u64)),
DataType::Float16 => ScalarValue::Float16(Some(half::f16::from_f64(q))),
DataType::Float32 => ScalarValue::Float32(Some(q as f32)),
DataType::Float64 => ScalarValue::Float64(Some(q)),
Expand Down
Loading
Loading