From ba74cd22bfacfd1168ade5e6f04c3c5db448ea6c Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 18 Feb 2026 15:32:54 -0500 Subject: [PATCH 01/17] Revised approach to fixing numeric/string type coercion --- datafusion/core/src/physical_planner.rs | 6 +- datafusion/core/tests/expr_api/mod.rs | 31 +- datafusion/core/tests/sql/unparser.rs | 16 +- .../expr-common/src/interval_arithmetic.rs | 4 +- datafusion/expr-common/src/signature.rs | 21 +- .../expr-common/src/type_coercion/binary.rs | 316 +++++------- .../type_coercion/binary/tests/comparison.rs | 108 +++- .../type_coercion/binary/tests/dictionary.rs | 20 +- .../binary/tests/run_end_encoded.rs | 20 +- .../expr/src/type_coercion/functions.rs | 4 +- datafusion/expr/src/type_coercion/other.rs | 66 ++- datafusion/functions/src/core/nvl2.rs | 10 +- .../optimizer/src/analyzer/type_coercion.rs | 21 +- .../physical-expr/src/expressions/case.rs | 6 +- datafusion/sqllogictest/test_files/delete.slt | 4 +- .../sqllogictest/test_files/dictionary.slt | 7 +- .../test_files/push_down_filter_parquet.slt | 18 - .../test_files/string/string_query.slt.part | 33 +- .../test_files/string_numeric_coercion.slt | 479 ++++++++++++++++++ .../tests/cases/roundtrip_logical_plan.rs | 2 +- 20 files changed, 855 insertions(+), 337 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/string_numeric_coercion.slt diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 8d52f555bbf76..2bd3756e5f785 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -3436,8 +3436,9 @@ mod tests { #[tokio::test] async fn in_list_types() -> Result<()> { - // expression: "a in ('a', 1)" - let list = vec![lit("a"), lit(1i64)]; + // expression: "c1 in ('a', 'b')" where c1 is Utf8 + // Tests that IN list coercion works with compatible string types. + let list = vec![lit("a"), lit("b")]; let logical_plan = test_csv_scan() .await? // filter clause needs the type coercion rule applied @@ -3445,7 +3446,6 @@ mod tests { .project(vec![col("c1").in_list(list, false)])? .build()?; let execution_plan = plan(&logical_plan).await?; - // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated. let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#; diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index 91dd5de7fcd64..bd2288ae9ebae 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -342,20 +342,25 @@ fn test_create_physical_expr_nvl2() { #[tokio::test] async fn test_create_physical_expr_coercion() { - // create_physical_expr does apply type coercion and unwrapping in cast + // create_physical_expr applies type coercion (and can unwrap/fold + // literal casts). Comparison coercion prefers numeric types, so + // string/int comparisons cast the string side to the numeric type. // - // expect the cast on the literals - // compare string function to int `id = 1` - create_expr_test(col("id").eq(lit(1i32)), "id@0 = CAST(1 AS Utf8)"); - create_expr_test(lit(1i32).eq(col("id")), "CAST(1 AS Utf8) = id@0"); - // compare int col to string literal `i = '202410'` - // Note this casts the column (not the field) - create_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410"); - create_expr_test(lit("202410").eq(col("i")), "202410 = CAST(i@1 AS Utf8)"); - // however, when simplified the casts on i should removed - // https://github.com/apache/datafusion/issues/14944 - create_simplified_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410"); - create_simplified_expr_test(lit("202410").eq(col("i")), "CAST(i@1 AS Utf8) = 202410"); + // string column vs int literal: id (Utf8) is cast to Int32 + create_expr_test(col("id").eq(lit(1i32)), "CAST(id@0 AS Int32) = 1"); + create_expr_test(lit(1i32).eq(col("id")), "1 = CAST(id@0 AS Int32)"); + // int column vs string literal: the string literal is cast to Int64 + create_expr_test(col("i").eq(lit("202410")), "i@1 = CAST(202410 AS Int64)"); + create_expr_test(lit("202410").eq(col("i")), "CAST(202410 AS Int64) = i@1"); + // when simplified, the literal cast is constant-folded + create_simplified_expr_test( + col("i").eq(lit("202410")), + "i@1 = CAST(202410 AS Int64)", + ); + create_simplified_expr_test( + lit("202410").eq(col("i")), + "i@1 = CAST(202410 AS Int64)", + ); } /// Evaluates the specified expr as an aggregate and compares the result to the diff --git a/datafusion/core/tests/sql/unparser.rs b/datafusion/core/tests/sql/unparser.rs index ab1015b2d18d9..47d30871c268b 100644 --- a/datafusion/core/tests/sql/unparser.rs +++ b/datafusion/core/tests/sql/unparser.rs @@ -143,16 +143,26 @@ fn tpch_queries() -> Vec { } /// Create a new SessionContext for testing that has all Clickbench tables registered. +/// +/// Registers the raw Parquet as `hits_raw`, then creates a `hits` view that +/// casts `EventDate` from UInt16 (day-offset) to DATE. This mirrors the +/// approach used by the benchmark runner in `benchmarks/src/clickbench.rs`. async fn clickbench_test_context() -> Result { let ctx = SessionContext::new(); ctx.register_parquet( - "hits", + "hits_raw", "tests/data/clickbench_hits_10.parquet", ParquetReadOptions::default(), ) .await?; - // Sanity check we found the table by querying it's schema, it should not be empty - // Otherwise if the path is wrong the tests will all fail in confusing ways + ctx.sql( + r#"CREATE VIEW hits AS + SELECT * EXCEPT ("EventDate"), + CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" + FROM hits_raw"#, + ) + .await?; + // Sanity check we found the table by querying its schema let df = ctx.sql("SELECT * FROM hits LIMIT 1").await?; assert!( !df.schema().fields().is_empty(), diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index f93ef3b79595b..fb4f1f37b8ced 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -22,7 +22,7 @@ use std::fmt::{self, Display, Formatter}; use std::ops::{AddAssign, SubAssign}; use crate::operator::Operator; -use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion_numeric}; +use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion}; use arrow::compute::{CastOptions, cast_with_options}; use arrow::datatypes::{ @@ -734,7 +734,7 @@ impl Interval { (self.lower.clone(), self.upper.clone(), rhs.clone()) } else { let maybe_common_type = - comparison_coercion_numeric(&self.data_type(), &rhs.data_type()); + comparison_coercion(&self.data_type(), &rhs.data_type()); assert_or_internal_err!( maybe_common_type.is_some(), "Data types must be compatible for containment checks, lhs:{}, rhs:{}", diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 82759be9f75e8..f7ed270126470 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -158,7 +158,7 @@ pub enum Arity { pub enum TypeSignature { /// One or more arguments of a common type out of a list of valid types. /// - /// For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]). + /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]). /// /// # Examples /// @@ -184,7 +184,7 @@ pub enum TypeSignature { Uniform(usize, Vec), /// One or more arguments with exactly the specified types in order. /// - /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. Exact(Vec), /// One or more arguments belonging to the [`TypeSignatureClass`], in order. /// @@ -192,12 +192,12 @@ pub enum TypeSignature { /// casts. For example, if you expect a function has string type, but you /// also allow it to be casted from binary type. /// - /// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]. Coercible(Vec), /// One or more arguments coercible to a single, comparable type. /// /// Each argument will be coerced to a single type using the - /// coercion rules described in [`comparison_coercion_numeric`]. + /// coercion rules described in [`comparison_coercion`]. /// /// # Examples /// @@ -205,17 +205,18 @@ pub enum TypeSignature { /// the types will both be coerced to `i64` before the function is invoked. /// /// If the `nullif('1', 2)` function is called with `Utf8` and `i64` arguments - /// the types will both be coerced to `Utf8` before the function is invoked. + /// the types will both be coerced to `Int64` before the function is invoked + /// (numeric is preferred over string). /// /// Note: - /// - For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]). + /// - For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]). /// - If all arguments have type [`DataType::Null`], they are coerced to `Utf8` /// - /// [`comparison_coercion_numeric`]: crate::type_coercion::binary::comparison_coercion_numeric + /// [`comparison_coercion`]: crate::type_coercion::binary::comparison_coercion Comparable(usize), /// One or more arguments of arbitrary types. /// - /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. Any(usize), /// Matches exactly one of a list of [`TypeSignature`]s. /// @@ -233,7 +234,7 @@ pub enum TypeSignature { /// /// See [`NativeType::is_numeric`] to know which type is considered numeric /// - /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. /// /// [`NativeType::is_numeric`]: datafusion_common::types::NativeType::is_numeric Numeric(usize), @@ -246,7 +247,7 @@ pub enum TypeSignature { /// For example, if a function is called with (utf8, large_utf8), all /// arguments will be coerced to `LargeUtf8` /// - /// For functions that take no arguments (e.g. `random()` use [`TypeSignature::Nullary`]). + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]). String(usize), /// No arguments Nullary, diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index fa109e38a4382..086b06ce189f3 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -33,7 +33,6 @@ use arrow::datatypes::{ DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType, Field, FieldRef, Fields, TimeUnit, }; -use datafusion_common::types::NativeType; use datafusion_common::{ Diagnostic, Result, Span, Spans, exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, @@ -745,10 +744,10 @@ fn type_union_resolution_coercion( // Numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| list_coercion(lhs_type, rhs_type)) + .or_else(|| list_coercion(lhs_type, rhs_type, type_union_coercion)) .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) - .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) + .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) .or_else(|| binary_coercion(lhs_type, rhs_type)) } } @@ -843,102 +842,100 @@ pub fn try_type_union_resolution_with_struct( Ok(final_struct_types) } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a -/// comparison operation +/// Coerce `lhs_type` and `rhs_type` to a common type for type unification +/// contexts — where two values must be brought to a common type but are not +/// being compared. Examples: UNION, CASE THEN/ELSE branches, NVL2. /// -/// Example comparison operations are `lhs = rhs` and `lhs > rhs` +/// Prefers strings over numeric types (e.g., `SELECT 1 UNION SELECT '2'` +/// coerces both sides to `Utf8`). /// -/// Binary comparison kernels require the two arguments to be the (exact) same -/// data type. However, users can write queries where the two arguments are -/// different data types. In such cases, the data types are automatically cast -/// (coerced) to a single data type to pass to the kernels. -/// -/// # Numeric comparisons -/// -/// When comparing numeric values, the lower precision type is coerced to the -/// higher precision type to avoid losing data. For example when comparing -/// `Int32` to `Int64` the coerced type is `Int64` so the `Int32` argument will -/// be cast. -/// -/// # Numeric / String comparisons -/// -/// When comparing numeric values and strings, both values will be coerced to -/// strings. For example when comparing `'2' > 1`, the arguments will be -/// coerced to `Utf8` for comparison -pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// For comparisons (`=`, `<`, `>`), use [`comparison_coercion`] instead. +pub fn type_union_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type.equals_datatype(rhs_type) { - // same type => equality is possible return Some(lhs_type.clone()); } binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, true)) - .or_else(|| ree_comparison_coercion(lhs_type, rhs_type, true)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, true, type_union_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, true, type_union_coercion)) .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) - .or_else(|| list_coercion(lhs_type, rhs_type)) + .or_else(|| list_coercion(lhs_type, rhs_type, type_union_coercion)) .or_else(|| null_coercion(lhs_type, rhs_type)) - .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) + .or_else(|| string_numeric_union_coercion(lhs_type, rhs_type)) .or_else(|| string_temporal_coercion(lhs_type, rhs_type)) .or_else(|| binary_coercion(lhs_type, rhs_type)) - .or_else(|| struct_coercion(lhs_type, rhs_type)) - .or_else(|| map_coercion(lhs_type, rhs_type)) + .or_else(|| struct_coercion(lhs_type, rhs_type, type_union_coercion)) + .or_else(|| map_coercion(lhs_type, rhs_type, type_union_coercion)) } -/// Similar to [`comparison_coercion`] but prefers numeric if compares with -/// numeric and string +/// Coerce `lhs_type` and `rhs_type` to a common type for comparison +/// contexts — any context where two values are compared rather than +/// unified. This includes binary comparison operators, IN lists, +/// CASE/WHEN conditions, and BETWEEN. +/// +/// When the two types differ, this function determines the common type +/// to cast to. /// /// # Numeric comparisons /// -/// When comparing numeric values and strings, the values will be coerced to the -/// numeric type. For example, `'2' > 1` if `1` is an `Int32`, the arguments -/// will be coerced to `Int32`. -pub fn comparison_coercion_numeric( - lhs_type: &DataType, - rhs_type: &DataType, -) -> Option { - if lhs_type == rhs_type { +/// The lower precision type is widened to the higher precision type +/// (e.g., `Int32` vs `Int64` → `Int64`). +/// +/// # Numeric / String comparisons +/// +/// Prefers the numeric type (e.g., `'2' > 1` where `1` is `Int32` coerces +/// `'2'` to `Int32`). +/// +/// For type unification contexts (UNION, CASE THEN/ELSE), use +/// [`type_union_coercion`] instead. +pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { + if lhs_type.equals_datatype(rhs_type) { // same type => equality is possible return Some(lhs_type.clone()); } binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_comparison_coercion_numeric(lhs_type, rhs_type, true)) - .or_else(|| ree_comparison_coercion_numeric(lhs_type, rhs_type, true)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, true, comparison_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, true, comparison_coercion)) + .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) + .or_else(|| list_coercion(lhs_type, rhs_type, comparison_coercion)) .or_else(|| null_coercion(lhs_type, rhs_type)) - .or_else(|| string_numeric_coercion_as_numeric(lhs_type, rhs_type)) + .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) + .or_else(|| string_temporal_coercion(lhs_type, rhs_type)) + .or_else(|| binary_coercion(lhs_type, rhs_type)) + .or_else(|| struct_coercion(lhs_type, rhs_type, comparison_coercion)) + .or_else(|| map_coercion(lhs_type, rhs_type, comparison_coercion)) } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation -/// where one is numeric and one is `Utf8`/`LargeUtf8`. +/// Coerce a numeric/string pair to the numeric type. +/// +/// Used by [`comparison_coercion`]. fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Utf8, _) if rhs_type.is_numeric() => Some(Utf8), - (LargeUtf8, _) if rhs_type.is_numeric() => Some(LargeUtf8), - (Utf8View, _) if rhs_type.is_numeric() => Some(Utf8View), - (_, Utf8) if lhs_type.is_numeric() => Some(Utf8), - (_, LargeUtf8) if lhs_type.is_numeric() => Some(LargeUtf8), - (_, Utf8View) if lhs_type.is_numeric() => Some(Utf8View), + (lhs, Utf8 | LargeUtf8 | Utf8View) if lhs.is_numeric() => Some(lhs.clone()), + (Utf8 | LargeUtf8 | Utf8View, rhs) if rhs.is_numeric() => Some(rhs.clone()), _ => None, } } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation -/// where one is numeric and one is `Utf8`/`LargeUtf8`. -fn string_numeric_coercion_as_numeric( +/// Coerce a numeric/string pair to the string type. +/// +/// Used by [`type_union_coercion`]. +fn string_numeric_union_coercion( lhs_type: &DataType, rhs_type: &DataType, ) -> Option { - let lhs_logical_type = NativeType::from(lhs_type); - let rhs_logical_type = NativeType::from(rhs_type); - if lhs_logical_type.is_numeric() && rhs_logical_type == NativeType::String { - return Some(lhs_type.to_owned()); - } - if rhs_logical_type.is_numeric() && lhs_logical_type == NativeType::String { - return Some(rhs_type.to_owned()); + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (lhs @ (Utf8 | LargeUtf8 | Utf8View), _) if rhs_type.is_numeric() => { + Some(lhs.clone()) + } + (_, rhs @ (Utf8 | LargeUtf8 | Utf8View)) if lhs_type.is_numeric() => { + Some(rhs.clone()) + } + _ => None, } - - None } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation @@ -1230,7 +1227,13 @@ fn coerce_numeric_type_to_decimal256(numeric_type: &DataType) -> Option Option { +/// Coerce two struct types by recursively coercing their fields using +/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). +fn struct_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -1254,10 +1257,10 @@ fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option // // See docs/source/user-guide/sql/struct_coercion.md for detailed examples. if fields_have_same_names(lhs_fields, rhs_fields) { - return coerce_struct_by_name(lhs_fields, rhs_fields); + return coerce_struct_by_name(lhs_fields, rhs_fields, coerce_fn); } - coerce_struct_by_position(lhs_fields, rhs_fields) + coerce_struct_by_position(lhs_fields, rhs_fields, coerce_fn) } _ => None, } @@ -1300,8 +1303,13 @@ fn fields_have_same_names(lhs_fields: &Fields, rhs_fields: &Fields) -> bool { .all(|lf| rhs_names.contains(lf.name().as_str())) } -/// Coerce two structs by matching fields by name. Assumes the name-sets match. -fn coerce_struct_by_name(lhs_fields: &Fields, rhs_fields: &Fields) -> Option { +/// Coerce two structs by matching fields by name using `coerce_fn`. +/// Assumes the name-sets match. +fn coerce_struct_by_name( + lhs_fields: &Fields, + rhs_fields: &Fields, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; let rhs_by_name: HashMap<&str, &FieldRef> = @@ -1311,7 +1319,7 @@ fn coerce_struct_by_name(lhs_fields: &Fields, rhs_fields: &Fields) -> Option Option Option, ) -> Option { use arrow::datatypes::DataType::*; @@ -1335,7 +1344,7 @@ fn coerce_struct_by_position( let coerced_types: Vec = lhs_fields .iter() .zip(rhs_fields.iter()) - .map(|(l, r)| comparison_coercion(l.data_type(), r.data_type())) + .map(|(l, r)| coerce_fn(l.data_type(), r.data_type())) .collect::>>()?; // Build final fields preserving left-side names and combined nullability. @@ -1356,13 +1365,17 @@ fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> Field Arc::new(Field::new(name, common_type, is_nullable)) } -/// coerce two types if they are Maps by coercing their inner 'entries' fields' types -/// using struct coercion -fn map_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// Coerce two Map types by coercing their inner entry fields using +/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). +fn map_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { (Map(lhs_field, lhs_ordered), Map(rhs_field, rhs_ordered)) => { - struct_coercion(lhs_field.data_type(), rhs_field.data_type()).map( + struct_coercion(lhs_field.data_type(), rhs_field.data_type(), coerce_fn).map( |key_value_type| { Map( Arc::new((**lhs_field).clone().with_data_type(key_value_type)), @@ -1483,15 +1496,12 @@ fn both_numeric_or_null_and_numeric(lhs_type: &DataType, rhs_type: &DataType) -> } } -/// Generic coercion rules for Dictionaries: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. +/// Coerce two Dictionary types by coercing their value types using +/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). /// -/// Not all operators support dictionaries, if `preserve_dictionaries` is true -/// dictionaries will be preserved if possible. -/// -/// The `coerce_fn` parameter determines which comparison coercion function to use -/// for comparing the dictionary value types. -fn dictionary_comparison_coercion_generic( +/// If `preserve_dictionaries` is true, dictionaries will be preserved +/// when possible. +fn dictionary_coercion( lhs_type: &DataType, rhs_type: &DataType, preserve_dictionaries: bool, @@ -1515,52 +1525,11 @@ fn dictionary_comparison_coercion_generic( } } -/// Coercion rules for Dictionaries: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. -/// -/// Not all operators support dictionaries, if `preserve_dictionaries` is true -/// dictionaries will be preserved if possible -fn dictionary_comparison_coercion( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_dictionaries: bool, -) -> Option { - dictionary_comparison_coercion_generic( - lhs_type, - rhs_type, - preserve_dictionaries, - comparison_coercion, - ) -} - -/// Coercion rules for Dictionaries with numeric preference: similar to -/// [`dictionary_comparison_coercion`] but uses [`comparison_coercion_numeric`] -/// which prefers numeric types over strings when both are present. -/// -/// This is used by [`comparison_coercion_numeric`] to maintain consistent -/// numeric-preferring semantics when dealing with dictionary types. -fn dictionary_comparison_coercion_numeric( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_dictionaries: bool, -) -> Option { - dictionary_comparison_coercion_generic( - lhs_type, - rhs_type, - preserve_dictionaries, - comparison_coercion_numeric, - ) -} - -/// Coercion rules for RunEndEncoded: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. -/// -/// Not all operators support REE, if `preserve_ree` is true -/// REE will be preserved if possible +/// Coerce two RunEndEncoded types using `coerce_fn` +/// (either [`comparison_coercion`] or [`type_union_coercion`]). /// -/// The `coerce_fn` parameter determines which comparison coercion function to use -/// for comparing the REE value types. -fn ree_comparison_coercion_generic( +/// If `preserve_ree` is true, REE will be preserved when possible. +fn ree_coercion( lhs_type: &DataType, rhs_type: &DataType, preserve_ree: bool, @@ -1587,38 +1556,6 @@ fn ree_comparison_coercion_generic( } } -/// Coercion rules for RunEndEncoded: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. -/// -/// Not all operators support REE, if `preserve_ree` is true -/// REE will be preserved if possible -fn ree_comparison_coercion( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_ree: bool, -) -> Option { - ree_comparison_coercion_generic(lhs_type, rhs_type, preserve_ree, comparison_coercion) -} - -/// Coercion rules for RunEndEncoded with numeric preference: similar to -/// [`ree_comparison_coercion`] but uses [`comparison_coercion_numeric`] -/// which prefers numeric types over strings when both are present. -/// -/// This is used by [`comparison_coercion_numeric`] to maintain consistent -/// numeric-preferring semantics when dealing with REE types. -fn ree_comparison_coercion_numeric( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_ree: bool, -) -> Option { - ree_comparison_coercion_generic( - lhs_type, - rhs_type, - preserve_ree, - comparison_coercion_numeric, - ) -} - /// Coercion rules for string concat. /// This is a union of string coercion rules and specified rules: /// 1. At least one side of lhs and rhs should be string type (Utf8 / LargeUtf8) @@ -1683,32 +1620,28 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; - match (lhs_type, rhs_type) { - (Utf8 | LargeUtf8 | Utf8View, other_type) - | (other_type, Utf8 | LargeUtf8 | Utf8View) - if other_type.is_numeric() => - { - Some(other_type.clone()) - } - _ => None, - } -} - -/// Coerces two fields together, ensuring the field data (name and nullability) is correctly set. -fn coerce_list_children(lhs_field: &FieldRef, rhs_field: &FieldRef) -> Option { - let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()]; +/// Coerce two list element fields to a common type using `coerce_fn`. +fn coerce_list_children( + lhs_field: &FieldRef, + rhs_field: &FieldRef, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { + let coerced_type = coerce_fn(lhs_field.data_type(), rhs_field.data_type())?; Some(Arc::new( (**lhs_field) .clone() - .with_data_type(type_union_resolution(&data_types)?) + .with_data_type(coerced_type) .with_nullable(lhs_field.is_nullable() || rhs_field.is_nullable()), )) } -/// Coercion rules for list types. -fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// Coerce two list types by coercing their element types using +/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). +fn list_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { // Coerce to the left side FixedSizeList type if the list lengths are the same, @@ -1716,11 +1649,11 @@ fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { (FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) => { if ls == rs { Some(FixedSizeList( - coerce_list_children(lhs_field, rhs_field)?, + coerce_list_children(lhs_field, rhs_field, coerce_fn)?, *rs, )) } else { - Some(List(coerce_list_children(lhs_field, rhs_field)?)) + Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?)) } } // LargeList on any side @@ -1728,13 +1661,13 @@ fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { LargeList(lhs_field), List(rhs_field) | LargeList(rhs_field) | FixedSizeList(rhs_field, _), ) - | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => { - Some(LargeList(coerce_list_children(lhs_field, rhs_field)?)) - } + | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => Some( + LargeList(coerce_list_children(lhs_field, rhs_field, coerce_fn)?), + ), // Lists on both sides (List(lhs_field), List(rhs_field) | FixedSizeList(rhs_field, _)) | (FixedSizeList(lhs_field, _), List(rhs_field)) => { - Some(List(coerce_list_children(lhs_field, rhs_field)?)) + Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?)) } _ => None, } @@ -1803,8 +1736,8 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { string_coercion(lhs_type, rhs_type) .or_else(|| binary_to_string_coercion(lhs_type, rhs_type)) - .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false)) - .or_else(|| ree_comparison_coercion(lhs_type, rhs_type, false)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, false, string_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, false, string_coercion)) .or_else(|| regex_null_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) } @@ -1821,10 +1754,11 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { string_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, false, string_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, false, string_coercion)) .or_else(|| regex_null_coercion(lhs_type, rhs_type)) } diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs index 5d1b3bea75b0a..317b022238f67 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs @@ -654,7 +654,7 @@ fn test_list_coercion() { let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true))); - let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap(); + let coerced_type = list_coercion(&lhs_type, &rhs_type, comparison_coercion).unwrap(); assert_eq!( coerced_type, DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true))) @@ -791,3 +791,109 @@ fn test_decimal_cross_variant_comparison_coercion() -> Result<()> { Ok(()) } + +/// Tests that `comparison_coercion` prefers the numeric type when one side is +/// numeric and the other is a string (e.g., `numeric_col < '123'`). +#[test] +fn test_comparison_coercion_prefers_numeric() { + assert_eq!( + comparison_coercion(&DataType::Int32, &DataType::Utf8), + Some(DataType::Int32) + ); + assert_eq!( + comparison_coercion(&DataType::Utf8, &DataType::Int32), + Some(DataType::Int32) + ); + assert_eq!( + comparison_coercion(&DataType::Utf8, &DataType::Float64), + Some(DataType::Float64) + ); + assert_eq!( + comparison_coercion(&DataType::Float64, &DataType::Utf8), + Some(DataType::Float64) + ); + assert_eq!( + comparison_coercion(&DataType::Int64, &DataType::LargeUtf8), + Some(DataType::Int64) + ); + assert_eq!( + comparison_coercion(&DataType::Utf8View, &DataType::Int16), + Some(DataType::Int16) + ); + // String-string stays string + assert_eq!( + comparison_coercion(&DataType::Utf8, &DataType::Utf8), + Some(DataType::Utf8) + ); + // Numeric-numeric stays numeric + assert_eq!( + comparison_coercion(&DataType::Int32, &DataType::Int64), + Some(DataType::Int64) + ); +} + +/// Tests that `type_union_coercion` prefers the string type when unifying +/// numeric and string types (for UNION, CASE THEN/ELSE, etc.). +#[test] +fn test_type_union_coercion_prefers_string() { + assert_eq!( + type_union_coercion(&DataType::Int32, &DataType::Utf8), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Utf8, &DataType::Int32), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Float64, &DataType::Utf8), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Utf8, &DataType::Float64), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Int64, &DataType::LargeUtf8), + Some(DataType::LargeUtf8) + ); + assert_eq!( + type_union_coercion(&DataType::Utf8View, &DataType::Int16), + Some(DataType::Utf8View) + ); + // String-string stays string + assert_eq!( + type_union_coercion(&DataType::Utf8, &DataType::Utf8), + Some(DataType::Utf8) + ); + // Numeric-numeric stays numeric + assert_eq!( + type_union_coercion(&DataType::Int32, &DataType::Int64), + Some(DataType::Int64) + ); +} + +/// Tests that comparison operators coerce to numeric when comparing +/// numeric and string types. +#[test] +fn test_binary_comparison_string_numeric_coercion() -> Result<()> { + let comparison_ops = [ + Operator::Eq, + Operator::NotEq, + Operator::Lt, + Operator::LtEq, + Operator::Gt, + Operator::GtEq, + ]; + for op in &comparison_ops { + let (lhs, rhs) = BinaryTypeCoercer::new(&DataType::Int64, op, &DataType::Utf8) + .get_input_types()?; + assert_eq!(lhs, DataType::Int64, "Op {op}: Int64 vs Utf8 -> lhs"); + assert_eq!(rhs, DataType::Int64, "Op {op}: Int64 vs Utf8 -> rhs"); + + let (lhs, rhs) = BinaryTypeCoercer::new(&DataType::Utf8, op, &DataType::Float64) + .get_input_types()?; + assert_eq!(lhs, DataType::Float64, "Op {op}: Utf8 vs Float64 -> lhs"); + assert_eq!(rhs, DataType::Float64, "Op {op}: Utf8 vs Float64 -> rhs"); + } + Ok(()) +} diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs index 0fb56a4a2c536..f0aadfd3ce3a5 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs @@ -24,49 +24,49 @@ fn test_dictionary_type_coercion() { let lhs_type = Dictionary(Box::new(Int8), Box::new(Int32)); let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Int32) ); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, false), + dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Int32) ); - // Since we can coerce values of Int16 to Utf8 can support this + // In comparison context, numeric is preferred over string let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), - Some(Utf8) + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), + Some(Int16) ); // Since we can coerce values of Utf8 to Binary can support this let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let rhs_type = Dictionary(Box::new(Int8), Box::new(Binary)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Binary) ); let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let rhs_type = Utf8; assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, false), + dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(lhs_type.clone()) ); let lhs_type = Utf8; let rhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, false), + dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(rhs_type.clone()) ); } diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs index 9997db7a82688..dab42214d755c 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs @@ -30,15 +30,15 @@ fn test_ree_type_coercion() { Arc::new(Field::new("values", Int16, false)), ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Int32) ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, false), + ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Int32) ); - // Since we can coerce values of Int16 to Utf8 can support this: Coercion of Int16 to Utf8 + // In comparison context, numeric is preferred over string let lhs_type = RunEndEncoded( Arc::new(Field::new("run_ends", Int8, false)), Arc::new(Field::new("values", Utf8, false)), @@ -48,8 +48,8 @@ fn test_ree_type_coercion() { Arc::new(Field::new("values", Int16, false)), ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), - Some(Utf8) + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), + Some(Int16) ); // Since we can coerce values of Utf8 to Binary can support this @@ -62,7 +62,7 @@ fn test_ree_type_coercion() { Arc::new(Field::new("values", Binary, false)), ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Binary) ); let lhs_type = RunEndEncoded( @@ -72,12 +72,12 @@ fn test_ree_type_coercion() { let rhs_type = Utf8; // Don't preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, false), + ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); // Preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(lhs_type.clone()) ); @@ -88,12 +88,12 @@ fn test_ree_type_coercion() { ); // Don't preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, false), + ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); // Preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(rhs_type.clone()) ); } diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index a79f2c66830bf..be097f3a8520f 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -33,7 +33,7 @@ use datafusion_expr_common::signature::ArrayFunctionArgument; use datafusion_expr_common::type_coercion::binary::type_union_resolution; use datafusion_expr_common::{ signature::{ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD}, - type_coercion::binary::comparison_coercion_numeric, + type_coercion::binary::comparison_coercion, type_coercion::binary::string_coercion, }; use itertools::Itertools as _; @@ -593,7 +593,7 @@ fn get_valid_types( function_length_check(function_name, current_types.len(), *num)?; let mut target_type = current_types[0].to_owned(); for data_type in current_types.iter().skip(1) { - if let Some(dt) = comparison_coercion_numeric(&target_type, data_type) { + if let Some(dt) = comparison_coercion(&target_type, data_type) { target_type = dt; } else { return plan_err!( diff --git a/datafusion/expr/src/type_coercion/other.rs b/datafusion/expr/src/type_coercion/other.rs index 634558094ae79..aa21323c217df 100644 --- a/datafusion/expr/src/type_coercion/other.rs +++ b/datafusion/expr/src/type_coercion/other.rs @@ -17,38 +17,58 @@ use arrow::datatypes::DataType; -use super::binary::comparison_coercion; +use super::binary::{comparison_coercion, type_union_coercion}; + +/// Fold `coerce_fn` over `types`, starting from `initial_type`. +fn fold_coerce( + initial_type: &DataType, + types: &[DataType], + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { + types + .iter() + .try_fold(initial_type.clone(), |left_type, right_type| { + coerce_fn(&left_type, right_type) + }) +} /// Attempts to coerce the types of `list_types` to be comparable with the -/// `expr_type`. -/// Returns the common data type for `expr_type` and `list_types` +/// `expr_type` for IN list predicates. +/// Returns the common data type for `expr_type` and `list_types`. +/// +/// Uses comparison coercion because `x IN (a, b)` is semantically equivalent +/// to `x = a OR x = b`. pub fn get_coerce_type_for_list( expr_type: &DataType, list_types: &[DataType], ) -> Option { - list_types - .iter() - .try_fold(expr_type.clone(), |left_type, right_type| { - comparison_coercion(&left_type, right_type) - }) + fold_coerce(expr_type, list_types, comparison_coercion) +} + +/// Find a common coerceable type for `CASE expr WHEN val1 WHEN val2 ...` +/// conditions. Returns the common type for `case_type` and all `when_types`. +/// +/// Uses comparison coercion because `CASE expr WHEN val` is semantically +/// equivalent to `expr = val`. +pub fn get_coerce_type_for_case_when( + when_types: &[DataType], + case_type: &DataType, +) -> Option { + fold_coerce(case_type, when_types, comparison_coercion) } -/// Find a common coerceable type for all `when_or_then_types` as well -/// and the `case_or_else_type`, if specified. -/// Returns the common data type for `when_or_then_types` and `case_or_else_type` +/// Find a common coerceable type for CASE THEN/ELSE result expressions. +/// Returns the common data type for `then_types` and `else_type`. +/// +/// Uses type union coercion because the result branches must be brought to a +/// common type (like UNION), not compared. pub fn get_coerce_type_for_case_expression( - when_or_then_types: &[DataType], - case_or_else_type: Option<&DataType>, + then_types: &[DataType], + else_type: Option<&DataType>, ) -> Option { - let case_or_else_type = match case_or_else_type { - None => when_or_then_types[0].clone(), - Some(data_type) => data_type.clone(), + let (initial_type, remaining) = match else_type { + None => (&then_types[0], &then_types[1..]), + Some(data_type) => (data_type, then_types), }; - when_or_then_types - .iter() - .try_fold(case_or_else_type, |left_type, right_type| { - // TODO: now just use the `equal` coercion rule for case when. If find the issue, and - // refactor again. - comparison_coercion(&left_type, right_type) - }) + fold_coerce(initial_type, remaining, type_union_coercion) } diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs index 0b092c44d502b..6be861b129122 100644 --- a/datafusion/functions/src/core/nvl2.rs +++ b/datafusion/functions/src/core/nvl2.rs @@ -22,7 +22,7 @@ use datafusion_expr::{ ScalarUDFImpl, Signature, Volatility, conditional_expressions::CaseBuilder, simplify::{ExprSimplifyResult, SimplifyContext}, - type_coercion::binary::comparison_coercion, + type_coercion::binary::type_union_coercion, }; use datafusion_macros::user_doc; @@ -133,11 +133,9 @@ impl ScalarUDFImpl for NVL2Func { [if_non_null, if_null] .iter() .try_fold(tested.clone(), |acc, x| { - // The coerced types found by `comparison_coercion` are not guaranteed to be - // coercible for the arguments. `comparison_coercion` returns more loose - // types that can be coerced to both `acc` and `x` for comparison purpose. - // See `maybe_data_types` for the actual coercion. - let coerced_type = comparison_coercion(&acc, x); + // `type_union_coercion` finds a loose common type; the actual + // coercion is done by `maybe_data_types`. + let coerced_type = type_union_coercion(&acc, x); if let Some(coerced_type) = coerced_type { Ok(coerced_type) } else { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index efc9984acb9b0..baffa23258b60 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -41,11 +41,14 @@ use datafusion_expr::expr::{ use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema; use datafusion_expr::expr_schema::cast_subquery; use datafusion_expr::logical_plan::Subquery; -use datafusion_expr::type_coercion::binary::{comparison_coercion, like_coercion}; +use datafusion_expr::type_coercion::binary::{ + comparison_coercion, like_coercion, type_union_coercion, +}; use datafusion_expr::type_coercion::functions::{UDFCoercionExt, fields_with_udf}; use datafusion_expr::type_coercion::is_datetime; use datafusion_expr::type_coercion::other::{ - get_coerce_type_for_case_expression, get_coerce_type_for_list, + get_coerce_type_for_case_expression, get_coerce_type_for_case_when, + get_coerce_type_for_list, }; use datafusion_expr::utils::merge_schema; use datafusion_expr::{ @@ -1028,8 +1031,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { .iter() .map(|(when, _then)| when.get_type(schema)) .collect::>>()?; - let coerced_type = - get_coerce_type_for_case_expression(&when_types, Some(case_type)); + let coerced_type = get_coerce_type_for_case_when(&when_types, case_type); coerced_type.ok_or_else(|| { plan_datafusion_err!( "Failed to coerce case ({case_type}) and when ({}) \ @@ -1107,7 +1109,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { /// **Field-level metadata merging**: Later fields take precedence for duplicate metadata keys. /// /// **Type coercion precedence**: The coerced type is determined by iteratively applying -/// `comparison_coercion()` between the accumulated type and each new input's type. The +/// `type_union_coercion()` between the accumulated type and each new input's type. The /// result depends on type coercion rules, not input order. /// /// **Nullability merging**: Nullability is accumulated using logical OR (`||`). @@ -1130,7 +1132,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { /// ``` /// /// **Precedence Summary**: -/// - **Datatypes**: Determined by `comparison_coercion()` rules, not input order +/// - **Datatypes**: Determined by `type_union_coercion()` rules, not input order /// - **Nullability**: Later inputs can add nullability but cannot remove it (logical OR) /// - **Metadata**: Later inputs take precedence for same keys (HashMap::extend semantics) pub fn coerce_union_schema(inputs: &[Arc]) -> Result { @@ -1180,7 +1182,7 @@ fn coerce_union_schema_with_schema( plan_schema.fields().iter() ) { let coerced_type = - comparison_coercion(union_datatype, plan_field.data_type()).ok_or_else( + type_union_coercion(union_datatype, plan_field.data_type()).ok_or_else( || { plan_datafusion_err!( "Incompatible inputs for Union: Previous inputs were \ @@ -2345,6 +2347,9 @@ mod test { let actual = coerce_case_expression(case, &schema)?; assert_eq!(expected, actual); + // CASE string WHEN float/integer/string: comparison coercion + // prefers numeric, so the common type for the CASE expr and + // WHEN values is Float32. let case = Case { expr: Some(Box::new(col("string"))), when_then_expr: vec![ @@ -2354,7 +2359,7 @@ mod test { ], else_expr: Some(Box::new(col("string"))), }; - let case_when_common_type = Utf8; + let case_when_common_type = DataType::Float32; let then_else_common_type = Utf8; let expected = cast_helper( case.clone(), diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index f1d867dddf369..bbb1b3ec25d79 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -1431,7 +1431,7 @@ mod tests { use datafusion_common::cast::{as_float64_array, as_int32_array}; use datafusion_common::plan_err; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; - use datafusion_expr::type_coercion::binary::comparison_coercion; + use datafusion_expr::type_coercion::binary::type_union_coercion; use datafusion_expr_common::operator::Operator; use datafusion_physical_expr_common::physical_expr::fmt_sql; use half::f16; @@ -2381,9 +2381,7 @@ mod tests { thens_type .iter() .try_fold(else_type, |left_type, right_type| { - // TODO: now just use the `equal` coercion rule for case when. If find the issue, and - // refactor again. - comparison_coercion(&left_type, right_type) + type_union_coercion(&left_type, right_type) }) } diff --git a/datafusion/sqllogictest/test_files/delete.slt b/datafusion/sqllogictest/test_files/delete.slt index b01eb6f5e9ec7..be6d5739e1f0f 100644 --- a/datafusion/sqllogictest/test_files/delete.slt +++ b/datafusion/sqllogictest/test_files/delete.slt @@ -45,7 +45,7 @@ explain delete from t1 where a = 1 and b = 2 and c > 3 and d != 4; ---- logical_plan 01)Dml: op=[Delete] table=[t1] -02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) +02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND CAST(t1.b AS Int64) = Int64(2) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) 03)----TableScan: t1 physical_plan 01)CooperativeExec @@ -58,7 +58,7 @@ explain delete from t1 where t1.a = 1 and b = 2 and t1.c > 3 and d != 4; ---- logical_plan 01)Dml: op=[Delete] table=[t1] -02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) +02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND CAST(t1.b AS Int64) = Int64(2) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) 03)----TableScan: t1 physical_plan 01)CooperativeExec diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index 511061cf82f06..5ec95260e8357 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -426,7 +426,8 @@ physical_plan 02)--DataSourceExec: partitions=1, partition_sizes=[1] -# Now query using an integer which must be coerced into a dictionary string +# Query using an integer literal: comparison coercion prefers numeric types, +# so the dictionary string column is cast to Int64 query TT SELECT * from test where column2 = 1; ---- @@ -436,10 +437,10 @@ query TT explain SELECT * from test where column2 = 1; ---- logical_plan -01)Filter: test.column2 = Dictionary(Int32, Utf8("1")) +01)Filter: CAST(test.column2 AS Int64) = Int64(1) 02)--TableScan: test projection=[column1, column2] physical_plan -01)FilterExec: column2@1 = 1 +01)FilterExec: CAST(column2@1 AS Int64) = 1 02)--DataSourceExec: partitions=1, partition_sizes=[1] # Window Functions diff --git a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt index e1c83c8c330d8..d612b01b5427b 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt @@ -122,24 +122,6 @@ explain select a from t where a != '100'; ---- physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)] -# The predicate should still have the column cast when the value is a NOT valid i32 -query TT -explain select a from t where a = '99999999999'; ----- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999 - -# The predicate should still have the column cast when the value is a NOT valid i32 -query TT -explain select a from t where a = '99.99'; ----- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99 - -# The predicate should still have the column cast when the value is a NOT valid i32 -query TT -explain select a from t where a = ''; ----- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = - # The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information. query TT explain select a from t where cast(a as string) = '100'; diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 2884c3518610d..679ba0aa8a888 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -41,38 +41,17 @@ NULL R NULL 🔥 # -------------------------------------- # test type coercion (compare to int) -# queries should not error +# +# Comparing a string column to an integer literal is allowed but will fail +# at runtime if the string column contains any values that can't be cast +# to integers. # -------------------------------------- -query BB +statement error Arrow error: Cast error: Cannot cast string 'Andrew' to value of Int64 type select ascii_1 = 1 as col1, 1 = ascii_1 as col2 from test_basic_operator; ----- -false false -false false -false false -false false -false false -false false -false false -false false -false false -NULL NULL -NULL NULL -query BB +statement error Arrow error: Cast error: Cannot cast string 'Andrew' to value of Int64 type select ascii_1 <> 1 as col1, 1 <> ascii_1 as col2 from test_basic_operator; ----- -true true -true true -true true -true true -true true -true true -true true -true true -true true -NULL NULL -NULL NULL # Coercion to date/time query BBB diff --git a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt new file mode 100644 index 0000000000000..8cd3f1db81235 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt @@ -0,0 +1,479 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## Tests for string-numeric comparison coercion +## Verifies that when comparing a numeric column to a string literal, +## the comparison is performed numerically (not lexicographically). +## See: https://github.com/apache/datafusion/issues/15161 +########## + +# Setup test data +statement ok +CREATE TABLE t_int AS VALUES (1), (5), (325), (499), (1000); + +statement ok +CREATE TABLE t_float AS VALUES (1.5), (5.0), (325.7), (499.9), (1000.1); + +# ------------------------------------------------- +# Integer column with comparison operators vs string literals. +# Ensure that the comparison is done with numeric semantics, +# not lexicographically. +# ------------------------------------------------- + +query I rowsort +SELECT * FROM t_int WHERE column1 < '5'; +---- +1 + +query I rowsort +SELECT * FROM t_int WHERE column1 > '5'; +---- +1000 +325 +499 + +query I rowsort +SELECT * FROM t_int WHERE column1 <= '5'; +---- +1 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 >= '5'; +---- +1000 +325 +499 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 = '5'; +---- +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 != '5'; +---- +1 +1000 +325 +499 + +query I rowsort +SELECT * FROM t_int WHERE column1 < '10'; +---- +1 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 <= '100'; +---- +1 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 > '100'; +---- +1000 +325 +499 + +# ------------------------------------------------- +# Float column with comparison operators vs string literals +# ------------------------------------------------- + +query R rowsort +SELECT * FROM t_float WHERE column1 < '5'; +---- +1.5 + +query R rowsort +SELECT * FROM t_float WHERE column1 > '5'; +---- +1000.1 +325.7 +499.9 + +query R rowsort +SELECT * FROM t_float WHERE column1 = '5'; +---- +5 + +query R rowsort +SELECT * FROM t_float WHERE column1 = '5.0'; +---- +5 + +# ------------------------------------------------- +# Error on strings that cannot be cast to the numeric column type +# ------------------------------------------------- + +# Non-numeric string against integer column +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type +SELECT * FROM t_int WHERE column1 < 'hello'; + +# Non-numeric string against float column +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Float64 type +SELECT * FROM t_float WHERE column1 < 'hello'; + +# Float string against integer column +statement error Arrow error: Cast error: Cannot cast string '99.99' to value of Int64 type +SELECT * FROM t_int WHERE column1 = '99.99'; + +# Empty string against integer column +statement error Arrow error: Cast error: Cannot cast string '' to value of Int64 type +SELECT * FROM t_int WHERE column1 = ''; + +# Empty string against float column +statement error Arrow error: Cast error: Cannot cast string '' to value of Float64 type +SELECT * FROM t_float WHERE column1 = ''; + +# Overflow +statement error Arrow error: Cast error: Cannot cast string '99999999999999999999' to value of Int64 type +SELECT * FROM t_int WHERE column1 = '99999999999999999999'; + + +# ------------------------------------------------- +# UNION still uses string coercion (type unification context) +# ------------------------------------------------- + +statement ok +CREATE TABLE t_str AS VALUES ('one'), ('two'), ('three'); + +query T rowsort +SELECT column1 FROM t_int UNION ALL SELECT column1 FROM t_str; +---- +1 +1000 +325 +499 +5 +one +three +two + +# Verify the UNION coerces to Utf8 (not numeric) +query TT +EXPLAIN SELECT column1 FROM t_int UNION ALL SELECT column1 FROM t_str; +---- +logical_plan +01)Union +02)--Projection: CAST(t_int.column1 AS Utf8) AS column1 +03)----TableScan: t_int projection=[column1] +04)--TableScan: t_str projection=[column1] +physical_plan +01)UnionExec +02)--ProjectionExec: expr=[CAST(column1@0 AS Utf8) as column1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)--DataSourceExec: partitions=1, partition_sizes=[1] + +# ------------------------------------------------- +# BETWEEN uses comparison coercion (numeric preferred) +# ------------------------------------------------- + +query I rowsort +SELECT * FROM t_int WHERE column1 BETWEEN '5' AND '100'; +---- +5 + +# ------------------------------------------------- +# IN list uses comparison coercion (numeric preferred) +# `x IN (a, b)` is semantically equivalent to `x = a OR x = b` +# ------------------------------------------------- + +# Basic IN list with string literals against integer column +query I rowsort +SELECT * FROM t_int WHERE column1 IN ('5', '325'); +---- +325 +5 + +# IN list with a value where numeric coercion matters +query I rowsort +SELECT * FROM t_int WHERE column1 IN ('1000'); +---- +1000 + +# IN list with NOT +query I rowsort +SELECT * FROM t_int WHERE column1 NOT IN ('1', '5'); +---- +1000 +325 +499 + +# IN list with float column +query R rowsort +SELECT * FROM t_float WHERE column1 IN ('5.0', '325.7'); +---- +325.7 +5 + +# Verify the plan shows numeric coercion (not CAST to Utf8) +query TT +EXPLAIN SELECT * FROM t_int WHERE column1 IN ('5', '325'); +---- +logical_plan +01)Filter: t_int.column1 = Int64(5) OR t_int.column1 = Int64(325) +02)--TableScan: t_int projection=[column1] +physical_plan +01)FilterExec: column1@0 = 5 OR column1@0 = 325 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# Error on invalid string in IN list +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type +SELECT * FROM t_int WHERE column1 IN ('5', 'hello'); + +# Mixed numeric literal and string literal in IN list against integer column +query I rowsort +SELECT * FROM t_int WHERE column1 IN (5, '325'); +---- +325 +5 + +# Mixed numeric literal and string literal in IN list against float column +query R rowsort +SELECT * FROM t_float WHERE column1 IN (5, '325.7'); +---- +325.7 +5 + +# String and numeric literal order reversed +query R rowsort +SELECT * FROM t_float WHERE column1 IN ('5', 325.7); +---- +325.7 +5 + +# Float string literal against integer column errors (cannot cast '10.0' to Int64) +statement error Arrow error: Cast error: Cannot cast string '10.0' to value of Int64 type +SELECT * FROM t_int WHERE column1 IN (5, '10.0'); + +# Non-numeric string in mixed IN list still errors +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type +SELECT * FROM t_int WHERE column1 IN ('hello', 5); + +# ------------------------------------------------- +# CASE WHEN uses comparison coercion for conditions (numeric preferred) +# `CASE expr WHEN val` is semantically equivalent to `expr = val` +# ------------------------------------------------- + +# Basic CASE with integer column and string WHEN values +query T rowsort +SELECT CASE column1 WHEN '5' THEN 'five' WHEN '1000' THEN 'thousand' ELSE 'other' END FROM t_int; +---- +five +other +other +other +thousand + +# CASE with float column: '5' is cast to 5.0 numerically, matching the row. +# (Under string comparison, '5.0' != '5' would fail to match.) +query T rowsort +SELECT CASE column1 WHEN '5' THEN 'matched' ELSE 'no match' END FROM t_float; +---- +matched +no match +no match +no match +no match + +# THEN/ELSE results still use type union coercion (string preferred), +# so mixing numeric and string coerces to string +query T rowsort +SELECT CASE WHEN column1 > 500 THEN column1 ELSE 'small' END FROM t_int; +---- +1000 +small +small +small +small + +# ------------------------------------------------- +# Nested struct/map/list comparisons use comparison coercion +# (numeric preferred) for their field/element types +# ------------------------------------------------- + +statement ok +CREATE TABLE t_struct_int AS SELECT named_struct('val', column1) as s FROM (VALUES (1), (5), (10)); + +statement ok +CREATE TABLE t_struct_str AS SELECT named_struct('val', column1) as s FROM (VALUES ('5'), ('10')); + +# Struct comparison: the string field is cast to Int64 (numeric preferred). +query ? rowsort +SELECT t1.s FROM t_struct_int t1, t_struct_str t2 WHERE t1.s = t2.s; +---- +{val: 10} +{val: 5} + +# Struct in UNION uses type union coercion (string preferred). +# The integer struct field is cast to Utf8. +query ? rowsort +SELECT s FROM t_struct_int UNION ALL SELECT s FROM t_struct_str; +---- +{val: 10} +{val: 10} +{val: 1} +{val: 5} +{val: 5} + +statement ok +DROP TABLE t_struct_int; + +statement ok +DROP TABLE t_struct_str; + +# List comparison: string elements are cast to Int64 (numeric preferred). +statement ok +CREATE TABLE t_list_int AS SELECT column1 as l FROM (VALUES ([1, 5, 10]), ([20, 30])); + +statement ok +CREATE TABLE t_list_str AS SELECT column1 as l FROM (VALUES (['5', '10']), (['20', '30'])); + +# Verify the element types are Int64 and Utf8 respectively +query T +SELECT arrow_typeof(l) FROM t_list_int LIMIT 1; +---- +List(Int64) + +query T +SELECT arrow_typeof(l) FROM t_list_str LIMIT 1; +---- +List(Utf8) + +query ? rowsort +SELECT t1.l FROM t_list_int t1, t_list_str t2 WHERE t1.l = t2.l; +---- +[20, 30] + +# List in UNION uses type union coercion (string preferred). +# The integer list elements are cast to Utf8. +query ? rowsort +SELECT l FROM t_list_int UNION ALL SELECT l FROM t_list_str; +---- +[1, 5, 10] +[20, 30] +[20, 30] +[5, 10] + +statement ok +DROP TABLE t_list_int; + +statement ok +DROP TABLE t_list_str; + +# Map comparison: string values are cast to Int64 (numeric preferred). +statement ok +CREATE TABLE t_map_int AS SELECT MAP {'a': 1, 'b': 5} as m; + +statement ok +CREATE TABLE t_map_str AS SELECT MAP {'a': '1', 'b': '5'} as m; + +# Verify the value types are Int64 and Utf8 respectively +query T +SELECT arrow_typeof(m) FROM t_map_int LIMIT 1; +---- +Map("entries": non-null Struct("key": non-null Utf8, "value": Int64), unsorted) + +query T +SELECT arrow_typeof(m) FROM t_map_str LIMIT 1; +---- +Map("entries": non-null Struct("key": non-null Utf8, "value": Utf8), unsorted) + +query ? rowsort +SELECT t1.m FROM t_map_int t1, t_map_str t2 WHERE t1.m = t2.m; +---- +{a: 1, b: 5} + +# Map in UNION uses type union coercion (string preferred). +# The integer map values are cast to Utf8. +query ? rowsort +SELECT m FROM t_map_int UNION ALL SELECT m FROM t_map_str; +---- +{a: 1, b: 5} +{a: 1, b: 5} + +statement ok +DROP TABLE t_map_int; + +statement ok +DROP TABLE t_map_str; + +# ------------------------------------------------- +# LIKE / regex on dictionary-encoded numeric columns should error, +# consistent with LIKE on plain numeric columns +# ------------------------------------------------- + +# Plain integer column: LIKE is not supported +statement error There isn't a common type to coerce Int64 and Utf8 in LIKE expression +SELECT * FROM t_int WHERE column1 LIKE '%5%'; + +# Dictionary-encoded integer column: should also error +statement error There isn't a common type to coerce Dictionary\(Int32, Int64\) and Utf8 in LIKE expression +SELECT arrow_cast(column1, 'Dictionary(Int32, Int64)') LIKE '%5%' FROM t_int; + +# Dictionary-encoded string column: LIKE works as normal +query B rowsort +SELECT arrow_cast('hello', 'Dictionary(Int32, Utf8)') LIKE '%ell%'; +---- +true + +# REE-encoded integer column: LIKE should also error +statement error There isn't a common type to coerce RunEndEncoded.* and Utf8 in LIKE expression +SELECT arrow_cast(column1, 'RunEndEncoded("run_ends": non-null Int32, "values": Int64)') LIKE '%5%' FROM t_int; + +# REE-encoded string column: LIKE works as normal +query B rowsort +SELECT arrow_cast('hello', 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') LIKE '%ell%'; +---- +true + +# Dictionary-encoded integer column: regex should error +statement error Cannot infer common argument type for regex operation +SELECT arrow_cast(column1, 'Dictionary(Int32, Int64)') ~ '5' FROM t_int; + +# Dictionary-encoded string column: regex works as normal +query B rowsort +SELECT arrow_cast('hello', 'Dictionary(Int32, Utf8)') ~ 'ell'; +---- +true + +# REE-encoded integer column: regex should error +statement error Cannot infer common argument type for regex operation +SELECT arrow_cast(column1, 'RunEndEncoded("run_ends": non-null Int32, "values": Int64)') ~ '5' FROM t_int; + +# REE-encoded string column: regex works as normal +query B rowsort +SELECT arrow_cast('hello', 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') ~ 'ell'; +---- +true + +# ------------------------------------------------- +# Cleanup +# ------------------------------------------------- + +statement ok +DROP TABLE t_int; + +statement ok +DROP TABLE t_float; + +statement ok +DROP TABLE t_str; diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 926eb8a343f01..18b6f8560d552 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -568,7 +568,7 @@ async fn try_cast_decimal_to_int() -> Result<()> { #[tokio::test] async fn try_cast_decimal_to_string() -> Result<()> { - roundtrip("SELECT * FROM data WHERE a = TRY_CAST(b AS string)").await + roundtrip("SELECT * FROM data WHERE f = TRY_CAST(b AS string)").await } #[tokio::test] From 4bf9fe47ecc3cd7e28031fcfd898e0d76a2d9c5a Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 5 Mar 2026 21:41:40 -0500 Subject: [PATCH 02/17] Revert map key coercion regression, fix other tests --- .../expr-common/src/type_coercion/binary.rs | 36 +++++++--------- .../type_coercion/binary/tests/comparison.rs | 2 +- datafusion/sqllogictest/test_files/expr.slt | 42 ++++++------------- .../test_files/string_numeric_coercion.slt | 17 ++++++++ 4 files changed, 46 insertions(+), 51 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 086b06ce189f3..b2df18c17fd0f 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -744,7 +744,7 @@ fn type_union_resolution_coercion( // Numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| list_coercion(lhs_type, rhs_type, type_union_coercion)) + .or_else(|| list_coercion(lhs_type, rhs_type)) .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) @@ -859,7 +859,7 @@ pub fn type_union_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option Option Option, ) -> Option { - let coerced_type = coerce_fn(lhs_field.data_type(), rhs_field.data_type())?; + let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()]; Some(Arc::new( (**lhs_field) .clone() - .with_data_type(coerced_type) + .with_data_type(type_union_resolution(&data_types)?) .with_nullable(lhs_field.is_nullable() || rhs_field.is_nullable()), )) } -/// Coerce two list types by coercing their element types using -/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). -fn list_coercion( - lhs_type: &DataType, - rhs_type: &DataType, - coerce_fn: fn(&DataType, &DataType) -> Option, -) -> Option { +/// Coerce two list types by coercing their element types via +/// [`type_union_resolution`]. +fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { // Coerce to the left side FixedSizeList type if the list lengths are the same, @@ -1649,11 +1645,11 @@ fn list_coercion( (FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) => { if ls == rs { Some(FixedSizeList( - coerce_list_children(lhs_field, rhs_field, coerce_fn)?, + coerce_list_children(lhs_field, rhs_field)?, *rs, )) } else { - Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?)) + Some(List(coerce_list_children(lhs_field, rhs_field)?)) } } // LargeList on any side @@ -1661,13 +1657,13 @@ fn list_coercion( LargeList(lhs_field), List(rhs_field) | LargeList(rhs_field) | FixedSizeList(rhs_field, _), ) - | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => Some( - LargeList(coerce_list_children(lhs_field, rhs_field, coerce_fn)?), - ), + | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => { + Some(LargeList(coerce_list_children(lhs_field, rhs_field)?)) + } // Lists on both sides (List(lhs_field), List(rhs_field) | FixedSizeList(rhs_field, _)) | (FixedSizeList(lhs_field, _), List(rhs_field)) => { - Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?)) + Some(List(coerce_list_children(lhs_field, rhs_field)?)) } _ => None, } diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs index 317b022238f67..166b308710e0e 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs @@ -654,7 +654,7 @@ fn test_list_coercion() { let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true))); - let coerced_type = list_coercion(&lhs_type, &rhs_type, comparison_coercion).unwrap(); + let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap(); assert_eq!( coerced_type, DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true))) diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index a6341bc686f74..9365f3896b618 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1086,55 +1086,37 @@ SELECT 0.3 NOT IN (0.0,0.1,0.2,NULL) ---- NULL -query B +# Mixed string/integer IN lists: comparison coercion picks the numeric +# type, so non-numeric strings like 'a' fail to cast to Int64. +query error Cannot cast string 'a' to value of Int64 type SELECT '1' IN ('a','b',1) ----- -true -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' IN ('a','b',1) ----- -false -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' NOT IN ('a','b',1) ----- -true -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '1' NOT IN ('a','b',1) ----- -false -query B +query error Cannot cast string 'a' to value of Int64 type SELECT NULL IN ('a','b',1) ----- -NULL -query B +query error Cannot cast string 'a' to value of Int64 type SELECT NULL NOT IN ('a','b',1) ----- -NULL -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '1' IN ('a','b',NULL,1) ----- -true -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' IN ('a','b',NULL,1) ----- -NULL -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '1' NOT IN ('a','b',NULL,1) ----- -false -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' NOT IN ('a','b',NULL,1) ----- -NULL # ======================================================================== # Comprehensive IN LIST tests with NULL handling diff --git a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt index 8cd3f1db81235..27b1be1074769 100644 --- a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt +++ b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt @@ -477,3 +477,20 @@ DROP TABLE t_float; statement ok DROP TABLE t_str; + +# ------------------------------------------------- +# List element coercion should reject mixed +# numeric/string categories (same as array literals) +# ------------------------------------------------- + +# Array literal with mixed numeric/string elements errors +query error Cannot cast string 'a' to value of Int64 type +SELECT [1, 'a']; + +# MAP with mixed-category list keys should also error +query error +SELECT MAP {[1,2,3]:1, ['a', 'b']:2}; + +# MAP with mixed-category list values should also error +query error +SELECT MAP {'a':[1,2,3], 'b':['a', 'b']}; From 1cd260266d96dba2d664d36ec093e74e6bf38180 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 5 Mar 2026 21:49:39 -0500 Subject: [PATCH 03/17] cargo fmt --- datafusion/expr-common/src/type_coercion/binary.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index b2df18c17fd0f..5543841264b0e 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -1622,10 +1622,7 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { +fn coerce_list_children(lhs_field: &FieldRef, rhs_field: &FieldRef) -> Option { let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()]; Some(Arc::new( (**lhs_field) From e2272f05fb41346e6c3a5d6a79656707ad83312a Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:17:01 -0700 Subject: [PATCH 04/17] More rationale for type union behavior --- datafusion/expr-common/src/type_coercion/binary.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 5543841264b0e..a6e36a7e4eeac 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -844,12 +844,16 @@ pub fn try_type_union_resolution_with_struct( /// Coerce `lhs_type` and `rhs_type` to a common type for type unification /// contexts — where two values must be brought to a common type but are not -/// being compared. Examples: UNION, CASE THEN/ELSE branches, NVL2. +/// being compared. Examples: UNION, CASE THEN/ELSE branches, NVL2. For other +/// contexts, [`comparison_coercion`] should typically be used instead. /// -/// Prefers strings over numeric types (e.g., `SELECT 1 UNION SELECT '2'` -/// coerces both sides to `Utf8`). -/// -/// For comparisons (`=`, `<`, `>`), use [`comparison_coercion`] instead. +/// The intuition is that we try to find the "widest" type that can represent +/// all values from both sides. When one side is a string and the other is +/// numeric, this prefers strings because every number has a textual +/// representation but not every string can be parsed as a number (e.g., `SELECT +/// 1 UNION SELECT 'a'` coerces both sides to a string). This is in contrast to +/// [`comparison_coercion`], which prefers numeric types so that ordering and +/// equality follow numeric semantics. pub fn type_union_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type.equals_datatype(rhs_type) { return Some(lhs_type.clone()); From e52f506841c02f3b4822d0c8fd09f3528d0bf23a Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:18:49 -0700 Subject: [PATCH 05/17] Fix comment typos Co-authored-by: Martin Grigorov --- datafusion/expr-common/src/signature.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index f7ed270126470..42d4de939a8e8 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -158,7 +158,7 @@ pub enum Arity { pub enum TypeSignature { /// One or more arguments of a common type out of a list of valid types. /// - /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]). + /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]. /// /// # Examples /// @@ -209,7 +209,7 @@ pub enum TypeSignature { /// (numeric is preferred over string). /// /// Note: - /// - For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]). + /// - For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]. /// - If all arguments have type [`DataType::Null`], they are coerced to `Utf8` /// /// [`comparison_coercion`]: crate::type_coercion::binary::comparison_coercion @@ -247,7 +247,7 @@ pub enum TypeSignature { /// For example, if a function is called with (utf8, large_utf8), all /// arguments will be coerced to `LargeUtf8` /// - /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]). + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. String(usize), /// No arguments Nullary, From 8084010fe96d10267750fb2723a1341f672e8b6f Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:25:44 -0700 Subject: [PATCH 06/17] Apply suggestion from @martin-g Co-authored-by: Martin Grigorov --- datafusion/expr/src/type_coercion/other.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/other.rs b/datafusion/expr/src/type_coercion/other.rs index aa21323c217df..48125b661e2ca 100644 --- a/datafusion/expr/src/type_coercion/other.rs +++ b/datafusion/expr/src/type_coercion/other.rs @@ -67,7 +67,7 @@ pub fn get_coerce_type_for_case_expression( else_type: Option<&DataType>, ) -> Option { let (initial_type, remaining) = match else_type { - None => (&then_types[0], &then_types[1..]), + None => then_types.split_first()?, Some(data_type) => (data_type, then_types), }; fold_coerce(initial_type, remaining, type_union_coercion) From 7667bc19c0beff5c2c96a7cf90916f3cbbc9fe84 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:33:15 -0700 Subject: [PATCH 07/17] Per review, explicitly test for IN list type mismatch error --- datafusion/core/src/physical_planner.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 2bd3756e5f785..062a3d14a0d86 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -3435,21 +3435,17 @@ mod tests { } #[tokio::test] - async fn in_list_types() -> Result<()> { - // expression: "c1 in ('a', 'b')" where c1 is Utf8 - // Tests that IN list coercion works with compatible string types. - let list = vec![lit("a"), lit("b")]; + async fn in_list_types_mixed_string_int_error() -> Result<()> { + // expression: "c1 in ('a', 1)" where c1 is Utf8 + let list = vec![lit("a"), lit(1i64)]; let logical_plan = test_csv_scan() .await? - // filter clause needs the type coercion rule applied .filter(col("c12").lt(lit(0.05)))? .project(vec![col("c1").in_list(list, false)])? .build()?; - let execution_plan = plan(&logical_plan).await?; - - let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#; + let e = plan(&logical_plan).await.unwrap_err().to_string(); - assert_contains!(format!("{execution_plan:?}"), expected); + assert_contains!(&e, "Cannot cast string 'a' to value of Int64 type"); Ok(()) } From 43657dd18d0bff9c896afec95bd1aab861d886e4 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:44:55 -0700 Subject: [PATCH 08/17] Add upgrade notes for change --- .../library-user-guide/upgrading/53.0.0.md | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md index ef5f5743f5ea6..2d1efd96c4c32 100644 --- a/docs/source/library-user-guide/upgrading/53.0.0.md +++ b/docs/source/library-user-guide/upgrading/53.0.0.md @@ -440,6 +440,68 @@ let sort_proto = serialize_physical_sort_expr( ); ``` +### String/numeric comparison coercion now prefers numeric types + +Previously, comparing a numeric column with a string value (e.g., +`WHERE int_col > '100'`) coerced both sides to strings and performed a +lexicographic comparison. This produced incorrect results — for example, +`325 > '100'` was `false` under string comparison because `'3' < '1'` is +`false` lexicographically. + +DataFusion now coerces the string side to the numeric type in comparison +contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). + +**Who is affected:** + +- Queries that compare numeric values with string values +- Queries that use `IN` lists with mixed string and numeric types +- Queries that use `CASE expr WHEN` with mixed string and numeric types + +**Behavioral changes (old → new):** + +| Expression | Old behavior | New behavior | +|---|---|---| +| `int_col > '100'` | Lexicographic (incorrect) | Numeric (correct) | +| `float_col = '5'` | String `'5' != '5.0'` (incorrect) | Numeric `5.0 = 5.0` (correct) | +| `int_col = 'hello'` | String comparison, always false | Cast error | +| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | +| `float_col IN ('1.0')` | String `'1.0' != '1'` (incorrect) | Numeric `1.0 = 1.0` (correct) | +| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | +| `SELECT 1 UNION SELECT 'a'` | Coerce to Utf8 | Coerce to Utf8 (unchanged) | + +**Migration guide:** + +Most queries will produce more correct results with no changes needed. +However, queries that relied on the old string-comparison behavior may need +adjustment: + +- **Queries comparing numeric columns with non-numeric strings** (e.g., + `int_col = 'hello'` or `int_col > text_col` where `text_col` contains + non-numeric values) will now produce a cast error instead of silently + returning no rows. +- **Mixed-type `IN` lists** (e.g., `str_col IN ('a', 1)`) are now rejected. Use + consistent types in the list, or add an explicit `CAST`. +- **Queries comparing integer columns with decimal strings** (e.g., + `int_col = '99.99'`) will now produce a cast error because `'99.99'` + cannot be cast to an integer. Use a float column or adjust the literal. + +See [#15161](https://github.com/apache/datafusion/issues/15161) and +[PR #20426](https://github.com/apache/datafusion/pull/20426) for details. + +### `comparison_coercion_numeric` removed, replaced by `comparison_coercion` + +The `comparison_coercion_numeric` function has been removed. Its behavior +(preferring numeric types for string/numeric comparisons) is now the default in +`comparison_coercion`. A new function, `type_union_coercion`, handles contexts +where string types are preferred (`UNION`, `CASE THEN/ELSE`, `NVL2`). + +**Who is affected:** + +- Crates that call `comparison_coercion_numeric` directly +- Crates that call `comparison_coercion` and relied on its old + string-preferring behavior +- Crates that call `get_coerce_type_for_case_expression` + ### `generate_series` and `range` table functions changed The `generate_series` and `range` table functions now return an empty set when the interval is invalid, instead of an error. From 36cd3df8e7152be78355ffc92e26425bb33a869f Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:46:22 -0700 Subject: [PATCH 09/17] Fix docs prettier lint --- .../library-user-guide/upgrading/53.0.0.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md index 2d1efd96c4c32..5dd812a2e376a 100644 --- a/docs/source/library-user-guide/upgrading/53.0.0.md +++ b/docs/source/library-user-guide/upgrading/53.0.0.md @@ -459,15 +459,15 @@ contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). **Behavioral changes (old → new):** -| Expression | Old behavior | New behavior | -|---|---|---| -| `int_col > '100'` | Lexicographic (incorrect) | Numeric (correct) | -| `float_col = '5'` | String `'5' != '5.0'` (incorrect) | Numeric `5.0 = 5.0` (correct) | -| `int_col = 'hello'` | String comparison, always false | Cast error | -| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | -| `float_col IN ('1.0')` | String `'1.0' != '1'` (incorrect) | Numeric `1.0 = 1.0` (correct) | -| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | -| `SELECT 1 UNION SELECT 'a'` | Coerce to Utf8 | Coerce to Utf8 (unchanged) | +| Expression | Old behavior | New behavior | +| --------------------------- | --------------------------------- | ------------------------------------------ | +| `int_col > '100'` | Lexicographic (incorrect) | Numeric (correct) | +| `float_col = '5'` | String `'5' != '5.0'` (incorrect) | Numeric `5.0 = 5.0` (correct) | +| `int_col = 'hello'` | String comparison, always false | Cast error | +| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | +| `float_col IN ('1.0')` | String `'1.0' != '1'` (incorrect) | Numeric `1.0 = 1.0` (correct) | +| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | +| `SELECT 1 UNION SELECT 'a'` | Coerce to Utf8 | Coerce to Utf8 (unchanged) | **Migration guide:** From e12bad2c7f8294a4ccac7b3d63646bef06f069ac Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:49:53 -0700 Subject: [PATCH 10/17] Fix think in docs --- docs/source/library-user-guide/upgrading/53.0.0.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md index 5dd812a2e376a..a6e6ace4c9dc8 100644 --- a/docs/source/library-user-guide/upgrading/53.0.0.md +++ b/docs/source/library-user-guide/upgrading/53.0.0.md @@ -445,8 +445,8 @@ let sort_proto = serialize_physical_sort_expr( Previously, comparing a numeric column with a string value (e.g., `WHERE int_col > '100'`) coerced both sides to strings and performed a lexicographic comparison. This produced incorrect results — for example, -`325 > '100'` was `false` under string comparison because `'3' < '1'` is -`false` lexicographically. +`5 > '100'` was `true` under string comparison because `'5' > '1'` +lexicographically, even though `5 > 100` is `false` numerically. DataFusion now coerces the string side to the numeric type in comparison contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). From 6bdfc37763fabf1c1ec2e9d10ef8d8b7c281d268 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 9 Mar 2026 15:53:36 -0700 Subject: [PATCH 11/17] Doc tweaks --- .../library-user-guide/upgrading/53.0.0.md | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md index a6e6ace4c9dc8..8d13ba97a4c64 100644 --- a/docs/source/library-user-guide/upgrading/53.0.0.md +++ b/docs/source/library-user-guide/upgrading/53.0.0.md @@ -457,17 +457,16 @@ contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). - Queries that use `IN` lists with mixed string and numeric types - Queries that use `CASE expr WHEN` with mixed string and numeric types -**Behavioral changes (old → new):** - -| Expression | Old behavior | New behavior | -| --------------------------- | --------------------------------- | ------------------------------------------ | -| `int_col > '100'` | Lexicographic (incorrect) | Numeric (correct) | -| `float_col = '5'` | String `'5' != '5.0'` (incorrect) | Numeric `5.0 = 5.0` (correct) | -| `int_col = 'hello'` | String comparison, always false | Cast error | -| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | -| `float_col IN ('1.0')` | String `'1.0' != '1'` (incorrect) | Numeric `1.0 = 1.0` (correct) | -| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | -| `SELECT 1 UNION SELECT 'a'` | Coerce to Utf8 | Coerce to Utf8 (unchanged) | +**Behavioral changes:** + +| Expression | Old behavior | New behavior | +| ----------------------- | ------------------------------- | ------------------------------------------ | +| `int_col > '100'` | Lexicographic | Numeric | +| `float_col = '5'` | String `'5' != '5.0'` | Numeric `5.0 = 5.0` | +| `int_col = 'hello'` | String comparison, always false | Cast error | +| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | +| `float_col IN ('1.0')` | String `'1.0' != '1'` | Numeric `1.0 = 1.0` (correct) | +| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | **Migration guide:** @@ -480,7 +479,7 @@ adjustment: non-numeric values) will now produce a cast error instead of silently returning no rows. - **Mixed-type `IN` lists** (e.g., `str_col IN ('a', 1)`) are now rejected. Use - consistent types in the list, or add an explicit `CAST`. + consistent types for the `IN` list or add an explicit `CAST`. - **Queries comparing integer columns with decimal strings** (e.g., `int_col = '99.99'`) will now produce a cast error because `'99.99'` cannot be cast to an integer. Use a float column or adjust the literal. From 48fc73cd5ef1232ef614fa818890c2f21418568b Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 07:11:13 -0400 Subject: [PATCH 12/17] Address review comments --- .../test_files/string_numeric_coercion.slt | 4 +- .../library-user-guide/upgrading/53.0.0.md | 61 ------------------- .../library-user-guide/upgrading/54.0.0.md | 61 +++++++++++++++++++ 3 files changed, 63 insertions(+), 63 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt index 27b1be1074769..a66752aaa2748 100644 --- a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt +++ b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt @@ -488,9 +488,9 @@ query error Cannot cast string 'a' to value of Int64 type SELECT [1, 'a']; # MAP with mixed-category list keys should also error -query error +query error Cannot cast string 'a' to value of Int64 type SELECT MAP {[1,2,3]:1, ['a', 'b']:2}; # MAP with mixed-category list values should also error -query error +query error Cannot cast string 'a' to value of Int64 type SELECT MAP {'a':[1,2,3], 'b':['a', 'b']}; diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md index 8d13ba97a4c64..ef5f5743f5ea6 100644 --- a/docs/source/library-user-guide/upgrading/53.0.0.md +++ b/docs/source/library-user-guide/upgrading/53.0.0.md @@ -440,67 +440,6 @@ let sort_proto = serialize_physical_sort_expr( ); ``` -### String/numeric comparison coercion now prefers numeric types - -Previously, comparing a numeric column with a string value (e.g., -`WHERE int_col > '100'`) coerced both sides to strings and performed a -lexicographic comparison. This produced incorrect results — for example, -`5 > '100'` was `true` under string comparison because `'5' > '1'` -lexicographically, even though `5 > 100` is `false` numerically. - -DataFusion now coerces the string side to the numeric type in comparison -contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). - -**Who is affected:** - -- Queries that compare numeric values with string values -- Queries that use `IN` lists with mixed string and numeric types -- Queries that use `CASE expr WHEN` with mixed string and numeric types - -**Behavioral changes:** - -| Expression | Old behavior | New behavior | -| ----------------------- | ------------------------------- | ------------------------------------------ | -| `int_col > '100'` | Lexicographic | Numeric | -| `float_col = '5'` | String `'5' != '5.0'` | Numeric `5.0 = 5.0` | -| `int_col = 'hello'` | String comparison, always false | Cast error | -| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | -| `float_col IN ('1.0')` | String `'1.0' != '1'` | Numeric `1.0 = 1.0` (correct) | -| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | - -**Migration guide:** - -Most queries will produce more correct results with no changes needed. -However, queries that relied on the old string-comparison behavior may need -adjustment: - -- **Queries comparing numeric columns with non-numeric strings** (e.g., - `int_col = 'hello'` or `int_col > text_col` where `text_col` contains - non-numeric values) will now produce a cast error instead of silently - returning no rows. -- **Mixed-type `IN` lists** (e.g., `str_col IN ('a', 1)`) are now rejected. Use - consistent types for the `IN` list or add an explicit `CAST`. -- **Queries comparing integer columns with decimal strings** (e.g., - `int_col = '99.99'`) will now produce a cast error because `'99.99'` - cannot be cast to an integer. Use a float column or adjust the literal. - -See [#15161](https://github.com/apache/datafusion/issues/15161) and -[PR #20426](https://github.com/apache/datafusion/pull/20426) for details. - -### `comparison_coercion_numeric` removed, replaced by `comparison_coercion` - -The `comparison_coercion_numeric` function has been removed. Its behavior -(preferring numeric types for string/numeric comparisons) is now the default in -`comparison_coercion`. A new function, `type_union_coercion`, handles contexts -where string types are preferred (`UNION`, `CASE THEN/ELSE`, `NVL2`). - -**Who is affected:** - -- Crates that call `comparison_coercion_numeric` directly -- Crates that call `comparison_coercion` and relied on its old - string-preferring behavior -- Crates that call `get_coerce_type_for_case_expression` - ### `generate_series` and `range` table functions changed The `generate_series` and `range` table functions now return an empty set when the interval is invalid, instead of an error. diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index 77b4fb6f71a35..ca6da40b83272 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -25,6 +25,67 @@ in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. +### String/numeric comparison coercion now prefers numeric types + +Previously, comparing a numeric column with a string value (e.g., +`WHERE int_col > '100'`) coerced both sides to strings and performed a +lexicographic comparison. This produced incorrect results — for example, +`5 > '100'` was `true` under string comparison because `'5' > '1'` +lexicographically, even though `5 > 100` is `false` numerically. + +DataFusion now coerces the string side to the numeric type in comparison +contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). + +**Who is affected:** + +- Queries that compare numeric values with string values +- Queries that use `IN` lists with mixed string and numeric types +- Queries that use `CASE expr WHEN` with mixed string and numeric types + +**Behavioral changes:** + +| Expression | Old behavior | New behavior | +| ----------------------- | ------------------------------- | ------------------------------------------ | +| `int_col > '100'` | Lexicographic | Numeric | +| `float_col = '5'` | String `'5' != '5.0'` | Numeric `5.0 = 5.0` | +| `int_col = 'hello'` | String comparison, always false | Cast error | +| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | +| `float_col IN ('1.0')` | String `'1.0' != '1'` | Numeric `1.0 = 1.0` (correct) | +| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | + +**Migration guide:** + +Most queries will produce more correct results with no changes needed. +However, queries that relied on the old string-comparison behavior may need +adjustment: + +- **Queries comparing numeric columns with non-numeric strings** (e.g., + `int_col = 'hello'` or `int_col > text_col` where `text_col` contains + non-numeric values) will now produce a cast error instead of silently + returning no rows. +- **Mixed-type `IN` lists** (e.g., `str_col IN ('a', 1)`) are now rejected. Use + consistent types for the `IN` list or add an explicit `CAST`. +- **Queries comparing integer columns with non-integer numeric string literals** (e.g., + `int_col = '99.99'`) will now produce a cast error because `'99.99'` + cannot be cast to an integer. Use a float column or adjust the literal. + +See [#15161](https://github.com/apache/datafusion/issues/15161) and +[PR #20426](https://github.com/apache/datafusion/pull/20426) for details. + +### `comparison_coercion_numeric` removed, replaced by `comparison_coercion` + +The `comparison_coercion_numeric` function has been removed. Its behavior +(preferring numeric types for string/numeric comparisons) is now the default in +`comparison_coercion`. A new function, `type_union_coercion`, handles contexts +where string types are preferred (`UNION`, `CASE THEN/ELSE`, `NVL2`). + +**Who is affected:** + +- Crates that call `comparison_coercion_numeric` directly +- Crates that call `comparison_coercion` and relied on its old + string-preferring behavior +- Crates that call `get_coerce_type_for_case_expression` + ### `ExecutionPlan::apply_expressions` is now a required method `apply_expressions` has been added as a **required** method on the `ExecutionPlan` trait (no default implementation). The same applies to the `FileSource` and `DataSource` traits. Any custom implementation of these traits must now implement `apply_expressions`. From 3527d2fcb3ce46fadab8c728ad5a263253dd04d8 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 07:41:36 -0400 Subject: [PATCH 13/17] Address more review comments --- .../expr-common/src/type_coercion/binary.rs | 45 +++++++++++-------- .../type_coercion/binary/tests/comparison.rs | 2 +- .../test_files/string_numeric_coercion.slt | 30 +++++++++++++ .../library-user-guide/upgrading/54.0.0.md | 9 ++-- 4 files changed, 63 insertions(+), 23 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index a6e36a7e4eeac..cdcc71973fc1d 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -744,7 +744,9 @@ fn type_union_resolution_coercion( // Numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| list_coercion(lhs_type, rhs_type)) + .or_else(|| { + list_coercion(lhs_type, rhs_type, type_union_resolution_coercion) + }) .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) @@ -863,7 +865,7 @@ pub fn type_union_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option Option Option { - let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()]; +/// Coerce two list element fields to a common type using the provided +/// coercion function for element types. +fn coerce_list_children( + lhs_field: &FieldRef, + rhs_field: &FieldRef, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { Some(Arc::new( (**lhs_field) .clone() - .with_data_type(type_union_resolution(&data_types)?) + .with_data_type(coerce_fn(lhs_field.data_type(), rhs_field.data_type())?) .with_nullable(lhs_field.is_nullable() || rhs_field.is_nullable()), )) } -/// Coerce two list types by coercing their element types via -/// [`type_union_resolution`]. -fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// Coerce two list types by coercing their element types via `coerce_fn` +/// (either [`comparison_coercion`] or [`type_union_coercion`]). +fn list_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { // Coerce to the left side FixedSizeList type if the list lengths are the same, @@ -1646,11 +1655,11 @@ fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { (FixedSizeList(lhs_field, ls), FixedSizeList(rhs_field, rs)) => { if ls == rs { Some(FixedSizeList( - coerce_list_children(lhs_field, rhs_field)?, + coerce_list_children(lhs_field, rhs_field, coerce_fn)?, *rs, )) } else { - Some(List(coerce_list_children(lhs_field, rhs_field)?)) + Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?)) } } // LargeList on any side @@ -1658,13 +1667,13 @@ fn list_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { LargeList(lhs_field), List(rhs_field) | LargeList(rhs_field) | FixedSizeList(rhs_field, _), ) - | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => { - Some(LargeList(coerce_list_children(lhs_field, rhs_field)?)) - } + | (List(lhs_field) | FixedSizeList(lhs_field, _), LargeList(rhs_field)) => Some( + LargeList(coerce_list_children(lhs_field, rhs_field, coerce_fn)?), + ), // Lists on both sides (List(lhs_field), List(rhs_field) | FixedSizeList(rhs_field, _)) | (FixedSizeList(lhs_field, _), List(rhs_field)) => { - Some(List(coerce_list_children(lhs_field, rhs_field)?)) + Some(List(coerce_list_children(lhs_field, rhs_field, coerce_fn)?)) } _ => None, } diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs index 166b308710e0e..317b022238f67 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs @@ -654,7 +654,7 @@ fn test_list_coercion() { let rhs_type = DataType::List(Arc::new(Field::new("rhs", DataType::Int64, true))); - let coerced_type = list_coercion(&lhs_type, &rhs_type).unwrap(); + let coerced_type = list_coercion(&lhs_type, &rhs_type, comparison_coercion).unwrap(); assert_eq!( coerced_type, DataType::List(Arc::new(Field::new("lhs", DataType::Int64, true))) diff --git a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt index a66752aaa2748..fa24ec3633700 100644 --- a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt +++ b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt @@ -363,6 +363,24 @@ SELECT t1.l FROM t_list_int t1, t_list_str t2 WHERE t1.l = t2.l; ---- [20, 30] +# List comparison casts the string list to the numeric element type (Int64), +# not the other way around. +query TT +EXPLAIN SELECT t1.l FROM t_list_int t1, t_list_str t2 WHERE t1.l = t2.l; +---- +logical_plan +01)Projection: t1.l +02)--Inner Join: t1.l = CAST(t2.l AS List(Int64)) +03)----SubqueryAlias: t1 +04)------TableScan: t_list_int projection=[l] +05)----SubqueryAlias: t2 +06)------TableScan: t_list_str projection=[l] +physical_plan +01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(l@0, CAST(t2.l AS List(Int64))@1)], projection=[l@0] +02)--DataSourceExec: partitions=1, partition_sizes=[1] +03)--ProjectionExec: expr=[l@0 as l, CAST(l@0 AS List(Int64)) as CAST(t2.l AS List(Int64))] +04)----DataSourceExec: partitions=1, partition_sizes=[1] + # List in UNION uses type union coercion (string preferred). # The integer list elements are cast to Utf8. query ? rowsort @@ -373,6 +391,12 @@ SELECT l FROM t_list_int UNION ALL SELECT l FROM t_list_str; [20, 30] [5, 10] +# Verify the UNION result type has Utf8 elements (not Int64). +query T +SELECT arrow_typeof(l) FROM (SELECT l FROM t_list_int UNION ALL SELECT l FROM t_list_str) LIMIT 1; +---- +List(Utf8) + statement ok DROP TABLE t_list_int; @@ -410,6 +434,12 @@ SELECT m FROM t_map_int UNION ALL SELECT m FROM t_map_str; {a: 1, b: 5} {a: 1, b: 5} +# Verify the UNION result type has Utf8 values (not Int64). +query T +SELECT arrow_typeof(m) FROM (SELECT m FROM t_map_int UNION ALL SELECT m FROM t_map_str) LIMIT 1; +---- +Map("entries": non-null Struct("key": non-null Utf8, "value": Utf8), unsorted) + statement ok DROP TABLE t_map_int; diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index ca6da40b83272..da3c2d9054dad 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -29,12 +29,13 @@ to the main branch and are awaiting release in this version. Previously, comparing a numeric column with a string value (e.g., `WHERE int_col > '100'`) coerced both sides to strings and performed a -lexicographic comparison. This produced incorrect results — for example, -`5 > '100'` was `true` under string comparison because `'5' > '1'` -lexicographically, even though `5 > 100` is `false` numerically. +lexicographic comparison. This produced surprising results — for example, +`5 > '100'` yielded `true` because `'5' > '1'` lexicographically, even +though `5 > 100` is `false` numerically. DataFusion now coerces the string side to the numeric type in comparison -contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). +contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). For +example, `5 > '100'` will now yield `false`. **Who is affected:** From 3eae07d4326ce10ce326ef2ddeff38ef210c95cb Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 07:56:34 -0400 Subject: [PATCH 14/17] Use comparison coercion for GREATEST, LEAST --- .../src/core/greatest_least_utils.rs | 18 +++--- .../test_files/string_numeric_coercion.slt | 58 +++++++++++++++++++ .../library-user-guide/upgrading/54.0.0.md | 7 ++- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/datafusion/functions/src/core/greatest_least_utils.rs b/datafusion/functions/src/core/greatest_least_utils.rs index 5f8b4a51186fe..2714a01832175 100644 --- a/datafusion/functions/src/core/greatest_least_utils.rs +++ b/datafusion/functions/src/core/greatest_least_utils.rs @@ -20,7 +20,7 @@ use arrow::compute::kernels::zip::zip; use arrow::datatypes::DataType; use datafusion_common::{Result, ScalarValue, assert_or_internal_err, plan_err}; use datafusion_expr_common::columnar_value::ColumnarValue; -use datafusion_expr_common::type_coercion::binary::type_union_resolution; +use datafusion_expr_common::type_coercion::binary::comparison_coercion; use std::sync::Arc; pub(super) trait GreatestLeastOperator { @@ -120,13 +120,17 @@ pub(super) fn find_coerced_type( data_types: &[DataType], ) -> Result { if data_types.is_empty() { - plan_err!( + return plan_err!( "{} was called without any arguments. It requires at least 1.", Op::NAME - ) - } else if let Some(coerced_type) = type_union_resolution(data_types) { - Ok(coerced_type) - } else { - plan_err!("Cannot find a common type for arguments") + ); + } + let mut coerced = data_types[0].clone(); + for dt in &data_types[1..] { + let Some(next) = comparison_coercion(&coerced, dt) else { + return plan_err!("Cannot find a common type for arguments to {}", Op::NAME); + }; + coerced = next; } + Ok(coerced) } diff --git a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt index fa24ec3633700..1567a149bcdf4 100644 --- a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt +++ b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt @@ -305,6 +305,64 @@ small small small +# ------------------------------------------------- +# GREATEST / LEAST use comparison coercion (numeric preferred) +# ------------------------------------------------- + +# GREATEST with mixed int and string: numeric comparison, not lexicographic. +query I +SELECT GREATEST(10, '9'); +---- +10 + +query T +SELECT arrow_typeof(GREATEST(10, '9')); +---- +Int64 + +# LEAST with mixed int and string: numeric comparison. +query I +SELECT LEAST(10, '9'); +---- +9 + +query T +SELECT arrow_typeof(LEAST(10, '9')); +---- +Int64 + +# GREATEST with multiple mixed args +query I +SELECT GREATEST(1, '20', 3); +---- +20 + +# Non-numeric string in GREATEST errors +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type +SELECT GREATEST(1, 'hello'); + +# ------------------------------------------------- +# NULLIF uses comparison coercion (numeric preferred) +# ------------------------------------------------- + +# NULLIF with mixed int and string: numeric comparison. +# 10 != 9 numerically, so returns 10. +query I +SELECT NULLIF(10, '9'); +---- +10 + +query T +SELECT arrow_typeof(NULLIF(10, '9')); +---- +Int64 + +# NULLIF with matching values: 10 = 10 numerically, so returns NULL. +query I +SELECT NULLIF(10, '10'); +---- +NULL + # ------------------------------------------------- # Nested struct/map/list comparisons use comparison coercion # (numeric preferred) for their field/element types diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index da3c2d9054dad..ec4045607c042 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -34,14 +34,15 @@ lexicographic comparison. This produced surprising results — for example, though `5 > 100` is `false` numerically. DataFusion now coerces the string side to the numeric type in comparison -contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). For -example, `5 > '100'` will now yield `false`. +contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`, +`GREATEST`, `LEAST`). For example, `5 > '100'` will now yield `false`. **Who is affected:** - Queries that compare numeric values with string values - Queries that use `IN` lists with mixed string and numeric types - Queries that use `CASE expr WHEN` with mixed string and numeric types +- Queries that use `GREATEST` or `LEAST` with mixed string and numeric types **Behavioral changes:** @@ -53,6 +54,8 @@ example, `5 > '100'` will now yield `false`. | `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | | `float_col IN ('1.0')` | String `'1.0' != '1'` | Numeric `1.0 = 1.0` (correct) | | `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | +| `GREATEST(10, '9')` | Utf8 `'9'` (lexicographic) | Int64 `10` (numeric) | +| `LEAST(10, '9')` | Utf8 `10` (lexicographic) | Int64 `9` (numeric) | **Migration guide:** From 9322665b393e2dab305b36404f5e07959084784e Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 09:24:57 -0400 Subject: [PATCH 15/17] Fix for change to list element coercion --- datafusion/expr-common/src/type_coercion/binary.rs | 3 +-- datafusion/sqllogictest/test_files/struct.slt | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index cdcc71973fc1d..c64509ddab9de 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -741,14 +741,13 @@ fn type_union_resolution_coercion( Some(DataType::Struct(fields.into())) } _ => { - // Numeric coercion is the same as comparison coercion, both find the narrowest type - // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) .or_else(|| { list_coercion(lhs_type, rhs_type, type_union_resolution_coercion) }) .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) + .or_else(|| null_coercion(lhs_type, rhs_type)) .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) .or_else(|| binary_coercion(lhs_type, rhs_type)) } diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index fcc7805372674..5cf6e4817d475 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -1116,7 +1116,7 @@ select [ query ? select [[{x: 1, y: 2}], [{y: 4, x: 3}]]; ---- -[[{x: 1, y: 2}], [{x: 3, y: 4}]] +[[{y: 2, x: 1}], [{y: 4, x: 3}]] # Test array literal with float type coercion across elements query ? From d9c9e8ae3fdde490aaeb67f7b9eb146fe5a69ce2 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 09:28:17 -0400 Subject: [PATCH 16/17] cargo fmt --- .../expr-common/src/type_coercion/binary.rs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index c64509ddab9de..ecf79b2114310 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -740,17 +740,13 @@ fn type_union_resolution_coercion( .collect(); Some(DataType::Struct(fields.into())) } - _ => { - binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| { - list_coercion(lhs_type, rhs_type, type_union_resolution_coercion) - }) - .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) - .or_else(|| string_coercion(lhs_type, rhs_type)) - .or_else(|| null_coercion(lhs_type, rhs_type)) - .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) - .or_else(|| binary_coercion(lhs_type, rhs_type)) - } + _ => binary_numeric_coercion(lhs_type, rhs_type) + .or_else(|| list_coercion(lhs_type, rhs_type, type_union_resolution_coercion)) + .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) + .or_else(|| string_coercion(lhs_type, rhs_type)) + .or_else(|| null_coercion(lhs_type, rhs_type)) + .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) + .or_else(|| binary_coercion(lhs_type, rhs_type)), } } From 91d5bfaf5ac7a7b95753249eea159e8ea40a21f4 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Thu, 26 Mar 2026 09:35:46 -0400 Subject: [PATCH 17/17] Update comments --- datafusion/core/tests/expr_api/mod.rs | 3 ++- datafusion/expr-common/src/type_coercion/binary.rs | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index ebd811dcd8cb9..19ff3933193de 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -352,7 +352,8 @@ async fn test_create_physical_expr_coercion() { // int column vs string literal: the string literal is cast to Int64 create_expr_test(col("i").eq(lit("202410")), "i@1 = CAST(202410 AS Int64)"); create_expr_test(lit("202410").eq(col("i")), "CAST(202410 AS Int64) = i@1"); - // when simplified, the literal cast is constant-folded + // The simplifier operates on the logical expression before type + // coercion adds the CAST, so the output is unchanged. create_simplified_expr_test( col("i").eq(lit("202410")), "i@1 = CAST(202410 AS Int64)", diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index ecf79b2114310..eda99dcc32075 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -579,8 +579,8 @@ impl From<&DataType> for TypeCategory { } /// Coerce dissimilar data types to a single data type. -/// UNION, INTERSECT, EXCEPT, CASE, ARRAY, VALUES, and the GREATEST and LEAST functions are -/// examples that has the similar resolution rules. +/// ARRAY literals, VALUES, COALESCE, and array concatenation are examples +/// of contexts that use this function. /// See for more information. /// The rules in the document provide a clue, but adhering strictly to them doesn't precisely /// align with the behavior of Postgres. Therefore, we've made slight adjustments to the rules