diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index e4f341efa8..42da809206 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -36,6 +36,19 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { private[comet] val negativeScaleDecimalToStringReason: String = "Negative-scale decimal requires spark.sql.legacy.allowNegativeScaleOfDecimal=true" + // When `spark.sql.legacy.castComplexTypesToString.enabled` is true, Spark wraps maps and + // structs with `[]` (instead of `{}`) when casting to string, and omits NULL elements of + // structs/maps/arrays (instead of rendering them as the literal "null"). Comet only + // implements the default formatting, so fall back to Spark for any array/map/struct to-string + // cast when the flag is enabled. The flag is internal in Spark 4.0 and defaults to false. + private[comet] val legacyCastComplexTypesToStringReason: String = + "spark.sql.legacy.castComplexTypesToString.enabled=true is not supported" + + private def legacyCastComplexTypesToString: Boolean = + SQLConf.get + .getConfString("spark.sql.legacy.castComplexTypesToString.enabled", "false") + .toBoolean + def supportedTypes: Seq[DataType] = Seq( DataTypes.BooleanType, @@ -150,6 +163,12 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { return Compatible() } + if (toType == DataTypes.StringType && legacyCastComplexTypesToString && (fromType + .isInstanceOf[ArrayType] || fromType.isInstanceOf[StructType] || + fromType.isInstanceOf[MapType])) { + return Unsupported(Some(legacyCastComplexTypesToStringReason)) + } + (fromType, toType) match { case (dt: ArrayType, _: ArrayType) if dt.elementType == NullType => Compatible() case (ArrayType(DataTypes.DateType, _), ArrayType(toElementType, _)) diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala index e261ac45d1..50cd0927b4 100644 --- a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala +++ b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala @@ -870,7 +870,10 @@ case class CometExecRule(session: SparkSession) if (groupingExpressions.isEmpty && aggregateExpressions.isEmpty) return false - if (groupingExpressions.exists(e => QueryPlanSerde.containsMapType(e.dataType))) return false + if (groupingExpressions.exists(e => + SupportLevel.containsType(e.dataType, classOf[MapType]))) { + return false + } if (!groupingExpressions.forall(e => QueryPlanSerde.exprToProto(e, agg.child.output).isDefined)) { diff --git a/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala b/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala index 3dcd67a65d..346bb454eb 100644 --- a/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala +++ b/spark/src/main/scala/org/apache/comet/serde/CometSortOrder.scala @@ -32,18 +32,11 @@ object CometSortOrder extends CometExpressionSerde[SortOrder] { " floating-point types is not 100% compatible with Spark") override def getSupportLevel(expr: SortOrder): SupportLevel = { - - if (CometConf.COMET_EXEC_STRICT_FLOATING_POINT.get() && - SupportLevel.containsFloatingPoint(expr.child.dataType)) { - // https://github.com/apache/datafusion-comet/issues/2626 - Incompatible( - Some( - "Sorting on floating-point is not 100% compatible with Spark, and Comet is running " + - s"with ${CometConf.COMET_EXEC_STRICT_FLOATING_POINT.key}=true. " + - s"${CometConf.COMPAT_GUIDE}")) - } else { - Compatible() - } + // https://github.com/apache/datafusion-comet/issues/2626 + SupportLevel + .strictFloatingPointReason(expr.child.dataType, "Sorting on floating-point") + .map(reason => Incompatible(Some(reason))) + .getOrElse(Compatible()) } override def convert( diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index a21d930226..a3d9944485 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -490,19 +490,6 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { false } - /** - * Returns true if the given data type is or contains a `MapType` at any nesting level. Arrow's - * row format (used by DataFusion's grouped hash aggregate for composite group keys) does not - * support `Map`, so grouping on any type that transitively contains a map would crash in native - * execution. - */ - def containsMapType(dt: DataType): Boolean = dt match { - case _: MapType => true - case a: ArrayType => containsMapType(a.elementType) - case s: StructType => s.fields.exists(f => containsMapType(f.dataType)) - case _ => false - } - /** * Serializes Spark datatype to protobuf. Note that, a datatype can be serialized by this method * doesn't mean it is supported by Comet native execution, i.e., `supportedDataType` may return diff --git a/spark/src/main/scala/org/apache/comet/serde/SupportLevel.scala b/spark/src/main/scala/org/apache/comet/serde/SupportLevel.scala index cb78c7d2d4..84db46923e 100644 --- a/spark/src/main/scala/org/apache/comet/serde/SupportLevel.scala +++ b/spark/src/main/scala/org/apache/comet/serde/SupportLevel.scala @@ -21,6 +21,9 @@ package org.apache.comet.serde import org.apache.spark.sql.types._ +import org.apache.comet.CometConf +import org.apache.comet.CometConf.COMET_EXEC_STRICT_FLOATING_POINT + sealed trait SupportLevel /** @@ -46,14 +49,41 @@ case class Unsupported(notes: Option[String] = None) extends SupportLevel object SupportLevel { /** - * Returns true if the given data type contains FloatType or DoubleType at any nesting level. + * Returns true if `dt` is, or transitively contains, an instance of any of the given `DataType` + * classes. Walks `ArrayType` element, `StructType` fields, and `MapType` key/value at every + * nesting level. + */ + def containsType(dt: DataType, classes: Class[_ <: DataType]*): Boolean = { + if (classes.exists(_.isInstance(dt))) { + true + } else { + dt match { + case ArrayType(elementType, _) => containsType(elementType, classes: _*) + case StructType(fields) => fields.exists(f => containsType(f.dataType, classes: _*)) + case MapType(keyType, valueType, _) => + containsType(keyType, classes: _*) || containsType(valueType, classes: _*) + case _ => false + } + } + } + + /** + * Gate for [[CometConf.COMET_EXEC_STRICT_FLOATING_POINT]]: returns the standard incompatibility + * reason when strict mode is enabled and `dt` contains a float or double (at any nesting + * level), and `None` otherwise. Callers wrap the reason with `Incompatible` or pass it to + * `withFallbackReason` as appropriate. + * + * `what` describes the operation being gated, e.g. "Sorting on floating-point" or "MapSort on + * floating-point key", and is interpolated into the returned message. */ - def containsFloatingPoint(dt: DataType): Boolean = dt match { - case FloatType | DoubleType => true - case ArrayType(elementType, _) => containsFloatingPoint(elementType) - case StructType(fields) => fields.exists(f => containsFloatingPoint(f.dataType)) - case MapType(keyType, valueType, _) => - containsFloatingPoint(keyType) || containsFloatingPoint(valueType) - case _ => false + def strictFloatingPointReason(dt: DataType, what: String): Option[String] = { + if (COMET_EXEC_STRICT_FLOATING_POINT.get() && + containsType(dt, classOf[FloatType], classOf[DoubleType])) { + Some( + s"$what is not 100% compatible with Spark, and Comet is running with " + + s"${COMET_EXEC_STRICT_FLOATING_POINT.key}=true. ${CometConf.COMPAT_GUIDE}") + } else { + None + } } } diff --git a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala index bd5737b54c..cf392e4214 100644 --- a/spark/src/main/scala/org/apache/comet/serde/aggregates.scala +++ b/spark/src/main/scala/org/apache/comet/serde/aggregates.scala @@ -715,17 +715,13 @@ object CometCollectSet extends CometAggregateExpressionSerde[CollectSet] { " `spark.comet.expression.CollectSet.allowIncompatible=true` is set.") override def getSupportLevel(expr: CollectSet): SupportLevel = { - if (COMET_EXEC_STRICT_FLOATING_POINT.get() && - SupportLevel.containsFloatingPoint(expr.children.head.dataType)) { - Incompatible( - Some( - "collect_set on floating-point types is not 100% compatible with Spark " + - "(Comet deduplicates NaN values while Spark treats each NaN as distinct), " + - s"and Comet is running with ${COMET_EXEC_STRICT_FLOATING_POINT.key}=true. " + - s"${CometConf.COMPAT_GUIDE}")) - } else { - Compatible() - } + SupportLevel + .strictFloatingPointReason( + expr.children.head.dataType, + "collect_set on floating-point types " + + "(Comet deduplicates NaN values while Spark treats each NaN as distinct)") + .map(reason => Incompatible(Some(reason))) + .getOrElse(Compatible()) } override def convert( diff --git a/spark/src/main/scala/org/apache/comet/serde/arrays.scala b/spark/src/main/scala/org/apache/comet/serde/arrays.scala index 690bc376c7..7f8dd7b40f 100644 --- a/spark/src/main/scala/org/apache/comet/serde/arrays.scala +++ b/spark/src/main/scala/org/apache/comet/serde/arrays.scala @@ -152,15 +152,11 @@ object CometSortArray extends CometExpressionSerde[SortArray] { if (!supportedSortArrayElementType(elementType)) { Unsupported(Some(s"Sort on array element type $elementType is not supported")) - } else if (CometConf.COMET_EXEC_STRICT_FLOATING_POINT.get() && - SupportLevel.containsFloatingPoint(elementType)) { - Incompatible( - Some( - "Sorting on floating-point is not 100% compatible with Spark, and Comet is running " + - s"with ${CometConf.COMET_EXEC_STRICT_FLOATING_POINT.key}=true. " + - s"${CometConf.COMPAT_GUIDE}")) } else { - Compatible() + SupportLevel + .strictFloatingPointReason(elementType, "Sorting on floating-point") + .map(reason => Incompatible(Some(reason))) + .getOrElse(Compatible()) } } @@ -553,17 +549,8 @@ object CometArrayReverse extends CometExpressionSerde[Reverse] with ArraysBase { override def getIncompatibleReasons(): Seq[String] = Seq(unsupportedReason) - @tailrec - private def containsBinary(dt: DataType): Boolean = { - dt match { - case BinaryType => true - case ArrayType(elementType, _) => containsBinary(elementType) - case _ => false - } - } - override def getSupportLevel(expr: Reverse): SupportLevel = { - if (containsBinary(expr.child.dataType)) { + if (SupportLevel.containsType(expr.child.dataType, classOf[BinaryType])) { Incompatible(Some(unsupportedReason)) } else { Compatible(None) diff --git a/spark/src/main/scala/org/apache/comet/serde/maps.scala b/spark/src/main/scala/org/apache/comet/serde/maps.scala index ab388c7ad0..accf0407a7 100644 --- a/spark/src/main/scala/org/apache/comet/serde/maps.scala +++ b/spark/src/main/scala/org/apache/comet/serde/maps.scala @@ -144,20 +144,11 @@ object CometMapFromEntries override def getIncompatibleReasons(): Seq[String] = Seq(keyUnsupportedReason, valueUnsupportedReason) - private def containsBinary(dataType: DataType): Boolean = { - dataType match { - case BinaryType => true - case StructType(fields) => fields.exists(field => containsBinary(field.dataType)) - case ArrayType(elementType, _) => containsBinary(elementType) - case _ => false - } - } - override def getSupportLevel(expr: MapFromEntries): SupportLevel = { - if (containsBinary(expr.dataType.keyType)) { + if (SupportLevel.containsType(expr.dataType.keyType, classOf[BinaryType])) { return Incompatible(Some(keyUnsupportedReason)) } - if (containsBinary(expr.dataType.valueType)) { + if (SupportLevel.containsType(expr.dataType.valueType, classOf[BinaryType])) { return Incompatible(Some(valueUnsupportedReason)) } Compatible(None) diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala index 53b09e92b3..ebb22d2361 100644 --- a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala +++ b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala @@ -1546,7 +1546,8 @@ trait CometBaseAggregate { return None } - if (groupingExpressions.exists(expr => QueryPlanSerde.containsMapType(expr.dataType))) { + if (groupingExpressions.exists(expr => + SupportLevel.containsType(expr.dataType, classOf[MapType]))) { withFallbackReason(aggregate, "Grouping on map-containing types is not supported") return None } diff --git a/spark/src/main/spark-4.x/org/apache/comet/serde/CometMapSort.scala b/spark/src/main/spark-4.x/org/apache/comet/serde/CometMapSort.scala index 4aecda4547..341c8fc24e 100644 --- a/spark/src/main/spark-4.x/org/apache/comet/serde/CometMapSort.scala +++ b/spark/src/main/spark-4.x/org/apache/comet/serde/CometMapSort.scala @@ -39,15 +39,11 @@ object CometMapSort extends CometExpressionSerde[MapSort] { val keyType = expr.dataType.asInstanceOf[MapType].keyType if (!supportedScalarSortElementType(keyType)) { Unsupported(Some(s"MapSort on map with key type $keyType is not supported")) - } else if (CometConf.COMET_EXEC_STRICT_FLOATING_POINT.get() && - SupportLevel.containsFloatingPoint(keyType)) { - Incompatible( - Some( - "MapSort on floating-point key is not 100% compatible with Spark, and Comet is " + - s"running with ${CometConf.COMET_EXEC_STRICT_FLOATING_POINT.key}=true. " + - s"${CometConf.COMPAT_GUIDE}")) } else { - Compatible(None) + SupportLevel + .strictFloatingPointReason(keyType, "MapSort on floating-point key") + .map(reason => Incompatible(Some(reason))) + .getOrElse(Compatible(None)) } } diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_complex_types_to_string.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_complex_types_to_string.sql new file mode 100644 index 0000000000..8b1d989ae7 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_complex_types_to_string.sql @@ -0,0 +1,345 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Default (non-legacy) formatting for array / map / struct → string: +-- `{f1, f2, ...}` for structs, `[e1, e2, ...]` for arrays, `{k1 -> v1, k2 -> v2}` for maps, +-- with NULL elements rendered as the literal "null". The legacy `[...]`-wrapped / +-- NULL-omitting mode is covered separately in cast_complex_types_to_string_legacy.sql. + +-- Config: spark.sql.legacy.castComplexTypesToString.enabled=false +-- ConfigMatrix: parquet.enable.dictionary=false,true + +statement +CREATE TABLE test_cast_struct_to_string( + id int, + s_unnamed struct, + s_named struct, + s_floats struct, + s_bounds struct, + s_decimal struct, + s_temporal struct, + s_binary struct, + s_nested struct, tag: string>, + s_with_array struct, label: string>, + s_all_null struct +) USING parquet + +statement +INSERT INTO test_cast_struct_to_string VALUES + ( + 1, + named_struct('col1', 1, 'col2', 'hello'), + named_struct('a', 42, 'b', 'world', 'c', true), + named_struct('f', cast(1.5 as float), 'd', cast(2.5 as double)), + named_struct('b', cast(127 as tinyint), 's', cast(32767 as smallint), 'i', 2147483647, 'l', 9223372036854775807), + named_struct('d1', cast('12345678.90' as decimal(10, 2)), 'd2', cast('1.234567890123456789' as decimal(38, 18))), + named_struct('dt', date '2024-01-15', 'ts', timestamp '2024-01-15 10:30:45'), + named_struct('b', X'616263'), + named_struct('inner', named_struct('x', 10, 'y', 'inner'), 'tag', 'outer'), + named_struct('arr', array(1, 2, 3), 'label', 'three'), + named_struct('a', 1, 'b', 'present') + ), + ( + 2, + named_struct('col1', cast(null as int), 'col2', 'with-null-int'), + named_struct('a', 0, 'b', cast(null as string), 'c', false), + named_struct('f', cast('NaN' as float), 'd', cast('NaN' as double)), + named_struct('b', cast(-128 as tinyint), 's', cast(-32768 as smallint), 'i', -2147483648, 'l', -9223372036854775808), + named_struct('d1', cast('-12345678.90' as decimal(10, 2)), 'd2', cast('-1.234567890123456789' as decimal(38, 18))), + named_struct('dt', date '1970-01-01', 'ts', timestamp '1970-01-01 00:00:00'), + named_struct('b', X''), + named_struct('inner', named_struct('x', cast(null as int), 'y', cast(null as string)), 'tag', cast(null as string)), + named_struct('arr', array(cast(null as int), 1, cast(null as int)), 'label', cast(null as string)), + named_struct('a', cast(null as int), 'b', cast(null as string)) + ), + ( + 3, + named_struct('col1', -1, 'col2', ''), + named_struct('a', cast(null as int), 'b', '', 'c', cast(null as boolean)), + named_struct('f', cast('Infinity' as float), 'd', cast('-Infinity' as double)), + named_struct('b', cast(0 as tinyint), 's', cast(0 as smallint), 'i', 0, 'l', cast(0 as bigint)), + named_struct('d1', cast(0 as decimal(10, 2)), 'd2', cast(0 as decimal(38, 18))), + named_struct('dt', date '9999-12-31', 'ts', timestamp '9999-12-31 23:59:59'), + named_struct('b', X'00FF7F80'), + named_struct('inner', named_struct('x', 0, 'y', ''), 'tag', ''), + named_struct('arr', cast(array() as array), 'label', ''), + cast(null as struct) + ), + ( + 4, + named_struct('col1', cast(null as int), 'col2', cast(null as string)), + named_struct('a', cast(null as int), 'b', cast(null as string), 'c', cast(null as boolean)), + named_struct('f', cast(-0.0 as float), 'd', cast(-0.0 as double)), + named_struct('b', cast(null as tinyint), 's', cast(null as smallint), 'i', cast(null as int), 'l', cast(null as bigint)), + named_struct('d1', cast(null as decimal(10, 2)), 'd2', cast(null as decimal(38, 18))), + named_struct('dt', cast(null as date), 'ts', cast(null as timestamp)), + named_struct('b', cast(null as binary)), + named_struct('inner', cast(null as struct), 'tag', cast(null as string)), + named_struct('arr', cast(null as array), 'label', cast(null as string)), + cast(null as struct) + ) + +-- Anonymous struct fields are auto-named col1, col2, ... by `struct(...)`. +query +SELECT cast(s_unnamed as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Named struct fields propagate user-supplied names into the formatted output. +query +SELECT cast(s_named as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Floating-point: NaN, ±0, ±Infinity, NULL. +query +SELECT cast(s_floats as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Integer min/max for byte, short, int, long. +query +SELECT cast(s_bounds as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Decimal at the small and the 38-precision limit, positive / negative / zero / NULL. +query +SELECT cast(s_decimal as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Date and timestamp at common values plus the date range edges. +query +SELECT cast(s_temporal as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Binary content including empty bytes and non-printable values. +query +SELECT cast(s_binary as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Nested struct: inner struct rendered inside the outer braces. +query +SELECT cast(s_nested as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Struct containing an array field. +query +SELECT cast(s_with_array as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Whole struct is NULL vs. all inner fields NULL. +query +SELECT cast(s_all_null as string), id FROM test_cast_struct_to_string ORDER BY id + +-- Literal anonymous struct, mixed types with NULL. +query +SELECT cast(struct(1, 'two', cast(null as double)) as string) + +-- Literal named struct, mixed types. +query +SELECT cast(named_struct('k', 'key', 'v', 100, 'flag', true) as string) + +-- Deeply nested literal struct (3 levels). +query +SELECT cast(named_struct('a', named_struct('b', named_struct('c', 1, 'd', 'leaf'))) as string) + +-- Empty-string and whitespace string-field rendering. +query +SELECT cast(named_struct('s1', '', 's2', ' ', 's3', cast(null as string)) as string) + +-- Map-valued field: not supported, falls back to Spark. +query expect_fallback(to StringType is not supported) +SELECT cast(named_struct('m', map('k', 1)) as string) + +-- ---------------------------------------------------------------------------- +-- Array → string +-- ---------------------------------------------------------------------------- + +statement +CREATE TABLE test_cast_array_to_string( + id int, + a_int array, + a_string array, + a_bool array, + a_bounds array, + a_decimal array, + a_date array, + a_ts array, + a_binary array, + a_struct array>, + a_nested array> +) USING parquet + +statement +INSERT INTO test_cast_array_to_string VALUES + ( + 1, + array(1, 2, 3), + array('a', 'b', 'c'), + array(true, false, true), + array(9223372036854775807, -9223372036854775808, 0), + array(cast('1.234567890123456789' as decimal(38, 18)), cast('-1.234567890123456789' as decimal(38, 18))), + array(date '2024-01-15', date '1970-01-01'), + array(timestamp '2024-01-15 10:30:45', timestamp '1970-01-01 00:00:00'), + array(X'616263', X'', X'00FF7F80'), + array(named_struct('x', 1, 'y', 'first'), named_struct('x', 2, 'y', 'second')), + array(array(1, 2), array(3, 4, 5)) + ), + ( + 2, + array(cast(null as int), 1, cast(null as int)), + array(cast(null as string), '', ' '), + array(cast(null as boolean), true), + array(cast(null as bigint), 0), + array(cast(null as decimal(38, 18))), + array(cast(null as date)), + array(cast(null as timestamp)), + array(cast(null as binary), X'00'), + array(named_struct('x', cast(null as int), 'y', cast(null as string)), cast(null as struct)), + array(cast(null as array), array(cast(null as int))) + ), + ( + 3, + cast(array() as array), + cast(array() as array), + cast(array() as array), + cast(array() as array), + cast(array() as array), + cast(array() as array), + cast(array() as array), + cast(array() as array), + cast(array() as array>), + cast(array() as array>) + ), + ( + 4, + cast(null as array), + cast(null as array), + cast(null as array), + cast(null as array), + cast(null as array), + cast(null as array), + cast(null as array), + cast(null as array), + cast(null as array>), + cast(null as array>) + ) + +query +SELECT cast(a_int as string), id FROM test_cast_array_to_string ORDER BY id + +query +SELECT cast(a_string as string), id FROM test_cast_array_to_string ORDER BY id + +query +SELECT cast(a_bool as string), id FROM test_cast_array_to_string ORDER BY id + +query +SELECT cast(a_bounds as string), id FROM test_cast_array_to_string ORDER BY id + +query +SELECT cast(a_decimal as string), id FROM test_cast_array_to_string ORDER BY id + +query +SELECT cast(a_date as string), id FROM test_cast_array_to_string ORDER BY id + +query +SELECT cast(a_ts as string), id FROM test_cast_array_to_string ORDER BY id + +query +SELECT cast(a_binary as string), id FROM test_cast_array_to_string ORDER BY id + +-- Array of structs: each element rendered as `{f1, f2, ...}`. +query +SELECT cast(a_struct as string), id FROM test_cast_array_to_string ORDER BY id + +-- Nested array>: outer `[...]` containing inner `[...]`. +query +SELECT cast(a_nested as string), id FROM test_cast_array_to_string ORDER BY id + +-- Array of floats / doubles with NaN / ±0 / ±Infinity / NULL. +query +SELECT cast(array(cast(1.5 as float), cast('NaN' as float), cast(-0.0 as float), cast(null as float)) as string) + +query +SELECT cast(array(cast(1.5 as double), cast('NaN' as double), cast('-Infinity' as double), cast(null as double)) as string) + +-- Deeply nested literal array (3 levels). +query +SELECT cast(array(array(array(1, 2), array(3)), array(array(cast(null as int)))) as string) + +-- Array of map: not supported, falls back to Spark. +query expect_fallback(to StringType is not supported) +SELECT cast(array(map('k', 1)) as string) + +-- ---------------------------------------------------------------------------- +-- Map → string +-- ---------------------------------------------------------------------------- +-- Comet does not implement map-to-string casts, so every map → string falls back to Spark. +-- Note: maps materialized through parquet have nondeterministic entry order, so map column +-- tests use literal maps directly rather than reading from a parquet table. + +-- Map with string keys, int values. +query expect_fallback(Cast from MapType) +SELECT cast(map('a', 1, 'b', 2, 'c', 3) as string) + +-- Map with NULL values rendered as "null". +query expect_fallback(Cast from MapType) +SELECT cast(map('a', 1, 'b', cast(null as int), 'c', 3) as string) + +-- Map with int keys, string values. +query expect_fallback(Cast from MapType) +SELECT cast(map(1, 'one', 2, 'two', 3, 'three') as string) + +-- Map with boolean values. +query expect_fallback(Cast from MapType) +SELECT cast(map('t', true, 'f', false, 'n', cast(null as boolean)) as string) + +-- Map with bigint values at min/max. +query expect_fallback(Cast from MapType) +SELECT cast(map('max', 9223372036854775807, 'min', -9223372036854775808, 'zero', cast(0 as bigint)) as string) + +-- Map with decimal values. +query expect_fallback(Cast from MapType) +SELECT cast(map('pos', cast('1.234567890123456789' as decimal(38, 18)), 'neg', cast('-1.234567890123456789' as decimal(38, 18)), 'null', cast(null as decimal(38, 18))) as string) + +-- Map with date and timestamp values. +query expect_fallback(Cast from MapType) +SELECT cast(map('a', date '2024-01-15', 'b', date '1970-01-01', 'c', cast(null as date)) as string) + +query expect_fallback(Cast from MapType) +SELECT cast(map('a', timestamp '2024-01-15 10:30:45', 'b', cast(null as timestamp)) as string) + +-- Map with binary values. +query expect_fallback(Cast from MapType) +SELECT cast(map('a', X'616263', 'b', X'', 'c', cast(null as binary)) as string) + +-- Map with float / double values: NaN / ±0 / ±Infinity / NULL. +query expect_fallback(Cast from MapType) +SELECT cast(map('nan', cast('NaN' as float), 'neg0', cast(-0.0 as float), 'null', cast(null as float)) as string) + +query expect_fallback(Cast from MapType) +SELECT cast(map('nan', cast('NaN' as double), 'inf', cast('Infinity' as double), 'ninf', cast('-Infinity' as double), 'null', cast(null as double)) as string) + +-- Map with struct values: each value rendered as `{f1, f2, ...}`. +query expect_fallback(Cast from MapType) +SELECT cast(map('a', named_struct('x', 1, 'y', 'first'), 'b', cast(null as struct)) as string) + +-- Map with array values. +query expect_fallback(Cast from MapType) +SELECT cast(map('a', array(1, 2, 3), 'b', array(cast(null as int)), 'c', cast(null as array)) as string) + +-- Empty map. +query expect_fallback(Cast from MapType) +SELECT cast(map() as string) + +-- NULL map: Spark constant-folds this to a literal NULL, so the cast never reaches Comet +-- and there is no fallback. +query +SELECT cast(cast(null as map) as string) + +-- Map of map. +query expect_fallback(Cast from MapType) +SELECT cast(map('outer', map('inner', 1)) as string) diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_complex_types_to_string_legacy.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_complex_types_to_string_legacy.sql new file mode 100644 index 0000000000..2c0bc19b3b --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_complex_types_to_string_legacy.sql @@ -0,0 +1,40 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- When `spark.sql.legacy.castComplexTypesToString.enabled` is true Spark wraps maps and +-- structs with `[...]` (instead of `{...}`) and omits NULL elements of structs/maps/arrays +-- (instead of rendering them as the literal "null"). Comet only implements the default +-- formatting, so any array/map/struct → string cast must fall back to Spark. +-- The flag is internal in Spark 4.0 and defaults to false. + +-- Config: spark.sql.legacy.castComplexTypesToString.enabled=true + +-- Struct → string falls back. +query expect_fallback(spark.sql.legacy.castComplexTypesToString.enabled=true is not supported) +SELECT CAST(struct(1, 2, null) AS STRING) + +-- Array → string falls back (NULL elements rendered differently between modes). +query expect_fallback(spark.sql.legacy.castComplexTypesToString.enabled=true is not supported) +SELECT CAST(array(1, 2, null) AS STRING) + +-- Map → string falls back (`[]` vs `{}` wrapping differs between modes). +query expect_fallback(spark.sql.legacy.castComplexTypesToString.enabled=true is not supported) +SELECT CAST(map('a', 1, 'b', null) AS STRING) + +-- Nested complex types still fall back through the outer type. +query expect_fallback(spark.sql.legacy.castComplexTypesToString.enabled=true is not supported) +SELECT CAST(struct(array(1, null), map('k', null)) AS STRING)