fix: mark non-UTF8_BINARY collations as Incompatible for concat and reverse (#4567)

andygrove · web-flow · commit ddd08eeffb9a · 2026-06-02T16:33:33.000-06:00
diff --git a/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala b/spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala
@@ -23,15 +23,25 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Reverse}
 import org.apache.spark.sql.types.ArrayType
 
 import org.apache.comet.serde.ExprOuterClass.Expr
+import org.apache.comet.shims.CometTypeShim
 
-object CometReverse extends CometScalarFunction[Reverse]("reverse") {
+object CometReverse extends CometScalarFunction[Reverse]("reverse") with CometTypeShim {
+
+  // Spark 4.0 widens the string branch of Reverse to accept collated strings and propagates the
+  // collation through dataType. The native reverse UDF reverses code units and produces UTF8
+  // (UTF8_BINARY semantics), so a non-default collation diverges from Spark.
+  private val collationReason =
+    "reverse does not support non-UTF8_BINARY collations " +
+      "(https://github.com/apache/datafusion-comet/issues/2190)"
 
   override def getIncompatibleReasons(): Seq[String] =
-    CometArrayReverse.getIncompatibleReasons()
+    CometArrayReverse.getIncompatibleReasons() :+ collationReason
 
   override def getSupportLevel(expr: Reverse): SupportLevel = {
     if (expr.child.dataType.isInstanceOf[ArrayType]) {
       CometArrayReverse.getSupportLevel(expr)
+    } else if (hasNonDefaultStringCollation(expr.child.dataType)) {
+      Incompatible(Some(collationReason))
     } else {
       Compatible()
     }
diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala
@@ -30,6 +30,7 @@ import org.apache.comet.CometSparkSessionExtensions.withFallbackReason
 import org.apache.comet.expressions.{CometCast, CometEvalMode, RegExp}
 import org.apache.comet.serde.ExprOuterClass.Expr
 import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInternal, optExprWithFallbackReason, scalarFunctionExprToProto, scalarFunctionExprToProtoWithReturnType}
+import org.apache.comet.shims.CometTypeShim
 
 object CometStringRepeat extends CometExpressionSerde[StringRepeat] {
 
@@ -244,16 +245,32 @@ object CometRight extends CometExpressionSerde[Right] {
   }
 }
 
-object CometConcat extends CometScalarFunction[Concat]("concat") {
+object CometConcat extends CometScalarFunction[Concat]("concat") with CometTypeShim {
   private val unsupportedReason = "CONCAT supports only string input parameters"
 
+  // Spark 4.0 widens Concat to accept collated strings and preserves the collation in the merged
+  // result type. The native concat UDF always produces UTF8 (UTF8_BINARY semantics), so a
+  // non-default collation diverges from Spark.
+  private val collationReason =
+    "concat does not support non-UTF8_BINARY collations " +
+      "(https://github.com/apache/datafusion-comet/issues/2190)"
+
   override def getUnsupportedReasons(): Seq[String] = Seq(unsupportedReason)
 
+  override def getIncompatibleReasons(): Seq[String] = Seq(collationReason)
+
   override def getSupportLevel(expr: Concat): SupportLevel = {
-    if (expr.children.forall(_.dataType == DataTypes.StringType)) {
-      Compatible()
-    } else {
+    // Use isInstanceOf rather than `== DataTypes.StringType` so that collated strings (a
+    // StringType with a non-default collationId, which is not == the default StringType) are still
+    // recognised as string input and routed to the collation check below rather than reported as
+    // an unsupported input type.
+    if (!expr.children.forall(_.dataType.isInstanceOf[StringType])) {
       Unsupported(Some(unsupportedReason))
+    } else if (hasNonDefaultStringCollation(expr.dataType) ||
+      expr.children.exists(c => hasNonDefaultStringCollation(c.dataType))) {
+      Incompatible(Some(collationReason))
+    } else {
+      Compatible()
     }
   }
 }
diff --git a/spark/src/test/resources/sql-tests/expressions/string/collation.sql b/spark/src/test/resources/sql-tests/expressions/string/collation.sql
@@ -31,3 +31,20 @@ SELECT collation('hello' COLLATE UTF8_BINARY)
 -- collation of a NULL string
 query
 SELECT collation(CAST(NULL AS STRING))
+
+-- concat preserves a non-default collation in its result type, but Comet's native concat produces
+-- UTF8_BINARY, so it is Incompatible and falls back to Spark by default.
+query expect_fallback(concat does not support non-UTF8_BINARY collations)
+SELECT concat('Hello' COLLATE UTF8_LCASE, 'World' COLLATE UTF8_LCASE)
+
+-- reverse on a collated string is likewise Incompatible and falls back to Spark by default.
+query expect_fallback(reverse does not support non-UTF8_BINARY collations)
+SELECT reverse('Hello' COLLATE UTF8_LCASE)
+
+-- A standard ICU collation (UNICODE_CI) falls back the same way, confirming the gate covers
+-- any non-UTF8_BINARY collation rather than just UTF8_LCASE.
+query expect_fallback(concat does not support non-UTF8_BINARY collations)
+SELECT concat('Hello' COLLATE UNICODE_CI, 'World' COLLATE UNICODE_CI)
+
+query expect_fallback(reverse does not support non-UTF8_BINARY collations)
+SELECT reverse('Hello' COLLATE UNICODE_CI)