@@ -30,6 +30,7 @@ import org.apache.comet.CometSparkSessionExtensions.withFallbackReason
3030import org .apache .comet .expressions .{CometCast , CometEvalMode , RegExp }
3131import org .apache .comet .serde .ExprOuterClass .Expr
3232import org .apache .comet .serde .QueryPlanSerde .{createBinaryExpr , exprToProtoInternal , optExprWithFallbackReason , scalarFunctionExprToProto , scalarFunctionExprToProtoWithReturnType }
33+ import org .apache .comet .shims .CometTypeShim
3334
3435object CometStringRepeat extends CometExpressionSerde [StringRepeat ] {
3536
@@ -244,16 +245,32 @@ object CometRight extends CometExpressionSerde[Right] {
244245 }
245246}
246247
247- object CometConcat extends CometScalarFunction [Concat ](" concat" ) {
248+ object CometConcat extends CometScalarFunction [Concat ](" concat" ) with CometTypeShim {
248249 private val unsupportedReason = " CONCAT supports only string input parameters"
249250
251+ // Spark 4.0 widens Concat to accept collated strings and preserves the collation in the merged
252+ // result type. The native concat UDF always produces UTF8 (UTF8_BINARY semantics), so a
253+ // non-default collation diverges from Spark.
254+ private val collationReason =
255+ " concat does not support non-UTF8_BINARY collations " +
256+ " (https://github.com/apache/datafusion-comet/issues/2190)"
257+
250258 override def getUnsupportedReasons (): Seq [String ] = Seq (unsupportedReason)
251259
260+ override def getIncompatibleReasons (): Seq [String ] = Seq (collationReason)
261+
252262 override def getSupportLevel (expr : Concat ): SupportLevel = {
253- if (expr.children.forall(_.dataType == DataTypes .StringType )) {
254- Compatible ()
255- } else {
263+ // Use isInstanceOf rather than `== DataTypes.StringType` so that collated strings (a
264+ // StringType with a non-default collationId, which is not == the default StringType) are still
265+ // recognised as string input and routed to the collation check below rather than reported as
266+ // an unsupported input type.
267+ if (! expr.children.forall(_.dataType.isInstanceOf [StringType ])) {
256268 Unsupported (Some (unsupportedReason))
269+ } else if (hasNonDefaultStringCollation(expr.dataType) ||
270+ expr.children.exists(c => hasNonDefaultStringCollation(c.dataType))) {
271+ Incompatible (Some (collationReason))
272+ } else {
273+ Compatible ()
257274 }
258275 }
259276}
0 commit comments