Skip to content

Commit ddd08ee

Browse files
authored
fix: mark non-UTF8_BINARY collations as Incompatible for concat and reverse (#4567)
1 parent 779e427 commit ddd08ee

3 files changed

Lines changed: 50 additions & 6 deletions

File tree

spark/src/main/scala/org/apache/comet/serde/collectionOperations.scala

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,25 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, Reverse}
2323
import org.apache.spark.sql.types.ArrayType
2424

2525
import org.apache.comet.serde.ExprOuterClass.Expr
26+
import org.apache.comet.shims.CometTypeShim
2627

27-
object CometReverse extends CometScalarFunction[Reverse]("reverse") {
28+
object CometReverse extends CometScalarFunction[Reverse]("reverse") with CometTypeShim {
29+
30+
// Spark 4.0 widens the string branch of Reverse to accept collated strings and propagates the
31+
// collation through dataType. The native reverse UDF reverses code units and produces UTF8
32+
// (UTF8_BINARY semantics), so a non-default collation diverges from Spark.
33+
private val collationReason =
34+
"reverse does not support non-UTF8_BINARY collations " +
35+
"(https://github.com/apache/datafusion-comet/issues/2190)"
2836

2937
override def getIncompatibleReasons(): Seq[String] =
30-
CometArrayReverse.getIncompatibleReasons()
38+
CometArrayReverse.getIncompatibleReasons() :+ collationReason
3139

3240
override def getSupportLevel(expr: Reverse): SupportLevel = {
3341
if (expr.child.dataType.isInstanceOf[ArrayType]) {
3442
CometArrayReverse.getSupportLevel(expr)
43+
} else if (hasNonDefaultStringCollation(expr.child.dataType)) {
44+
Incompatible(Some(collationReason))
3545
} else {
3646
Compatible()
3747
}

spark/src/main/scala/org/apache/comet/serde/strings.scala

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import org.apache.comet.CometSparkSessionExtensions.withFallbackReason
3030
import org.apache.comet.expressions.{CometCast, CometEvalMode, RegExp}
3131
import org.apache.comet.serde.ExprOuterClass.Expr
3232
import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInternal, optExprWithFallbackReason, scalarFunctionExprToProto, scalarFunctionExprToProtoWithReturnType}
33+
import org.apache.comet.shims.CometTypeShim
3334

3435
object CometStringRepeat extends CometExpressionSerde[StringRepeat] {
3536

@@ -244,16 +245,32 @@ object CometRight extends CometExpressionSerde[Right] {
244245
}
245246
}
246247

247-
object CometConcat extends CometScalarFunction[Concat]("concat") {
248+
object CometConcat extends CometScalarFunction[Concat]("concat") with CometTypeShim {
248249
private val unsupportedReason = "CONCAT supports only string input parameters"
249250

251+
// Spark 4.0 widens Concat to accept collated strings and preserves the collation in the merged
252+
// result type. The native concat UDF always produces UTF8 (UTF8_BINARY semantics), so a
253+
// non-default collation diverges from Spark.
254+
private val collationReason =
255+
"concat does not support non-UTF8_BINARY collations " +
256+
"(https://github.com/apache/datafusion-comet/issues/2190)"
257+
250258
override def getUnsupportedReasons(): Seq[String] = Seq(unsupportedReason)
251259

260+
override def getIncompatibleReasons(): Seq[String] = Seq(collationReason)
261+
252262
override def getSupportLevel(expr: Concat): SupportLevel = {
253-
if (expr.children.forall(_.dataType == DataTypes.StringType)) {
254-
Compatible()
255-
} else {
263+
// Use isInstanceOf rather than `== DataTypes.StringType` so that collated strings (a
264+
// StringType with a non-default collationId, which is not == the default StringType) are still
265+
// recognised as string input and routed to the collation check below rather than reported as
266+
// an unsupported input type.
267+
if (!expr.children.forall(_.dataType.isInstanceOf[StringType])) {
256268
Unsupported(Some(unsupportedReason))
269+
} else if (hasNonDefaultStringCollation(expr.dataType) ||
270+
expr.children.exists(c => hasNonDefaultStringCollation(c.dataType))) {
271+
Incompatible(Some(collationReason))
272+
} else {
273+
Compatible()
257274
}
258275
}
259276
}

spark/src/test/resources/sql-tests/expressions/string/collation.sql

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,20 @@ SELECT collation('hello' COLLATE UTF8_BINARY)
3131
-- collation of a NULL string
3232
query
3333
SELECT collation(CAST(NULL AS STRING))
34+
35+
-- concat preserves a non-default collation in its result type, but Comet's native concat produces
36+
-- UTF8_BINARY, so it is Incompatible and falls back to Spark by default.
37+
query expect_fallback(concat does not support non-UTF8_BINARY collations)
38+
SELECT concat('Hello' COLLATE UTF8_LCASE, 'World' COLLATE UTF8_LCASE)
39+
40+
-- reverse on a collated string is likewise Incompatible and falls back to Spark by default.
41+
query expect_fallback(reverse does not support non-UTF8_BINARY collations)
42+
SELECT reverse('Hello' COLLATE UTF8_LCASE)
43+
44+
-- A standard ICU collation (UNICODE_CI) falls back the same way, confirming the gate covers
45+
-- any non-UTF8_BINARY collation rather than just UTF8_LCASE.
46+
query expect_fallback(concat does not support non-UTF8_BINARY collations)
47+
SELECT concat('Hello' COLLATE UNICODE_CI, 'World' COLLATE UNICODE_CI)
48+
49+
query expect_fallback(reverse does not support non-UTF8_BINARY collations)
50+
SELECT reverse('Hello' COLLATE UNICODE_CI)

0 commit comments

Comments
 (0)