Skip to content

Commit 779e427

Browse files
authored
fix: route StringReplace through codegen dispatcher to fix empty-search divergence (#4537)
1 parent 13f5d7b commit 779e427

3 files changed

Lines changed: 39 additions & 4 deletions

File tree

spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
192192
classOf[StartsWith] -> CometScalarFunction("starts_with"),
193193
classOf[StringInstr] -> CometScalarFunction("instr"),
194194
classOf[StringRepeat] -> CometStringRepeat,
195-
classOf[StringReplace] -> CometScalarFunction("replace"),
195+
classOf[StringReplace] -> CometStringReplace,
196196
classOf[StringRPad] -> CometStringRPad,
197197
classOf[StringLPad] -> CometStringLPad,
198198
classOf[StringSpace] -> CometScalarFunction("space"),

spark/src/main/scala/org/apache/comet/serde/strings.scala

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.comet.serde
2121

2222
import java.util.Locale
2323

24-
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, ConcatWs, Expression, GetJsonObject, If, InitCap, IsNull, Left, Length, Like, Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, StringRPad, StringSplit, Substring, SubstringIndex, Upper}
24+
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, ConcatWs, Expression, GetJsonObject, If, InitCap, IsNull, Left, Length, Like, Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, Substring, SubstringIndex, Upper}
2525
import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType}
2626
import org.apache.spark.unsafe.types.UTF8String
2727

@@ -102,6 +102,27 @@ object CometInitCap extends CometScalarFunction[InitCap]("initcap") {
102102
}
103103
}
104104

105+
object CometStringReplace extends CometScalarFunction[StringReplace]("replace") {
106+
107+
override def getSupportLevel(expr: StringReplace): SupportLevel = Compatible()
108+
109+
override def convert(
110+
expr: StringReplace,
111+
inputs: Seq[Attribute],
112+
binding: Boolean): Option[Expr] = {
113+
if (CometConf.isExprAllowIncompat(getExprConfigName(expr))) {
114+
// The native DataFusion `replace` avoids the JVM allocations of the codegen
115+
// dispatcher but is not Spark-compatible for an empty search string, so it is
116+
// only used when incompatibility is explicitly allowed.
117+
super.convert(expr, inputs, binding)
118+
} else {
119+
// Run Spark's own generated code inside the Comet pipeline so the result matches Spark
120+
// exactly. Falls back to Spark when the codegen dispatcher is disabled.
121+
CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding)
122+
}
123+
}
124+
}
125+
105126
object CometSubstring extends CometExpressionSerde[Substring] {
106127

107128
override def convert(

spark/src/test/resources/sql-tests/expressions/string/string_replace.sql

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,28 @@ statement
1919
CREATE TABLE test_str_replace(s string, search string, replace string) USING parquet
2020

2121
statement
22-
INSERT INTO test_str_replace VALUES ('hello world', 'world', 'there'), ('aaa', 'a', 'bb'), ('hello', 'xyz', 'abc'), ('', 'a', 'b'), (NULL, 'a', 'b')
22+
INSERT INTO test_str_replace VALUES ('hello world', 'world', 'there'), ('aaa', 'a', 'bb'), ('hello', 'xyz', 'abc'), ('', 'a', 'b'), (NULL, 'a', 'b'), ('hello', '', 'x')
2323

2424
query
2525
SELECT replace(s, search, replace) FROM test_str_replace
2626

27-
query ignore(https://github.com/apache/datafusion-comet/issues/3344)
27+
-- Empty literal search: DataFusion's replace diverges from Spark
28+
-- (Spark short-circuits and returns the source unchanged). The custom
29+
-- CometStringReplace serde routes through the codegen dispatcher so
30+
-- Spark's own doGenCode handles this case.
31+
-- https://github.com/apache/datafusion-comet/issues/4497
32+
query
2833
SELECT replace('hello', '', 'x')
2934

35+
query
36+
SELECT replace('', '', 'x')
37+
38+
query
39+
SELECT replace(NULL, '', 'x')
40+
41+
query
42+
SELECT replace('hello', '', NULL)
43+
3044
-- column + literal + literal
3145
query
3246
SELECT replace(s, 'world', 'there') FROM test_str_replace

0 commit comments

Comments
 (0)