Skip to content

Commit 03c1478

Browse files
authored
fix: gate bit_length/octet_length on BinaryType and downgrade translate (apache#4594)
1 parent 9a76677 commit 03c1478

8 files changed

Lines changed: 124 additions & 15 deletions

File tree

docs/source/user-guide/latest/expressions.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the
589589
| `to_char` || |
590590
| `to_number` || |
591591
| `to_varchar` || |
592-
| `translate` || |
592+
| `translate` || Falls back by default; opt-in via allowIncompatible ([#4463](https://github.com/apache/datafusion-comet/issues/4463)) |
593593
| `trim` || |
594594
| `try_to_binary` | 🔜 | Lowers to `TryEval(...)`, which falls back |
595595
| `try_to_number` | 🔜 | TRY variant of `to_number` |

spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
208208
// when `++` is applied directly to a `Map(...)` literal.
209209
val base: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map(
210210
classOf[Ascii] -> CometScalarFunction("ascii"),
211-
classOf[BitLength] -> CometScalarFunction("bit_length"),
211+
classOf[BitLength] -> CometBitLength,
212212
classOf[Chr] -> CometScalarFunction("char"),
213213
classOf[ConcatWs] -> CometConcatWs,
214214
classOf[Concat] -> CometConcat,
@@ -220,7 +220,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
220220
classOf[Levenshtein] -> CometLevenshtein,
221221
classOf[Like] -> CometLike,
222222
classOf[Lower] -> CometLower,
223-
classOf[OctetLength] -> CometScalarFunction("octet_length"),
223+
classOf[OctetLength] -> CometOctetLength,
224224
classOf[RegExpExtract] -> CometRegExpExtract,
225225
classOf[RegExpExtractAll] -> CometRegExpExtractAll,
226226
classOf[RegExpInStr] -> CometRegExpInStr,
@@ -235,7 +235,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
235235
classOf[StringLPad] -> CometStringLPad,
236236
classOf[StringSpace] -> CometScalarFunction("space"),
237237
classOf[StringSplit] -> CometStringSplit,
238-
classOf[StringTranslate] -> CometScalarFunction("translate"),
238+
classOf[StringTranslate] -> CometStringTranslate,
239239
classOf[StringTrim] -> CometScalarFunction("trim"),
240240
classOf[StringTrimLeft] -> CometScalarFunction("ltrim"),
241241
classOf[StringTrimRight] -> CometScalarFunction("rtrim"),

spark/src/main/scala/org/apache/comet/serde/strings.scala

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.comet.serde
2121

2222
import java.util.Locale
2323

24-
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, ConcatWs, Elt, Expression, FindInSet, FormatNumber, FormatString, GetJsonObject, If, InitCap, IsNull, Left, Length, Levenshtein, Like, Literal, Lower, Overlay, RegExpExtract, RegExpExtractAll, RegExpInStr, RegExpReplace, Right, RLike, SoundEx, StringLocate, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, Substring, SubstringIndex, ToCharacter, ToNumber, UnBase64, Upper}
24+
import org.apache.spark.sql.catalyst.expressions.{Attribute, BitLength, Cast, Concat, ConcatWs, Elt, Expression, FindInSet, FormatNumber, FormatString, GetJsonObject, If, InitCap, IsNull, Left, Length, Levenshtein, Like, Literal, Lower, OctetLength, Overlay, RegExpExtract, RegExpExtractAll, RegExpInStr, RegExpReplace, Right, RLike, SoundEx, StringLocate, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, StringTranslate, Substring, SubstringIndex, ToCharacter, ToNumber, UnBase64, Upper}
2525
import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType}
2626
import org.apache.spark.unsafe.types.UTF8String
2727

@@ -84,6 +84,35 @@ object CometLength extends CometScalarFunction[Length]("length") {
8484
}
8585
}
8686

87+
object CometBitLength extends CometScalarFunction[BitLength]("bit_length") {
88+
override def getUnsupportedReasons(): Seq[String] = Seq("`BinaryType` input is not supported")
89+
90+
override def getSupportLevel(expr: BitLength): SupportLevel = expr.child.dataType match {
91+
case _: BinaryType => Unsupported(Some("bit_length on BinaryType is not supported"))
92+
case _ => Compatible()
93+
}
94+
}
95+
96+
object CometOctetLength extends CometScalarFunction[OctetLength]("octet_length") {
97+
override def getUnsupportedReasons(): Seq[String] = Seq("`BinaryType` input is not supported")
98+
99+
override def getSupportLevel(expr: OctetLength): SupportLevel = expr.child.dataType match {
100+
case _: BinaryType => Unsupported(Some("octet_length on BinaryType is not supported"))
101+
case _ => Compatible()
102+
}
103+
}
104+
105+
object CometStringTranslate extends CometScalarFunction[StringTranslate]("translate") {
106+
private val incompatReason =
107+
"DataFusion's translate iterates over Unicode graphemes (Spark uses code points) and" +
108+
" substitutes U+0000 instead of treating it as a deletion sentinel"
109+
110+
override def getIncompatibleReasons(): Seq[String] = Seq(incompatReason)
111+
112+
override def getSupportLevel(expr: StringTranslate): SupportLevel = Incompatible(
113+
Some(incompatReason))
114+
}
115+
87116
object CometInitCap extends CometScalarFunction[InitCap]("initcap") {
88117

89118
override def getSupportLevel(expr: InitCap): SupportLevel = Compatible()

spark/src/test/resources/sql-tests/expressions/string/bit_length.sql

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,17 @@ SELECT bit_length(s) FROM test_bit_length
2727
-- literal arguments
2828
query
2929
SELECT bit_length('hello'), bit_length(''), bit_length(NULL)
30+
31+
-- BinaryType input falls back to Spark; the native DataFusion impl rejects Binary at runtime,
32+
-- so the serde gates Binary as Unsupported (matching the existing CometLength shape).
33+
statement
34+
CREATE TABLE test_bit_length_binary(b binary) USING parquet
35+
36+
statement
37+
INSERT INTO test_bit_length_binary VALUES (X'48656c6c6f'), (X''), (NULL), (X'FF')
38+
39+
query expect_fallback(bit_length on BinaryType is not supported)
40+
SELECT bit_length(b) FROM test_bit_length_binary
41+
42+
query expect_fallback(bit_length on BinaryType is not supported)
43+
SELECT bit_length(X'48656c6c6f'), bit_length(CAST(NULL AS BINARY))

spark/src/test/resources/sql-tests/expressions/string/octet_length.sql

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,17 @@ SELECT octet_length(s) FROM test_octet_length
2727
-- literal arguments
2828
query
2929
SELECT octet_length('hello'), octet_length(''), octet_length(NULL)
30+
31+
-- BinaryType input falls back to Spark; the native DataFusion impl rejects Binary at runtime,
32+
-- so the serde gates Binary as Unsupported (matching the existing CometLength shape).
33+
statement
34+
CREATE TABLE test_octet_length_binary(b binary) USING parquet
35+
36+
statement
37+
INSERT INTO test_octet_length_binary VALUES (X'48656c6c6f'), (X''), (NULL), (X'FF')
38+
39+
query expect_fallback(octet_length on BinaryType is not supported)
40+
SELECT octet_length(b) FROM test_octet_length_binary
41+
42+
query expect_fallback(octet_length on BinaryType is not supported)
43+
SELECT octet_length(X'48656c6c6f'), octet_length(CAST(NULL AS BINARY))

spark/src/test/resources/sql-tests/expressions/string/string_translate.sql

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,29 @@
1515
-- specific language governing permissions and limitations
1616
-- under the License.
1717

18+
-- translate is gated as Incompatible by default. DataFusion's translate iterates over Unicode
19+
-- graphemes (Spark uses code points) and substitutes U+0000 instead of treating it as a deletion
20+
-- sentinel, so the native path silently diverges from Spark for combining-mark inputs and for
21+
-- to=NUL. These default-config tests assert that the expression falls back cleanly to Spark.
22+
-- See string_translate_enabled.sql for the opt-in native path.
23+
1824
statement
1925
CREATE TABLE test_translate(s string, from_str string, to_str string) USING parquet
2026

2127
statement
2228
INSERT INTO test_translate VALUES ('hello', 'el', 'ip'), ('hello', 'aeiou', '12345'), ('', 'a', 'b'), (NULL, 'a', 'b'), ('hello', '', ''), ('abc', 'abc', 'x')
2329

24-
query
30+
query expect_fallback(is not fully compatible with Spark)
2531
SELECT translate(s, from_str, to_str) FROM test_translate
2632

2733
-- column + literal + literal
28-
query
34+
query expect_fallback(is not fully compatible with Spark)
2935
SELECT translate(s, 'el', 'ip') FROM test_translate
3036

3137
-- literal + column + column
32-
query
38+
query expect_fallback(is not fully compatible with Spark)
3339
SELECT translate('hello', from_str, to_str) FROM test_translate
3440

3541
-- literal + literal + literal
36-
query
42+
query expect_fallback(is not fully compatible with Spark)
3743
SELECT translate('hello', 'el', 'ip'), translate('hello', 'aeiou', '12345'), translate('', 'a', 'b'), translate(NULL, 'a', 'b')
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
-- Licensed to the Apache Software Foundation (ASF) under one
2+
-- or more contributor license agreements. See the NOTICE file
3+
-- distributed with this work for additional information
4+
-- regarding copyright ownership. The ASF licenses this file
5+
-- to you under the Apache License, Version 2.0 (the
6+
-- "License"); you may not use this file except in compliance
7+
-- with the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing,
12+
-- software distributed under the License is distributed on an
13+
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
-- KIND, either express or implied. See the License for the
15+
-- specific language governing permissions and limitations
16+
-- under the License.
17+
18+
-- Tests for the native translate path, which the user must opt into via
19+
-- spark.comet.expression.StringTranslate.allowIncompatible=true (gated as Incompatible because
20+
-- DataFusion's translate diverges from Spark on combining-mark inputs and on to=NUL deletion).
21+
-- These ASCII-only tests run under inputs where the two implementations agree.
22+
-- Config: spark.comet.expression.StringTranslate.allowIncompatible=true
23+
24+
statement
25+
CREATE TABLE test_translate_enabled(s string, from_str string, to_str string) USING parquet
26+
27+
statement
28+
INSERT INTO test_translate_enabled VALUES ('hello', 'el', 'ip'), ('hello', 'aeiou', '12345'), ('', 'a', 'b'), (NULL, 'a', 'b'), ('hello', '', ''), ('abc', 'abc', 'x')
29+
30+
query
31+
SELECT translate(s, from_str, to_str) FROM test_translate_enabled
32+
33+
-- column + literal + literal
34+
query
35+
SELECT translate(s, 'el', 'ip') FROM test_translate_enabled
36+
37+
-- literal + column + column
38+
query
39+
SELECT translate('hello', from_str, to_str) FROM test_translate_enabled
40+
41+
-- literal + literal + literal
42+
query
43+
SELECT translate('hello', 'el', 'ip'), translate('hello', 'aeiou', '12345'), translate('', 'a', 'b'), translate(NULL, 'a', 'b')

spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -373,12 +373,15 @@ class CometStringExpressionSuite extends CometTestBase {
373373

374374
test("length, reverse, instr, replace, translate") {
375375
val table = "test"
376-
withTable(table) {
377-
sql(s"create table $table(col string) using parquet")
378-
sql(
379-
s"insert into $table values('Spark SQL '), (NULL), (''), ('苹果手机'), ('Spark SQL '), (NULL), (''), ('苹果手机')")
380-
checkSparkAnswerAndOperator("select length(col), reverse(col), instr(col, 'SQL'), instr(col, '手机'), replace(col, 'SQL', '123')," +
381-
s" replace(col, 'SQL'), replace(col, '手机', '平板'), translate(col, 'SL苹', '123') from $table")
376+
withSQLConf("spark.comet.expression.StringTranslate.allowIncompatible" -> "true") {
377+
withTable(table) {
378+
sql(s"create table $table(col string) using parquet")
379+
sql(
380+
s"insert into $table values('Spark SQL '), (NULL), (''), ('苹果手机'), ('Spark SQL '), (NULL), (''), ('苹果手机')")
381+
checkSparkAnswerAndOperator(
382+
"select length(col), reverse(col), instr(col, 'SQL'), instr(col, '手机'), replace(col, 'SQL', '123')," +
383+
s" replace(col, 'SQL'), replace(col, '手机', '平板'), translate(col, 'SL苹', '123') from $table")
384+
}
382385
}
383386
}
384387

0 commit comments

Comments
 (0)