Skip to content

Commit af134cf

Browse files
authored
fix: fall back to Spark for str_to_map when legacy regex split truncation is enabled (apache#4627)
1 parent 7ce412e commit af134cf

2 files changed

Lines changed: 58 additions & 1 deletion

File tree

spark/src/main/scala/org/apache/comet/serde/maps.scala

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.apache.comet.serde
2121

2222
import org.apache.spark.sql.catalyst.expressions._
23+
import org.apache.spark.sql.internal.SQLConf
2324
import org.apache.spark.sql.types._
2425

2526
import org.apache.comet.serde.QueryPlanSerde.{createBinaryExpr, exprToProtoInternal, optExprWithFallbackReason, scalarFunctionExprToProto}
@@ -164,7 +165,22 @@ object CometMapFromEntries
164165
}
165166
}
166167

167-
object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map")
168+
object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map") {
169+
170+
// Spark 4.1.1+ honours spark.sql.legacy.truncateForEmptyRegexSplit by truncating trailing
171+
// empty entries from the split result. Comet's native str_to_map always behaves as if the flag
172+
// were false, so it is incompatible when legacy truncation is enabled. Read by string key so it
173+
// resolves on older Spark versions where the config is not registered.
174+
private val legacyTruncateConfig = "spark.sql.legacy.truncateForEmptyRegexSplit"
175+
176+
override def getSupportLevel(expr: StringToMap): SupportLevel = {
177+
if (SQLConf.get.getConfString(legacyTruncateConfig, "false").toBoolean) {
178+
Incompatible(Some(s"$legacyTruncateConfig is enabled"))
179+
} else {
180+
Compatible(None)
181+
}
182+
}
183+
}
168184

169185
object CometMapFilter extends CometCodegenDispatch[MapFilter]
170186

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
-- Licensed to the Apache Software Foundation (ASF) under one
2+
-- or more contributor license agreements. See the NOTICE file
3+
-- distributed with this work for additional information
4+
-- regarding copyright ownership. The ASF licenses this file
5+
-- to you under the Apache License, Version 2.0 (the
6+
-- "License"); you may not use this file except in compliance
7+
-- with the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing,
12+
-- software distributed under the License is distributed on an
13+
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
-- KIND, either express or implied. See the License for the
15+
-- specific language governing permissions and limitations
16+
-- under the License.
17+
18+
-- Tests that str_to_map falls back to Spark when
19+
-- spark.sql.legacy.truncateForEmptyRegexSplit is enabled. In legacy mode Spark truncates trailing
20+
-- empty entries from the split result, which Comet's native str_to_map does not honour.
21+
-- See https://github.com/apache/datafusion-comet/issues/4477
22+
23+
-- Config: spark.sql.legacy.truncateForEmptyRegexSplit=true
24+
25+
-- trailing pair delimiter: legacy mode truncates the trailing empty entry, so Comet must fall
26+
-- back to Spark
27+
query expect_fallback(truncateForEmptyRegexSplit)
28+
SELECT str_to_map('a:1,b:2,', ',', ':')
29+
30+
-- column input also falls back
31+
statement
32+
CREATE TABLE test_str_to_map_legacy(s STRING, pair_delim STRING, key_value_delim STRING) USING parquet
33+
34+
statement
35+
INSERT INTO test_str_to_map_legacy VALUES
36+
('a:1,b:2,', ',', ':'),
37+
('x:1;y:2;', ';', ':'),
38+
(NULL, ',', ':')
39+
40+
query expect_fallback(truncateForEmptyRegexSplit)
41+
SELECT str_to_map(s, pair_delim, key_value_delim) FROM test_str_to_map_legacy

0 commit comments

Comments
 (0)