Skip to content

Commit f10613b

Browse files
authored
feat: Support Spark expression str_to_map (#3654)
1 parent 61bcf31 commit f10613b

5 files changed

Lines changed: 120 additions & 1 deletion

File tree

docs/source/user-guide/latest/expressions.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ Comet supports using the following aggregate functions within window contexts wi
264264
| MapEntries |
265265
| MapValues |
266266
| MapFromArrays |
267+
| StringToMap |
267268

268269
## Struct Expressions
269270

native/core/src/execution/jni_api.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ use datafusion_spark::function::hash::crc32::SparkCrc32;
5454
use datafusion_spark::function::hash::sha1::SparkSha1;
5555
use datafusion_spark::function::hash::sha2::SparkSha2;
5656
use datafusion_spark::function::map::map_from_entries::MapFromEntries;
57+
use datafusion_spark::function::map::str_to_map::SparkStrToMap;
5758
use datafusion_spark::function::math::expm1::SparkExpm1;
5859
use datafusion_spark::function::math::hex::SparkHex;
5960
use datafusion_spark::function::math::width_bucket::SparkWidthBucket;
@@ -565,6 +566,7 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) {
565566
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitCount::default()));
566567
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkArrayContains::default()));
567568
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBin::default()));
569+
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkStrToMap::default()));
568570
}
569571

570572
/// Prepares arrow arrays for output.

spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
135135
classOf[MapValues] -> CometMapValues,
136136
classOf[MapFromArrays] -> CometMapFromArrays,
137137
classOf[MapContainsKey] -> CometMapContainsKey,
138-
classOf[MapFromEntries] -> CometMapFromEntries)
138+
classOf[MapFromEntries] -> CometMapFromEntries,
139+
classOf[StringToMap] -> CometStrToMap)
139140

140141
private[comet] val structExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] =
141142
Map(

spark/src/main/scala/org/apache/comet/serde/maps.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,10 @@ object CometMapFromEntries extends CometScalarFunction[MapFromEntries]("map_from
159159
Compatible(None)
160160
}
161161
}
162+
163+
object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map") {
164+
165+
override def getSupportLevel(expr: StringToMap): SupportLevel = {
166+
Compatible(None)
167+
}
168+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
-- Licensed to the Apache Software Foundation (ASF) under one
2+
-- or more contributor license agreements. See the NOTICE file
3+
-- distributed with this work for additional information
4+
-- regarding copyright ownership. The ASF licenses this file
5+
-- to you under the Apache License, Version 2.0 (the
6+
-- "License"); you may not use this file except in compliance
7+
-- with the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing,
12+
-- software distributed under the License is distributed on an
13+
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
-- KIND, either express or implied. See the License for the
15+
-- specific language governing permissions and limitations
16+
-- under the License.
17+
18+
-- Tests for Spark-compatible str_to_map function
19+
-- https://spark.apache.org/docs/latest/api/sql/index.html#str_to_map
20+
--
21+
-- Test cases derived from Spark test("StringToMap"):
22+
-- https://github.com/apache/spark/blob/v4.0.0/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala#L525-L618
23+
24+
-- ConfigMatrix: parquet.enable.dictionary=false,true
25+
26+
statement
27+
CREATE TABLE test_str_to_map(
28+
s STRING,
29+
pair_delim STRING,
30+
key_value_delim STRING,
31+
s_char CHAR(16),
32+
pair_delim_char CHAR(4),
33+
key_value_delim_char CHAR(4),
34+
s_varchar VARCHAR(16),
35+
pair_delim_varchar VARCHAR(4),
36+
key_value_delim_varchar VARCHAR(4)
37+
) USING parquet
38+
39+
statement
40+
INSERT INTO test_str_to_map VALUES
41+
('a:1,b:2,c:3', ',', ':', 'a:1,b:2,c:3', ',', ':', 'a:1,b:2,c:3', ',', ':'),
42+
('x=9;y=8', ';', '=', 'x=9;y=8', ';', '=', 'x=9;y=8', ';', '='),
43+
(NULL, ',', ':', NULL, ',', ':', NULL, ',', ':')
44+
45+
-- s0: Basic test with default delimiters
46+
query
47+
SELECT str_to_map('a:1,b:2,c:3')
48+
49+
-- s1: Preserve spaces in values
50+
query
51+
SELECT str_to_map('a: ,b:2')
52+
53+
-- s2: Custom key-value delimiter '='
54+
query
55+
SELECT str_to_map('a=1,b=2,c=3', ',', '=')
56+
57+
-- s3: Empty string returns map with empty key and NULL value
58+
query
59+
SELECT str_to_map('', ',', '=')
60+
61+
-- s4: Custom pair delimiter '_'
62+
query
63+
SELECT str_to_map('a:1_b:2_c:3', '_', ':')
64+
65+
-- s5: Single key without value returns NULL value
66+
query
67+
SELECT str_to_map('a')
68+
69+
-- s6: Custom delimiters '&' and '='
70+
query
71+
SELECT str_to_map('a=1&b=2&c=3', '&', '=')
72+
73+
-- Duplicate keys: EXCEPTION policy (Spark 3.0+ default)
74+
-- TODO: Add LAST_WIN policy tests when spark.sql.mapKeyDedupPolicy config is supported
75+
-- query
76+
-- SELECT str_to_map('a:1,b:2,a:3')
77+
78+
-- NULL input returns NULL
79+
query
80+
SELECT str_to_map(NULL, ',', ':')
81+
82+
-- Explicit 3-arg form
83+
query
84+
SELECT str_to_map('a:1,b:2,c:3', ',', ':')
85+
86+
-- Missing key-value delimiter results in NULL value
87+
query
88+
SELECT str_to_map('a,b:2', ',', ':')
89+
90+
-- Multi-row test
91+
query
92+
SELECT str_to_map(s) FROM test_str_to_map
93+
94+
-- Rows with per-row delimiters
95+
query
96+
SELECT str_to_map(s, pair_delim, key_value_delim) FROM test_str_to_map
97+
98+
-- STRING input with literal delimiters
99+
query
100+
SELECT str_to_map(s, ',', ':') FROM test_str_to_map
101+
102+
-- CHAR input and delimiters with per-row delimiter values
103+
query
104+
SELECT str_to_map(s_char, pair_delim_char, key_value_delim_char) FROM test_str_to_map
105+
106+
-- VARCHAR input and delimiters with per-row delimiter values
107+
query
108+
SELECT str_to_map(s_varchar, pair_delim_varchar, key_value_delim_varchar) FROM test_str_to_map

0 commit comments

Comments
 (0)