Skip to content

Commit b1c586a

Browse files
authored
feat: add from_utc_timestamp and to_utc_timestamp expressions (apache#4308)
1 parent 3209b98 commit b1c586a

6 files changed

Lines changed: 206 additions & 3 deletions

File tree

docs/source/contributor-guide/spark_expressions_support.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,11 @@
238238
- [x] dayofyear
239239
- [x] extract
240240
- [x] from_unixtime
241-
- [ ] from_utc_timestamp
241+
- [x] from_utc_timestamp
242+
- Spark 3.4.3 (audited 2026-05-12): identical to 3.5.8.
243+
- Spark 3.5.8 (audited 2026-05-12): baseline.
244+
- Spark 4.0.1 (audited 2026-05-12): `inputTypes` widened to `StringTypeWithCollation`; behaviour unchanged for ASCII timezone strings.
245+
- Known divergence: Comet's native timezone parser does not accept Spark's legacy zone forms (`GMT+1`, `UTC+1`, three-letter abbreviations like `PST`). Such timezones throw a native parse error at execution.
242246
- [x] hour
243247
- [x] last_day
244248
- [x] localtimestamp
@@ -270,7 +274,11 @@
270274
- [ ] to_timestamp_ltz
271275
- [ ] to_timestamp_ntz
272276
- [ ] to_unix_timestamp
273-
- [ ] to_utc_timestamp
277+
- [x] to_utc_timestamp
278+
- Spark 3.4.3 (audited 2026-05-12): identical to 3.5.8.
279+
- Spark 3.5.8 (audited 2026-05-12): baseline.
280+
- Spark 4.0.1 (audited 2026-05-12): `inputTypes` widened to `StringTypeWithCollation`; behaviour unchanged for ASCII timezone strings.
281+
- Known divergence: Comet's native timezone parser does not accept Spark's legacy zone forms (`GMT+1`, `UTC+1`, three-letter abbreviations like `PST`). Such timezones throw a native parse error at execution.
274282
- [x] trunc
275283
- [ ] try_make_interval
276284
- [ ] try_make_timestamp

native/core/src/execution/jni_api.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,10 @@ use datafusion_spark::function::bitwise::bit_get::SparkBitGet;
4848
use datafusion_spark::function::bitwise::bitwise_not::SparkBitwiseNot;
4949
use datafusion_spark::function::datetime::date_add::SparkDateAdd;
5050
use datafusion_spark::function::datetime::date_sub::SparkDateSub;
51+
use datafusion_spark::function::datetime::from_utc_timestamp::SparkFromUtcTimestamp;
5152
use datafusion_spark::function::datetime::last_day::SparkLastDay;
5253
use datafusion_spark::function::datetime::next_day::SparkNextDay;
54+
use datafusion_spark::function::datetime::to_utc_timestamp::SparkToUtcTimestamp;
5355
use datafusion_spark::function::hash::crc32::SparkCrc32;
5456
use datafusion_spark::function::hash::sha1::SparkSha1;
5557
use datafusion_spark::function::hash::sha2::SparkSha2;
@@ -574,8 +576,10 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) {
574576
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitGet::default()));
575577
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateAdd::default()));
576578
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkDateSub::default()));
579+
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkFromUtcTimestamp::default()));
577580
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkLastDay::default()));
578581
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkNextDay::default()));
582+
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkToUtcTimestamp::default()));
579583
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSha1::default()));
580584
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkConcat::default()));
581585
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitwiseNot::default()));

spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,8 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
225225
classOf[DateSub] -> CometDateSub,
226226
classOf[UnixDate] -> CometUnixDate,
227227
classOf[FromUnixTime] -> CometFromUnixTime,
228+
classOf[FromUTCTimestamp] -> CometFromUTCTimestamp,
229+
classOf[ToUTCTimestamp] -> CometToUTCTimestamp,
228230
classOf[LastDay] -> CometLastDay,
229231
classOf[Hour] -> CometHour,
230232
classOf[MakeDate] -> CometMakeDate,

spark/src/main/scala/org/apache/comet/serde/datetime.scala

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.comet.serde
2121

2222
import java.util.Locale
2323

24-
import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, NextDay, Quarter, Second, SecondsToTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year}
24+
import org.apache.spark.sql.catalyst.expressions.{Attribute, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, Hour, Hours, LastDay, Literal, MakeDate, Minute, Month, NextDay, Quarter, Second, SecondsToTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixTimestamp, WeekDay, WeekOfYear, Year}
2525
import org.apache.spark.sql.internal.SQLConf
2626
import org.apache.spark.sql.types.{DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType}
2727
import org.apache.spark.unsafe.types.UTF8String
@@ -364,6 +364,51 @@ object CometDateAdd extends CometScalarFunction[DateAdd]("date_add")
364364

365365
object CometDateSub extends CometScalarFunction[DateSub]("date_sub")
366366

367+
private object UTCTimestampSerde {
368+
val tzParseIncompatReason: String =
369+
"Comet's native timezone parser only accepts IANA zone IDs (e.g." +
370+
" `America/Los_Angeles`) and fixed offsets in `+HH:MM` form. Spark also" +
371+
" accepts forms such as `GMT+1`, `UTC+1`, or three-letter abbreviations like" +
372+
" `PST`; queries using those forms will throw a native parse error at" +
373+
" execution time. See https://github.com/apache/datafusion-comet/issues/2013."
374+
}
375+
376+
object CometFromUTCTimestamp extends CometExpressionSerde[FromUTCTimestamp] {
377+
378+
override def getSupportLevel(expr: FromUTCTimestamp): SupportLevel =
379+
Incompatible(Some(UTCTimestampSerde.tzParseIncompatReason))
380+
381+
override def getIncompatibleReasons(): Seq[String] =
382+
Seq(UTCTimestampSerde.tzParseIncompatReason)
383+
384+
override def convert(
385+
expr: FromUTCTimestamp,
386+
inputs: Seq[Attribute],
387+
binding: Boolean): Option[ExprOuterClass.Expr] = {
388+
val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding))
389+
val optExpr = scalarFunctionExprToProto("from_utc_timestamp", childExprs: _*)
390+
optExprWithInfo(optExpr, expr, expr.children: _*)
391+
}
392+
}
393+
394+
object CometToUTCTimestamp extends CometExpressionSerde[ToUTCTimestamp] {
395+
396+
override def getSupportLevel(expr: ToUTCTimestamp): SupportLevel =
397+
Incompatible(Some(UTCTimestampSerde.tzParseIncompatReason))
398+
399+
override def getIncompatibleReasons(): Seq[String] =
400+
Seq(UTCTimestampSerde.tzParseIncompatReason)
401+
402+
override def convert(
403+
expr: ToUTCTimestamp,
404+
inputs: Seq[Attribute],
405+
binding: Boolean): Option[ExprOuterClass.Expr] = {
406+
val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding))
407+
val optExpr = scalarFunctionExprToProto("to_utc_timestamp", childExprs: _*)
408+
optExprWithInfo(optExpr, expr, expr.children: _*)
409+
}
410+
}
411+
367412
object CometNextDay extends CometScalarFunction[NextDay]("next_day")
368413

369414
object CometMakeDate extends CometScalarFunction[MakeDate]("make_date")
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
-- Licensed to the Apache Software Foundation (ASF) under one
2+
-- or more contributor license agreements. See the NOTICE file
3+
-- distributed with this work for additional information
4+
-- regarding copyright ownership. The ASF licenses this file
5+
-- to you under the Apache License, Version 2.0 (the
6+
-- "License"); you may not use this file except in compliance
7+
-- with the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing,
12+
-- software distributed under the License is distributed on an
13+
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
-- KIND, either express or implied. See the License for the
15+
-- specific language governing permissions and limitations
16+
-- under the License.
17+
18+
-- The result of from_utc_timestamp is a shift of the underlying microsecond
19+
-- value, so it must not depend on the session timezone. Verify across two
20+
-- representative session zones.
21+
-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles
22+
23+
-- CometFromUTCTimestamp is marked Incompatible because Comet's native timezone
24+
-- parser does not accept Spark's legacy timezone forms; enable allowIncompatible
25+
-- to force the native path for the timezones covered here.
26+
-- Config: spark.comet.expression.FromUTCTimestamp.allowIncompatible=true
27+
28+
statement
29+
CREATE TABLE test_from_utc_timestamp(ts timestamp, tz string) USING parquet
30+
31+
-- Includes a summer and a winter row for the LA timezone so that both DST
32+
-- branches are exercised. The third row uses a fixed-offset tz to match the
33+
-- form Spark's own DateFunctionsSuite covers via CEST = +02:00.
34+
statement
35+
INSERT INTO test_from_utc_timestamp VALUES
36+
(timestamp('2015-07-24 00:00:00'), 'America/Los_Angeles'),
37+
(timestamp('2015-01-24 00:00:00'), 'America/Los_Angeles'),
38+
(timestamp('2024-06-15 10:30:45'), '+02:00'),
39+
(timestamp('2024-01-01 00:00:00'), 'Asia/Seoul'),
40+
(timestamp('1969-12-31 23:59:59'), 'UTC'),
41+
(NULL, 'UTC'),
42+
(timestamp('2024-06-15 10:30:45'), NULL),
43+
(NULL, NULL)
44+
45+
-- column timestamp, literal IANA timezone
46+
query
47+
SELECT from_utc_timestamp(ts, 'America/Los_Angeles') FROM test_from_utc_timestamp
48+
49+
query
50+
SELECT from_utc_timestamp(ts, 'Asia/Seoul') FROM test_from_utc_timestamp
51+
52+
query
53+
SELECT from_utc_timestamp(ts, 'UTC') FROM test_from_utc_timestamp
54+
55+
-- column timestamp, literal fixed-offset timezone
56+
query
57+
SELECT from_utc_timestamp(ts, '+02:00') FROM test_from_utc_timestamp
58+
59+
-- column timestamp, column timezone (mix of IANA and fixed-offset values)
60+
query
61+
SELECT from_utc_timestamp(ts, tz) FROM test_from_utc_timestamp
62+
63+
-- literal arguments
64+
query
65+
SELECT from_utc_timestamp(timestamp('2017-07-14 02:40:00'), 'Etc/GMT-1')
66+
67+
query
68+
SELECT from_utc_timestamp(timestamp('2016-08-31 00:00:00'), 'Asia/Seoul')
69+
70+
-- null handling
71+
query
72+
SELECT from_utc_timestamp(NULL, 'UTC'), from_utc_timestamp(timestamp('2024-01-01 00:00:00'), NULL), from_utc_timestamp(NULL, NULL)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
-- Licensed to the Apache Software Foundation (ASF) under one
2+
-- or more contributor license agreements. See the NOTICE file
3+
-- distributed with this work for additional information
4+
-- regarding copyright ownership. The ASF licenses this file
5+
-- to you under the Apache License, Version 2.0 (the
6+
-- "License"); you may not use this file except in compliance
7+
-- with the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing,
12+
-- software distributed under the License is distributed on an
13+
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
-- KIND, either express or implied. See the License for the
15+
-- specific language governing permissions and limitations
16+
-- under the License.
17+
18+
-- The result of to_utc_timestamp is a shift of the underlying microsecond
19+
-- value, so it must not depend on the session timezone. Verify across two
20+
-- representative session zones.
21+
-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles
22+
23+
-- CometToUTCTimestamp is marked Incompatible because Comet's native timezone
24+
-- parser does not accept Spark's legacy timezone forms; enable allowIncompatible
25+
-- to force the native path for the timezones covered here.
26+
-- Config: spark.comet.expression.ToUTCTimestamp.allowIncompatible=true
27+
28+
statement
29+
CREATE TABLE test_to_utc_timestamp(ts timestamp, tz string) USING parquet
30+
31+
-- Includes a summer and a winter row for the LA timezone so that both DST
32+
-- branches are exercised. The third row uses a fixed-offset tz to match the
33+
-- form Spark's own DateFunctionsSuite covers via CEST = +02:00.
34+
statement
35+
INSERT INTO test_to_utc_timestamp VALUES
36+
(timestamp('2015-07-24 00:00:00'), 'America/Los_Angeles'),
37+
(timestamp('2015-01-24 00:00:00'), 'America/Los_Angeles'),
38+
(timestamp('2024-06-15 10:30:45'), '+02:00'),
39+
(timestamp('2024-01-01 00:00:00'), 'Asia/Seoul'),
40+
(timestamp('1969-12-31 23:59:59'), 'UTC'),
41+
(NULL, 'UTC'),
42+
(timestamp('2024-06-15 10:30:45'), NULL),
43+
(NULL, NULL)
44+
45+
-- column timestamp, literal IANA timezone
46+
query
47+
SELECT to_utc_timestamp(ts, 'America/Los_Angeles') FROM test_to_utc_timestamp
48+
49+
query
50+
SELECT to_utc_timestamp(ts, 'Asia/Seoul') FROM test_to_utc_timestamp
51+
52+
query
53+
SELECT to_utc_timestamp(ts, 'UTC') FROM test_to_utc_timestamp
54+
55+
-- column timestamp, literal fixed-offset timezone
56+
query
57+
SELECT to_utc_timestamp(ts, '+02:00') FROM test_to_utc_timestamp
58+
59+
-- column timestamp, column timezone (mix of IANA and fixed-offset values)
60+
query
61+
SELECT to_utc_timestamp(ts, tz) FROM test_to_utc_timestamp
62+
63+
-- literal arguments
64+
query
65+
SELECT to_utc_timestamp(timestamp('2017-07-14 02:40:00'), 'Etc/GMT-1')
66+
67+
query
68+
SELECT to_utc_timestamp(timestamp('2016-08-31 00:00:00'), 'Asia/Seoul')
69+
70+
-- null handling
71+
query
72+
SELECT to_utc_timestamp(NULL, 'UTC'), to_utc_timestamp(timestamp('2024-01-01 00:00:00'), NULL), to_utc_timestamp(NULL, NULL)

0 commit comments

Comments
 (0)