Skip to content

Commit fafb7e2

Browse files
committed
use match_to_type produces timestamp
1 parent c27b1e1 commit fafb7e2

2 files changed

Lines changed: 118 additions & 9 deletions

File tree

native/spark-expr/src/conversion_funcs/string.rs

Lines changed: 113 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow::datatypes::{
2525
i256, is_validate_decimal_precision, DataType, Date32Type, Decimal256Type, Float32Type,
2626
Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimestampMicrosecondType,
2727
};
28-
use chrono::{DateTime, NaiveDate, TimeZone, Timelike};
28+
use chrono::{DateTime, NaiveDate, TimeZone, Timelike, Utc};
2929
use num::traits::CheckedNeg;
3030
use num::{CheckedSub, Integer};
3131
use regex::Regex;
@@ -34,9 +34,13 @@ use std::str::FromStr;
3434
use std::sync::Arc;
3535

3636
macro_rules! cast_utf8_to_timestamp {
37-
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{
37+
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $with_tz:expr) => {{
3838
let len = $array.len();
39-
let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
39+
let mut cast_array = if let Some(tz_str) = $with_tz {
40+
PrimitiveArray::<$array_type>::builder(len).with_timezone(tz_str)
41+
} else {
42+
PrimitiveArray::<$array_type>::builder(len)
43+
};
4044
for i in 0..len {
4145
if $array.is_null(i) {
4246
cast_array.append_null()
@@ -665,16 +669,28 @@ pub(crate) fn cast_string_to_timestamp(
665669
.downcast_ref::<GenericStringArray<i32>>()
666670
.expect("Expected a string array");
667671

668-
let tz = &timezone::Tz::from_str(timezone_str).unwrap();
669-
670672
let cast_array: ArrayRef = match to_type {
671-
DataType::Timestamp(_, _) => {
673+
DataType::Timestamp(_, Some(_)) => {
674+
let tz = &timezone::Tz::from_str(timezone_str).unwrap();
672675
cast_utf8_to_timestamp!(
673676
string_array,
674677
eval_mode,
675678
TimestampMicrosecondType,
676679
timestamp_parser,
677-
tz
680+
tz,
681+
Some("UTC")
682+
)
683+
}
684+
DataType::Timestamp(_, None) => {
685+
// TimestampNTZ: reuse timestamp_parser with Utc (identity timezone),
686+
// but don't set timezone metadata on the output array
687+
cast_utf8_to_timestamp!(
688+
string_array,
689+
eval_mode,
690+
TimestampMicrosecondType,
691+
timestamp_parser,
692+
&Utc,
693+
None::<&str>
678694
)
679695
}
680696
_ => unreachable!("Invalid data type {:?} in cast from string", to_type),
@@ -1340,7 +1356,8 @@ mod tests {
13401356
eval_mode,
13411357
TimestampMicrosecondType,
13421358
timestamp_parser,
1343-
tz
1359+
tz,
1360+
Some("UTC")
13441361
);
13451362

13461363
assert_eq!(
@@ -1350,6 +1367,94 @@ mod tests {
13501367
assert_eq!(result.len(), 4);
13511368
}
13521369

1370+
#[test]
1371+
#[cfg_attr(miri, ignore)]
1372+
fn test_cast_string_to_timestamp_ntz() {
1373+
let array: ArrayRef = Arc::new(StringArray::from(vec![
1374+
Some("2020-01-01T12:34:56.123456"),
1375+
Some("not_a_timestamp"),
1376+
Some("2020-01-01"),
1377+
]));
1378+
1379+
let string_array = array
1380+
.as_any()
1381+
.downcast_ref::<GenericStringArray<i32>>()
1382+
.expect("Expected a string array");
1383+
1384+
let eval_mode = EvalMode::Legacy;
1385+
let result = cast_utf8_to_timestamp!(
1386+
&string_array,
1387+
eval_mode,
1388+
TimestampMicrosecondType,
1389+
timestamp_parser,
1390+
&Utc,
1391+
None::<&str>
1392+
);
1393+
1394+
// Key assertion: TimestampNTZ should have NO timezone
1395+
assert_eq!(
1396+
result.data_type(),
1397+
&DataType::Timestamp(TimeUnit::Microsecond, None)
1398+
);
1399+
assert_eq!(result.len(), 3);
1400+
1401+
// Verify the actual values are not timezone-converted
1402+
let ts_array = result
1403+
.as_any()
1404+
.downcast_ref::<PrimitiveArray<TimestampMicrosecondType>>()
1405+
.expect("Expected a timestamp array");
1406+
1407+
// 2020-01-01T12:34:56.123456 as naive epoch micros
1408+
assert_eq!(ts_array.value(0), 1577882096123456);
1409+
// "not_a_timestamp" is invalid and should be null
1410+
assert!(ts_array.is_null(1));
1411+
// 2020-01-01 as naive epoch micros (midnight)
1412+
assert_eq!(ts_array.value(2), 1577836800000000);
1413+
}
1414+
1415+
#[test]
1416+
fn test_cast_string_to_timestamp_ntz_via_cast_array() -> DataFusionResult<()> {
1417+
let array: ArrayRef = Arc::new(StringArray::from(vec![
1418+
Some("2020-01-01T12:34:56.123456"),
1419+
Some("T2"),
1420+
]));
1421+
1422+
let timezone = "America/New_York".to_string();
1423+
let cast_options = SparkCastOptions::new(EvalMode::Legacy, &timezone, false);
1424+
1425+
// Cast to Timestamp with timezone
1426+
let result_tz = cast_array(
1427+
Arc::clone(&array),
1428+
&DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())),
1429+
&cast_options,
1430+
)?;
1431+
assert_eq!(
1432+
*result_tz.data_type(),
1433+
DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into()))
1434+
);
1435+
1436+
// Cast to TimestampNTZ (no timezone)
1437+
let result_ntz = cast_array(
1438+
Arc::clone(&array),
1439+
&DataType::Timestamp(TimeUnit::Microsecond, None),
1440+
&cast_options,
1441+
)?;
1442+
assert_eq!(
1443+
*result_ntz.data_type(),
1444+
DataType::Timestamp(TimeUnit::Microsecond, None)
1445+
);
1446+
1447+
// The NTZ result should NOT have timezone metadata
1448+
let ntz_array = result_ntz
1449+
.as_any()
1450+
.downcast_ref::<PrimitiveArray<TimestampMicrosecondType>>()
1451+
.expect("Expected a timestamp array");
1452+
// 2020-01-01T12:34:56.123456 stored as-is (no timezone conversion)
1453+
assert_eq!(ntz_array.value(0), 1577882096123456);
1454+
1455+
Ok(())
1456+
}
1457+
13531458
#[test]
13541459
fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> {
13551460
// prepare input data

spark/src/main/scala/org/apache/comet/expressions/CometCast.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.comet.expressions
2121

2222
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, Literal}
2323
import org.apache.spark.sql.internal.SQLConf
24-
import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType, TimestampType}
24+
import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType, TimestampNTZType, TimestampType}
2525

2626
import org.apache.comet.CometConf
2727
import org.apache.comet.CometSparkSessionExtensions.withInfo
@@ -222,6 +222,10 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim {
222222
case DataTypes.TimestampType =>
223223
// https://github.com/apache/datafusion-comet/issues/328
224224
Incompatible(Some("Not all valid formats are supported"))
225+
case _: TimestampNTZType if evalMode == CometEvalMode.ANSI =>
226+
Incompatible(Some("ANSI mode not supported"))
227+
case _: TimestampNTZType =>
228+
Incompatible(Some("Not all valid formats are supported"))
225229
case _ =>
226230
unsupported(DataTypes.StringType, toType)
227231
}

0 commit comments

Comments
 (0)