Skip to content

Commit e8e1369

Browse files
committed
fix: add timezone and special formats support for cast string to timestamp
1 parent a712414 commit e8e1369

2 files changed

Lines changed: 254 additions & 57 deletions

File tree

native/spark-expr/src/conversion_funcs/string.rs

Lines changed: 231 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ use std::str::FromStr;
3434
use std::sync::Arc;
3535

3636
macro_rules! cast_utf8_to_timestamp {
37-
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{
37+
// $tz is a Timezone:Tz object and contains the session timezone.
38+
// $to_tz_str is a string containing the to_type timezone
39+
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $to_tz_str:expr) => {{
3840
let len = $array.len();
39-
let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
41+
let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone($to_tz_str);
4042
let mut cast_err: Option<SparkError> = None;
4143
for i in 0..len {
4244
if $array.is_null(i) {
@@ -675,16 +677,21 @@ pub(crate) fn cast_string_to_timestamp(
675677
.downcast_ref::<GenericStringArray<i32>>()
676678
.expect("Expected a string array");
677679

678-
let tz = &timezone::Tz::from_str(timezone_str).unwrap();
680+
let tz = &timezone::Tz::from_str(timezone_str)
681+
.map_err(|_| SparkError::Internal(format!("Invalid timezone string: {timezone_str}")))?;
679682

680683
let cast_array: ArrayRef = match to_type {
681-
DataType::Timestamp(_, _) => cast_utf8_to_timestamp!(
682-
string_array,
683-
eval_mode,
684-
TimestampMicrosecondType,
685-
timestamp_parser,
686-
tz
687-
)?,
684+
DataType::Timestamp(_, tz_opt) => {
685+
let to_tz = tz_opt.as_deref().unwrap_or("UTC");
686+
cast_utf8_to_timestamp!(
687+
string_array,
688+
eval_mode,
689+
TimestampMicrosecondType,
690+
timestamp_parser,
691+
tz,
692+
to_tz
693+
)?
694+
}
688695
_ => unreachable!("Invalid data type {:?} in cast from string", to_type),
689696
};
690697
Ok(cast_array)
@@ -967,8 +974,14 @@ fn get_timestamp_values<T: TimeZone>(
967974
timestamp_type: &str,
968975
tz: &T,
969976
) -> SparkResult<Option<i64>> {
970-
let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
971-
let year = values[0].parse::<i32>().unwrap_or_default();
977+
// Handle negative year: strip leading '-' and remember the sign.
978+
let (sign, date_part) = if let Some(stripped) = value.strip_prefix('-') {
979+
(-1i32, stripped)
980+
} else {
981+
(1i32, value)
982+
};
983+
let values: Vec<_> = date_part.split(['T', ' ', '-', ':', '.']).collect();
984+
let year = sign * values[0].parse::<i32>().unwrap_or_default();
972985

973986
// NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to ±262142.
974987
if !(-262143..=262142).contains(&year) {
@@ -1041,28 +1054,19 @@ fn parse_timestamp_to_micros<T: TimeZone>(
10411054
timestamp_info.second,
10421055
);
10431056

1044-
// Check if datetime is not None
1045-
let tz_datetime = match datetime.single() {
1057+
// Spark uses the offset before daylight savings change so we need to use earliest()
1058+
// Return None for LocalResult::None which is the invalid time in a DST spring forward gap).
1059+
let tz_datetime = match datetime.earliest() {
10461060
Some(dt) => dt
10471061
.with_timezone(tz)
10481062
.with_nanosecond(timestamp_info.microsecond * 1000),
1049-
None => {
1050-
return Err(SparkError::Internal(
1051-
"Failed to parse timestamp".to_string(),
1052-
));
1053-
}
1054-
};
1055-
1056-
let result = match tz_datetime {
1057-
Some(dt) => dt.timestamp_micros(),
1058-
None => {
1059-
return Err(SparkError::Internal(
1060-
"Failed to parse timestamp".to_string(),
1061-
));
1062-
}
1063+
None => return Ok(None),
10631064
};
10641065

1065-
Ok(Some(result))
1066+
match tz_datetime {
1067+
Some(dt) => Ok(Some(dt.timestamp_micros())),
1068+
None => Ok(None),
1069+
}
10661070
}
10671071

10681072
fn parse_str_to_year_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
@@ -1105,40 +1109,119 @@ fn timestamp_parser<T: TimeZone>(
11051109
if value.is_empty() {
11061110
return Ok(None);
11071111
}
1108-
// Define regex patterns and corresponding parsing functions
1109-
let patterns = &[
1112+
1113+
// Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit fixed offset.
1114+
if let Some((stripped, offset_secs)) = extract_offset_suffix(value) {
1115+
let fixed_tz = chrono::FixedOffset::east_opt(offset_secs)
1116+
.ok_or_else(|| SparkError::Internal("Invalid timezone offset".to_string()))?;
1117+
return timestamp_parser_with_tz(stripped, eval_mode, &fixed_tz);
1118+
}
1119+
1120+
timestamp_parser_with_tz(value, eval_mode, tz)
1121+
}
1122+
1123+
/// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), returns the
1124+
/// stripped string and the offset in seconds. Returns `None` if no offset suffix is present.
1125+
fn extract_offset_suffix(value: &str) -> Option<(&str, i32)> {
1126+
if let Some(stripped) = value.strip_suffix('Z') {
1127+
return Some((stripped, 0));
1128+
}
1129+
// Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 digits)
1130+
if value.len() >= 6 {
1131+
let suffix_start = value.len() - 6;
1132+
let suffix = &value[suffix_start..];
1133+
let sign_byte = suffix.as_bytes()[0];
1134+
if (sign_byte == b'+' || sign_byte == b'-') && suffix.as_bytes()[3] == b':' {
1135+
if let (Ok(h), Ok(m)) = (suffix[1..3].parse::<i32>(), suffix[4..6].parse::<i32>()) {
1136+
let sign = if sign_byte == b'+' { 1i32 } else { -1i32 };
1137+
return Some((&value[..suffix_start], sign * (h * 3600 + m * 60)));
1138+
}
1139+
}
1140+
}
1141+
None
1142+
}
1143+
1144+
type TimestampParsePattern<T> = (Regex, fn(&str, &T) -> SparkResult<Option<i64>>);
1145+
1146+
fn timestamp_parser_with_tz<T: TimeZone>(
1147+
value: &str,
1148+
eval_mode: EvalMode,
1149+
tz: &T,
1150+
) -> SparkResult<Option<i64>> {
1151+
// Define regex patterns and corresponding parsing functions.
1152+
// Both T-separator and space-separator date-time forms are supported.
1153+
// Negative years are handled by get_timestamp_values detecting a leading '-'.
1154+
let patterns: &[TimestampParsePattern<T>] = &[
1155+
// Year only: 4-7 digits, optionally negative
11101156
(
1111-
Regex::new(r"^\d{4,7}$").unwrap(),
1157+
Regex::new(r"^-?\d{4,7}$").unwrap(),
11121158
parse_str_to_year_timestamp as fn(&str, &T) -> SparkResult<Option<i64>>,
11131159
),
1160+
// Year-month
11141161
(
1115-
Regex::new(r"^\d{4,7}-\d{2}$").unwrap(),
1162+
Regex::new(r"^-?\d{4,7}-\d{2}$").unwrap(),
11161163
parse_str_to_month_timestamp,
11171164
),
1165+
// Year-month-day
11181166
(
1119-
Regex::new(r"^\d{4,7}-\d{2}-\d{2}$").unwrap(),
1167+
Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}$").unwrap(),
11201168
parse_str_to_day_timestamp,
11211169
),
1170+
// Date T-or-space hour (1 or 2 digits)
11221171
(
1123-
Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{1,2}$").unwrap(),
1172+
Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{1,2}$").unwrap(),
11241173
parse_str_to_hour_timestamp,
11251174
),
1175+
// Date T-or-space hour:minute
11261176
(
1127-
Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap(),
1177+
Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}$").unwrap(),
11281178
parse_str_to_minute_timestamp,
11291179
),
1180+
// Date T-or-space hour:minute:second
11301181
(
1131-
Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap(),
1182+
Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}$").unwrap(),
11321183
parse_str_to_second_timestamp,
11331184
),
1185+
// Date T-or-space hour:minute:second.fraction
11341186
(
1135-
Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(),
1187+
Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(),
11361188
parse_str_to_microsecond_timestamp,
11371189
),
1190+
// Time-only: T hour (1 or 2 digits, no colon)
11381191
(
11391192
Regex::new(r"^T\d{1,2}$").unwrap(),
11401193
parse_str_to_time_only_timestamp,
11411194
),
1195+
// Time-only: T hour:minute
1196+
(
1197+
Regex::new(r"^T\d{1,2}:\d{2}$").unwrap(),
1198+
parse_str_to_time_only_timestamp,
1199+
),
1200+
// Time-only: T hour:minute:second
1201+
(
1202+
Regex::new(r"^T\d{1,2}:\d{2}:\d{2}$").unwrap(),
1203+
parse_str_to_time_only_timestamp,
1204+
),
1205+
// Time-only: T hour:minute:second.fraction
1206+
(
1207+
Regex::new(r"^T\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(),
1208+
parse_str_to_time_only_timestamp,
1209+
),
1210+
// Bare time-only: hour:minute (without T prefix)
1211+
(
1212+
Regex::new(r"^\d{1,2}:\d{2}$").unwrap(),
1213+
parse_str_to_time_only_timestamp,
1214+
),
1215+
// Bare time-only: hour:minute:second
1216+
(
1217+
Regex::new(r"^\d{1,2}:\d{2}:\d{2}$").unwrap(),
1218+
parse_str_to_time_only_timestamp,
1219+
),
1220+
// Bare time-only: hour:minute:second.fraction
1221+
(
1222+
Regex::new(r"^\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(),
1223+
parse_str_to_time_only_timestamp,
1224+
),
11421225
];
11431226

11441227
let mut timestamp = None;
@@ -1167,23 +1250,43 @@ fn timestamp_parser<T: TimeZone>(
11671250
}
11681251

11691252
fn parse_str_to_time_only_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
1170-
let values: Vec<&str> = value.split('T').collect();
1171-
let time_values: Vec<u32> = values[1]
1172-
.split(':')
1173-
.map(|v| v.parse::<u32>().unwrap_or(0))
1174-
.collect();
1253+
// The 'T' is optional in the time format; strip it if specified.
1254+
let time_part = value.strip_prefix('T').unwrap_or(value);
1255+
1256+
// Parse time components: hour[:minute[:second[.fraction]]]
1257+
// Use splitn(3) so "12:34:56.789" splits into ["12", "34", "56.789"].
1258+
let colon_parts: Vec<&str> = time_part.splitn(3, ':').collect();
1259+
let hour: u32 = colon_parts[0].parse().unwrap_or(0);
1260+
let minute: u32 = colon_parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(0);
1261+
let (second, nanosecond) = if let Some(sec_frac) = colon_parts.get(2) {
1262+
let dot_idx = sec_frac.find('.');
1263+
let sec: u32 = sec_frac[..dot_idx.unwrap_or(sec_frac.len())]
1264+
.parse()
1265+
.unwrap_or(0);
1266+
let ns: u32 = if let Some(dot) = dot_idx {
1267+
let frac = &sec_frac[dot + 1..];
1268+
// Interpret up to 6 digits as microseconds, padding with trailing zeros.
1269+
let trimmed = &frac[..frac.len().min(6)];
1270+
let padded = format!("{:0<6}", trimmed);
1271+
padded.parse::<u32>().unwrap_or(0) * 1000
1272+
} else {
1273+
0
1274+
};
1275+
(sec, ns)
1276+
} else {
1277+
(0, 0)
1278+
};
11751279

11761280
let datetime = tz.from_utc_datetime(&chrono::Utc::now().naive_utc());
1177-
let timestamp = datetime
1281+
let result = datetime
11781282
.with_timezone(tz)
1179-
.with_hour(time_values.first().copied().unwrap_or_default())
1180-
.and_then(|dt| dt.with_minute(*time_values.get(1).unwrap_or(&0)))
1181-
.and_then(|dt| dt.with_second(*time_values.get(2).unwrap_or(&0)))
1182-
.and_then(|dt| dt.with_nanosecond(*time_values.get(3).unwrap_or(&0) * 1_000))
1183-
.map(|dt| dt.timestamp_micros())
1184-
.unwrap_or_default();
1185-
1186-
Ok(Some(timestamp))
1283+
.with_hour(hour)
1284+
.and_then(|dt| dt.with_minute(minute))
1285+
.and_then(|dt| dt.with_second(second))
1286+
.and_then(|dt| dt.with_nanosecond(nanosecond))
1287+
.map(|dt| dt.timestamp_micros());
1288+
1289+
Ok(result)
11871290
}
11881291

11891292
//a string to date parser - port of spark's SparkDateTimeUtils#stringToDate.
@@ -1353,7 +1456,8 @@ mod tests {
13531456
eval_mode,
13541457
TimestampMicrosecondType,
13551458
timestamp_parser,
1356-
tz
1459+
tz,
1460+
"UTC"
13571461
)
13581462
.unwrap();
13591463

@@ -1383,7 +1487,8 @@ mod tests {
13831487
eval_mode,
13841488
TimestampMicrosecondType,
13851489
timestamp_parser,
1386-
tz
1490+
tz,
1491+
"UTC"
13871492
);
13881493
assert!(
13891494
result.is_err(),
@@ -1507,6 +1612,78 @@ mod tests {
15071612
timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
15081613
Some(253402346096123456)
15091614
);
1615+
// Space separator (same values as T separator)
1616+
assert_eq!(
1617+
timestamp_parser("2020-01-01 12", EvalMode::Legacy, tz).unwrap(),
1618+
Some(1577880000000000)
1619+
);
1620+
assert_eq!(
1621+
timestamp_parser("2020-01-01 12:34", EvalMode::Legacy, tz).unwrap(),
1622+
Some(1577882040000000)
1623+
);
1624+
assert_eq!(
1625+
timestamp_parser("2020-01-01 12:34:56", EvalMode::Legacy, tz).unwrap(),
1626+
Some(1577882096000000)
1627+
);
1628+
assert_eq!(
1629+
timestamp_parser("2020-01-01 12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
1630+
Some(1577882096123456)
1631+
);
1632+
// Z suffix (UTC)
1633+
assert_eq!(
1634+
timestamp_parser("2020-01-01T12:34:56Z", EvalMode::Legacy, tz).unwrap(),
1635+
Some(1577882096000000)
1636+
);
1637+
// Positive offset suffix
1638+
assert_eq!(
1639+
timestamp_parser("2020-01-01T12:34:56+05:30", EvalMode::Legacy, tz).unwrap(),
1640+
Some(1577862296000000) // 12:34:56 UTC+5:30 = 07:04:56 UTC
1641+
);
1642+
// T-prefixed time-only with colon
1643+
assert_eq!(
1644+
timestamp_parser("T12:34", EvalMode::Legacy, tz)
1645+
.unwrap()
1646+
.is_some(),
1647+
true
1648+
);
1649+
assert_eq!(
1650+
timestamp_parser("T12:34:56", EvalMode::Legacy, tz)
1651+
.unwrap()
1652+
.is_some(),
1653+
true
1654+
);
1655+
assert_eq!(
1656+
timestamp_parser("T12:34:56.123456", EvalMode::Legacy, tz)
1657+
.unwrap()
1658+
.is_some(),
1659+
true
1660+
);
1661+
// Bare time-only (hour:minute without T prefix)
1662+
assert_eq!(
1663+
timestamp_parser("12:34", EvalMode::Legacy, tz)
1664+
.unwrap()
1665+
.is_some(),
1666+
true
1667+
);
1668+
assert_eq!(
1669+
timestamp_parser("12:34:56", EvalMode::Legacy, tz)
1670+
.unwrap()
1671+
.is_some(),
1672+
true
1673+
);
1674+
// Negative year
1675+
assert_eq!(
1676+
timestamp_parser("-0001", EvalMode::Legacy, tz)
1677+
.unwrap()
1678+
.is_some(),
1679+
true
1680+
);
1681+
assert_eq!(
1682+
timestamp_parser("-0001-01-01T12:34:56", EvalMode::Legacy, tz)
1683+
.unwrap()
1684+
.is_some(),
1685+
true
1686+
);
15101687
}
15111688

15121689
#[test]

0 commit comments

Comments
 (0)