@@ -34,9 +34,11 @@ use std::str::FromStr;
3434use std:: sync:: Arc ;
3535
3636macro_rules! cast_utf8_to_timestamp {
37- ( $array: expr, $eval_mode: expr, $array_type: ty, $cast_method: ident, $tz: expr) => { {
37+ // $tz is a Timezone:Tz object and contains the session timezone.
38+ // $to_tz_str is a string containing the to_type timezone
39+ ( $array: expr, $eval_mode: expr, $array_type: ty, $cast_method: ident, $tz: expr, $to_tz_str: expr) => { {
3840 let len = $array. len( ) ;
39- let mut cast_array = PrimitiveArray :: <$array_type>:: builder( len) . with_timezone( "UTC" ) ;
41+ let mut cast_array = PrimitiveArray :: <$array_type>:: builder( len) . with_timezone( $to_tz_str ) ;
4042 let mut cast_err: Option <SparkError > = None ;
4143 for i in 0 ..len {
4244 if $array. is_null( i) {
@@ -675,16 +677,21 @@ pub(crate) fn cast_string_to_timestamp(
675677 . downcast_ref :: < GenericStringArray < i32 > > ( )
676678 . expect ( "Expected a string array" ) ;
677679
678- let tz = & timezone:: Tz :: from_str ( timezone_str) . unwrap ( ) ;
680+ let tz = & timezone:: Tz :: from_str ( timezone_str)
681+ . map_err ( |_| SparkError :: Internal ( format ! ( "Invalid timezone string: {timezone_str}" ) ) ) ?;
679682
680683 let cast_array: ArrayRef = match to_type {
681- DataType :: Timestamp ( _, _) => cast_utf8_to_timestamp ! (
682- string_array,
683- eval_mode,
684- TimestampMicrosecondType ,
685- timestamp_parser,
686- tz
687- ) ?,
684+ DataType :: Timestamp ( _, tz_opt) => {
685+ let to_tz = tz_opt. as_deref ( ) . unwrap_or ( "UTC" ) ;
686+ cast_utf8_to_timestamp ! (
687+ string_array,
688+ eval_mode,
689+ TimestampMicrosecondType ,
690+ timestamp_parser,
691+ tz,
692+ to_tz
693+ ) ?
694+ }
688695 _ => unreachable ! ( "Invalid data type {:?} in cast from string" , to_type) ,
689696 } ;
690697 Ok ( cast_array)
@@ -967,8 +974,14 @@ fn get_timestamp_values<T: TimeZone>(
967974 timestamp_type : & str ,
968975 tz : & T ,
969976) -> SparkResult < Option < i64 > > {
970- let values: Vec < _ > = value. split ( [ 'T' , '-' , ':' , '.' ] ) . collect ( ) ;
971- let year = values[ 0 ] . parse :: < i32 > ( ) . unwrap_or_default ( ) ;
977+ // Handle negative year: strip leading '-' and remember the sign.
978+ let ( sign, date_part) = if let Some ( stripped) = value. strip_prefix ( '-' ) {
979+ ( -1i32 , stripped)
980+ } else {
981+ ( 1i32 , value)
982+ } ;
983+ let values: Vec < _ > = date_part. split ( [ 'T' , ' ' , '-' , ':' , '.' ] ) . collect ( ) ;
984+ let year = sign * values[ 0 ] . parse :: < i32 > ( ) . unwrap_or_default ( ) ;
972985
973986 // NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to ±262142.
974987 if !( -262143 ..=262142 ) . contains ( & year) {
@@ -1041,28 +1054,19 @@ fn parse_timestamp_to_micros<T: TimeZone>(
10411054 timestamp_info. second ,
10421055 ) ;
10431056
1044- // Check if datetime is not None
1045- let tz_datetime = match datetime. single ( ) {
1057+ // Spark uses the offset before daylight savings change so we need to use earliest()
1058+ // Return None for LocalResult::None which is the invalid time in a DST spring forward gap).
1059+ let tz_datetime = match datetime. earliest ( ) {
10461060 Some ( dt) => dt
10471061 . with_timezone ( tz)
10481062 . with_nanosecond ( timestamp_info. microsecond * 1000 ) ,
1049- None => {
1050- return Err ( SparkError :: Internal (
1051- "Failed to parse timestamp" . to_string ( ) ,
1052- ) ) ;
1053- }
1054- } ;
1055-
1056- let result = match tz_datetime {
1057- Some ( dt) => dt. timestamp_micros ( ) ,
1058- None => {
1059- return Err ( SparkError :: Internal (
1060- "Failed to parse timestamp" . to_string ( ) ,
1061- ) ) ;
1062- }
1063+ None => return Ok ( None ) ,
10631064 } ;
10641065
1065- Ok ( Some ( result) )
1066+ match tz_datetime {
1067+ Some ( dt) => Ok ( Some ( dt. timestamp_micros ( ) ) ) ,
1068+ None => Ok ( None ) ,
1069+ }
10661070}
10671071
10681072fn parse_str_to_year_timestamp < T : TimeZone > ( value : & str , tz : & T ) -> SparkResult < Option < i64 > > {
@@ -1105,40 +1109,119 @@ fn timestamp_parser<T: TimeZone>(
11051109 if value. is_empty ( ) {
11061110 return Ok ( None ) ;
11071111 }
1108- // Define regex patterns and corresponding parsing functions
1109- let patterns = & [
1112+
1113+ // Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit fixed offset.
1114+ if let Some ( ( stripped, offset_secs) ) = extract_offset_suffix ( value) {
1115+ let fixed_tz = chrono:: FixedOffset :: east_opt ( offset_secs)
1116+ . ok_or_else ( || SparkError :: Internal ( "Invalid timezone offset" . to_string ( ) ) ) ?;
1117+ return timestamp_parser_with_tz ( stripped, eval_mode, & fixed_tz) ;
1118+ }
1119+
1120+ timestamp_parser_with_tz ( value, eval_mode, tz)
1121+ }
1122+
1123+ /// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), returns the
1124+ /// stripped string and the offset in seconds. Returns `None` if no offset suffix is present.
1125+ fn extract_offset_suffix ( value : & str ) -> Option < ( & str , i32 ) > {
1126+ if let Some ( stripped) = value. strip_suffix ( 'Z' ) {
1127+ return Some ( ( stripped, 0 ) ) ;
1128+ }
1129+ // Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 digits)
1130+ if value. len ( ) >= 6 {
1131+ let suffix_start = value. len ( ) - 6 ;
1132+ let suffix = & value[ suffix_start..] ;
1133+ let sign_byte = suffix. as_bytes ( ) [ 0 ] ;
1134+ if ( sign_byte == b'+' || sign_byte == b'-' ) && suffix. as_bytes ( ) [ 3 ] == b':' {
1135+ if let ( Ok ( h) , Ok ( m) ) = ( suffix[ 1 ..3 ] . parse :: < i32 > ( ) , suffix[ 4 ..6 ] . parse :: < i32 > ( ) ) {
1136+ let sign = if sign_byte == b'+' { 1i32 } else { -1i32 } ;
1137+ return Some ( ( & value[ ..suffix_start] , sign * ( h * 3600 + m * 60 ) ) ) ;
1138+ }
1139+ }
1140+ }
1141+ None
1142+ }
1143+
1144+ type TimestampParsePattern < T > = ( Regex , fn ( & str , & T ) -> SparkResult < Option < i64 > > ) ;
1145+
1146+ fn timestamp_parser_with_tz < T : TimeZone > (
1147+ value : & str ,
1148+ eval_mode : EvalMode ,
1149+ tz : & T ,
1150+ ) -> SparkResult < Option < i64 > > {
1151+ // Define regex patterns and corresponding parsing functions.
1152+ // Both T-separator and space-separator date-time forms are supported.
1153+ // Negative years are handled by get_timestamp_values detecting a leading '-'.
1154+ let patterns: & [ TimestampParsePattern < T > ] = & [
1155+ // Year only: 4-7 digits, optionally negative
11101156 (
1111- Regex :: new ( r"^\d{4,7}$" ) . unwrap ( ) ,
1157+ Regex :: new ( r"^-? \d{4,7}$" ) . unwrap ( ) ,
11121158 parse_str_to_year_timestamp as fn ( & str , & T ) -> SparkResult < Option < i64 > > ,
11131159 ) ,
1160+ // Year-month
11141161 (
1115- Regex :: new ( r"^\d{4,7}-\d{2}$" ) . unwrap ( ) ,
1162+ Regex :: new ( r"^-? \d{4,7}-\d{2}$" ) . unwrap ( ) ,
11161163 parse_str_to_month_timestamp,
11171164 ) ,
1165+ // Year-month-day
11181166 (
1119- Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}$" ) . unwrap ( ) ,
1167+ Regex :: new ( r"^-? \d{4,7}-\d{2}-\d{2}$" ) . unwrap ( ) ,
11201168 parse_str_to_day_timestamp,
11211169 ) ,
1170+ // Date T-or-space hour (1 or 2 digits)
11221171 (
1123- Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T \d{1,2}$" ) . unwrap ( ) ,
1172+ Regex :: new ( r"^-? \d{4,7}-\d{2}-\d{2}[T ] \d{1,2}$" ) . unwrap ( ) ,
11241173 parse_str_to_hour_timestamp,
11251174 ) ,
1175+ // Date T-or-space hour:minute
11261176 (
1127- Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T \d{2}:\d{2}$" ) . unwrap ( ) ,
1177+ Regex :: new ( r"^-? \d{4,7}-\d{2}-\d{2}[T ] \d{2}:\d{2}$" ) . unwrap ( ) ,
11281178 parse_str_to_minute_timestamp,
11291179 ) ,
1180+ // Date T-or-space hour:minute:second
11301181 (
1131- Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T \d{2}:\d{2}:\d{2}$" ) . unwrap ( ) ,
1182+ Regex :: new ( r"^-? \d{4,7}-\d{2}-\d{2}[T ] \d{2}:\d{2}:\d{2}$" ) . unwrap ( ) ,
11321183 parse_str_to_second_timestamp,
11331184 ) ,
1185+ // Date T-or-space hour:minute:second.fraction
11341186 (
1135- Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T \d{2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ,
1187+ Regex :: new ( r"^-? \d{4,7}-\d{2}-\d{2}[T ] \d{2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ,
11361188 parse_str_to_microsecond_timestamp,
11371189 ) ,
1190+ // Time-only: T hour (1 or 2 digits, no colon)
11381191 (
11391192 Regex :: new ( r"^T\d{1,2}$" ) . unwrap ( ) ,
11401193 parse_str_to_time_only_timestamp,
11411194 ) ,
1195+ // Time-only: T hour:minute
1196+ (
1197+ Regex :: new ( r"^T\d{1,2}:\d{2}$" ) . unwrap ( ) ,
1198+ parse_str_to_time_only_timestamp,
1199+ ) ,
1200+ // Time-only: T hour:minute:second
1201+ (
1202+ Regex :: new ( r"^T\d{1,2}:\d{2}:\d{2}$" ) . unwrap ( ) ,
1203+ parse_str_to_time_only_timestamp,
1204+ ) ,
1205+ // Time-only: T hour:minute:second.fraction
1206+ (
1207+ Regex :: new ( r"^T\d{1,2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ,
1208+ parse_str_to_time_only_timestamp,
1209+ ) ,
1210+ // Bare time-only: hour:minute (without T prefix)
1211+ (
1212+ Regex :: new ( r"^\d{1,2}:\d{2}$" ) . unwrap ( ) ,
1213+ parse_str_to_time_only_timestamp,
1214+ ) ,
1215+ // Bare time-only: hour:minute:second
1216+ (
1217+ Regex :: new ( r"^\d{1,2}:\d{2}:\d{2}$" ) . unwrap ( ) ,
1218+ parse_str_to_time_only_timestamp,
1219+ ) ,
1220+ // Bare time-only: hour:minute:second.fraction
1221+ (
1222+ Regex :: new ( r"^\d{1,2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ,
1223+ parse_str_to_time_only_timestamp,
1224+ ) ,
11421225 ] ;
11431226
11441227 let mut timestamp = None ;
@@ -1167,23 +1250,43 @@ fn timestamp_parser<T: TimeZone>(
11671250}
11681251
11691252fn parse_str_to_time_only_timestamp < T : TimeZone > ( value : & str , tz : & T ) -> SparkResult < Option < i64 > > {
1170- let values: Vec < & str > = value. split ( 'T' ) . collect ( ) ;
1171- let time_values: Vec < u32 > = values[ 1 ]
1172- . split ( ':' )
1173- . map ( |v| v. parse :: < u32 > ( ) . unwrap_or ( 0 ) )
1174- . collect ( ) ;
1253+ // The 'T' is optional in the time format; strip it if specified.
1254+ let time_part = value. strip_prefix ( 'T' ) . unwrap_or ( value) ;
1255+
1256+ // Parse time components: hour[:minute[:second[.fraction]]]
1257+ // Use splitn(3) so "12:34:56.789" splits into ["12", "34", "56.789"].
1258+ let colon_parts: Vec < & str > = time_part. splitn ( 3 , ':' ) . collect ( ) ;
1259+ let hour: u32 = colon_parts[ 0 ] . parse ( ) . unwrap_or ( 0 ) ;
1260+ let minute: u32 = colon_parts. get ( 1 ) . and_then ( |s| s. parse ( ) . ok ( ) ) . unwrap_or ( 0 ) ;
1261+ let ( second, nanosecond) = if let Some ( sec_frac) = colon_parts. get ( 2 ) {
1262+ let dot_idx = sec_frac. find ( '.' ) ;
1263+ let sec: u32 = sec_frac[ ..dot_idx. unwrap_or ( sec_frac. len ( ) ) ]
1264+ . parse ( )
1265+ . unwrap_or ( 0 ) ;
1266+ let ns: u32 = if let Some ( dot) = dot_idx {
1267+ let frac = & sec_frac[ dot + 1 ..] ;
1268+ // Interpret up to 6 digits as microseconds, padding with trailing zeros.
1269+ let trimmed = & frac[ ..frac. len ( ) . min ( 6 ) ] ;
1270+ let padded = format ! ( "{:0<6}" , trimmed) ;
1271+ padded. parse :: < u32 > ( ) . unwrap_or ( 0 ) * 1000
1272+ } else {
1273+ 0
1274+ } ;
1275+ ( sec, ns)
1276+ } else {
1277+ ( 0 , 0 )
1278+ } ;
11751279
11761280 let datetime = tz. from_utc_datetime ( & chrono:: Utc :: now ( ) . naive_utc ( ) ) ;
1177- let timestamp = datetime
1281+ let result = datetime
11781282 . with_timezone ( tz)
1179- . with_hour ( time_values. first ( ) . copied ( ) . unwrap_or_default ( ) )
1180- . and_then ( |dt| dt. with_minute ( * time_values. get ( 1 ) . unwrap_or ( & 0 ) ) )
1181- . and_then ( |dt| dt. with_second ( * time_values. get ( 2 ) . unwrap_or ( & 0 ) ) )
1182- . and_then ( |dt| dt. with_nanosecond ( * time_values. get ( 3 ) . unwrap_or ( & 0 ) * 1_000 ) )
1183- . map ( |dt| dt. timestamp_micros ( ) )
1184- . unwrap_or_default ( ) ;
1185-
1186- Ok ( Some ( timestamp) )
1283+ . with_hour ( hour)
1284+ . and_then ( |dt| dt. with_minute ( minute) )
1285+ . and_then ( |dt| dt. with_second ( second) )
1286+ . and_then ( |dt| dt. with_nanosecond ( nanosecond) )
1287+ . map ( |dt| dt. timestamp_micros ( ) ) ;
1288+
1289+ Ok ( result)
11871290}
11881291
11891292//a string to date parser - port of spark's SparkDateTimeUtils#stringToDate.
@@ -1353,7 +1456,8 @@ mod tests {
13531456 eval_mode,
13541457 TimestampMicrosecondType ,
13551458 timestamp_parser,
1356- tz
1459+ tz,
1460+ "UTC"
13571461 )
13581462 . unwrap ( ) ;
13591463
@@ -1383,7 +1487,8 @@ mod tests {
13831487 eval_mode,
13841488 TimestampMicrosecondType ,
13851489 timestamp_parser,
1386- tz
1490+ tz,
1491+ "UTC"
13871492 ) ;
13881493 assert ! (
13891494 result. is_err( ) ,
@@ -1507,6 +1612,78 @@ mod tests {
15071612 timestamp_parser( "10000-01-01T12:34:56.123456" , EvalMode :: Legacy , tz) . unwrap( ) ,
15081613 Some ( 253402346096123456 )
15091614 ) ;
1615+ // Space separator (same values as T separator)
1616+ assert_eq ! (
1617+ timestamp_parser( "2020-01-01 12" , EvalMode :: Legacy , tz) . unwrap( ) ,
1618+ Some ( 1577880000000000 )
1619+ ) ;
1620+ assert_eq ! (
1621+ timestamp_parser( "2020-01-01 12:34" , EvalMode :: Legacy , tz) . unwrap( ) ,
1622+ Some ( 1577882040000000 )
1623+ ) ;
1624+ assert_eq ! (
1625+ timestamp_parser( "2020-01-01 12:34:56" , EvalMode :: Legacy , tz) . unwrap( ) ,
1626+ Some ( 1577882096000000 )
1627+ ) ;
1628+ assert_eq ! (
1629+ timestamp_parser( "2020-01-01 12:34:56.123456" , EvalMode :: Legacy , tz) . unwrap( ) ,
1630+ Some ( 1577882096123456 )
1631+ ) ;
1632+ // Z suffix (UTC)
1633+ assert_eq ! (
1634+ timestamp_parser( "2020-01-01T12:34:56Z" , EvalMode :: Legacy , tz) . unwrap( ) ,
1635+ Some ( 1577882096000000 )
1636+ ) ;
1637+ // Positive offset suffix
1638+ assert_eq ! (
1639+ timestamp_parser( "2020-01-01T12:34:56+05:30" , EvalMode :: Legacy , tz) . unwrap( ) ,
1640+ Some ( 1577862296000000 ) // 12:34:56 UTC+5:30 = 07:04:56 UTC
1641+ ) ;
1642+ // T-prefixed time-only with colon
1643+ assert_eq ! (
1644+ timestamp_parser( "T12:34" , EvalMode :: Legacy , tz)
1645+ . unwrap( )
1646+ . is_some( ) ,
1647+ true
1648+ ) ;
1649+ assert_eq ! (
1650+ timestamp_parser( "T12:34:56" , EvalMode :: Legacy , tz)
1651+ . unwrap( )
1652+ . is_some( ) ,
1653+ true
1654+ ) ;
1655+ assert_eq ! (
1656+ timestamp_parser( "T12:34:56.123456" , EvalMode :: Legacy , tz)
1657+ . unwrap( )
1658+ . is_some( ) ,
1659+ true
1660+ ) ;
1661+ // Bare time-only (hour:minute without T prefix)
1662+ assert_eq ! (
1663+ timestamp_parser( "12:34" , EvalMode :: Legacy , tz)
1664+ . unwrap( )
1665+ . is_some( ) ,
1666+ true
1667+ ) ;
1668+ assert_eq ! (
1669+ timestamp_parser( "12:34:56" , EvalMode :: Legacy , tz)
1670+ . unwrap( )
1671+ . is_some( ) ,
1672+ true
1673+ ) ;
1674+ // Negative year
1675+ assert_eq ! (
1676+ timestamp_parser( "-0001" , EvalMode :: Legacy , tz)
1677+ . unwrap( )
1678+ . is_some( ) ,
1679+ true
1680+ ) ;
1681+ assert_eq ! (
1682+ timestamp_parser( "-0001-01-01T12:34:56" , EvalMode :: Legacy , tz)
1683+ . unwrap( )
1684+ . is_some( ) ,
1685+ true
1686+ ) ;
15101687 }
15111688
15121689 #[ test]
0 commit comments