diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 6ca095328af1..c540922b43af 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -37,7 +37,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.12 - name: Install Archery and Crossbow dependencies run: pip install -e dev/archery[bot] - name: Handle Github comment event diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 51569c0029a7..b1acea2c136b 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -36,7 +36,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v1 with: - python-version: 3.8 + python-version: 3.12 - name: Audit licenses run: ./dev/release/run-rat.sh . diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 41b1dcbe8eb9..c0bd7c13c856 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -41,7 +41,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.12 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build @@ -64,12 +64,12 @@ jobs: rustup default ${{ matrix.rust }} rustup component add rustfmt clippy - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /home/runner/.cargo key: cargo-maturin-cache- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /home/runner/target # this key is not equal because maturin uses different compilation flags. diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index d8681b7aaca1..43e8fd0c5ceb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,14 +41,14 @@ jobs: steps: - uses: actions/checkout@v2 - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: # these represent dependencies downloaded by cargo # and thus do not depend on the OS, arch nor rust version. path: /github/home/.cargo key: cargo-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: # these represent compiled steps of both dependencies and arrow # and thus are specific for a particular OS, arch and rust version. @@ -86,13 +86,13 @@ jobs: with: submodules: true - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/.cargo # this key equals the ones on `linux-build-lib` for re-use key: cargo-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/target # this key equals the ones on `linux-build-lib` for re-use @@ -154,12 +154,12 @@ jobs: with: submodules: true - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/.cargo key: cargo-nightly-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/target key: ${{ runner.os }}-${{ matrix.arch }}-target-nightly-cache3-${{ matrix.rust }} @@ -205,6 +205,7 @@ jobs: export PARQUET_TEST_DATA=$(pwd)/parquet-testing/data # do not produce debug symbols to keep memory usage down export RUSTFLAGS="-C debuginfo=0" + export CMAKE_POLICY_VERSION_MINIMUM=3.5 cargo test clippy: @@ -226,13 +227,13 @@ jobs: with: submodules: true - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/.cargo # this key equals the ones on `linux-build-lib` for re-use key: cargo-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/target # this key equals the ones on `linux-build-lib` for re-use @@ -268,13 +269,13 @@ jobs: with: submodules: true - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/.cargo # this key equals the ones on `linux-build-lib` for re-use key: cargo-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/target # this key equals the ones on `linux-build-lib` for re-use @@ -321,13 +322,13 @@ jobs: rustup default ${{ matrix.rust }} rustup component add rustfmt clippy - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /home/runner/.cargo # this key is not equal because the user is different than on a container (runner vs github) key: cargo-coverage-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /home/runner/target # this key is not equal because coverage uses different compilation flags. @@ -369,12 +370,12 @@ jobs: with: submodules: true - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/.cargo key: cargo-wasm32-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/target key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache3-${{ matrix.rust }} @@ -414,14 +415,14 @@ jobs: - name: Install python dev run: | apt update - apt install -y libpython3.11-dev + apt install -y libpython3.12-dev - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/.cargo key: cargo-nightly-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/target key: ${{ runner.os }}-${{ matrix.arch }}-target-nightly-cache3-${{ matrix.rust }} @@ -453,13 +454,13 @@ jobs: steps: - uses: actions/checkout@v2 - name: Cache Cargo - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/.cargo # this key equals the ones on `linux-build-lib` for re-use key: cargo-cache3- - name: Cache Rust dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: /github/home/target # this key equals the ones on `linux-build-lib` for re-use diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index ca15d9c1edde..a8a63cd6c679 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -3043,7 +3043,8 @@ mod tests { fn test_cast_utf8_to_i32() { let a = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Int32).unwrap(); + let b = + cast_with_options(&array, &DataType::Int32, &DEFAULT_CAST_OPTIONS).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(5, c.value(0)); assert_eq!(6, c.value(1)); @@ -3073,7 +3074,9 @@ mod tests { let strings = Arc::new(StringArray::from(vec![ "true", "false", "invalid", " Y ", "", ])) as ArrayRef; - let casted = cast(&strings, &DataType::Boolean).unwrap(); + let casted = + cast_with_options(&strings, &DataType::Boolean, &DEFAULT_CAST_OPTIONS) + .unwrap(); let expected = BooleanArray::from(vec![Some(true), Some(false), None, Some(true), None]); assert_eq!(*as_boolean_array(&casted), expected); @@ -3257,8 +3260,12 @@ mod tests { None, ])) as ArrayRef; for array in &[a1, a2] { - let b = - cast(array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); + let b = cast_with_options( + array, + &DataType::Timestamp(TimeUnit::Nanosecond, None), + &DEFAULT_CAST_OPTIONS, + ) + .unwrap(); let c = b .as_any() .downcast_ref::() @@ -3284,7 +3291,8 @@ mod tests { None, ])) as ArrayRef; for array in &[a1, a2] { - let b = cast(array, &DataType::Date32).unwrap(); + let b = cast_with_options(array, &DataType::Date32, &DEFAULT_CAST_OPTIONS) + .unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(17890, c.value(0)); assert_eq!(17891, c.value(1)); @@ -3306,7 +3314,8 @@ mod tests { None, ])) as ArrayRef; for array in &[a1, a2] { - let b = cast(array, &DataType::Date64).unwrap(); + let b = cast_with_options(array, &DataType::Date64, &DEFAULT_CAST_OPTIONS) + .unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(1599566400000, c.value(0)); assert!(c.is_null(1)); @@ -5026,7 +5035,8 @@ mod tests { "2000", // just a year is invalid ]); let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date32).unwrap(); + let b = + cast_with_options(&array, &DataType::Date32, &DEFAULT_CAST_OPTIONS).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); // test valid inputs @@ -5067,7 +5077,8 @@ mod tests { "2000-01-01", // just a date is invalid ]); let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date64).unwrap(); + let b = + cast_with_options(&array, &DataType::Date64, &DEFAULT_CAST_OPTIONS).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); // test valid inputs @@ -5097,7 +5108,8 @@ mod tests { for array in get_arrays_of_all_types() { for to_type in &all_types { println!("Test casting {:?} --> {:?}", array.data_type(), to_type); - let cast_result = cast(&array, to_type); + let cast_result = + cast_with_options(&array, to_type, &DEFAULT_CAST_OPTIONS); let reported_cast_ability = can_cast_types(array.data_type(), to_type); // check for mismatch diff --git a/arrow/src/compute/kernels/cast_utils.rs b/arrow/src/compute/kernels/cast_utils.rs index e430bd56dcef..96c42982b7a3 100644 --- a/arrow/src/compute/kernels/cast_utils.rs +++ b/arrow/src/compute/kernels/cast_utils.rs @@ -93,6 +93,14 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { return Ok(ts.timestamp_nanos()); } + // Try to split off a trailing timezone offset and parse it manually. + // Covers variants chrono's format specifiers don't accept on parse, + // such as `+HH`, `+HHMM`, and `+HH:MM:SS` (chrono's `%:z`/`%::z`/`%:::z` + // all only parse `+HH:MM`). + if let Some(ts) = parse_timestamp_with_manual_offset(s) { + return Ok(ts); + } + // Support timestamps without an explicit timezone offset, again // to be compatible with what Apache Spark SQL does. @@ -133,6 +141,88 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { ))) } +/// Parse a timezone offset suffix as a chrono `FixedOffset`. +/// +/// Accepts: `Z`, `+HH`, `-HH`, `+HHMM`, `-HHMM`, `+HH:MM`, `-HH:MM`, +/// `+HH:MM:SS`, `-HH:MM:SS`. Returns `None` for any other shape. +fn parse_fixed_offset(s: &str) -> Option { + if s == "Z" { + return FixedOffset::east_opt(0); + } + let sign: i32 = match s.as_bytes().first()? { + b'+' => 1, + b'-' => -1, + _ => return None, + }; + let body = &s[1..]; + let (h, m, sec) = if body.contains(':') { + let mut parts = body.split(':'); + let h = parts.next()?.parse::().ok()?; + let m = parts.next()?.parse::().ok()?; + let sec = match parts.next() { + Some(p) => p.parse::().ok()?, + None => 0, + }; + if parts.next().is_some() { + return None; + } + (h, m, sec) + } else { + match body.len() { + 2 => (body.parse::().ok()?, 0, 0), + 4 => ( + body[..2].parse::().ok()?, + body[2..].parse::().ok()?, + 0, + ), + _ => return None, + } + }; + if !(0..24).contains(&h) || !(0..60).contains(&m) || !(0..60).contains(&sec) { + return None; + } + FixedOffset::east_opt(sign * (h * 3600 + m * 60 + sec)) +} + +/// Find a trailing timezone offset (if any) and split the string into +/// `(datetime_part, FixedOffset)`. The offset starts at the last `+`, `-`, +/// or `Z` that appears after the date portion (index >= 11). Returns +/// `None` if no plausible offset suffix is found or it fails to parse. +fn split_timestamp_offset(s: &str) -> Option<(&str, FixedOffset)> { + if let Some(stripped) = s.strip_suffix('Z') { + return Some((stripped, FixedOffset::east_opt(0)?)); + } + let bytes = s.as_bytes(); + // Date prefix is `YYYY-MM-DD` (10 chars) + separator, so an offset + // sign must appear at index >= 11 to not collide with date `-`. + for i in (11..bytes.len()).rev() { + match bytes[i] { + b'+' | b'-' => { + let offset = parse_fixed_offset(&s[i..])?; + return Some((&s[..i], offset)); + } + _ => {} + } + } + None +} + +fn parse_timestamp_with_manual_offset(s: &str) -> Option { + let (datetime_part, offset) = split_timestamp_offset(s)?; + for fmt in &[ + "%Y-%m-%dT%H:%M:%S%.f", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d %H:%M:%S%.f", + "%Y-%m-%d %H:%M:%S", + ] { + if let Ok(naive) = NaiveDateTime::parse_from_str(datetime_part, fmt) { + let dt = offset.from_local_datetime(&naive).single()?; + return Some(dt.timestamp_nanos()); + } + } + None +} + /// Converts the naive datetime (which has no specific timezone) to a /// nanosecond epoch timestamp relative to UTC. fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result { @@ -407,6 +497,46 @@ mod tests { Ok(()) } + #[test] + fn string_to_timestamp_timezone_offset_variants() -> Result<()> { + const BASE: i64 = 1_599_572_549_000_000_000; // 2020-09-08T13:42:29Z + + // `+HH` (postgres-style short offset) + assert_eq!(BASE, parse_timestamp("2020-09-08T13:42:29+00")?); + assert_eq!(BASE, parse_timestamp("2020-09-08 13:42:29+00")?); + assert_eq!( + BASE + 5 * 3600 * 1_000_000_000, + parse_timestamp("2020-09-08 13:42:29-05")? + ); + + // `+HH:MM:SS` (postgres-style extended offset) + assert_eq!(BASE, parse_timestamp("2020-09-08T13:42:29+00:00:00")?); + assert_eq!(BASE, parse_timestamp("2020-09-08 13:42:29+00:00:00")?); + assert_eq!( + BASE + (5 * 3600 + 30 * 60) * 1_000_000_000, + parse_timestamp("2020-09-08 13:42:29-05:30:00")? + ); + + // `+HHMM` (compact form) + assert_eq!(BASE, parse_timestamp("2020-09-08 13:42:29+0000")?); + assert_eq!( + BASE + (5 * 3600 + 30 * 60) * 1_000_000_000, + parse_timestamp("2020-09-08 13:42:29-0530")? + ); + + // With fractional seconds. + assert_eq!( + BASE + 190_855_000, + parse_timestamp("2020-09-08 13:42:29.190855+00")? + ); + assert_eq!( + BASE + 190_855_000, + parse_timestamp("2020-09-08 13:42:29.190855+00:00:00")? + ); + + Ok(()) + } + #[test] fn string_to_timestamp_timezone_space() -> Result<()> { // Ensure space rather than T between time and date is accepted