Skip to content

Commit 7c6e3ab

Browse files
rampage644claude
andcommitted
fix(arrow/transform): accept TIMESTAMP_TZ for day/hour/month/year transforms
`transform_arrow()` only matched `DataType::Timestamp(TimeUnit::Microsecond, None)` for the day/hour/month/year arms, so any `timestamptz` column fell through to the catchall and raised `Compute error: Failed to perform transform for datatype`. Embucket's MERGE write path on `events_hooli` — whose `collector_tstamp` is `TIMESTAMP_TZ` partitioned by `day(collector_tstamp)` — tripped this every time. Iceberg's day/hour/month/year transforms are defined on the absolute instant (microseconds since the Unix epoch), so the Arrow timezone metadata is irrelevant to the numeric result. Widen each arm to `Timestamp(Microsecond, _)`. For month and year the existing `date_part` call used a named-timezone path that requires `chrono-tz`; cast to `Timestamp(Microsecond, None)` first so we run on a naive variant that works without that feature flag. Adds 4 regression tests exercising all four transforms with a `TimestampMicrosecondArray::with_timezone("UTC")` input to lock the fix in. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 841aaca commit 7c6e3ab

1 file changed

Lines changed: 90 additions & 7 deletions

File tree

iceberg-rust/src/arrow/transform.rs

Lines changed: 90 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,25 +72,36 @@ pub fn transform_arrow(array: ArrayRef, transform: &Transform) -> Result<ArrayRe
7272
)?),
7373
datepart_to_years,
7474
))),
75-
(DataType::Timestamp(TimeUnit::Microsecond, None), Transform::Hour) => {
75+
// `_` for the timezone parameter so both `timestamp` (None) and
76+
// `timestamptz` (Some(tz)) match. Iceberg partition transforms are
77+
// defined on the absolute instant (microseconds since Unix epoch in
78+
// UTC), so the attached tz metadata is irrelevant to the numeric
79+
// result — we just need to read the underlying i64.
80+
(DataType::Timestamp(TimeUnit::Microsecond, _), Transform::Hour) => {
7681
Ok(Arc::new(unary::<_, _, Int32Type>(
7782
as_primitive_array::<Int64Type>(&cast(&array, &DataType::Int64)?),
7883
micros_to_hours,
7984
)) as Arc<dyn Array>)
8085
}
81-
(DataType::Timestamp(TimeUnit::Microsecond, None), Transform::Day) => {
86+
(DataType::Timestamp(TimeUnit::Microsecond, _), Transform::Day) => {
8287
Ok(Arc::new(unary::<_, _, Int32Type>(
8388
as_primitive_array::<Int64Type>(&cast(&array, &DataType::Int64)?),
8489
micros_to_days,
8590
)) as Arc<dyn Array>)
8691
}
87-
(DataType::Timestamp(TimeUnit::Microsecond, None), Transform::Month) => {
92+
(DataType::Timestamp(TimeUnit::Microsecond, _), Transform::Month) => {
93+
// date_part requires chrono-tz for named timezones like "UTC".
94+
// Iceberg computes month/year over the absolute instant, so the
95+
// tz metadata only affects display, not the result. Strip the tz
96+
// by casting to Timestamp(Microsecond, None) first so date_part
97+
// runs on a plain value.
98+
let naive = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None))?;
8899
let year = date_part(
89-
as_primitive_array::<TimestampMicrosecondType>(&array),
100+
as_primitive_array::<TimestampMicrosecondType>(&naive),
90101
DatePart::Year,
91102
)?;
92103
let month = date_part(
93-
as_primitive_array::<TimestampMicrosecondType>(&array),
104+
as_primitive_array::<TimestampMicrosecondType>(&naive),
94105
DatePart::Month,
95106
)?;
96107
Ok(Arc::new(binary::<_, _, _, Int32Type>(
@@ -99,10 +110,12 @@ pub fn transform_arrow(array: ArrayRef, transform: &Transform) -> Result<ArrayRe
99110
datepart_to_months,
100111
)?))
101112
}
102-
(DataType::Timestamp(TimeUnit::Microsecond, None), Transform::Year) => {
113+
(DataType::Timestamp(TimeUnit::Microsecond, _), Transform::Year) => {
114+
// Same tz-stripping rationale as Month above.
115+
let naive = cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None))?;
103116
Ok(Arc::new(unary::<_, _, Int32Type>(
104117
as_primitive_array::<Int32Type>(&date_part(
105-
as_primitive_array::<TimestampMicrosecondType>(&array),
118+
as_primitive_array::<TimestampMicrosecondType>(&naive),
106119
DatePart::Year,
107120
)?),
108121
datepart_to_years,
@@ -520,4 +533,74 @@ mod tests {
520533
"Compute error: Failed to perform transform for datatype"
521534
);
522535
}
536+
537+
/// Returns the same three representative microsecond values as
538+
/// `create_timestamp_micro_array()` but wrapped in a `TimestampMicrosecondArray`
539+
/// with a `"UTC"` timezone attached. This matches how Arrow encodes
540+
/// Iceberg's `timestamptz` type, which is what Embucket hands to the
541+
/// partition transforms for tables like `events_hooli` whose
542+
/// `collector_tstamp` is `TIMESTAMP_TZ`.
543+
fn create_timestamp_micro_tz_array() -> ArrayRef {
544+
Arc::new(
545+
TimestampMicrosecondArray::from(vec![
546+
Some(1682937000000000),
547+
Some(1686840330000000),
548+
Some(1704067200000000),
549+
None,
550+
])
551+
.with_timezone("UTC"),
552+
) as ArrayRef
553+
}
554+
555+
#[test]
556+
fn test_timestamp_tz_day_transform() {
557+
let array = create_timestamp_micro_tz_array();
558+
let result = transform_arrow(array, &Transform::Day).unwrap();
559+
let expected = Arc::new(arrow::array::Int32Array::from(vec![
560+
Some(19478),
561+
Some(19523),
562+
Some(19723),
563+
None,
564+
])) as ArrayRef;
565+
assert_eq!(&expected, &result);
566+
}
567+
568+
#[test]
569+
fn test_timestamp_tz_hour_transform() {
570+
let array = create_timestamp_micro_tz_array();
571+
let result = transform_arrow(array, &Transform::Hour).unwrap();
572+
let expected = Arc::new(arrow::array::Int32Array::from(vec![
573+
Some(467482),
574+
Some(468566),
575+
Some(473352),
576+
None,
577+
])) as ArrayRef;
578+
assert_eq!(&expected, &result);
579+
}
580+
581+
#[test]
582+
fn test_timestamp_tz_month_transform() {
583+
let array = create_timestamp_micro_tz_array();
584+
let result = transform_arrow(array, &Transform::Month).unwrap();
585+
let expected = Arc::new(arrow::array::Int32Array::from(vec![
586+
Some(641),
587+
Some(642),
588+
Some(649),
589+
None,
590+
])) as ArrayRef;
591+
assert_eq!(&expected, &result);
592+
}
593+
594+
#[test]
595+
fn test_timestamp_tz_year_transform() {
596+
let array = create_timestamp_micro_tz_array();
597+
let result = transform_arrow(array, &Transform::Year).unwrap();
598+
let expected = Arc::new(arrow::array::Int32Array::from(vec![
599+
Some(53),
600+
Some(53),
601+
Some(54),
602+
None,
603+
])) as ArrayRef;
604+
assert_eq!(&expected, &result);
605+
}
523606
}

0 commit comments

Comments
 (0)