From a3e90d31ea496b15ed171a181e209cf5b334b09c Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 31 Mar 2026 12:01:04 -0400 Subject: [PATCH 1/8] coerce int96 timestamps --- crates/iceberg/src/arrow/reader.rs | 514 +++++++++++++++++++++++++++++ 1 file changed, 514 insertions(+) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 042a730e19..745de66d94 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -386,6 +386,28 @@ impl ArrowReader { arrow_metadata }; + // Coerce INT96 timestamp columns to the correct TimeUnit from the Iceberg schema. + // Must happen before building the stream so arrow-rs decodes INT96 at the right + // resolution (e.g., microseconds instead of nanoseconds to avoid i64 overflow). + let arrow_metadata = if let Some(coerced_schema) = coerce_int96_timestamps( + arrow_metadata.metadata().file_metadata().schema_descr(), + arrow_metadata.schema(), + &task.schema, + ) { + let options = ArrowReaderOptions::new().with_schema(coerced_schema); + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( + |e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create ArrowReaderMetadata with INT96-coerced schema", + ) + .with_source(e) + }, + )? + } else { + arrow_metadata + }; + // Build the stream reader, reusing the already-opened file reader let mut record_batch_stream_builder = ParquetRecordBatchStreamBuilder::new_with_metadata(parquet_file_reader, arrow_metadata); @@ -1232,6 +1254,121 @@ fn add_fallback_field_ids_to_arrow_schema(arrow_schema: &ArrowSchemaRef) -> Arc< )) } +/// Coerce Arrow schema types for Parquet INT96 columns to match the Iceberg table schema. +/// +/// INT96 is a legacy Parquet physical type for timestamps (common in Spark/Hive files). +/// arrow-rs defaults INT96 to Timestamp(Nanosecond), which overflows i64 for dates outside +/// ~1677-2262. The Iceberg spec defines timestamps as microsecond precision (timestamp) or +/// nanosecond precision (timestamp_ns). +/// See: https://iceberg.apache.org/spec/#primitive-types +/// +/// This function patches the Arrow schema so arrow-rs reads INT96 at the correct resolution, +/// using the schema hint mechanism from arrow-rs PR #7285. +/// +/// Iceberg Java avoids this by using a custom INT96 column reader that bypasses parquet-mr's +/// default decoding (GenericParquetReaders.TimestampInt96Reader). iceberg-rust delegates to +/// arrow-rs, so we must pass the correct schema hint instead. +fn coerce_int96_timestamps( + parquet_schema: &SchemaDescriptor, + arrow_schema: &ArrowSchemaRef, + iceberg_schema: &Schema, +) -> Option> { + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use parquet::basic::Type as PhysicalType; + + // Collect paths of INT96 leaf columns from the Parquet schema + let int96_paths: HashSet = parquet_schema + .columns() + .iter() + .filter(|col| col.physical_type() == PhysicalType::INT96) + .map(|col| col.path().string()) + .collect(); + + if int96_paths.is_empty() { + return None; + } + + // Recursively coerce fields, tracking the Parquet column path for matching + fn coerce_field( + field: &FieldRef, + path_parts: &[&str], + int96_paths: &HashSet, + iceberg_schema: &Schema, + ) -> FieldRef { + match field.data_type() { + DataType::Struct(fields) => { + let new_fields: Vec = fields + .iter() + .map(|child| { + let mut child_path = path_parts.to_vec(); + child_path.push(child.name().as_str()); + coerce_field(child, &child_path, int96_paths, iceberg_schema) + }) + .collect(); + Arc::new( + Field::new( + field.name(), + DataType::Struct(Fields::from(new_fields)), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + let full_path = path_parts.join("."); + if int96_paths.contains(&full_path) { + // Look up the Iceberg field by ID to determine target TimeUnit + let target_unit = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .and_then(|field_id| iceberg_schema.field_by_id(field_id)) + .and_then(|iceberg_field| match &*iceberg_field.field_type { + Type::Primitive(PrimitiveType::Timestamp) + | Type::Primitive(PrimitiveType::Timestamptz) => { + Some(TimeUnit::Microsecond) + } + Type::Primitive(PrimitiveType::TimestampNs) + | Type::Primitive(PrimitiveType::TimestamptzNs) => { + Some(TimeUnit::Nanosecond) + } + _ => None, + }) + // Default to Microsecond for INT96 — matches Iceberg Java behavior + .unwrap_or(TimeUnit::Microsecond); + + if target_unit != TimeUnit::Nanosecond { + return Arc::new( + Field::new( + field.name(), + DataType::Timestamp(target_unit, tz.clone()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ); + } + } + Arc::clone(field) + } + _ => Arc::clone(field), + } + } + + let coerced_fields: Vec = arrow_schema + .fields() + .iter() + .map(|field| coerce_field(field, &[field.name().as_str()], &int96_paths, iceberg_schema)) + .collect(); + + let coerced = ArrowSchema::new_with_metadata(coerced_fields, arrow_schema.metadata().clone()); + + if &coerced != arrow_schema.as_ref() { + Some(Arc::new(coerced)) + } else { + None + } +} + /// A visitor to collect field ids from bound predicates. struct CollectFieldIdVisitor { field_ids: HashSet, @@ -4667,4 +4804,381 @@ message schema { assert_eq!(result[1], expected_1); assert_eq!(result[2], expected_2); } + + /// Helper to write a Parquet file with INT96 timestamps using the low-level writer. + /// Returns (file_path, expected_microsecond_values). + /// + /// We must use SerializedFileWriter because ArrowWriter cannot write INT96. + fn write_int96_parquet_file( + table_location: &str, + filename: &str, + with_field_ids: bool, + ) -> (String, Vec) { + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{Int32Type, Int96, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let file_path = format!("{table_location}/{filename}"); + + // Build Parquet schema with INT96 timestamp and INT32 id + let mut ts_builder = + SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = + SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); + + if with_field_ids { + ts_builder = ts_builder.with_id(Some(1)); + id_builder = id_builder.with_id(Some(2)); + } + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new(ts_builder.build().unwrap()), + Arc::new(id_builder.build().unwrap()), + ]) + .build() + .unwrap(); + + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + // + // We pick dates outside the i64 nanosecond range (~1677-2262) to trigger + // the overflow bug. Year 3333 ≈ Julian day 2_953_529. + let test_data: Vec<(u32, u32, u32, i64)> = vec![ + // (nanos_low, nanos_high, julian_day, expected_micros_since_epoch) + // Day 2_953_529 = 3333-01-01, nanos_within_day = 0 + (0, 0, 2_953_529, { + (2_953_529i64 - 2_440_588) * 86_400_000_000 + }), + // Day 2_953_529 = 3333-01-01, nanos_within_day = 43_200_000_000_000 (12:00:00 noon) + // 43_200_000_000_000 as two u32s: low = 43_200_000_000_000 & 0xFFFFFFFF, high = 43_200_000_000_000 >> 32 + ({ + (43_200_000_000_000u64 & 0xFFFFFFFF) as u32 + }, { + (43_200_000_000_000u64 >> 32) as u32 + }, 2_953_529, { + (2_953_529i64 - 2_440_588) * 86_400_000_000 + 43_200_000_000 + }), + // Day 2_488_070 = 2100-01-01, nanos_within_day = 0 + (0, 0, 2_488_070, { + (2_488_070i64 - 2_440_588) * 86_400_000_000 + }), + ]; + + let int96_values: Vec = test_data + .iter() + .map(|(lo, hi, day, _)| { + let mut v = Int96::new(); + v.set_data(*lo, *hi, *day); + v + }) + .collect(); + + let id_values: Vec = (0..test_data.len() as i32).collect(); + let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + // Write ts column (optional — all present, so def_levels = [1, 1, 1]) + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&int96_values, Some(&vec![1i16; test_data.len()]), None) + .unwrap(); + col.close().unwrap(); + } + { + // Write id column (required — no def levels) + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&id_values, None, None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + (file_path, expected_micros) + } + + /// Test reading INT96 timestamps through iceberg-rust's ArrowReader. + /// Files WITH embedded field IDs (branch 1 of schema resolution). + #[tokio::test] + async fn test_read_int96_timestamps_with_field_ids() { + use arrow_array::TimestampMicrosecondArray; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "with_ids.parquet", true); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: 0, + length: std::fs::metadata(&file_path).unwrap().len(), + record_count: None, + data_file_path: file_path, + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let batches: Vec = reader + .read(tasks) + .unwrap() + .try_collect() + .await + .unwrap(); + + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Mismatch at row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + /// Test reading INT96 timestamps from migrated files WITHOUT field IDs + /// (branches 2/3 of schema resolution — the common migration case). + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + use arrow_array::TimestampMicrosecondArray; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: 0, + length: std::fs::metadata(&file_path).unwrap().len(), + record_count: None, + data_file_path: file_path, + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let batches: Vec = reader + .read(tasks) + .unwrap() + .try_collect() + .await + .unwrap(); + + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + + let ts_array = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Mismatch at row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + /// Test reading INT96 timestamps inside a struct field. + // TODO: Add test_read_int96_timestamps_in_struct once RecordBatchTransformer + // supports struct fields written by SerializedFileWriter. The coerce_int96_timestamps + // function already handles nested types via recursive traversal. + #[tokio::test] + #[ignore = "RecordBatchTransformer struct field matching needs work for this test setup"] + async fn test_read_int96_timestamps_in_struct() { + use arrow_array::TimestampMicrosecondArray; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/struct_int96.parquet"); + + // Parquet schema: message schema { required group data { optional INT96 ts; } } + // With field IDs: data=1, ts=2 + let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let struct_type = SchemaType::group_type_builder("data") + .with_repetition(Repetition::REQUIRED) + .with_id(Some(1)) + .with_fields(vec![Arc::new(ts_type)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(struct_type)]) + .build() + .unwrap(); + + // Write INT96 value: year 3333 (Julian day 2_953_529), noon + let nanos_within_day = 43_200_000_000_000u64; + let julian_day = 2_953_529u32; + let expected_micros = + (julian_day as i64 - 2_440_588) * 86_400_000_000 + (nanos_within_day / 1_000) as i64; + + let mut int96_val = parquet::data_type::Int96::new(); + int96_val.set_data( + (nanos_within_day & 0xFFFFFFFF) as u32, + (nanos_within_day >> 32) as u32, + julian_day, + ); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[1i16]), None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + // Iceberg schema: struct + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional(2, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + ])), + ) + .into()]) + .build() + .unwrap(), + ); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: 0, + length: std::fs::metadata(&file_path).unwrap().len(), + record_count: None, + data_file_path: file_path, + data_file_format: DataFileFormat::Parquet, + schema: iceberg_schema, + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let batches: Vec = reader + .read(tasks) + .unwrap() + .try_collect() + .await + .unwrap(); + + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + + // Navigate into the struct to get the timestamp array + let struct_array = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected StructArray"); + let ts_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside struct"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in struct: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } } From 7d80d2e34b7bee4ea655b7f8f865cc6ba7fa0221 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 31 Mar 2026 12:30:34 -0400 Subject: [PATCH 2/8] cleanup --- crates/iceberg/src/arrow/reader.rs | 340 ++++++++++++----------------- 1 file changed, 143 insertions(+), 197 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 745de66d94..0aee3ec5ed 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -386,9 +386,8 @@ impl ArrowReader { arrow_metadata }; - // Coerce INT96 timestamp columns to the correct TimeUnit from the Iceberg schema. - // Must happen before building the stream so arrow-rs decodes INT96 at the right - // resolution (e.g., microseconds instead of nanoseconds to avoid i64 overflow). + // Coerce INT96 timestamp columns to the resolution specified by the Iceberg schema. + // This must happen before building the stream reader to avoid i64 overflow in arrow-rs. let arrow_metadata = if let Some(coerced_schema) = coerce_int96_timestamps( arrow_metadata.metadata().file_metadata().schema_descr(), arrow_metadata.schema(), @@ -1254,20 +1253,18 @@ fn add_fallback_field_ids_to_arrow_schema(arrow_schema: &ArrowSchemaRef) -> Arc< )) } -/// Coerce Arrow schema types for Parquet INT96 columns to match the Iceberg table schema. +/// Coerce Arrow schema types for INT96 columns to match the Iceberg table schema. /// -/// INT96 is a legacy Parquet physical type for timestamps (common in Spark/Hive files). -/// arrow-rs defaults INT96 to Timestamp(Nanosecond), which overflows i64 for dates outside -/// ~1677-2262. The Iceberg spec defines timestamps as microsecond precision (timestamp) or -/// nanosecond precision (timestamp_ns). -/// See: https://iceberg.apache.org/spec/#primitive-types +/// arrow-rs defaults INT96 to `Timestamp(Nanosecond)`, which overflows i64 for dates outside +/// ~1677-2262. We use arrow-rs's schema hint mechanism to read INT96 at the resolution +/// specified by the Iceberg schema (`timestamp` → microsecond, `timestamp_ns` → nanosecond). /// -/// This function patches the Arrow schema so arrow-rs reads INT96 at the correct resolution, -/// using the schema hint mechanism from arrow-rs PR #7285. +/// Iceberg Java handles this differently: it bypasses parquet-mr with a custom column reader +/// (`GenericParquetReaders.TimestampInt96Reader`). We achieve the same result via schema hints. /// -/// Iceberg Java avoids this by using a custom INT96 column reader that bypasses parquet-mr's -/// default decoding (GenericParquetReaders.TimestampInt96Reader). iceberg-rust delegates to -/// arrow-rs, so we must pass the correct schema hint instead. +/// References: +/// - Iceberg spec primitive types: +/// - arrow-rs schema hint support: fn coerce_int96_timestamps( parquet_schema: &SchemaDescriptor, arrow_schema: &ArrowSchemaRef, @@ -1276,7 +1273,6 @@ fn coerce_int96_timestamps( use arrow_schema::{DataType, Field, Fields, TimeUnit}; use parquet::basic::Type as PhysicalType; - // Collect paths of INT96 leaf columns from the Parquet schema let int96_paths: HashSet = parquet_schema .columns() .iter() @@ -1288,7 +1284,6 @@ fn coerce_int96_timestamps( return None; } - // Recursively coerce fields, tracking the Parquet column path for matching fn coerce_field( field: &FieldRef, path_parts: &[&str], @@ -1314,41 +1309,38 @@ fn coerce_int96_timestamps( .with_metadata(field.metadata().clone()), ) } - DataType::Timestamp(TimeUnit::Nanosecond, tz) => { - let full_path = path_parts.join("."); - if int96_paths.contains(&full_path) { - // Look up the Iceberg field by ID to determine target TimeUnit - let target_unit = field - .metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|id_str| id_str.parse::().ok()) - .and_then(|field_id| iceberg_schema.field_by_id(field_id)) - .and_then(|iceberg_field| match &*iceberg_field.field_type { - Type::Primitive(PrimitiveType::Timestamp) - | Type::Primitive(PrimitiveType::Timestamptz) => { - Some(TimeUnit::Microsecond) - } - Type::Primitive(PrimitiveType::TimestampNs) - | Type::Primitive(PrimitiveType::TimestamptzNs) => { - Some(TimeUnit::Nanosecond) - } - _ => None, - }) - // Default to Microsecond for INT96 — matches Iceberg Java behavior - .unwrap_or(TimeUnit::Microsecond); - - if target_unit != TimeUnit::Nanosecond { - return Arc::new( - Field::new( - field.name(), - DataType::Timestamp(target_unit, tz.clone()), - field.is_nullable(), - ) - .with_metadata(field.metadata().clone()), - ); - } + DataType::Timestamp(TimeUnit::Nanosecond, tz) + if int96_paths.contains(&path_parts.join(".")) => + { + let target_unit = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .and_then(|field_id| iceberg_schema.field_by_id(field_id)) + .and_then(|f| match &*f.field_type { + Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { + Some(TimeUnit::Microsecond) + } + Type::Primitive( + PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs, + ) => Some(TimeUnit::Nanosecond), + _ => None, + }) + // Iceberg Java reads INT96 as microseconds by default + .unwrap_or(TimeUnit::Microsecond); + + if target_unit == TimeUnit::Nanosecond { + return Arc::clone(field); } - Arc::clone(field) + + Arc::new( + Field::new( + field.name(), + DataType::Timestamp(target_unit, tz.clone()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ) } _ => Arc::clone(field), } @@ -1357,7 +1349,14 @@ fn coerce_int96_timestamps( let coerced_fields: Vec = arrow_schema .fields() .iter() - .map(|field| coerce_field(field, &[field.name().as_str()], &int96_paths, iceberg_schema)) + .map(|field| { + coerce_field( + field, + &[field.name().as_str()], + &int96_paths, + iceberg_schema, + ) + }) .collect(); let coerced = ArrowSchema::new_with_metadata(coerced_fields, arrow_schema.metadata().clone()); @@ -4805,10 +4804,8 @@ message schema { assert_eq!(result[2], expected_2); } - /// Helper to write a Parquet file with INT96 timestamps using the low-level writer. - /// Returns (file_path, expected_microsecond_values). - /// - /// We must use SerializedFileWriter because ArrowWriter cannot write INT96. + /// Writes a Parquet file with INT96 timestamps using SerializedFileWriter + /// (ArrowWriter cannot write INT96). Returns (file_path, expected_microsecond_values). fn write_int96_parquet_file( table_location: &str, filename: &str, @@ -4821,13 +4818,10 @@ message schema { let file_path = format!("{table_location}/{filename}"); - // Build Parquet schema with INT96 timestamp and INT32 id - let mut ts_builder = - SchemaType::primitive_type_builder("ts", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL); - let mut id_builder = - SchemaType::primitive_type_builder("id", PhysicalType::INT32) - .with_repetition(Repetition::REQUIRED); + let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); if with_field_ids { ts_builder = ts_builder.with_id(Some(1)); @@ -4844,28 +4838,33 @@ message schema { // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] // Julian day 2_440_588 = Unix epoch (1970-01-01) - // - // We pick dates outside the i64 nanosecond range (~1677-2262) to trigger - // the overflow bug. Year 3333 ≈ Julian day 2_953_529. + // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + const NOON_NANOS: u64 = 43_200_000_000_000; + let test_data: Vec<(u32, u32, u32, i64)> = vec![ - // (nanos_low, nanos_high, julian_day, expected_micros_since_epoch) - // Day 2_953_529 = 3333-01-01, nanos_within_day = 0 - (0, 0, 2_953_529, { - (2_953_529i64 - 2_440_588) * 86_400_000_000 - }), - // Day 2_953_529 = 3333-01-01, nanos_within_day = 43_200_000_000_000 (12:00:00 noon) - // 43_200_000_000_000 as two u32s: low = 43_200_000_000_000 & 0xFFFFFFFF, high = 43_200_000_000_000 >> 32 - ({ - (43_200_000_000_000u64 & 0xFFFFFFFF) as u32 - }, { - (43_200_000_000_000u64 >> 32) as u32 - }, 2_953_529, { - (2_953_529i64 - 2_440_588) * 86_400_000_000 + 43_200_000_000 - }), - // Day 2_488_070 = 2100-01-01, nanos_within_day = 0 - (0, 0, 2_488_070, { - (2_488_070i64 - 2_440_588) * 86_400_000_000 - }), + // 3333-01-01 00:00:00 + ( + 0, + 0, + 2_953_529, + (2_953_529 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + // 3333-01-01 12:00:00 + ( + (NOON_NANOS & 0xFFFFFFFF) as u32, + (NOON_NANOS >> 32) as u32, + 2_953_529, + (2_953_529 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + (NOON_NANOS / 1_000) as i64, + ), + // 2100-01-01 00:00:00 + ( + 0, + 0, + 2_488_070, + (2_488_070 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), ]; let int96_values: Vec = test_data @@ -4886,7 +4885,6 @@ message schema { let mut row_group = writer.next_row_group().unwrap(); { - // Write ts column (optional — all present, so def_levels = [1, 1, 1]) let mut col = row_group.next_column().unwrap().unwrap(); col.typed::() .write_batch(&int96_values, Some(&vec![1i16; test_data.len()]), None) @@ -4894,7 +4892,6 @@ message schema { col.close().unwrap(); } { - // Write id column (required — no def levels) let mut col = row_group.next_column().unwrap().unwrap(); col.typed::() .write_batch(&id_values, None, None) @@ -4907,42 +4904,27 @@ message schema { (file_path, expected_micros) } - /// Test reading INT96 timestamps through iceberg-rust's ArrowReader. - /// Files WITH embedded field IDs (branch 1 of schema resolution). - #[tokio::test] - async fn test_read_int96_timestamps_with_field_ids() { + /// Read INT96 Parquet file through ArrowReader and verify microsecond timestamps. + async fn assert_int96_read_matches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + expected_micros: &[i64], + ) { use arrow_array::TimestampMicrosecondArray; - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) - .into(), - NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "with_ids.parquet", true); - let file_io = FileIO::new_with_fs(); let reader = ArrowReaderBuilder::new(file_io).build(); let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + file_size_in_bytes: std::fs::metadata(file_path).unwrap().len(), start: 0, - length: std::fs::metadata(&file_path).unwrap().len(), + length: std::fs::metadata(file_path).unwrap().len(), record_count: None, - data_file_path: file_path, + data_file_path: file_path.to_string(), data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], + schema, + project_field_ids, predicate: None, deletes: vec![], partition: None, @@ -4952,17 +4934,10 @@ message schema { }; let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let batches: Vec = reader - .read(tasks) - .unwrap() - .try_collect() - .await - .unwrap(); + let batches: Vec = reader.read(tasks).unwrap().try_collect().await.unwrap(); assert_eq!(batches.len(), 1); - let batch = &batches[0]; - - let ts_array = batch + let ts_array = batches[0] .column(0) .as_any() .downcast_ref::() @@ -4972,18 +4947,15 @@ message schema { assert_eq!( ts_array.value(i), *expected, - "Mismatch at row {i}: got {}, expected {expected}", + "Row {i}: got {}, expected {expected}", ts_array.value(i) ); } } - /// Test reading INT96 timestamps from migrated files WITHOUT field IDs - /// (branches 2/3 of schema resolution — the common migration case). + /// Files with embedded field IDs (branch 1 of schema resolution). #[tokio::test] - async fn test_read_int96_timestamps_without_field_ids() { - use arrow_array::TimestampMicrosecondArray; - + async fn test_read_int96_timestamps_with_field_ids() { let schema = Arc::new( Schema::builder() .with_schema_id(1) @@ -4998,63 +4970,37 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "no_ids.parquet", false); - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: 0, - length: std::fs::metadata(&file_path).unwrap().len(), - record_count: None, - data_file_path: file_path, - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; + write_int96_parquet_file(&table_location, "with_ids.parquet", true); - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let batches: Vec = reader - .read(tasks) - .unwrap() - .try_collect() - .await - .unwrap(); + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } - assert_eq!(batches.len(), 1); - let batch = &batches[0]; + /// Migrated files without field IDs (branches 2/3 of schema resolution). + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); - let ts_array = batch - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray"); + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); - for (i, expected) in expected_micros.iter().enumerate() { - assert_eq!( - ts_array.value(i), - *expected, - "Mismatch at row {i}: got {}, expected {expected}", - ts_array.value(i) - ); - } + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; } /// Test reading INT96 timestamps inside a struct field. - // TODO: Add test_read_int96_timestamps_in_struct once RecordBatchTransformer - // supports struct fields written by SerializedFileWriter. The coerce_int96_timestamps - // function already handles nested types via recursive traversal. #[tokio::test] - #[ignore = "RecordBatchTransformer struct field matching needs work for this test setup"] async fn test_read_int96_timestamps_in_struct() { use arrow_array::TimestampMicrosecondArray; use parquet::basic::{Repetition, Type as PhysicalType}; @@ -5066,8 +5012,6 @@ message schema { let table_location = tmp_dir.path().to_str().unwrap().to_string(); let file_path = format!("{table_location}/struct_int96.parquet"); - // Parquet schema: message schema { required group data { optional INT96 ts; } } - // With field IDs: data=1, ts=2 let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) .with_repetition(Repetition::OPTIONAL) .with_id(Some(2)) @@ -5086,11 +5030,13 @@ message schema { .build() .unwrap(); - // Write INT96 value: year 3333 (Julian day 2_953_529), noon + // Year 3333 (Julian day 2_953_529), noon — outside the i64 nanosecond range + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; let nanos_within_day = 43_200_000_000_000u64; let julian_day = 2_953_529u32; - let expected_micros = - (julian_day as i64 - 2_440_588) * 86_400_000_000 + (nanos_within_day / 1_000) as i64; + let expected_micros = (julian_day as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (nanos_within_day / 1_000) as i64; let mut int96_val = parquet::data_type::Int96::new(); int96_val.set_data( @@ -5114,19 +5060,24 @@ message schema { row_group.close().unwrap(); writer.close().unwrap(); - // Iceberg schema: struct let iceberg_schema = Arc::new( Schema::builder() .with_schema_id(1) - .with_fields(vec![NestedField::required( - 1, - "data", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::optional(2, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional( + 2, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + ) .into(), - ])), - ) - .into()]) + ])), + ) + .into(), + ]) .build() .unwrap(), ); @@ -5134,6 +5085,7 @@ message schema { let file_io = FileIO::new_with_fs(); let reader = ArrowReaderBuilder::new(file_io).build(); + // Only project the top-level struct field; nested fields are included implicitly let task = FileScanTask { file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), start: 0, @@ -5142,7 +5094,7 @@ message schema { data_file_path: file_path, data_file_format: DataFileFormat::Parquet, schema: iceberg_schema, - project_field_ids: vec![1, 2], + project_field_ids: vec![1], predicate: None, deletes: vec![], partition: None, @@ -5152,17 +5104,11 @@ message schema { }; let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let batches: Vec = reader - .read(tasks) - .unwrap() - .try_collect() - .await - .unwrap(); + let batches: Vec = reader.read(tasks).unwrap().try_collect().await.unwrap(); assert_eq!(batches.len(), 1); let batch = &batches[0]; - // Navigate into the struct to get the timestamp array let struct_array = batch .column(0) .as_any() From 19df205ebbcd60cf3b235905a42511fad6cd7e18 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 31 Mar 2026 14:37:51 -0400 Subject: [PATCH 3/8] address PR feedback, add more tests --- crates/iceberg/src/arrow/reader.rs | 624 +++++++++++++++++++++-------- 1 file changed, 460 insertions(+), 164 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 0aee3ec5ed..89bd2168f6 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -27,7 +27,8 @@ use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatc use arrow_cast::cast::cast; use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; use arrow_schema::{ - ArrowError, DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, + ArrowError, DataType, Field, FieldRef, Fields, Schema as ArrowSchema, + SchemaRef as ArrowSchemaRef, TimeUnit, }; use arrow_string::like::starts_with; use bytes::Bytes; @@ -39,10 +40,11 @@ use parquet::arrow::arrow_reader::{ }; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::basic::Type as PhysicalType; use parquet::file::metadata::{ PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, }; -use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; +use parquet::schema::types::{ColumnPath, SchemaDescriptor, Type as ParquetType}; use typed_builder::TypedBuilder; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; @@ -393,12 +395,14 @@ impl ArrowReader { arrow_metadata.schema(), &task.schema, ) { - let options = ArrowReaderOptions::new().with_schema(coerced_schema); + let options = ArrowReaderOptions::new().with_schema(Arc::clone(&coerced_schema)); ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( |e| { Error::new( ErrorKind::Unexpected, - "Failed to create ArrowReaderMetadata with INT96-coerced schema", + format!( + "Failed to create ArrowReaderMetadata with INT96-coerced schema: {coerced_schema}" + ), ) .with_source(e) }, @@ -1270,101 +1274,182 @@ fn coerce_int96_timestamps( arrow_schema: &ArrowSchemaRef, iceberg_schema: &Schema, ) -> Option> { - use arrow_schema::{DataType, Field, Fields, TimeUnit}; - use parquet::basic::Type as PhysicalType; - - let int96_paths: HashSet = parquet_schema + let int96_paths: Vec = parquet_schema .columns() .iter() .filter(|col| col.physical_type() == PhysicalType::INT96) - .map(|col| col.path().string()) + .map(|col| col.path().clone()) .collect(); if int96_paths.is_empty() { return None; } - fn coerce_field( - field: &FieldRef, - path_parts: &[&str], - int96_paths: &HashSet, - iceberg_schema: &Schema, - ) -> FieldRef { - match field.data_type() { - DataType::Struct(fields) => { - let new_fields: Vec = fields - .iter() - .map(|child| { - let mut child_path = path_parts.to_vec(); - child_path.push(child.name().as_str()); - coerce_field(child, &child_path, int96_paths, iceberg_schema) - }) - .collect(); - Arc::new( - Field::new( - field.name(), - DataType::Struct(Fields::from(new_fields)), - field.is_nullable(), - ) - .with_metadata(field.metadata().clone()), - ) + let mut fields: Vec = arrow_schema.fields().iter().cloned().collect(); + let mut any_changed = false; + + for path in &int96_paths { + let parts = path.parts(); + if let Some(idx) = fields.iter().position(|f| f.name() == &parts[0]) { + let (new_field, changed) = coerce_field_at_path(&fields[idx], parts, 0, iceberg_schema); + if changed { + fields[idx] = new_field; + any_changed = true; } - DataType::Timestamp(TimeUnit::Nanosecond, tz) - if int96_paths.contains(&path_parts.join(".")) => - { - let target_unit = field - .metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|id_str| id_str.parse::().ok()) - .and_then(|field_id| iceberg_schema.field_by_id(field_id)) - .and_then(|f| match &*f.field_type { - Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { - Some(TimeUnit::Microsecond) - } - Type::Primitive( - PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs, - ) => Some(TimeUnit::Nanosecond), - _ => None, - }) - // Iceberg Java reads INT96 as microseconds by default - .unwrap_or(TimeUnit::Microsecond); + } + } - if target_unit == TimeUnit::Nanosecond { - return Arc::clone(field); + if any_changed { + Some(Arc::new(ArrowSchema::new_with_metadata( + fields, + arrow_schema.metadata().clone(), + ))) + } else { + None + } +} + +/// Navigate an Arrow field tree using a Parquet column path and coerce the leaf +/// INT96 `Timestamp(Nanosecond)` to the resolution specified by the Iceberg schema. +/// +/// Parquet column paths include intermediate group names for nested types (e.g. the +/// repeated group in LIST encoding). This function accounts for those extra levels +/// when descending through List, LargeList, and Map Arrow types. +fn coerce_field_at_path( + field: &FieldRef, + parquet_path: &[String], + depth: usize, + iceberg_schema: &Schema, +) -> (FieldRef, bool) { + if depth == parquet_path.len() - 1 { + return coerce_timestamp_field(field, iceberg_schema); + } + + match field.data_type() { + DataType::Struct(fields) => { + let child_name = &parquet_path[depth + 1]; + let mut new_fields: Vec = fields.iter().cloned().collect(); + let mut changed = false; + + if let Some(idx) = new_fields.iter().position(|f| f.name() == child_name) { + let (new_child, child_changed) = + coerce_field_at_path(&new_fields[idx], parquet_path, depth + 1, iceberg_schema); + if child_changed { + new_fields[idx] = new_child; + changed = true; } + } - Arc::new( + if changed { + let new_field = Arc::new( Field::new( field.name(), - DataType::Timestamp(target_unit, tz.clone()), + DataType::Struct(Fields::from(new_fields)), field.is_nullable(), ) .with_metadata(field.metadata().clone()), - ) + ); + (new_field, true) + } else { + (Arc::clone(field), false) } - _ => Arc::clone(field), } + DataType::List(element_field) | DataType::LargeList(element_field) => { + // Parquet 3-level LIST encoding inserts a repeated group between the list + // and its element, e.g. `my_list.list.element` where `list` is the + // intermediate group. The group name varies across writers — the spec says + // "list" but legacy data uses "element", "array", `_tuple`, etc. + // We skip it (depth + 1) since we navigate by Parquet ColumnPath parts + // which contain the actual name from the file. See the Parquet LogicalTypes + // spec for the backward-compatibility rules: + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + if depth + 2 < parquet_path.len() { + debug_assert_eq!( + element_field.name(), + &parquet_path[depth + 2], + "Arrow list element name '{}' does not match Parquet path segment '{}'", + element_field.name(), + &parquet_path[depth + 2] + ); + let (new_element, changed) = + coerce_field_at_path(element_field, parquet_path, depth + 2, iceberg_schema); + if changed { + let new_type = match field.data_type() { + DataType::List(_) => DataType::List(new_element), + DataType::LargeList(_) => DataType::LargeList(new_element), + _ => unreachable!(), + }; + let new_field = Arc::new( + Field::new(field.name(), new_type, field.is_nullable()) + .with_metadata(field.metadata().clone()), + ); + return (new_field, true); + } + } + (Arc::clone(field), false) + } + DataType::Map(entries_field, sorted) => { + // The entries field is a struct containing key/value fields. The Parquet + // path segment at depth + 1 is the entries group name (e.g. "key_value"), + // which corresponds to the Arrow entries struct field. + if depth + 1 < parquet_path.len() { + let (new_entries, changed) = + coerce_field_at_path(entries_field, parquet_path, depth + 1, iceberg_schema); + if changed { + let new_field = Arc::new( + Field::new( + field.name(), + DataType::Map(new_entries, *sorted), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ); + return (new_field, true); + } + } + (Arc::clone(field), false) + } + _ => (Arc::clone(field), false), } +} - let coerced_fields: Vec = arrow_schema - .fields() - .iter() - .map(|field| { - coerce_field( - field, - &[field.name().as_str()], - &int96_paths, - iceberg_schema, - ) - }) - .collect(); +/// Coerce a single `Timestamp(Nanosecond)` field to the resolution indicated by the +/// Iceberg schema. Falls back to microsecond when field IDs are unavailable, matching +/// Iceberg Java behavior. +fn coerce_timestamp_field(field: &FieldRef, iceberg_schema: &Schema) -> (FieldRef, bool) { + if let DataType::Timestamp(TimeUnit::Nanosecond, tz) = field.data_type() { + let target_unit = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .and_then(|field_id| iceberg_schema.field_by_id(field_id)) + .and_then(|f| match &*f.field_type { + Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { + Some(TimeUnit::Microsecond) + } + Type::Primitive(PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs) => { + Some(TimeUnit::Nanosecond) + } + _ => None, + }) + // Iceberg Java reads INT96 as microseconds by default + .unwrap_or(TimeUnit::Microsecond); - let coerced = ArrowSchema::new_with_metadata(coerced_fields, arrow_schema.metadata().clone()); + if target_unit == TimeUnit::Nanosecond { + return (Arc::clone(field), false); + } - if &coerced != arrow_schema.as_ref() { - Some(Arc::new(coerced)) + let new_field = Arc::new( + Field::new( + field.name(), + DataType::Timestamp(target_unit, tz.clone()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ); + (new_field, true) } else { - None + (Arc::clone(field), false) } } @@ -2094,16 +2179,20 @@ mod tests { use std::sync::Arc; use arrow_array::cast::AsArray; - use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; + use arrow_array::{ + ArrayRef, LargeStringArray, RecordBatch, StringArray, TimestampMicrosecondArray, + }; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::{ArrowWriter, ProjectionMask}; - use parquet::basic::Compression; + use parquet::basic::{Compression, Repetition, Type as PhysicalType}; + use parquet::data_type::{ByteArrayType, Int32Type, Int96, Int96Type}; use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use parquet::file::properties::WriterProperties; + use parquet::file::writer::SerializedFileWriter; use parquet::schema::parser::parse_message_type; - use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; + use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor, Type as SchemaType}; use roaring::RoaringTreemap; use tempfile::TempDir; @@ -4804,6 +4893,58 @@ message schema { assert_eq!(result[2], expected_2); } + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + /// Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). + const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; + const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; + + /// Build an INT96 value and its expected microsecond interpretation. + fn make_int96_test_value() -> (Int96, i64) { + let mut val = Int96::new(); + val.set_data( + (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, + (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, + INT96_TEST_JULIAN_DAY, + ); + let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; + (val, expected_micros) + } + + /// Read a Parquet file through ArrowReader and return the resulting batches. + async fn read_int96_batches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + ) -> Vec { + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let file_size = std::fs::metadata(file_path).unwrap().len(); + let task = FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: file_size, + record_count: None, + data_file_path: file_path.to_string(), + data_file_format: DataFileFormat::Parquet, + schema, + project_field_ids, + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + reader.read(tasks).unwrap().try_collect().await.unwrap() + } + /// Writes a Parquet file with INT96 timestamps using SerializedFileWriter /// (ArrowWriter cannot write INT96). Returns (file_path, expected_microsecond_values). fn write_int96_parquet_file( @@ -4811,11 +4952,6 @@ message schema { filename: &str, with_field_ids: bool, ) -> (String, Vec) { - use parquet::basic::{Repetition, Type as PhysicalType}; - use parquet::data_type::{Int32Type, Int96, Int96Type}; - use parquet::file::writer::SerializedFileWriter; - use parquet::schema::types::Type as SchemaType; - let file_path = format!("{table_location}/{filename}"); let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) @@ -4836,34 +4972,33 @@ message schema { .build() .unwrap(); - // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] - // Julian day 2_440_588 = Unix epoch (1970-01-01) // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. - const UNIX_EPOCH_JULIAN: i64 = 2_440_588; - const MICROS_PER_DAY: i64 = 86_400_000_000; - const NOON_NANOS: u64 = 43_200_000_000_000; + const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; + const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; + const JULIAN_2100: u32 = 2_488_070; let test_data: Vec<(u32, u32, u32, i64)> = vec![ // 3333-01-01 00:00:00 ( 0, 0, - 2_953_529, - (2_953_529 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, ), // 3333-01-01 12:00:00 ( (NOON_NANOS & 0xFFFFFFFF) as u32, (NOON_NANOS >> 32) as u32, - 2_953_529, - (2_953_529 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + (NOON_NANOS / 1_000) as i64, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (NOON_NANOS / 1_000) as i64, ), // 2100-01-01 00:00:00 ( 0, 0, - 2_488_070, - (2_488_070 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + JULIAN_2100, + (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, ), ]; @@ -4904,37 +5039,14 @@ message schema { (file_path, expected_micros) } - /// Read INT96 Parquet file through ArrowReader and verify microsecond timestamps. + /// Read INT96 Parquet file through ArrowReader and verify top-level microsecond timestamps. async fn assert_int96_read_matches( file_path: &str, schema: SchemaRef, project_field_ids: Vec, expected_micros: &[i64], ) { - use arrow_array::TimestampMicrosecondArray; - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(file_path).unwrap().len(), - start: 0, - length: std::fs::metadata(file_path).unwrap().len(), - record_count: None, - data_file_path: file_path.to_string(), - data_file_format: DataFileFormat::Parquet, - schema, - project_field_ids, - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let batches: Vec = reader.read(tasks).unwrap().try_collect().await.unwrap(); + let batches = read_int96_batches(file_path, schema, project_field_ids).await; assert_eq!(batches.len(), 1); let ts_array = batches[0] @@ -5002,12 +5114,6 @@ message schema { /// Test reading INT96 timestamps inside a struct field. #[tokio::test] async fn test_read_int96_timestamps_in_struct() { - use arrow_array::TimestampMicrosecondArray; - use parquet::basic::{Repetition, Type as PhysicalType}; - use parquet::data_type::Int96Type; - use parquet::file::writer::SerializedFileWriter; - use parquet::schema::types::Type as SchemaType; - let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); let file_path = format!("{table_location}/struct_int96.parquet"); @@ -5030,20 +5136,7 @@ message schema { .build() .unwrap(); - // Year 3333 (Julian day 2_953_529), noon — outside the i64 nanosecond range - const UNIX_EPOCH_JULIAN: i64 = 2_440_588; - const MICROS_PER_DAY: i64 = 86_400_000_000; - let nanos_within_day = 43_200_000_000_000u64; - let julian_day = 2_953_529u32; - let expected_micros = (julian_day as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY - + (nanos_within_day / 1_000) as i64; - - let mut int96_val = parquet::data_type::Int96::new(); - int96_val.set_data( - (nanos_within_day & 0xFFFFFFFF) as u32, - (nanos_within_day >> 32) as u32, - julian_day, - ); + let (int96_val, expected_micros) = make_int96_test_value(); let file = File::create(&file_path).unwrap(); let mut writer = @@ -5082,34 +5175,10 @@ message schema { .unwrap(), ); - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Only project the top-level struct field; nested fields are included implicitly - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: 0, - length: std::fs::metadata(&file_path).unwrap().len(), - record_count: None, - data_file_path: file_path, - data_file_format: DataFileFormat::Parquet, - schema: iceberg_schema, - project_field_ids: vec![1], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let batches: Vec = reader.read(tasks).unwrap().try_collect().await.unwrap(); + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; assert_eq!(batches.len(), 1); - let batch = &batches[0]; - - let struct_array = batch + let struct_array = batches[0] .column(0) .as_any() .downcast_ref::() @@ -5127,4 +5196,231 @@ message schema { ts_array.value(0) ); } + + /// Test reading INT96 timestamps inside a list field (3-level Parquet LIST encoding). + #[tokio::test] + async fn test_read_int96_timestamps_in_list() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/list_int96.parquet"); + + // 3-level LIST encoding: + // optional group timestamps (LIST) { + // repeated group list { + // optional int96 element; + // } + // } + let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let list_group = SchemaType::group_type_builder("list") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(element_type)]) + .build() + .unwrap(); + + let list_type = SchemaType::group_type_builder("timestamps") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::List)) + .with_fields(vec![Arc::new(list_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(list_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a list containing one INT96 element + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3i16]), Some(&[0i16])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(crate::spec::ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let list_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected ListArray"); + let ts_array = list_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside list"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in list: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + /// Test reading INT96 timestamps as map values. + #[tokio::test] + async fn test_read_int96_timestamps_in_map() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/map_int96.parquet"); + + // MAP encoding: + // optional group ts_map (MAP) { + // repeated group key_value { + // required binary key (UTF8); + // optional int96 value; + // } + // } + let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(parquet::basic::LogicalType::String)) + .with_id(Some(2)) + .build() + .unwrap(); + + let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(3)) + .build() + .unwrap(); + + let key_value_group = SchemaType::group_type_builder("key_value") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) + .build() + .unwrap(); + + let map_type = SchemaType::group_type_builder("ts_map") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::Map)) + .with_fields(vec![Arc::new(key_value_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(map_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a map containing one key-value pair + let mut row_group = writer.next_row_group().unwrap(); + { + // key column + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch( + &[parquet::data_type::ByteArray::from("event_time")], + Some(&[2i16]), + Some(&[0i16]), + ) + .unwrap(); + col.close().unwrap(); + } + { + // value column (INT96) + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3i16]), Some(&[0i16])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let map_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected MapArray"); + let ts_array = map_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray as map values"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in map: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } } From 63b2260bb8b01ed41f7e2a1a9c0867dbe4aa9571 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 31 Mar 2026 14:42:07 -0400 Subject: [PATCH 4/8] clean up comments --- crates/iceberg/src/arrow/reader.rs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 89bd2168f6..a4be7090e2 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -5020,9 +5020,10 @@ message schema { let mut row_group = writer.next_row_group().unwrap(); { + // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). let mut col = row_group.next_column().unwrap().unwrap(); col.typed::() - .write_batch(&int96_values, Some(&vec![1i16; test_data.len()]), None) + .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) .unwrap(); col.close().unwrap(); } @@ -5142,11 +5143,13 @@ message schema { let mut writer = SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). + // No repetition levels needed (no repeated groups). let mut row_group = writer.next_row_group().unwrap(); { let mut col = row_group.next_column().unwrap().unwrap(); col.typed::() - .write_batch(&[int96_val], Some(&[1i16]), None) + .write_batch(&[int96_val], Some(&[1]), None) .unwrap(); col.close().unwrap(); } @@ -5241,12 +5244,14 @@ message schema { let mut writer = SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - // Write a single row with a list containing one INT96 element + // Write a single row with a list containing one INT96 element. + // def=3: list present (1) + repeated group (2) + element present (3) + // rep=0: start of a new list let mut row_group = writer.next_row_group().unwrap(); { let mut col = row_group.next_column().unwrap().unwrap(); col.typed::() - .write_batch(&[int96_val], Some(&[3i16]), Some(&[0i16])) + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) .unwrap(); col.close().unwrap(); } @@ -5349,25 +5354,26 @@ message schema { let mut writer = SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - // Write a single row with a map containing one key-value pair + // Write a single row with a map containing one key-value pair. + // rep=0 for both columns: start of a new map. + // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. + // value def=3: map present (1) + key_value entry present (2) + value present (3). let mut row_group = writer.next_row_group().unwrap(); { - // key column let mut col = row_group.next_column().unwrap().unwrap(); col.typed::() .write_batch( &[parquet::data_type::ByteArray::from("event_time")], - Some(&[2i16]), - Some(&[0i16]), + Some(&[2]), + Some(&[0]), ) .unwrap(); col.close().unwrap(); } { - // value column (INT96) let mut col = row_group.next_column().unwrap().unwrap(); col.typed::() - .write_batch(&[int96_val], Some(&[3i16]), Some(&[0i16])) + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) .unwrap(); col.close().unwrap(); } From 516312289b534a731f6ffc3451c771fd09be832f Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 31 Mar 2026 16:33:25 -0400 Subject: [PATCH 5/8] remove error-prone debug_assert --- crates/iceberg/src/arrow/reader.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index a4be7090e2..43b51c9f99 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -1364,13 +1364,6 @@ fn coerce_field_at_path( // spec for the backward-compatibility rules: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists if depth + 2 < parquet_path.len() { - debug_assert_eq!( - element_field.name(), - &parquet_path[depth + 2], - "Arrow list element name '{}' does not match Parquet path segment '{}'", - element_field.name(), - &parquet_path[depth + 2] - ); let (new_element, changed) = coerce_field_at_path(element_field, parquet_path, depth + 2, iceberg_schema); if changed { From 7abe9e839b3619fd0ec6684a0a3311572b539b10 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Wed, 1 Apr 2026 11:11:40 -0400 Subject: [PATCH 6/8] address PR feedback, add comments. --- crates/iceberg/src/arrow/reader.rs | 306 +++++++++++++++-------------- crates/iceberg/src/arrow/schema.rs | 5 +- 2 files changed, 160 insertions(+), 151 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 43b51c9f99..c42460df1a 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -40,15 +40,15 @@ use parquet::arrow::arrow_reader::{ }; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::basic::Type as PhysicalType; use parquet::file::metadata::{ PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, }; -use parquet::schema::types::{ColumnPath, SchemaDescriptor, Type as ParquetType}; +use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; use typed_builder::TypedBuilder; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; +use crate::arrow::schema::{ArrowSchemaVisitor, DEFAULT_MAP_FIELD_NAME, visit_schema}; use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; use crate::delete_vector::DeleteVector; use crate::error::Result; @@ -390,11 +390,9 @@ impl ArrowReader { // Coerce INT96 timestamp columns to the resolution specified by the Iceberg schema. // This must happen before building the stream reader to avoid i64 overflow in arrow-rs. - let arrow_metadata = if let Some(coerced_schema) = coerce_int96_timestamps( - arrow_metadata.metadata().file_metadata().schema_descr(), - arrow_metadata.schema(), - &task.schema, - ) { + let arrow_metadata = if let Some(coerced_schema) = + coerce_int96_timestamps(arrow_metadata.schema(), &task.schema) + { let options = ArrowReaderOptions::new().with_schema(Arc::clone(&coerced_schema)); ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( |e| { @@ -1270,152 +1268,51 @@ fn add_fallback_field_ids_to_arrow_schema(arrow_schema: &ArrowSchemaRef) -> Arc< /// - Iceberg spec primitive types: /// - arrow-rs schema hint support: fn coerce_int96_timestamps( - parquet_schema: &SchemaDescriptor, arrow_schema: &ArrowSchemaRef, iceberg_schema: &Schema, ) -> Option> { - let int96_paths: Vec = parquet_schema - .columns() - .iter() - .filter(|col| col.physical_type() == PhysicalType::INT96) - .map(|col| col.path().clone()) - .collect(); - - if int96_paths.is_empty() { - return None; - } - - let mut fields: Vec = arrow_schema.fields().iter().cloned().collect(); - let mut any_changed = false; - - for path in &int96_paths { - let parts = path.parts(); - if let Some(idx) = fields.iter().position(|f| f.name() == &parts[0]) { - let (new_field, changed) = coerce_field_at_path(&fields[idx], parts, 0, iceberg_schema); - if changed { - fields[idx] = new_field; - any_changed = true; - } - } - } - - if any_changed { - Some(Arc::new(ArrowSchema::new_with_metadata( - fields, - arrow_schema.metadata().clone(), - ))) + let mut visitor = Int96CoercionVisitor::new(iceberg_schema); + let coerced = visit_schema(arrow_schema, &mut visitor).ok()?; + if visitor.changed { + Some(Arc::new(coerced)) } else { None } } -/// Navigate an Arrow field tree using a Parquet column path and coerce the leaf -/// INT96 `Timestamp(Nanosecond)` to the resolution specified by the Iceberg schema. -/// -/// Parquet column paths include intermediate group names for nested types (e.g. the -/// repeated group in LIST encoding). This function accounts for those extra levels -/// when descending through List, LargeList, and Map Arrow types. -fn coerce_field_at_path( - field: &FieldRef, - parquet_path: &[String], - depth: usize, - iceberg_schema: &Schema, -) -> (FieldRef, bool) { - if depth == parquet_path.len() - 1 { - return coerce_timestamp_field(field, iceberg_schema); - } - - match field.data_type() { - DataType::Struct(fields) => { - let child_name = &parquet_path[depth + 1]; - let mut new_fields: Vec = fields.iter().cloned().collect(); - let mut changed = false; - - if let Some(idx) = new_fields.iter().position(|f| f.name() == child_name) { - let (new_child, child_changed) = - coerce_field_at_path(&new_fields[idx], parquet_path, depth + 1, iceberg_schema); - if child_changed { - new_fields[idx] = new_child; - changed = true; - } - } +/// Visitor that coerces `Timestamp(Nanosecond)` Arrow fields to the resolution +/// indicated by the Iceberg schema. Follows the same pattern as `MetadataStripVisitor`. +struct Int96CoercionVisitor<'a> { + iceberg_schema: &'a Schema, + field_stack: Vec, + changed: bool, +} - if changed { - let new_field = Arc::new( - Field::new( - field.name(), - DataType::Struct(Fields::from(new_fields)), - field.is_nullable(), - ) - .with_metadata(field.metadata().clone()), - ); - (new_field, true) - } else { - (Arc::clone(field), false) - } - } - DataType::List(element_field) | DataType::LargeList(element_field) => { - // Parquet 3-level LIST encoding inserts a repeated group between the list - // and its element, e.g. `my_list.list.element` where `list` is the - // intermediate group. The group name varies across writers — the spec says - // "list" but legacy data uses "element", "array", `_tuple`, etc. - // We skip it (depth + 1) since we navigate by Parquet ColumnPath parts - // which contain the actual name from the file. See the Parquet LogicalTypes - // spec for the backward-compatibility rules: - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists - if depth + 2 < parquet_path.len() { - let (new_element, changed) = - coerce_field_at_path(element_field, parquet_path, depth + 2, iceberg_schema); - if changed { - let new_type = match field.data_type() { - DataType::List(_) => DataType::List(new_element), - DataType::LargeList(_) => DataType::LargeList(new_element), - _ => unreachable!(), - }; - let new_field = Arc::new( - Field::new(field.name(), new_type, field.is_nullable()) - .with_metadata(field.metadata().clone()), - ); - return (new_field, true); - } - } - (Arc::clone(field), false) - } - DataType::Map(entries_field, sorted) => { - // The entries field is a struct containing key/value fields. The Parquet - // path segment at depth + 1 is the entries group name (e.g. "key_value"), - // which corresponds to the Arrow entries struct field. - if depth + 1 < parquet_path.len() { - let (new_entries, changed) = - coerce_field_at_path(entries_field, parquet_path, depth + 1, iceberg_schema); - if changed { - let new_field = Arc::new( - Field::new( - field.name(), - DataType::Map(new_entries, *sorted), - field.is_nullable(), - ) - .with_metadata(field.metadata().clone()), - ); - return (new_field, true); - } - } - (Arc::clone(field), false) +impl<'a> Int96CoercionVisitor<'a> { + fn new(iceberg_schema: &'a Schema) -> Self { + Self { + iceberg_schema, + field_stack: Vec::new(), + changed: false, } - _ => (Arc::clone(field), false), } -} -/// Coerce a single `Timestamp(Nanosecond)` field to the resolution indicated by the -/// Iceberg schema. Falls back to microsecond when field IDs are unavailable, matching -/// Iceberg Java behavior. -fn coerce_timestamp_field(field: &FieldRef, iceberg_schema: &Schema) -> (FieldRef, bool) { - if let DataType::Timestamp(TimeUnit::Nanosecond, tz) = field.data_type() { - let target_unit = field + /// Determine the target TimeUnit for a Timestamp(Nanosecond) field based on the + /// Iceberg schema. Falls back to microsecond when field IDs are unavailable, + /// matching Iceberg Java behavior. + fn target_unit(&self, field: &Field) -> Option { + if !matches!( + field.data_type(), + DataType::Timestamp(TimeUnit::Nanosecond, _) + ) { + return None; + } + + let target = field .metadata() .get(PARQUET_FIELD_ID_META_KEY) .and_then(|id_str| id_str.parse::().ok()) - .and_then(|field_id| iceberg_schema.field_by_id(field_id)) + .and_then(|field_id| self.iceberg_schema.field_by_id(field_id)) .and_then(|f| match &*f.field_type { Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { Some(TimeUnit::Microsecond) @@ -1428,21 +1325,130 @@ fn coerce_timestamp_field(field: &FieldRef, iceberg_schema: &Schema) -> (FieldRe // Iceberg Java reads INT96 as microseconds by default .unwrap_or(TimeUnit::Microsecond); - if target_unit == TimeUnit::Nanosecond { - return (Arc::clone(field), false); + if target == TimeUnit::Nanosecond { + None + } else { + Some(target) } + } +} - let new_field = Arc::new( - Field::new( - field.name(), - DataType::Timestamp(target_unit, tz.clone()), - field.is_nullable(), - ) - .with_metadata(field.metadata().clone()), +impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> { + type T = Field; + type U = ArrowSchema; + + fn before_field(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn before_list_element(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn before_map_key(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn before_map_value(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn schema(&mut self, schema: &ArrowSchema, values: Vec) -> Result { + Ok(ArrowSchema::new_with_metadata( + values, + schema.metadata().clone(), + )) + } + + fn r#struct(&mut self, _fields: &Fields, results: Vec) -> Result { + let field_info = self + .field_stack + .pop() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in struct"))?; + Ok(Field::new( + field_info.name(), + DataType::Struct(Fields::from(results)), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn list(&mut self, list: &DataType, value: Self::T) -> Result { + let field_info = self + .field_stack + .pop() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in list"))?; + let list_type = match list { + DataType::List(_) => DataType::List(Arc::new(value)), + DataType::LargeList(_) => DataType::LargeList(Arc::new(value)), + DataType::FixedSizeList(_, size) => DataType::FixedSizeList(Arc::new(value), *size), + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected list type, got {list}"), + )); + } + }; + Ok( + Field::new(field_info.name(), list_type, field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + + fn map(&mut self, map: &DataType, key_value: Self::T, value: Self::T) -> Result { + let field_info = self + .field_stack + .pop() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in map"))?; + let sorted = match map { + DataType::Map(_, sorted) => *sorted, + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected map type, got {map}"), + )); + } + }; + let struct_field = Field::new( + DEFAULT_MAP_FIELD_NAME, + DataType::Struct(Fields::from(vec![key_value, value])), + false, ); - (new_field, true) - } else { - (Arc::clone(field), false) + Ok(Field::new( + field_info.name(), + DataType::Map(Arc::new(struct_field), sorted), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn primitive(&mut self, p: &DataType) -> Result { + let field_info = self.field_stack.pop().ok_or_else(|| { + Error::new(ErrorKind::Unexpected, "Field stack underflow in primitive") + })?; + + if let Some(target_unit) = self.target_unit(&field_info) { + let tz = match field_info.data_type() { + DataType::Timestamp(_, tz) => tz.clone(), + _ => None, + }; + self.changed = true; + Ok(Field::new( + field_info.name(), + DataType::Timestamp(target_unit, tz), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } else { + Ok( + Field::new(field_info.name(), p.clone(), field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } } } diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index bd9e249f48..f96c29ab4a 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -199,7 +199,10 @@ fn visit_struct(fields: &Fields, visitor: &mut V) -> Resu } /// Visit schema in post order. -fn visit_schema(schema: &ArrowSchema, visitor: &mut V) -> Result { +pub(crate) fn visit_schema( + schema: &ArrowSchema, + visitor: &mut V, +) -> Result { let mut results = Vec::with_capacity(schema.fields().len()); for field in schema.fields() { visitor.before_field(field)?; From 11f532057b5a15234f7bedcd7531103fa3222c6e Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Thu, 2 Apr 2026 13:47:23 -0400 Subject: [PATCH 7/8] address PR feedback. --- crates/iceberg/src/arrow/int96.rs | 804 +++++++++++++++++++++++++++++ crates/iceberg/src/arrow/mod.rs | 1 + crates/iceberg/src/arrow/reader.rs | 749 +-------------------------- 3 files changed, 810 insertions(+), 744 deletions(-) create mode 100644 crates/iceberg/src/arrow/int96.rs diff --git a/crates/iceberg/src/arrow/int96.rs b/crates/iceberg/src/arrow/int96.rs new file mode 100644 index 0000000000..57516a63d9 --- /dev/null +++ b/crates/iceberg/src/arrow/int96.rs @@ -0,0 +1,804 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! INT96 timestamp coercion for Parquet files. + +use std::sync::Arc; + +use arrow_schema::{ + DataType, Field, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + +use crate::arrow::schema::{ArrowSchemaVisitor, DEFAULT_MAP_FIELD_NAME, visit_schema}; +use crate::error::Result; +use crate::spec::{PrimitiveType, Schema, Type}; +use crate::{Error, ErrorKind}; + +/// Coerce Arrow schema types for INT96 columns to match the Iceberg table schema. +/// +/// arrow-rs defaults INT96 to `Timestamp(Nanosecond)`, which overflows i64 for dates outside +/// ~1677-2262. We use arrow-rs's schema hint mechanism to read INT96 at the resolution +/// specified by the Iceberg schema (`timestamp` → microsecond, `timestamp_ns` → nanosecond). +/// +/// Iceberg Java handles this differently: it bypasses parquet-mr with a custom column reader +/// (`GenericParquetReaders.TimestampInt96Reader`). We achieve the same result via schema hints. +/// +/// References: +/// - Iceberg spec primitive types: +/// - arrow-rs schema hint support: +pub(crate) fn coerce_int96_timestamps( + arrow_schema: &ArrowSchemaRef, + iceberg_schema: &Schema, +) -> Option> { + let mut visitor = Int96CoercionVisitor::new(iceberg_schema); + let coerced = visit_schema(arrow_schema, &mut visitor).ok()?; + if visitor.changed { + Some(Arc::new(coerced)) + } else { + None + } +} + +/// Visitor that coerces `Timestamp(Nanosecond)` Arrow fields to the resolution +/// indicated by the Iceberg schema. +struct Int96CoercionVisitor<'a> { + iceberg_schema: &'a Schema, + // TODO(#2310): use FieldRef (Arc) once ArrowSchemaVisitor passes FieldRef. + field_stack: Vec, + changed: bool, +} + +impl<'a> Int96CoercionVisitor<'a> { + fn new(iceberg_schema: &'a Schema) -> Self { + Self { + iceberg_schema, + field_stack: Vec::new(), + changed: false, + } + } + + /// Determine the target TimeUnit for a Timestamp(Nanosecond) field based on the + /// Iceberg schema. Falls back to microsecond when field IDs are unavailable, + /// matching Iceberg Java behavior. + fn target_unit(&self, field: &Field) -> Option { + if !matches!( + field.data_type(), + DataType::Timestamp(TimeUnit::Nanosecond, _) + ) { + return None; + } + + let target = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .and_then(|field_id| self.iceberg_schema.field_by_id(field_id)) + .and_then(|f| match &*f.field_type { + Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { + Some(TimeUnit::Microsecond) + } + Type::Primitive(PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs) => { + Some(TimeUnit::Nanosecond) + } + _ => None, + }) + // Iceberg Java reads INT96 as microseconds by default + .unwrap_or(TimeUnit::Microsecond); + + if target == TimeUnit::Nanosecond { + None + } else { + Some(target) + } + } +} + +impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> { + type T = Field; + type U = ArrowSchema; + + fn before_field(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_field(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_list_element(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_list_element(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_key(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_key(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_value(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_value(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn schema(&mut self, schema: &ArrowSchema, values: Vec) -> Result { + Ok(ArrowSchema::new_with_metadata( + values, + schema.metadata().clone(), + )) + } + + fn r#struct(&mut self, _fields: &Fields, results: Vec) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in struct"))?; + Ok(Field::new( + field_info.name(), + DataType::Struct(Fields::from(results)), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn list(&mut self, list: &DataType, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in list"))?; + let list_type = match list { + DataType::List(_) => DataType::List(Arc::new(value)), + DataType::LargeList(_) => DataType::LargeList(Arc::new(value)), + DataType::FixedSizeList(_, size) => DataType::FixedSizeList(Arc::new(value), *size), + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected list type, got {list}"), + )); + } + }; + Ok( + Field::new(field_info.name(), list_type, field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + + fn map(&mut self, map: &DataType, key_value: Field, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in map"))?; + let sorted = match map { + DataType::Map(_, sorted) => *sorted, + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected map type, got {map}"), + )); + } + }; + let struct_field = Field::new( + DEFAULT_MAP_FIELD_NAME, + DataType::Struct(Fields::from(vec![key_value, value])), + false, + ); + Ok(Field::new( + field_info.name(), + DataType::Map(Arc::new(struct_field), sorted), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn primitive(&mut self, p: &DataType) -> Result { + let field_info = self.field_stack.last().ok_or_else(|| { + Error::new(ErrorKind::Unexpected, "Field stack underflow in primitive") + })?; + + if let Some(target_unit) = self.target_unit(field_info) { + let tz = match field_info.data_type() { + DataType::Timestamp(_, tz) => tz.clone(), + _ => None, + }; + self.changed = true; + Ok(Field::new( + field_info.name(), + DataType::Timestamp(target_unit, tz), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } else { + Ok( + Field::new(field_info.name(), p.clone(), field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + } +} + +#[cfg(test)] +mod tests { + use std::fs::File; + use std::sync::Arc; + + use arrow_array::TimestampMicrosecondArray; + use futures::TryStreamExt; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{ByteArrayType, Int32Type, Int96, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + use tempfile::TempDir; + + use crate::arrow::ArrowReaderBuilder; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + /// Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). + const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; + const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; + + /// Build an INT96 value and its expected microsecond interpretation. + fn make_int96_test_value() -> (Int96, i64) { + let mut val = Int96::new(); + val.set_data( + (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, + (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, + INT96_TEST_JULIAN_DAY, + ); + let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; + (val, expected_micros) + } + + /// Read a Parquet file through ArrowReader and return the resulting batches. + async fn read_int96_batches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + ) -> Vec { + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let file_size = std::fs::metadata(file_path).unwrap().len(); + let task = FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: file_size, + record_count: None, + data_file_path: file_path.to_string(), + data_file_format: DataFileFormat::Parquet, + schema, + project_field_ids, + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + reader.read(tasks).unwrap().try_collect().await.unwrap() + } + + /// Writes a Parquet file with INT96 timestamps using SerializedFileWriter + /// (ArrowWriter cannot write INT96). Returns (file_path, expected_microsecond_values). + fn write_int96_parquet_file( + table_location: &str, + filename: &str, + with_field_ids: bool, + ) -> (String, Vec) { + let file_path = format!("{table_location}/{filename}"); + + let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); + + if with_field_ids { + ts_builder = ts_builder.with_id(Some(1)); + id_builder = id_builder.with_id(Some(2)); + } + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new(ts_builder.build().unwrap()), + Arc::new(id_builder.build().unwrap()), + ]) + .build() + .unwrap(); + + // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. + const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; + const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; + const JULIAN_2100: u32 = 2_488_070; + + let test_data: Vec<(u32, u32, u32, i64)> = vec![ + // 3333-01-01 00:00:00 + ( + 0, + 0, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + // 3333-01-01 12:00:00 + ( + (NOON_NANOS & 0xFFFFFFFF) as u32, + (NOON_NANOS >> 32) as u32, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (NOON_NANOS / 1_000) as i64, + ), + // 2100-01-01 00:00:00 + ( + 0, + 0, + JULIAN_2100, + (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + ]; + + let int96_values: Vec = test_data + .iter() + .map(|(lo, hi, day, _)| { + let mut v = Int96::new(); + v.set_data(*lo, *hi, *day); + v + }) + .collect(); + + let id_values: Vec = (0..test_data.len() as i32).collect(); + let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&id_values, None, None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + (file_path, expected_micros) + } + + /// Read INT96 Parquet file through ArrowReader and verify top-level microsecond timestamps. + async fn assert_int96_read_matches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + expected_micros: &[i64], + ) { + let batches = read_int96_batches(file_path, schema, project_field_ids).await; + + assert_eq!(batches.len(), 1); + let ts_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + /// Files with embedded field IDs (branch 1 of schema resolution). + #[tokio::test] + async fn test_read_int96_timestamps_with_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "with_ids.parquet", true); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + /// Migrated files without field IDs (branches 2/3 of schema resolution). + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + /// Test reading INT96 timestamps inside a struct field. + #[tokio::test] + async fn test_read_int96_timestamps_in_struct() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/struct_int96.parquet"); + + let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let struct_type = SchemaType::group_type_builder("data") + .with_repetition(Repetition::REQUIRED) + .with_id(Some(1)) + .with_fields(vec![Arc::new(ts_type)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(struct_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). + // No repetition levels needed (no repeated groups). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[1]), None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional( + 2, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let struct_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected StructArray"); + let ts_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside struct"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in struct: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + /// Test reading INT96 timestamps inside a list field (3-level Parquet LIST encoding). + #[tokio::test] + async fn test_read_int96_timestamps_in_list() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/list_int96.parquet"); + + // 3-level LIST encoding: + // optional group timestamps (LIST) { + // repeated group list { + // optional int96 element; + // } + // } + let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let list_group = SchemaType::group_type_builder("list") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(element_type)]) + .build() + .unwrap(); + + let list_type = SchemaType::group_type_builder("timestamps") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::List)) + .with_fields(vec![Arc::new(list_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(list_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a list containing one INT96 element. + // def=3: list present (1) + repeated group (2) + element present (3) + // rep=0: start of a new list + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(crate::spec::ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let list_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected ListArray"); + let ts_array = list_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside list"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in list: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + /// Test reading INT96 timestamps as map values. + #[tokio::test] + async fn test_read_int96_timestamps_in_map() { + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/map_int96.parquet"); + + // MAP encoding: + // optional group ts_map (MAP) { + // repeated group key_value { + // required binary key (UTF8); + // optional int96 value; + // } + // } + let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(parquet::basic::LogicalType::String)) + .with_id(Some(2)) + .build() + .unwrap(); + + let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(3)) + .build() + .unwrap(); + + let key_value_group = SchemaType::group_type_builder("key_value") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) + .build() + .unwrap(); + + let map_type = SchemaType::group_type_builder("ts_map") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::Map)) + .with_fields(vec![Arc::new(key_value_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(map_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a map containing one key-value pair. + // rep=0 for both columns: start of a new map. + // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. + // value def=3: map present (1) + key_value entry present (2) + value present (3). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch( + &[parquet::data_type::ByteArray::from("event_time")], + Some(&[2]), + Some(&[0]), + ) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let map_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected MapArray"); + let ts_array = map_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray as map values"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in map: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } +} diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index c091c45177..7823320452 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -27,6 +27,7 @@ pub(crate) mod caching_delete_file_loader; pub mod delete_file_loader; pub(crate) mod delete_filter; +mod int96; mod reader; /// RecordBatch projection utilities pub mod record_batch_projector; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 167ffd96c5..8eba4ec6c9 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -27,8 +27,7 @@ use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatc use arrow_cast::cast::cast; use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; use arrow_schema::{ - ArrowError, DataType, Field, FieldRef, Fields, Schema as ArrowSchema, - SchemaRef as ArrowSchemaRef, TimeUnit, + ArrowError, DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; use arrow_string::like::starts_with; use bytes::Bytes; @@ -47,8 +46,8 @@ use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; use typed_builder::TypedBuilder; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::arrow::int96::coerce_int96_timestamps; use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; -use crate::arrow::schema::{ArrowSchemaVisitor, DEFAULT_MAP_FIELD_NAME, visit_schema}; use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; use crate::delete_vector::DeleteVector; use crate::error::Result; @@ -1255,203 +1254,6 @@ fn add_fallback_field_ids_to_arrow_schema(arrow_schema: &ArrowSchemaRef) -> Arc< )) } -/// Coerce Arrow schema types for INT96 columns to match the Iceberg table schema. -/// -/// arrow-rs defaults INT96 to `Timestamp(Nanosecond)`, which overflows i64 for dates outside -/// ~1677-2262. We use arrow-rs's schema hint mechanism to read INT96 at the resolution -/// specified by the Iceberg schema (`timestamp` → microsecond, `timestamp_ns` → nanosecond). -/// -/// Iceberg Java handles this differently: it bypasses parquet-mr with a custom column reader -/// (`GenericParquetReaders.TimestampInt96Reader`). We achieve the same result via schema hints. -/// -/// References: -/// - Iceberg spec primitive types: -/// - arrow-rs schema hint support: -fn coerce_int96_timestamps( - arrow_schema: &ArrowSchemaRef, - iceberg_schema: &Schema, -) -> Option> { - let mut visitor = Int96CoercionVisitor::new(iceberg_schema); - let coerced = visit_schema(arrow_schema, &mut visitor).ok()?; - if visitor.changed { - Some(Arc::new(coerced)) - } else { - None - } -} - -/// Visitor that coerces `Timestamp(Nanosecond)` Arrow fields to the resolution -/// indicated by the Iceberg schema. Follows the same pattern as `MetadataStripVisitor`. -struct Int96CoercionVisitor<'a> { - iceberg_schema: &'a Schema, - field_stack: Vec, - changed: bool, -} - -impl<'a> Int96CoercionVisitor<'a> { - fn new(iceberg_schema: &'a Schema) -> Self { - Self { - iceberg_schema, - field_stack: Vec::new(), - changed: false, - } - } - - /// Determine the target TimeUnit for a Timestamp(Nanosecond) field based on the - /// Iceberg schema. Falls back to microsecond when field IDs are unavailable, - /// matching Iceberg Java behavior. - fn target_unit(&self, field: &Field) -> Option { - if !matches!( - field.data_type(), - DataType::Timestamp(TimeUnit::Nanosecond, _) - ) { - return None; - } - - let target = field - .metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|id_str| id_str.parse::().ok()) - .and_then(|field_id| self.iceberg_schema.field_by_id(field_id)) - .and_then(|f| match &*f.field_type { - Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { - Some(TimeUnit::Microsecond) - } - Type::Primitive(PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs) => { - Some(TimeUnit::Nanosecond) - } - _ => None, - }) - // Iceberg Java reads INT96 as microseconds by default - .unwrap_or(TimeUnit::Microsecond); - - if target == TimeUnit::Nanosecond { - None - } else { - Some(target) - } - } -} - -impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> { - type T = Field; - type U = ArrowSchema; - - fn before_field(&mut self, field: &Field) -> Result<()> { - self.field_stack.push(field.as_ref().clone()); - Ok(()) - } - - fn before_list_element(&mut self, field: &Field) -> Result<()> { - self.field_stack.push(field.as_ref().clone()); - Ok(()) - } - - fn before_map_key(&mut self, field: &Field) -> Result<()> { - self.field_stack.push(field.as_ref().clone()); - Ok(()) - } - - fn before_map_value(&mut self, field: &Field) -> Result<()> { - self.field_stack.push(field.as_ref().clone()); - Ok(()) - } - - fn schema(&mut self, schema: &ArrowSchema, values: Vec) -> Result { - Ok(ArrowSchema::new_with_metadata( - values, - schema.metadata().clone(), - )) - } - - fn r#struct(&mut self, _fields: &Fields, results: Vec) -> Result { - let field_info = self - .field_stack - .pop() - .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in struct"))?; - Ok(Field::new( - field_info.name(), - DataType::Struct(Fields::from(results)), - field_info.is_nullable(), - ) - .with_metadata(field_info.metadata().clone())) - } - - fn list(&mut self, list: &DataType, value: Self::T) -> Result { - let field_info = self - .field_stack - .pop() - .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in list"))?; - let list_type = match list { - DataType::List(_) => DataType::List(Arc::new(value)), - DataType::LargeList(_) => DataType::LargeList(Arc::new(value)), - DataType::FixedSizeList(_, size) => DataType::FixedSizeList(Arc::new(value), *size), - _ => { - return Err(Error::new( - ErrorKind::Unexpected, - format!("Expected list type, got {list}"), - )); - } - }; - Ok( - Field::new(field_info.name(), list_type, field_info.is_nullable()) - .with_metadata(field_info.metadata().clone()), - ) - } - - fn map(&mut self, map: &DataType, key_value: Self::T, value: Self::T) -> Result { - let field_info = self - .field_stack - .pop() - .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in map"))?; - let sorted = match map { - DataType::Map(_, sorted) => *sorted, - _ => { - return Err(Error::new( - ErrorKind::Unexpected, - format!("Expected map type, got {map}"), - )); - } - }; - let struct_field = Field::new( - DEFAULT_MAP_FIELD_NAME, - DataType::Struct(Fields::from(vec![key_value, value])), - false, - ); - Ok(Field::new( - field_info.name(), - DataType::Map(Arc::new(struct_field), sorted), - field_info.is_nullable(), - ) - .with_metadata(field_info.metadata().clone())) - } - - fn primitive(&mut self, p: &DataType) -> Result { - let field_info = self.field_stack.pop().ok_or_else(|| { - Error::new(ErrorKind::Unexpected, "Field stack underflow in primitive") - })?; - - if let Some(target_unit) = self.target_unit(&field_info) { - let tz = match field_info.data_type() { - DataType::Timestamp(_, tz) => tz.clone(), - _ => None, - }; - self.changed = true; - Ok(Field::new( - field_info.name(), - DataType::Timestamp(target_unit, tz), - field_info.is_nullable(), - ) - .with_metadata(field_info.metadata().clone())) - } else { - Ok( - Field::new(field_info.name(), p.clone(), field_info.is_nullable()) - .with_metadata(field_info.metadata().clone()), - ) - } - } -} - /// A visitor to collect field ids from bound predicates. struct CollectFieldIdVisitor { field_ids: HashSet, @@ -2178,20 +1980,16 @@ mod tests { use std::sync::Arc; use arrow_array::cast::AsArray; - use arrow_array::{ - ArrayRef, LargeStringArray, RecordBatch, StringArray, TimestampMicrosecondArray, - }; + use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::{ArrowWriter, ProjectionMask}; - use parquet::basic::{Compression, Repetition, Type as PhysicalType}; - use parquet::data_type::{ByteArrayType, Int32Type, Int96, Int96Type}; + use parquet::basic::Compression; use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use parquet::file::properties::WriterProperties; - use parquet::file::writer::SerializedFileWriter; use parquet::schema::parser::parse_message_type; - use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor, Type as SchemaType}; + use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; use roaring::RoaringTreemap; use tempfile::TempDir; @@ -4891,541 +4689,4 @@ message schema { assert_eq!(result[1], expected_1); assert_eq!(result[2], expected_2); } - - // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] - // Julian day 2_440_588 = Unix epoch (1970-01-01) - const UNIX_EPOCH_JULIAN: i64 = 2_440_588; - const MICROS_PER_DAY: i64 = 86_400_000_000; - /// Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). - const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; - const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; - - /// Build an INT96 value and its expected microsecond interpretation. - fn make_int96_test_value() -> (Int96, i64) { - let mut val = Int96::new(); - val.set_data( - (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, - (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, - INT96_TEST_JULIAN_DAY, - ); - let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY - + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; - (val, expected_micros) - } - - /// Read a Parquet file through ArrowReader and return the resulting batches. - async fn read_int96_batches( - file_path: &str, - schema: SchemaRef, - project_field_ids: Vec, - ) -> Vec { - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let file_size = std::fs::metadata(file_path).unwrap().len(); - let task = FileScanTask { - file_size_in_bytes: file_size, - start: 0, - length: file_size, - record_count: None, - data_file_path: file_path.to_string(), - data_file_format: DataFileFormat::Parquet, - schema, - project_field_ids, - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - reader.read(tasks).unwrap().try_collect().await.unwrap() - } - - /// Writes a Parquet file with INT96 timestamps using SerializedFileWriter - /// (ArrowWriter cannot write INT96). Returns (file_path, expected_microsecond_values). - fn write_int96_parquet_file( - table_location: &str, - filename: &str, - with_field_ids: bool, - ) -> (String, Vec) { - let file_path = format!("{table_location}/{filename}"); - - let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL); - let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) - .with_repetition(Repetition::REQUIRED); - - if with_field_ids { - ts_builder = ts_builder.with_id(Some(1)); - id_builder = id_builder.with_id(Some(2)); - } - - let schema = SchemaType::group_type_builder("schema") - .with_fields(vec![ - Arc::new(ts_builder.build().unwrap()), - Arc::new(id_builder.build().unwrap()), - ]) - .build() - .unwrap(); - - // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. - const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; - const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; - const JULIAN_2100: u32 = 2_488_070; - - let test_data: Vec<(u32, u32, u32, i64)> = vec![ - // 3333-01-01 00:00:00 - ( - 0, - 0, - JULIAN_3333, - (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, - ), - // 3333-01-01 12:00:00 - ( - (NOON_NANOS & 0xFFFFFFFF) as u32, - (NOON_NANOS >> 32) as u32, - JULIAN_3333, - (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY - + (NOON_NANOS / 1_000) as i64, - ), - // 2100-01-01 00:00:00 - ( - 0, - 0, - JULIAN_2100, - (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, - ), - ]; - - let int96_values: Vec = test_data - .iter() - .map(|(lo, hi, day, _)| { - let mut v = Int96::new(); - v.set_data(*lo, *hi, *day); - v - }) - .collect(); - - let id_values: Vec = (0..test_data.len() as i32).collect(); - let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); - - let mut row_group = writer.next_row_group().unwrap(); - { - // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) - .unwrap(); - col.close().unwrap(); - } - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&id_values, None, None) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - (file_path, expected_micros) - } - - /// Read INT96 Parquet file through ArrowReader and verify top-level microsecond timestamps. - async fn assert_int96_read_matches( - file_path: &str, - schema: SchemaRef, - project_field_ids: Vec, - expected_micros: &[i64], - ) { - let batches = read_int96_batches(file_path, schema, project_field_ids).await; - - assert_eq!(batches.len(), 1); - let ts_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray"); - - for (i, expected) in expected_micros.iter().enumerate() { - assert_eq!( - ts_array.value(i), - *expected, - "Row {i}: got {}, expected {expected}", - ts_array.value(i) - ); - } - } - - /// Files with embedded field IDs (branch 1 of schema resolution). - #[tokio::test] - async fn test_read_int96_timestamps_with_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) - .into(), - NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "with_ids.parquet", true); - - assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; - } - - /// Migrated files without field IDs (branches 2/3 of schema resolution). - #[tokio::test] - async fn test_read_int96_timestamps_without_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) - .into(), - NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "no_ids.parquet", false); - - assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; - } - - /// Test reading INT96 timestamps inside a struct field. - #[tokio::test] - async fn test_read_int96_timestamps_in_struct() { - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/struct_int96.parquet"); - - let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(2)) - .build() - .unwrap(); - - let struct_type = SchemaType::group_type_builder("data") - .with_repetition(Repetition::REQUIRED) - .with_id(Some(1)) - .with_fields(vec![Arc::new(ts_type)]) - .build() - .unwrap(); - - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(struct_type)]) - .build() - .unwrap(); - - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). - // No repetition levels needed (no repeated groups). - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[1]), None) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required( - 1, - "data", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::optional( - 2, - "ts", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - ])), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; - - assert_eq!(batches.len(), 1); - let struct_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected StructArray"); - let ts_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray inside struct"); - - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in struct: got {}, expected {expected_micros}", - ts_array.value(0) - ); - } - - /// Test reading INT96 timestamps inside a list field (3-level Parquet LIST encoding). - #[tokio::test] - async fn test_read_int96_timestamps_in_list() { - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/list_int96.parquet"); - - // 3-level LIST encoding: - // optional group timestamps (LIST) { - // repeated group list { - // optional int96 element; - // } - // } - let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(2)) - .build() - .unwrap(); - - let list_group = SchemaType::group_type_builder("list") - .with_repetition(Repetition::REPEATED) - .with_fields(vec![Arc::new(element_type)]) - .build() - .unwrap(); - - let list_type = SchemaType::group_type_builder("timestamps") - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(1)) - .with_logical_type(Some(parquet::basic::LogicalType::List)) - .with_fields(vec![Arc::new(list_group)]) - .build() - .unwrap(); - - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(list_type)]) - .build() - .unwrap(); - - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // Write a single row with a list containing one INT96 element. - // def=3: list present (1) + repeated group (2) + element present (3) - // rep=0: start of a new list - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[3]), Some(&[0])) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional( - 1, - "timestamps", - Type::List(crate::spec::ListType { - element_field: NestedField::optional( - 2, - "element", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - }), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; - - assert_eq!(batches.len(), 1); - let list_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected ListArray"); - let ts_array = list_array - .values() - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray inside list"); - - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in list: got {}, expected {expected_micros}", - ts_array.value(0) - ); - } - - /// Test reading INT96 timestamps as map values. - #[tokio::test] - async fn test_read_int96_timestamps_in_map() { - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/map_int96.parquet"); - - // MAP encoding: - // optional group ts_map (MAP) { - // repeated group key_value { - // required binary key (UTF8); - // optional int96 value; - // } - // } - let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) - .with_repetition(Repetition::REQUIRED) - .with_logical_type(Some(parquet::basic::LogicalType::String)) - .with_id(Some(2)) - .build() - .unwrap(); - - let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(3)) - .build() - .unwrap(); - - let key_value_group = SchemaType::group_type_builder("key_value") - .with_repetition(Repetition::REPEATED) - .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) - .build() - .unwrap(); - - let map_type = SchemaType::group_type_builder("ts_map") - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(1)) - .with_logical_type(Some(parquet::basic::LogicalType::Map)) - .with_fields(vec![Arc::new(key_value_group)]) - .build() - .unwrap(); - - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(map_type)]) - .build() - .unwrap(); - - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // Write a single row with a map containing one key-value pair. - // rep=0 for both columns: start of a new map. - // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. - // value def=3: map present (1) + key_value entry present (2) + value present (3). - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch( - &[parquet::data_type::ByteArray::from("event_time")], - Some(&[2]), - Some(&[0]), - ) - .unwrap(); - col.close().unwrap(); - } - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[3]), Some(&[0])) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional( - 1, - "ts_map", - Type::Map(crate::spec::MapType { - key_field: NestedField::required( - 2, - "key", - Type::Primitive(PrimitiveType::String), - ) - .into(), - value_field: NestedField::optional( - 3, - "value", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - }), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; - - assert_eq!(batches.len(), 1); - let map_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected MapArray"); - let ts_array = map_array - .values() - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray as map values"); - - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in map: got {}, expected {expected_micros}", - ts_array.value(0) - ); - } } From 38eee0339f275f5563d360cc2643364235a241d0 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Wed, 15 Apr 2026 10:24:02 -0400 Subject: [PATCH 8/8] Address PR feedback. --- crates/iceberg/src/arrow/int96.rs | 768 ++++++++++------------------- crates/iceberg/src/arrow/reader.rs | 553 +++++++++++++++++++++ 2 files changed, 824 insertions(+), 497 deletions(-) diff --git a/crates/iceberg/src/arrow/int96.rs b/crates/iceberg/src/arrow/int96.rs index 57516a63d9..63a7a30f1a 100644 --- a/crates/iceberg/src/arrow/int96.rs +++ b/crates/iceberg/src/arrow/int96.rs @@ -249,556 +249,330 @@ impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> { #[cfg(test)] mod tests { - use std::fs::File; + use std::collections::HashMap; use std::sync::Arc; - use arrow_array::TimestampMicrosecondArray; - use futures::TryStreamExt; - use parquet::basic::{Repetition, Type as PhysicalType}; - use parquet::data_type::{ByteArrayType, Int32Type, Int96, Int96Type}; - use parquet::file::writer::SerializedFileWriter; - use parquet::schema::types::Type as SchemaType; - use tempfile::TempDir; - - use crate::arrow::ArrowReaderBuilder; - use crate::io::FileIO; - use crate::scan::{FileScanTask, FileScanTaskStream}; - use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Schema, SchemaRef, Type}; - - // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] - // Julian day 2_440_588 = Unix epoch (1970-01-01) - const UNIX_EPOCH_JULIAN: i64 = 2_440_588; - const MICROS_PER_DAY: i64 = 86_400_000_000; - /// Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). - const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; - const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; - - /// Build an INT96 value and its expected microsecond interpretation. - fn make_int96_test_value() -> (Int96, i64) { - let mut val = Int96::new(); - val.set_data( - (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, - (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, - INT96_TEST_JULIAN_DAY, - ); - let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY - + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; - (val, expected_micros) - } - - /// Read a Parquet file through ArrowReader and return the resulting batches. - async fn read_int96_batches( - file_path: &str, - schema: SchemaRef, - project_field_ids: Vec, - ) -> Vec { - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let file_size = std::fs::metadata(file_path).unwrap().len(); - let task = FileScanTask { - file_size_in_bytes: file_size, - start: 0, - length: file_size, - record_count: None, - data_file_path: file_path.to_string(), - data_file_format: DataFileFormat::Parquet, - schema, - project_field_ids, - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - reader.read(tasks).unwrap().try_collect().await.unwrap() - } - - /// Writes a Parquet file with INT96 timestamps using SerializedFileWriter - /// (ArrowWriter cannot write INT96). Returns (file_path, expected_microsecond_values). - fn write_int96_parquet_file( - table_location: &str, - filename: &str, - with_field_ids: bool, - ) -> (String, Vec) { - let file_path = format!("{table_location}/{filename}"); - - let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL); - let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) - .with_repetition(Repetition::REQUIRED); - - if with_field_ids { - ts_builder = ts_builder.with_id(Some(1)); - id_builder = id_builder.with_id(Some(2)); - } + use super::coerce_int96_timestamps; + use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type}; - let schema = SchemaType::group_type_builder("schema") + fn iceberg_schema_with_timestamp() -> Schema { + Schema::builder() + .with_schema_id(1) .with_fields(vec![ - Arc::new(ts_builder.build().unwrap()), - Arc::new(id_builder.build().unwrap()), + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)).into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), ]) .build() - .unwrap(); - - // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. - const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; - const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; - const JULIAN_2100: u32 = 2_488_070; - - let test_data: Vec<(u32, u32, u32, i64)> = vec![ - // 3333-01-01 00:00:00 - ( - 0, - 0, - JULIAN_3333, - (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, - ), - // 3333-01-01 12:00:00 - ( - (NOON_NANOS & 0xFFFFFFFF) as u32, - (NOON_NANOS >> 32) as u32, - JULIAN_3333, - (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY - + (NOON_NANOS / 1_000) as i64, - ), - // 2100-01-01 00:00:00 - ( - 0, - 0, - JULIAN_2100, - (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, - ), - ]; - - let int96_values: Vec = test_data - .iter() - .map(|(lo, hi, day, _)| { - let mut v = Int96::new(); - v.set_data(*lo, *hi, *day); - v - }) - .collect(); - - let id_values: Vec = (0..test_data.len() as i32).collect(); - let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); - - let mut row_group = writer.next_row_group().unwrap(); - { - // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) - .unwrap(); - col.close().unwrap(); - } - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&id_values, None, None) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - (file_path, expected_micros) + .unwrap() } - /// Read INT96 Parquet file through ArrowReader and verify top-level microsecond timestamps. - async fn assert_int96_read_matches( - file_path: &str, - schema: SchemaRef, - project_field_ids: Vec, - expected_micros: &[i64], - ) { - let batches = read_int96_batches(file_path, schema, project_field_ids).await; - - assert_eq!(batches.len(), 1); - let ts_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray"); - - for (i, expected) in expected_micros.iter().enumerate() { - assert_eq!( - ts_array.value(i), - *expected, - "Row {i}: got {}, expected {expected}", - ts_array.value(i) - ); - } + fn field_id_meta(id: i32) -> HashMap { + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), id.to_string())]) } - /// Files with embedded field IDs (branch 1 of schema resolution). - #[tokio::test] - async fn test_read_int96_timestamps_with_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) - .into(), - NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); + #[test] + fn test_coerce_timestamp_ns_to_us() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "with_ids.parquet", true); - - assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + // Non-timestamp field unchanged + assert_eq!(coerced.field(1).data_type(), &DataType::Int32); } - /// Migrated files without field IDs (branches 2/3 of schema resolution). - #[tokio::test] - async fn test_read_int96_timestamps_without_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) - .into(), - NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); + #[test] + fn test_coerce_timestamptz_ns_to_us() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamptz)).into(), + ]) + .build() + .unwrap(); - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "no_ids.parquet", false); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); - assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) + ); } - /// Test reading INT96 timestamps inside a struct field. - #[tokio::test] - async fn test_read_int96_timestamps_in_struct() { - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/struct_int96.parquet"); - - let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(2)) + #[test] + fn test_no_coercion_when_iceberg_is_timestamp_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestampNs)).into(), + ]) .build() .unwrap(); - let struct_type = SchemaType::group_type_builder("data") - .with_repetition(Repetition::REQUIRED) - .with_id(Some(1)) - .with_fields(vec![Arc::new(ts_type)]) - .build() - .unwrap(); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(struct_type)]) + #[test] + fn test_no_coercion_when_iceberg_is_timestamptz_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestamptzNs)) + .into(), + ]) .build() .unwrap(); - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). - // No repetition levels needed (no repeated groups). - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[1]), None) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required( - 1, - "data", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::optional( - 2, - "ts", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - ])), - ) - .into(), - ]) - .build() - .unwrap(), - ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } - assert_eq!(batches.len(), 1); - let struct_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected StructArray"); - let ts_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray inside struct"); + #[test] + fn test_no_coercion_when_already_microsecond() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Microsecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + // Without field IDs, the visitor can't look up the Iceberg type and falls back + // to microsecond to match Iceberg Java behavior. + #[test] + fn test_defaults_to_us_without_field_ids() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in struct: got {}, expected {expected_micros}", - ts_array.value(0) + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) ); } - /// Test reading INT96 timestamps inside a list field (3-level Parquet LIST encoding). - #[tokio::test] - async fn test_read_int96_timestamps_in_list() { - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/list_int96.parquet"); - - // 3-level LIST encoding: - // optional group timestamps (LIST) { - // repeated group list { - // optional int96 element; - // } - // } - let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(2)) - .build() - .unwrap(); - - let list_group = SchemaType::group_type_builder("list") - .with_repetition(Repetition::REPEATED) - .with_fields(vec![Arc::new(element_type)]) + // Field ID exists but points to a non-timestamp Iceberg type. The field_by_id + // lookup succeeds but the match arm returns None, so unwrap_or falls back to + // microsecond. + #[test] + fn test_defaults_to_us_when_iceberg_type_is_not_timestamp() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::String)).into(), + ]) .build() .unwrap(); - let list_type = SchemaType::group_type_builder("timestamps") - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(1)) - .with_logical_type(Some(parquet::basic::LogicalType::List)) - .with_fields(vec![Arc::new(list_group)]) - .build() - .unwrap(); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(list_type)]) - .build() - .unwrap(); - - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // Write a single row with a list containing one INT96 element. - // def=3: list present (1) + repeated group (2) + element present (3) - // rep=0: start of a new list - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[3]), Some(&[0])) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional( - 1, - "timestamps", - Type::List(crate::spec::ListType { - element_field: NestedField::optional( - 2, - "element", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - }), - ) - .into(), - ]) - .build() - .unwrap(), + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) ); + } - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + #[test] + fn test_coerce_preserves_field_metadata() { + let mut meta = field_id_meta(1); + meta.insert("custom_key".to_string(), "custom_value".to_string()); - assert_eq!(batches.len(), 1); - let list_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected ListArray"); - let ts_array = list_array - .values() - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray inside list"); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(meta.clone()), + ])); + let iceberg = iceberg_schema_with_timestamp(); - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in list: got {}, expected {expected_micros}", - ts_array.value(0) - ); + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!(coerced.field(0).metadata(), &meta); } - /// Test reading INT96 timestamps as map values. - #[tokio::test] - async fn test_read_int96_timestamps_in_map() { - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/map_int96.parquet"); - - // MAP encoding: - // optional group ts_map (MAP) { - // repeated group key_value { - // required binary key (UTF8); - // optional int96 value; - // } - // } - let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) - .with_repetition(Repetition::REQUIRED) - .with_logical_type(Some(parquet::basic::LogicalType::String)) - .with_id(Some(2)) - .build() - .unwrap(); - - let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(3)) + #[test] + fn test_coerce_timestamp_in_struct() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(StructType::new(vec![ + NestedField::optional(2, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + ])), + ) + .into(), + ]) .build() .unwrap(); - let key_value_group = SchemaType::group_type_builder("key_value") - .with_repetition(Repetition::REPEATED) - .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) - .build() - .unwrap(); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "data", + DataType::Struct( + vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(2)), + ] + .into(), + ), + false, + ) + .with_metadata(field_id_meta(1)), + ])); - let map_type = SchemaType::group_type_builder("ts_map") - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(1)) - .with_logical_type(Some(parquet::basic::LogicalType::Map)) - .with_fields(vec![Arc::new(key_value_group)]) - .build() - .unwrap(); + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let inner = match coerced.field(0).data_type() { + DataType::Struct(fields) => fields, + other => panic!("Expected Struct, got {other}"), + }; + assert_eq!( + inner[0].data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(map_type)]) + #[test] + fn test_coerce_timestamp_in_list() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) .build() .unwrap(); - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // Write a single row with a map containing one key-value pair. - // rep=0 for both columns: start of a new map. - // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. - // value def=3: map present (1) + key_value entry present (2) + value present (3). - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch( - &[parquet::data_type::ByteArray::from("event_time")], - Some(&[2]), - Some(&[0]), - ) - .unwrap(); - col.close().unwrap(); - } - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[3]), Some(&[0])) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional( - 1, - "ts_map", - Type::Map(crate::spec::MapType { - key_field: NestedField::required( - 2, - "key", - Type::Primitive(PrimitiveType::String), - ) - .into(), - value_field: NestedField::optional( - 3, - "value", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - }), - ) - .into(), - ]) - .build() - .unwrap(), + let element_field = Field::new( + "element", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(2)); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("timestamps", DataType::List(Arc::new(element_field)), true) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let element_dt = match coerced.field(0).data_type() { + DataType::List(f) => f.data_type(), + other => panic!("Expected List, got {other}"), + }; + assert_eq!( + element_dt, + &DataType::Timestamp(TimeUnit::Microsecond, None) ); + } - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; - - assert_eq!(batches.len(), 1); - let map_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected MapArray"); - let ts_array = map_array - .values() - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray as map values"); + #[test] + fn test_coerce_timestamp_in_map_value() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(); - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in map: got {}, expected {expected_micros}", - ts_array.value(0) + let key_field = Field::new("key", DataType::Utf8, false).with_metadata(field_id_meta(2)); + let value_field = Field::new( + "value", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(3)); + let entries_field = Field::new( + "key_value", + DataType::Struct(vec![key_field, value_field].into()), + false, ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts_map", + DataType::Map(Arc::new(entries_field), false), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let value_dt = match coerced.field(0).data_type() { + DataType::Map(entries, _) => match entries.data_type() { + DataType::Struct(fields) => fields[1].data_type().clone(), + other => panic!("Expected Struct inside Map, got {other}"), + }, + other => panic!("Expected Map, got {other}"), + }; + assert_eq!(value_dt, DataType::Timestamp(TimeUnit::Microsecond, None)); } } diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 8eba4ec6c9..8cbd4a96c1 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -4689,4 +4689,557 @@ message schema { assert_eq!(result[1], expected_1); assert_eq!(result[2], expected_2); } + + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + // Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). + const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; + const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; + + fn make_int96_test_value() -> (parquet::data_type::Int96, i64) { + let mut val = parquet::data_type::Int96::new(); + val.set_data( + (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, + (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, + INT96_TEST_JULIAN_DAY, + ); + let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; + (val, expected_micros) + } + + async fn read_int96_batches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + ) -> Vec { + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let file_size = std::fs::metadata(file_path).unwrap().len(); + let task = FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: file_size, + record_count: None, + data_file_path: file_path.to_string(), + data_file_format: DataFileFormat::Parquet, + schema, + project_field_ids, + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + reader.read(tasks).unwrap().try_collect().await.unwrap() + } + + // ArrowWriter cannot write INT96, so we use SerializedFileWriter directly. + fn write_int96_parquet_file( + table_location: &str, + filename: &str, + with_field_ids: bool, + ) -> (String, Vec) { + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{Int32Type, Int96, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let file_path = format!("{table_location}/{filename}"); + + let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); + + if with_field_ids { + ts_builder = ts_builder.with_id(Some(1)); + id_builder = id_builder.with_id(Some(2)); + } + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new(ts_builder.build().unwrap()), + Arc::new(id_builder.build().unwrap()), + ]) + .build() + .unwrap(); + + // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. + const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; + const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; + const JULIAN_2100: u32 = 2_488_070; + + let test_data: Vec<(u32, u32, u32, i64)> = vec![ + // 3333-01-01 00:00:00 + ( + 0, + 0, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + // 3333-01-01 12:00:00 + ( + (NOON_NANOS & 0xFFFFFFFF) as u32, + (NOON_NANOS >> 32) as u32, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (NOON_NANOS / 1_000) as i64, + ), + // 2100-01-01 00:00:00 + ( + 0, + 0, + JULIAN_2100, + (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + ]; + + let int96_values: Vec = test_data + .iter() + .map(|(lo, hi, day, _)| { + let mut v = Int96::new(); + v.set_data(*lo, *hi, *day); + v + }) + .collect(); + + let id_values: Vec = (0..test_data.len() as i32).collect(); + let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&id_values, None, None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + (file_path, expected_micros) + } + + async fn assert_int96_read_matches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + expected_micros: &[i64], + ) { + use arrow_array::TimestampMicrosecondArray; + + let batches = read_int96_batches(file_path, schema, project_field_ids).await; + + assert_eq!(batches.len(), 1); + let ts_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + #[tokio::test] + async fn test_read_int96_timestamps_with_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "with_ids.parquet", true); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_struct() { + use arrow_array::{StructArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/struct_int96.parquet"); + + let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let struct_type = SchemaType::group_type_builder("data") + .with_repetition(Repetition::REQUIRED) + .with_id(Some(1)) + .with_fields(vec![Arc::new(ts_type)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(struct_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). + // No repetition levels needed (no repeated groups). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[1]), None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional( + 2, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let struct_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected StructArray"); + let ts_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside struct"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in struct: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_list() { + use arrow_array::{ListArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/list_int96.parquet"); + + // 3-level LIST encoding: + // optional group timestamps (LIST) { + // repeated group list { + // optional int96 element; + // } + // } + let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let list_group = SchemaType::group_type_builder("list") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(element_type)]) + .build() + .unwrap(); + + let list_type = SchemaType::group_type_builder("timestamps") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::List)) + .with_fields(vec![Arc::new(list_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(list_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a list containing one INT96 element. + // def=3: list present (1) + repeated group (2) + element present (3) + // rep=0: start of a new list + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(crate::spec::ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let list_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected ListArray"); + let ts_array = list_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside list"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in list: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_map() { + use arrow_array::{MapArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{ByteArrayType, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/map_int96.parquet"); + + // MAP encoding: + // optional group ts_map (MAP) { + // repeated group key_value { + // required binary key (UTF8); + // optional int96 value; + // } + // } + let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(parquet::basic::LogicalType::String)) + .with_id(Some(2)) + .build() + .unwrap(); + + let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(3)) + .build() + .unwrap(); + + let key_value_group = SchemaType::group_type_builder("key_value") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) + .build() + .unwrap(); + + let map_type = SchemaType::group_type_builder("ts_map") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::Map)) + .with_fields(vec![Arc::new(key_value_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(map_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a map containing one key-value pair. + // rep=0 for both columns: start of a new map. + // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. + // value def=3: map present (1) + key_value entry present (2) + value present (3). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch( + &[parquet::data_type::ByteArray::from("event_time")], + Some(&[2]), + Some(&[0]), + ) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let map_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected MapArray"); + let ts_array = map_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray as map values"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in map: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } }