diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 9afae78b0665..411c1f14c26d 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -2098,6 +2098,133 @@ mod tests { } } + /// Test that the reader can read legacy files where empty list arrays were written with a 0-byte offsets buffer. + #[test] + fn test_read_legacy_empty_list_without_offsets_buffer() { + use crate::r#gen::Message::*; + use flatbuffers::FlatBufferBuilder; + + let schema = Arc::new(Schema::new(vec![Field::new_list( + "items", + Field::new_list_field(DataType::Int32, true), + true, + )])); + + // Legacy arrow-rs versions wrote empty offsets buffers for empty list arrays. + // Keep reader compatibility with such files by accepting a 0-byte offsets buffer. + let mut fbb = FlatBufferBuilder::new(); + let nodes = fbb.create_vector(&[ + FieldNode::new(0, 0), // list node + FieldNode::new(0, 0), // child int32 node + ]); + let buffers = fbb.create_vector(&[ + crate::Buffer::new(0, 0), // list validity + crate::Buffer::new(0, 0), // list offsets (legacy empty buffer) + crate::Buffer::new(0, 0), // child validity + crate::Buffer::new(0, 0), // child values + ]); + let batch_offset = RecordBatch::create( + &mut fbb, + &RecordBatchArgs { + length: 0, + nodes: Some(nodes), + buffers: Some(buffers), + compression: None, + variadicBufferCounts: None, + }, + ); + fbb.finish_minimal(batch_offset); + let batch_bytes = fbb.finished_data().to_vec(); + let batch = flatbuffers::root::(&batch_bytes).unwrap(); + + let body = Buffer::from(Vec::::new()); + let dictionaries: HashMap = HashMap::new(); + let metadata = MetadataVersion::V5; + + let decoder = + RecordBatchDecoder::try_new(&body, batch, schema.clone(), &dictionaries, &metadata) + .unwrap(); + + let read_batch = decoder.read_record_batch().unwrap(); + assert_eq!(read_batch.num_rows(), 0); + + let list = read_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(list.len(), 0); + assert_eq!(list.values().len(), 0); + } + + /// Test that the reader can read legacy files where empty Utf8/Binary arrays were written with a 0-byte offsets buffer. + #[test] + fn test_read_legacy_empty_utf8_and_binary_without_offsets_buffer() { + use crate::r#gen::Message::*; + use flatbuffers::FlatBufferBuilder; + + let schema = Arc::new(Schema::new(vec![ + Field::new("name", DataType::Utf8, true), + Field::new("payload", DataType::Binary, true), + ])); + + // Legacy arrow-rs versions wrote empty offsets buffers for empty Utf8/Binary arrays. + // Keep reader compatibility with such files by accepting 0-byte offsets buffers. + let mut fbb = FlatBufferBuilder::new(); + let nodes = fbb.create_vector(&[ + FieldNode::new(0, 0), // utf8 node + FieldNode::new(0, 0), // binary node + ]); + let buffers = fbb.create_vector(&[ + crate::Buffer::new(0, 0), // utf8 validity + crate::Buffer::new(0, 0), // utf8 offsets (legacy empty buffer) + crate::Buffer::new(0, 0), // utf8 values + crate::Buffer::new(0, 0), // binary validity + crate::Buffer::new(0, 0), // binary offsets (legacy empty buffer) + crate::Buffer::new(0, 0), // binary values + ]); + let batch_offset = RecordBatch::create( + &mut fbb, + &RecordBatchArgs { + length: 0, + nodes: Some(nodes), + buffers: Some(buffers), + compression: None, + variadicBufferCounts: None, + }, + ); + fbb.finish_minimal(batch_offset); + let batch_bytes = fbb.finished_data().to_vec(); + let batch = flatbuffers::root::(&batch_bytes).unwrap(); + + let body = Buffer::from(Vec::::new()); + let dictionaries: HashMap = HashMap::new(); + let metadata = MetadataVersion::V5; + + let decoder = + RecordBatchDecoder::try_new(&body, batch, schema.clone(), &dictionaries, &metadata) + .unwrap(); + + let read_batch = decoder.read_record_batch().unwrap(); + assert_eq!(read_batch.num_rows(), 0); + + let utf8 = read_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(utf8.len(), 0); + assert_eq!(utf8.value_offsets(), [0]); + + let binary = read_batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(binary.len(), 0); + assert_eq!(binary.value_offsets(), [0]); + } + #[test] fn test_projection_array_values() { // define schema diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 9d3fca293874..4194bbf2ee39 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -37,7 +37,7 @@ use arrow_array::cast::*; use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; use arrow_array::*; use arrow_buffer::bit_util; -use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder, BufferSpec, layout}; use arrow_schema::*; @@ -1724,7 +1724,11 @@ fn reencode_offsets( /// size of sliced arrays, as values that have been sliced away are not encoded fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buffer) { if data.is_empty() { - return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); + // As per specification, offsets buffer has N+1 elements. + // So an empty array should still be encoded with a single 0 offset. + let mut offsets = MutableBuffer::new(size_of::()); + offsets.extend_from_slice(O::usize_as(0).to_byte_slice()); + return (offsets.into(), MutableBuffer::new(0).into()); } let (offsets, original_start_offset, len) = reencode_offsets::(&data.buffers()[0], data); @@ -1736,10 +1740,11 @@ fn get_byte_array_buffers(data: &ArrayData) -> (Buffer, Buff /// of a values buffer. fn get_list_array_buffers(data: &ArrayData) -> (Buffer, ArrayData) { if data.is_empty() { - return ( - MutableBuffer::new(0).into(), - data.child_data()[0].slice(0, 0), - ); + // As per specification, offsets buffer has N+1 elements. + // So an empty array should still be encoded with a single 0 offset. + let mut offsets = MutableBuffer::new(size_of::()); + offsets.extend_from_slice(O::usize_as(0).to_byte_slice()); + return (offsets.into(), data.child_data()[0].slice(0, 0)); } let (offsets, original_start_offset, len) = reencode_offsets::(&data.buffers()[0], data); @@ -2372,6 +2377,62 @@ mod tests { } } + #[test] + fn test_empty_utf8_ipc_writes_nonempty_offsets_buffer() { + let name = StringArray::from(Vec::::new()); + let (offsets, values) = get_byte_array_buffers::(&name.to_data()); + + assert_eq!(name.len(), 0); + assert_eq!( + offsets.len(), + std::mem::size_of::(), + "offsets buffer should contain one zero i32 offset" + ); + assert_eq!(values.len(), 0, "values buffer should remain empty"); + } + + #[test] + fn test_empty_large_utf8_ipc_writes_nonempty_offsets_buffer() { + let name = LargeStringArray::from(Vec::::new()); + let (offsets, values) = get_byte_array_buffers::(&name.to_data()); + + assert_eq!(name.len(), 0); + assert_eq!( + offsets.len(), + std::mem::size_of::(), + "offsets buffer should contain one zero i64 offset" + ); + assert_eq!(values.len(), 0, "values buffer should remain empty"); + } + + #[test] + fn test_empty_list_ipc_writes_nonempty_offsets_buffer() { + let list = GenericListBuilder::::new(UInt32Builder::new()).finish(); + let (offsets, child_data) = get_list_array_buffers::(&list.to_data()); + + assert_eq!(list.len(), 0); + assert_eq!( + offsets.len(), + std::mem::size_of::(), + "offsets buffer should contain one zero i32 offset" + ); + assert_eq!(child_data.len(), 0, "child data should remain empty"); + } + + #[test] + fn test_empty_large_list_ipc_writes_nonempty_offsets_buffer() { + let list = GenericListBuilder::::new(UInt32Builder::new()).finish(); + let (offsets, child_data) = get_list_array_buffers::(&list.to_data()); + + assert_eq!(list.len(), 0); + assert_eq!( + offsets.len(), + std::mem::size_of::(), + "offsets buffer should contain one zero i64 offset" + ); + assert_eq!(child_data.len(), 0, "child data should remain empty"); + } + fn write_null_file(options: IpcWriteOptions) { let schema = Schema::new(vec![ Field::new("nulls", DataType::Null, true),