Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions arrow-ipc/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2098,6 +2098,133 @@ mod tests {
}
}

/// Test that the reader can read legacy files where empty list arrays were written with a 0-byte offsets buffer.
#[test]
fn test_read_legacy_empty_list_without_offsets_buffer() {
use crate::r#gen::Message::*;
use flatbuffers::FlatBufferBuilder;

let schema = Arc::new(Schema::new(vec![Field::new_list(
"items",
Field::new_list_field(DataType::Int32, true),
true,
)]));

// Legacy arrow-rs versions wrote empty offsets buffers for empty list arrays.
// Keep reader compatibility with such files by accepting a 0-byte offsets buffer.
let mut fbb = FlatBufferBuilder::new();
let nodes = fbb.create_vector(&[
FieldNode::new(0, 0), // list node
FieldNode::new(0, 0), // child int32 node
]);
let buffers = fbb.create_vector(&[
crate::Buffer::new(0, 0), // list validity
crate::Buffer::new(0, 0), // list offsets (legacy empty buffer)
crate::Buffer::new(0, 0), // child validity
crate::Buffer::new(0, 0), // child values
]);
let batch_offset = RecordBatch::create(
&mut fbb,
&RecordBatchArgs {
length: 0,
nodes: Some(nodes),
buffers: Some(buffers),
compression: None,
variadicBufferCounts: None,
},
);
fbb.finish_minimal(batch_offset);
let batch_bytes = fbb.finished_data().to_vec();
let batch = flatbuffers::root::<RecordBatch>(&batch_bytes).unwrap();

let body = Buffer::from(Vec::<u8>::new());
let dictionaries: HashMap<i64, ArrayRef> = HashMap::new();
let metadata = MetadataVersion::V5;

let decoder =
RecordBatchDecoder::try_new(&body, batch, schema.clone(), &dictionaries, &metadata)
.unwrap();

let read_batch = decoder.read_record_batch().unwrap();
assert_eq!(read_batch.num_rows(), 0);

let list = read_batch
.column(0)
.as_any()
.downcast_ref::<ListArray>()
.unwrap();
assert_eq!(list.len(), 0);
assert_eq!(list.values().len(), 0);
}

/// Test that the reader can read legacy files where empty Utf8/Binary arrays were written with a 0-byte offsets buffer.
#[test]
fn test_read_legacy_empty_utf8_and_binary_without_offsets_buffer() {
use crate::r#gen::Message::*;
use flatbuffers::FlatBufferBuilder;

let schema = Arc::new(Schema::new(vec![
Field::new("name", DataType::Utf8, true),
Field::new("payload", DataType::Binary, true),
]));

// Legacy arrow-rs versions wrote empty offsets buffers for empty Utf8/Binary arrays.
// Keep reader compatibility with such files by accepting 0-byte offsets buffers.
let mut fbb = FlatBufferBuilder::new();
let nodes = fbb.create_vector(&[
FieldNode::new(0, 0), // utf8 node
FieldNode::new(0, 0), // binary node
]);
let buffers = fbb.create_vector(&[
crate::Buffer::new(0, 0), // utf8 validity
crate::Buffer::new(0, 0), // utf8 offsets (legacy empty buffer)
crate::Buffer::new(0, 0), // utf8 values
crate::Buffer::new(0, 0), // binary validity
crate::Buffer::new(0, 0), // binary offsets (legacy empty buffer)
crate::Buffer::new(0, 0), // binary values
]);
let batch_offset = RecordBatch::create(
&mut fbb,
&RecordBatchArgs {
length: 0,
nodes: Some(nodes),
buffers: Some(buffers),
compression: None,
variadicBufferCounts: None,
},
);
fbb.finish_minimal(batch_offset);
let batch_bytes = fbb.finished_data().to_vec();
let batch = flatbuffers::root::<RecordBatch>(&batch_bytes).unwrap();

let body = Buffer::from(Vec::<u8>::new());
let dictionaries: HashMap<i64, ArrayRef> = HashMap::new();
let metadata = MetadataVersion::V5;

let decoder =
RecordBatchDecoder::try_new(&body, batch, schema.clone(), &dictionaries, &metadata)
.unwrap();

let read_batch = decoder.read_record_batch().unwrap();
assert_eq!(read_batch.num_rows(), 0);

let utf8 = read_batch
.column(0)
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
assert_eq!(utf8.len(), 0);
assert_eq!(utf8.value_offsets(), [0]);

let binary = read_batch
.column(1)
.as_any()
.downcast_ref::<BinaryArray>()
.unwrap();
assert_eq!(binary.len(), 0);
assert_eq!(binary.value_offsets(), [0]);
}

#[test]
fn test_projection_array_values() {
// define schema
Expand Down
73 changes: 67 additions & 6 deletions arrow-ipc/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ use arrow_array::cast::*;
use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType};
use arrow_array::*;
use arrow_buffer::bit_util;
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, ToByteSlice};
use arrow_data::{ArrayData, ArrayDataBuilder, BufferSpec, layout};
use arrow_schema::*;

Expand Down Expand Up @@ -1724,7 +1724,11 @@ fn reencode_offsets<O: OffsetSizeTrait>(
/// size of sliced arrays, as values that have been sliced away are not encoded
fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, Buffer) {
if data.is_empty() {
return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into());
// As per specification, offsets buffer has N+1 elements.
// So an empty array should still be encoded with a single 0 offset.
let mut offsets = MutableBuffer::new(size_of::<O>());
Comment thread
atwam marked this conversation as resolved.
offsets.extend_from_slice(O::usize_as(0).to_byte_slice());
return (offsets.into(), MutableBuffer::new(0).into());
}

let (offsets, original_start_offset, len) = reencode_offsets::<O>(&data.buffers()[0], data);
Expand All @@ -1736,10 +1740,11 @@ fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, Buff
/// of a values buffer.
fn get_list_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, ArrayData) {
if data.is_empty() {
return (
MutableBuffer::new(0).into(),
data.child_data()[0].slice(0, 0),
);
// As per specification, offsets buffer has N+1 elements.
// So an empty array should still be encoded with a single 0 offset.
let mut offsets = MutableBuffer::new(size_of::<O>());
Comment thread
atwam marked this conversation as resolved.
offsets.extend_from_slice(O::usize_as(0).to_byte_slice());
return (offsets.into(), data.child_data()[0].slice(0, 0));
}

let (offsets, original_start_offset, len) = reencode_offsets::<O>(&data.buffers()[0], data);
Expand Down Expand Up @@ -2372,6 +2377,62 @@ mod tests {
}
}

#[test]
fn test_empty_utf8_ipc_writes_nonempty_offsets_buffer() {
let name = StringArray::from(Vec::<String>::new());
let (offsets, values) = get_byte_array_buffers::<i32>(&name.to_data());

assert_eq!(name.len(), 0);
assert_eq!(
offsets.len(),
std::mem::size_of::<i32>(),
"offsets buffer should contain one zero i32 offset"
);
assert_eq!(values.len(), 0, "values buffer should remain empty");
}

#[test]
fn test_empty_large_utf8_ipc_writes_nonempty_offsets_buffer() {
let name = LargeStringArray::from(Vec::<String>::new());
let (offsets, values) = get_byte_array_buffers::<i64>(&name.to_data());

assert_eq!(name.len(), 0);
assert_eq!(
offsets.len(),
std::mem::size_of::<i64>(),
"offsets buffer should contain one zero i64 offset"
);
assert_eq!(values.len(), 0, "values buffer should remain empty");
}

#[test]
fn test_empty_list_ipc_writes_nonempty_offsets_buffer() {
let list = GenericListBuilder::<i32, _>::new(UInt32Builder::new()).finish();
let (offsets, child_data) = get_list_array_buffers::<i32>(&list.to_data());

assert_eq!(list.len(), 0);
assert_eq!(
offsets.len(),
std::mem::size_of::<i32>(),
"offsets buffer should contain one zero i32 offset"
);
assert_eq!(child_data.len(), 0, "child data should remain empty");
}

#[test]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we please also add a test that shows IPC files without any offets are still accepted?

Copy link
Copy Markdown
Contributor Author

@atwam atwam Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added two tests for legacy files.

Creating files with the legacy format means I could not rely on the writer (unless I added options to enable some legacy format, which was ugly), so I just went back to raw flatbuffers for the purpose of these tests.

Another alternative would be to generate such files once with the current version and embed some raw files, but that amounts to embedding small binary files, which I found equally repulsive.

I hope my approach will do.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what you have is fine -- if it becomes too annoying to maintain we can make a checked in file

fn test_empty_large_list_ipc_writes_nonempty_offsets_buffer() {
let list = GenericListBuilder::<i64, _>::new(UInt32Builder::new()).finish();
let (offsets, child_data) = get_list_array_buffers::<i64>(&list.to_data());

assert_eq!(list.len(), 0);
assert_eq!(
offsets.len(),
std::mem::size_of::<i64>(),
"offsets buffer should contain one zero i64 offset"
);
assert_eq!(child_data.len(), 0, "child data should remain empty");
}

fn write_null_file(options: IpcWriteOptions) {
let schema = Schema::new(vec![
Field::new("nulls", DataType::Null, true),
Expand Down
Loading