-
Notifications
You must be signed in to change notification settings - Fork 1.2k
arrow-ipc: Write 0 offset buffer for length-0 variable-size arrays #9717
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,7 +37,7 @@ use arrow_array::cast::*; | |
| use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType}; | ||
| use arrow_array::*; | ||
| use arrow_buffer::bit_util; | ||
| use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; | ||
| use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, ToByteSlice}; | ||
| use arrow_data::{ArrayData, ArrayDataBuilder, BufferSpec, layout}; | ||
| use arrow_schema::*; | ||
|
|
||
|
|
@@ -1724,7 +1724,11 @@ fn reencode_offsets<O: OffsetSizeTrait>( | |
| /// size of sliced arrays, as values that have been sliced away are not encoded | ||
| fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, Buffer) { | ||
| if data.is_empty() { | ||
| return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into()); | ||
| // As per specification, offsets buffer has N+1 elements. | ||
| // So an empty array should still be encoded with a single 0 offset. | ||
| let mut offsets = MutableBuffer::new(size_of::<O>()); | ||
| offsets.extend_from_slice(O::usize_as(0).to_byte_slice()); | ||
| return (offsets.into(), MutableBuffer::new(0).into()); | ||
| } | ||
|
|
||
| let (offsets, original_start_offset, len) = reencode_offsets::<O>(&data.buffers()[0], data); | ||
|
|
@@ -1736,10 +1740,11 @@ fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, Buff | |
| /// of a values buffer. | ||
| fn get_list_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, ArrayData) { | ||
| if data.is_empty() { | ||
| return ( | ||
| MutableBuffer::new(0).into(), | ||
| data.child_data()[0].slice(0, 0), | ||
| ); | ||
| // As per specification, offsets buffer has N+1 elements. | ||
| // So an empty array should still be encoded with a single 0 offset. | ||
| let mut offsets = MutableBuffer::new(size_of::<O>()); | ||
|
atwam marked this conversation as resolved.
|
||
| offsets.extend_from_slice(O::usize_as(0).to_byte_slice()); | ||
| return (offsets.into(), data.child_data()[0].slice(0, 0)); | ||
| } | ||
|
|
||
| let (offsets, original_start_offset, len) = reencode_offsets::<O>(&data.buffers()[0], data); | ||
|
|
@@ -2372,6 +2377,62 @@ mod tests { | |
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_empty_utf8_ipc_writes_nonempty_offsets_buffer() { | ||
| let name = StringArray::from(Vec::<String>::new()); | ||
| let (offsets, values) = get_byte_array_buffers::<i32>(&name.to_data()); | ||
|
|
||
| assert_eq!(name.len(), 0); | ||
| assert_eq!( | ||
| offsets.len(), | ||
| std::mem::size_of::<i32>(), | ||
| "offsets buffer should contain one zero i32 offset" | ||
| ); | ||
| assert_eq!(values.len(), 0, "values buffer should remain empty"); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_empty_large_utf8_ipc_writes_nonempty_offsets_buffer() { | ||
| let name = LargeStringArray::from(Vec::<String>::new()); | ||
| let (offsets, values) = get_byte_array_buffers::<i64>(&name.to_data()); | ||
|
|
||
| assert_eq!(name.len(), 0); | ||
| assert_eq!( | ||
| offsets.len(), | ||
| std::mem::size_of::<i64>(), | ||
| "offsets buffer should contain one zero i64 offset" | ||
| ); | ||
| assert_eq!(values.len(), 0, "values buffer should remain empty"); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_empty_list_ipc_writes_nonempty_offsets_buffer() { | ||
| let list = GenericListBuilder::<i32, _>::new(UInt32Builder::new()).finish(); | ||
| let (offsets, child_data) = get_list_array_buffers::<i32>(&list.to_data()); | ||
|
|
||
| assert_eq!(list.len(), 0); | ||
| assert_eq!( | ||
| offsets.len(), | ||
| std::mem::size_of::<i32>(), | ||
| "offsets buffer should contain one zero i32 offset" | ||
| ); | ||
| assert_eq!(child_data.len(), 0, "child data should remain empty"); | ||
| } | ||
|
|
||
| #[test] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we please also add a test that shows IPC files without any offets are still accepted?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added two tests for legacy files. Creating files with the legacy format means I could not rely on the writer (unless I added options to enable some legacy format, which was ugly), so I just went back to raw flatbuffers for the purpose of these tests. Another alternative would be to generate such files once with the current version and embed some raw files, but that amounts to embedding small binary files, which I found equally repulsive. I hope my approach will do.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think what you have is fine -- if it becomes too annoying to maintain we can make a checked in file |
||
| fn test_empty_large_list_ipc_writes_nonempty_offsets_buffer() { | ||
| let list = GenericListBuilder::<i64, _>::new(UInt32Builder::new()).finish(); | ||
| let (offsets, child_data) = get_list_array_buffers::<i64>(&list.to_data()); | ||
|
|
||
| assert_eq!(list.len(), 0); | ||
| assert_eq!( | ||
| offsets.len(), | ||
| std::mem::size_of::<i64>(), | ||
| "offsets buffer should contain one zero i64 offset" | ||
| ); | ||
| assert_eq!(child_data.len(), 0, "child data should remain empty"); | ||
| } | ||
|
|
||
| fn write_null_file(options: IpcWriteOptions) { | ||
| let schema = Schema::new(vec![ | ||
| Field::new("nulls", DataType::Null, true), | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.