Skip to content

Commit 37ab9bd

Browse files
committed
[Experiment] Self-defined arrow-export
Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent 43ae2dc commit 37ab9bd

28 files changed

Lines changed: 1124 additions & 545 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

encodings/runend/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ version = { workspace = true }
1515

1616
[dependencies]
1717
arbitrary = { workspace = true, optional = true }
18-
arrow-array = { workspace = true, optional = true }
18+
arrow-array = { workspace = true }
19+
arrow-buffer = { workspace = true }
20+
arrow-schema = { workspace = true }
1921
itertools = { workspace = true }
2022
num-traits = { workspace = true }
2123
prost = { workspace = true }
@@ -39,7 +41,6 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
3941

4042
[features]
4143
arbitrary = ["dep:arbitrary", "vortex-array/arbitrary"]
42-
arrow = ["dep:arrow-array"]
4344

4445
[[bench]]
4546
name = "run_end_null_count"

encodings/runend/public-api.lock

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,16 @@ pub fn vortex_runend::RunEnd::nbuffers(_array: &vortex_runend::RunEndArray) -> u
8888

8989
pub fn vortex_runend::RunEnd::nchildren(_array: &vortex_runend::RunEndArray) -> usize
9090

91+
pub fn vortex_runend::RunEnd::preferred_arrow_data_type(array: &vortex_runend::RunEndArray) -> core::option::Option<arrow_schema::datatype::DataType>
92+
9193
pub fn vortex_runend::RunEnd::reduce_parent(array: &Self::Array, parent: &vortex_array::array::ArrayRef, child_idx: usize) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
9294

9395
pub fn vortex_runend::RunEnd::serialize(metadata: Self::Metadata) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
9496

9597
pub fn vortex_runend::RunEnd::stats(array: &vortex_runend::RunEndArray) -> vortex_array::stats::array::StatsSetRef<'_>
9698

99+
pub fn vortex_runend::RunEnd::to_arrow_array(array: &vortex_runend::RunEndArray, data_type: &arrow_schema::datatype::DataType, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<arrow_array::array::ArrayRef>>
100+
97101
pub fn vortex_runend::RunEnd::with_children(array: &mut Self::Array, children: alloc::vec::Vec<vortex_array::array::ArrayRef>) -> vortex_error::VortexResult<()>
98102

99103
impl vortex_array::vtable::operations::OperationsVTable<vortex_runend::RunEnd> for vortex_runend::RunEnd

encodings/runend/src/array.rs

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,15 @@
33

44
use std::fmt::Debug;
55
use std::hash::Hash;
6-
6+
use std::sync::Arc;
7+
8+
use arrow_array::ArrayRef as ArrowArrayRef;
9+
use arrow_array::RunArray;
10+
use arrow_array::cast::AsArray;
11+
use arrow_array::types::*;
12+
use arrow_buffer::ArrowNativeType;
13+
use arrow_schema::DataType;
14+
use arrow_schema::Field;
715
use vortex_array::ArrayEq;
816
use vortex_array::ArrayHash;
917
use vortex_array::ArrayRef;
@@ -17,6 +25,7 @@ use vortex_array::ProstMetadata;
1725
use vortex_array::SerializeMetadata;
1826
use vortex_array::arrays::Primitive;
1927
use vortex_array::arrays::VarBinViewArray;
28+
use vortex_array::arrow::ArrowArrayExecutor;
2029
use vortex_array::buffer::BufferHandle;
2130
use vortex_array::dtype::DType;
2231
use vortex_array::dtype::Nullability;
@@ -203,6 +212,41 @@ impl VTable for RunEnd {
203212
PARENT_KERNELS.execute(array, parent, child_idx, ctx)
204213
}
205214

215+
fn preferred_arrow_data_type(array: &RunEndArray) -> Option<DataType> {
216+
let ends_dt = array.ends().dtype().to_arrow_dtype().ok()?;
217+
let values_dt = array.values().dtype().to_arrow_dtype().ok()?;
218+
let values_nullable = array.values().dtype().is_nullable();
219+
Some(DataType::RunEndEncoded(
220+
Arc::new(Field::new("run_ends", ends_dt, false)),
221+
Arc::new(Field::new("values", values_dt, values_nullable)),
222+
))
223+
}
224+
225+
fn to_arrow_array(
226+
array: &RunEndArray,
227+
data_type: &DataType,
228+
ctx: &mut ExecutionCtx,
229+
) -> VortexResult<Option<ArrowArrayRef>> {
230+
let DataType::RunEndEncoded(ends_field, values_field) = data_type else {
231+
return Ok(None);
232+
};
233+
let arrow_ends = array
234+
.ends()
235+
.clone()
236+
.execute_arrow(Some(ends_field.data_type()), ctx)?;
237+
let arrow_values = array
238+
.values()
239+
.clone()
240+
.execute_arrow(Some(values_field.data_type()), ctx)?;
241+
Ok(Some(build_run_array(
242+
&arrow_ends,
243+
&arrow_values,
244+
ends_field.data_type(),
245+
array.offset(),
246+
array.len(),
247+
)?))
248+
}
249+
206250
fn execute(array: &Self::Array, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionStep> {
207251
run_end_canonicalize(array, ctx).map(ExecutionStep::Done)
208252
}
@@ -503,6 +547,59 @@ pub(super) fn run_end_canonicalize(
503547
})
504548
}
505549

550+
fn build_run_array(
551+
ends: &ArrowArrayRef,
552+
values: &ArrowArrayRef,
553+
ends_type: &DataType,
554+
offset: usize,
555+
length: usize,
556+
) -> VortexResult<ArrowArrayRef> {
557+
match ends_type {
558+
DataType::Int16 => build_run_array_typed::<Int16Type>(ends, values, offset, length),
559+
DataType::Int32 => build_run_array_typed::<Int32Type>(ends, values, offset, length),
560+
DataType::Int64 => build_run_array_typed::<Int64Type>(ends, values, offset, length),
561+
_ => vortex_bail!("Unsupported run-end index type: {:?}", ends_type),
562+
}
563+
}
564+
565+
fn build_run_array_typed<R: RunEndIndexType>(
566+
ends: &ArrowArrayRef,
567+
values: &ArrowArrayRef,
568+
offset: usize,
569+
length: usize,
570+
) -> VortexResult<ArrowArrayRef>
571+
where
572+
R::Native: std::ops::Sub<Output = R::Native> + Ord,
573+
{
574+
let offset_native = R::Native::from_usize(offset).ok_or_else(|| {
575+
vortex_error::vortex_err!("Offset {offset} exceeds run-end index capacity")
576+
})?;
577+
let length_native = R::Native::from_usize(length).ok_or_else(|| {
578+
vortex_error::vortex_err!("Length {length} exceeds run-end index capacity")
579+
})?;
580+
581+
let ends_prim = ends.as_primitive::<R>();
582+
if offset == 0 && ends_prim.values().last() == Some(&length_native) {
583+
return Ok(Arc::new(RunArray::<R>::try_new(ends_prim, values)?) as ArrowArrayRef);
584+
}
585+
586+
// Trim to only include runs covering the [offset, offset+length) range.
587+
let num_runs = (ends_prim
588+
.values()
589+
.partition_point(|&e| e - offset_native < length_native)
590+
+ 1)
591+
.min(ends_prim.len());
592+
593+
let trimmed_ends = ends.slice(0, num_runs);
594+
let trimmed_values = values.slice(0, num_runs);
595+
596+
let adjusted = trimmed_ends
597+
.as_primitive::<R>()
598+
.unary(|end| (end - offset_native).min(length_native));
599+
600+
Ok(Arc::new(RunArray::<R>::try_new(&adjusted, &trimmed_values)?) as ArrowArrayRef)
601+
}
602+
506603
#[cfg(test)]
507604
mod tests {
508605
use vortex_array::IntoArray;

encodings/runend/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ pub use array::*;
99
pub use iter::trimmed_ends_iter;
1010

1111
mod array;
12-
#[cfg(feature = "arrow")]
1312
mod arrow;
1413
pub mod compress;
1514
mod compute;

0 commit comments

Comments
 (0)