Skip to content

Commit 7679cdc

Browse files
committed
feat[vortex-array]: support executing UUID to arrow
Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent 38ab5af commit 7679cdc

File tree

7 files changed

+227
-35
lines changed

7 files changed

+227
-35
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ arrow-cast = "58"
9797
arrow-data = "58"
9898
arrow-ipc = "58"
9999
arrow-ord = "58"
100-
arrow-schema = "58"
100+
arrow-schema = { version = "58", features = ["canonical_extension_types"] }
101101
arrow-select = "58"
102102
arrow-string = "58"
103103
async-fs = "2.2.0"

vortex-array/public-api.lock

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14858,6 +14858,14 @@ pub mod vortex_array::extension::uuid
1485814858

1485914859
pub struct vortex_array::extension::uuid::Uuid
1486014860

14861+
impl vortex_array::extension::uuid::Uuid
14862+
14863+
pub fn vortex_array::extension::uuid::Uuid::default(nullability: vortex_array::dtype::Nullability) -> vortex_array::dtype::extension::ExtDType<Self>
14864+
14865+
pub fn vortex_array::extension::uuid::Uuid::new(metadata: vortex_array::extension::uuid::UuidMetadata, nullability: vortex_array::dtype::Nullability) -> vortex_array::dtype::extension::ExtDType<Self>
14866+
14867+
pub fn vortex_array::extension::uuid::Uuid::storage_dtype(nullability: vortex_array::dtype::Nullability) -> vortex_array::dtype::DType
14868+
1486114869
impl core::clone::Clone for vortex_array::extension::uuid::Uuid
1486214870

1486314871
pub fn vortex_array::extension::uuid::Uuid::clone(&self) -> vortex_array::extension::uuid::Uuid
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use std::sync::Arc;
5+
6+
use arrow_array::ArrayRef as ArrowArrayRef;
7+
use arrow_array::FixedSizeBinaryArray;
8+
use vortex_error::VortexResult;
9+
use vortex_error::vortex_bail;
10+
11+
use crate::ArrayRef;
12+
use crate::ExecutionCtx;
13+
use crate::arrays::ExtensionArray;
14+
use crate::arrays::FixedSizeListArray;
15+
use crate::arrays::PrimitiveArray;
16+
use crate::arrow::executor::validity::to_arrow_null_buffer;
17+
use crate::dtype::DType;
18+
use crate::dtype::PType;
19+
use crate::vtable::ValidityHelper;
20+
21+
/// Convert a Vortex array to an Arrow `FixedSizeBinaryArray`.
22+
///
23+
/// Accepts either an extension array (e.g. UUID) or a plain `FixedSizeList(Primitive(U8), size)`.
24+
pub(super) fn to_arrow_fixed_size_binary(
25+
array: ArrayRef,
26+
size: i32,
27+
ctx: &mut ExecutionCtx,
28+
) -> VortexResult<ArrowArrayRef> {
29+
let storage = if array.dtype().is_extension() {
30+
array
31+
.execute::<ExtensionArray>(ctx)?
32+
.storage_array()
33+
.clone()
34+
} else {
35+
array
36+
};
37+
38+
let fsl = storage.execute::<FixedSizeListArray>(ctx)?;
39+
40+
match fsl.dtype() {
41+
DType::FixedSizeList(elem, list_size, _)
42+
if *list_size == size as u32
43+
&& matches!(elem.as_ref(), DType::Primitive(PType::U8, _)) => {}
44+
other => {
45+
vortex_bail!("FixedSizeBinary({size}) requires FixedSizeList(U8, {size}), got {other}");
46+
}
47+
}
48+
49+
let elements = fsl.elements().clone().execute::<PrimitiveArray>(ctx)?;
50+
let values = elements.into_buffer::<u8>().into_arrow_buffer();
51+
let null_buffer = to_arrow_null_buffer(fsl.validity(), fsl.len(), ctx)?;
52+
53+
Ok(Arc::new(FixedSizeBinaryArray::new(
54+
size,
55+
values,
56+
null_buffer,
57+
)))
58+
}
59+
60+
#[cfg(test)]
61+
mod tests {
62+
use arrow_array::FixedSizeBinaryArray;
63+
use arrow_schema::DataType;
64+
use vortex_buffer::BitBuffer;
65+
use vortex_buffer::Buffer;
66+
67+
use crate::IntoArray;
68+
use crate::LEGACY_SESSION;
69+
use crate::VortexSessionExecute;
70+
use crate::arrays::ExtensionArray;
71+
use crate::arrays::FixedSizeListArray;
72+
use crate::arrays::PrimitiveArray;
73+
use crate::arrow::ArrowArrayExecutor;
74+
use crate::dtype::Nullability;
75+
use crate::extension::uuid::Uuid;
76+
use crate::extension::uuid::vtable::UUID_BYTE_LEN;
77+
use crate::validity::Validity;
78+
79+
#[expect(
80+
clippy::cast_possible_truncation,
81+
reason = "UUID_BYTE_LEN always fits u32/i32"
82+
)]
83+
#[test]
84+
fn test_uuid_to_fixed_size_binary() {
85+
let u1 = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap();
86+
let u2 = uuid::Uuid::parse_str("f47ac10b-58cc-4372-a567-0e02b2c3d479").unwrap();
87+
88+
let flat: Vec<u8> = [u1.as_bytes(), &[0u8; 16], u2.as_bytes()]
89+
.into_iter()
90+
.flatten()
91+
.copied()
92+
.collect();
93+
let elements = PrimitiveArray::new(Buffer::from(flat), Validity::NonNullable).into_array();
94+
let validity = Validity::from(BitBuffer::from_iter([true, false, true]));
95+
let fsl = FixedSizeListArray::try_new(elements, UUID_BYTE_LEN as u32, validity, 3)
96+
.unwrap()
97+
.into_array();
98+
let uuid_array = ExtensionArray::new(Uuid::default(Nullability::Nullable).erased(), fsl);
99+
100+
let mut ctx = LEGACY_SESSION.create_execution_ctx();
101+
let arrow = uuid_array
102+
.into_array()
103+
.execute_arrow(
104+
Some(&DataType::FixedSizeBinary(UUID_BYTE_LEN as i32)),
105+
&mut ctx,
106+
)
107+
.unwrap();
108+
109+
let expected = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
110+
[Some(u1.as_bytes().as_slice()), None, Some(u2.as_bytes())].into_iter(),
111+
UUID_BYTE_LEN as i32,
112+
)
113+
.unwrap();
114+
assert_eq!(arrow.as_ref(), &expected as &dyn arrow_array::Array);
115+
}
116+
}

vortex-array/src/arrow/executor/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod byte;
66
pub mod byte_view;
77
mod decimal;
88
mod dictionary;
9+
mod fixed_size_binary;
910
mod fixed_size_list;
1011
mod list;
1112
mod list_view;
@@ -38,6 +39,7 @@ use crate::arrow::executor::byte::to_arrow_byte_array;
3839
use crate::arrow::executor::byte_view::to_arrow_byte_view;
3940
use crate::arrow::executor::decimal::to_arrow_decimal;
4041
use crate::arrow::executor::dictionary::to_arrow_dictionary;
42+
use crate::arrow::executor::fixed_size_binary::to_arrow_fixed_size_binary;
4143
use crate::arrow::executor::fixed_size_list::to_arrow_fixed_list;
4244
use crate::arrow::executor::list::to_arrow_list;
4345
use crate::arrow::executor::list_view::to_arrow_list_view;
@@ -156,8 +158,8 @@ impl ArrowArrayExecutor for ArrayRef {
156158
DataType::RunEndEncoded(ends_type, values_type) => {
157159
to_arrow_run_end(self, ends_type.data_type(), values_type, ctx)
158160
}
159-
DataType::FixedSizeBinary(_)
160-
| DataType::Map(..)
161+
DataType::FixedSizeBinary(size) => to_arrow_fixed_size_binary(self, *size, ctx),
162+
DataType::Map(..)
161163
| DataType::Duration(_)
162164
| DataType::Interval(_)
163165
| DataType::Union(..) => {

vortex-array/src/dtype/arrow.rs

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use arrow_schema::Schema;
2323
use arrow_schema::SchemaBuilder;
2424
use arrow_schema::SchemaRef;
2525
use arrow_schema::TimeUnit as ArrowTimeUnit;
26+
use arrow_schema::extension::ExtensionType as _;
2627
use vortex_error::VortexError;
2728
use vortex_error::VortexExpect;
2829
use vortex_error::VortexResult;
@@ -42,6 +43,8 @@ use crate::extension::datetime::TemporalMetadata;
4243
use crate::extension::datetime::Time;
4344
use crate::extension::datetime::TimeUnit;
4445
use crate::extension::datetime::Timestamp;
46+
use crate::extension::uuid::Uuid;
47+
use crate::extension::uuid::vtable::UUID_BYTE_LEN;
4548

4649
/// Trait for converting Arrow types to Vortex types.
4750
pub trait FromArrowType<T>: Sized {
@@ -210,15 +213,22 @@ impl FromArrowType<(&DataType, Nullability)> for DType {
210213

211214
impl FromArrowType<&Field> for DType {
212215
fn from_arrow(field: &Field) -> Self {
216+
let nullability = Nullability::from(field.is_nullable());
217+
213218
if field
214219
.metadata()
215220
.get("ARROW:extension:name")
216221
.map(|s| s.as_str())
217222
== Some("arrow.parquet.variant")
218223
{
219-
return DType::Variant(field.is_nullable().into());
224+
return DType::Variant(nullability);
225+
}
226+
227+
if field.extension_type_name() == Some(arrow_schema::extension::Uuid::NAME) {
228+
return DType::Extension(Uuid::default(nullability).erased());
220229
}
221-
Self::from_arrow((field.data_type(), field.is_nullable().into()))
230+
231+
Self::from_arrow((field.data_type(), nullability))
222232
}
223233
}
224234

@@ -245,11 +255,17 @@ impl DType {
245255
.into(),
246256
)
247257
} else {
248-
Field::new(
258+
let mut field = Field::new(
249259
field_name.as_ref(),
250260
field_dtype.to_arrow_dtype()?,
251261
field_dtype.is_nullable(),
252-
)
262+
);
263+
if let DType::Extension(ext) = field_dtype
264+
&& ext.is::<Uuid>()
265+
{
266+
field = field.with_extension_type(arrow_schema::extension::Uuid);
267+
}
268+
field
253269
};
254270
builder.push(field);
255271
}
@@ -349,6 +365,14 @@ impl DType {
349365
});
350366
};
351367

368+
if ext_dtype.is::<Uuid>() {
369+
#[expect(
370+
clippy::cast_possible_truncation,
371+
reason = "UUID_BYTE_LEN always fits i32"
372+
)]
373+
return Ok(DataType::FixedSizeBinary(UUID_BYTE_LEN as i32));
374+
}
375+
352376
vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id())
353377
}
354378
})
@@ -561,4 +585,25 @@ mod test {
561585

562586
assert_eq!(original_dtype, roundtripped_dtype);
563587
}
588+
589+
#[test]
590+
fn test_uuid_schema_roundtrip() {
591+
let original = DType::struct_(
592+
[(
593+
"id",
594+
DType::Extension(Uuid::default(Nullability::Nullable).erased()),
595+
)],
596+
Nullability::NonNullable,
597+
);
598+
let schema = original.to_arrow_schema().unwrap();
599+
600+
let field = schema.field(0);
601+
assert_eq!(field.data_type(), &DataType::FixedSizeBinary(16));
602+
assert_eq!(
603+
field.extension_type_name(),
604+
Some(arrow_schema::extension::Uuid::NAME)
605+
);
606+
607+
assert_eq!(DType::from_arrow(&schema), original);
608+
}
564609
}

vortex-array/src/extension/uuid/mod.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,42 @@ pub use metadata::UuidMetadata;
1515

1616
pub(crate) mod vtable;
1717

18+
use std::sync::Arc;
19+
20+
use vortex_error::VortexExpect;
21+
22+
use crate::dtype::DType;
23+
use crate::dtype::Nullability;
24+
use crate::dtype::PType;
25+
use crate::dtype::extension::ExtDType;
26+
1827
/// The VTable for the UUID extension type.
1928
#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
2029
pub struct Uuid;
30+
31+
#[expect(
32+
clippy::cast_possible_truncation,
33+
reason = "UUID_BYTE_LEN always fits u32"
34+
)]
35+
#[allow(clippy::same_name_method)]
36+
impl Uuid {
37+
/// Returns the canonical UUID storage dtype: `FixedSizeList(Primitive(U8, NonNullable), 16)`.
38+
pub fn storage_dtype(nullability: Nullability) -> DType {
39+
DType::FixedSizeList(
40+
Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)),
41+
vtable::UUID_BYTE_LEN as u32,
42+
nullability,
43+
)
44+
}
45+
46+
/// Creates a new UUID extension dtype with the given metadata and nullability.
47+
pub fn new(metadata: UuidMetadata, nullability: Nullability) -> ExtDType<Self> {
48+
ExtDType::try_new(metadata, Self::storage_dtype(nullability))
49+
.vortex_expect("valid UUID storage dtype")
50+
}
51+
52+
/// Creates a new UUID extension dtype with default metadata.
53+
pub fn default(nullability: Nullability) -> ExtDType<Self> {
54+
Self::new(UuidMetadata::default(), nullability)
55+
}
56+
}

vortex-array/src/extension/uuid/vtable.rs

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,11 @@ mod tests {
182182
#[case::non_nullable(Nullability::NonNullable)]
183183
#[case::nullable(Nullability::Nullable)]
184184
fn validate_correct_storage_dtype(#[case] nullability: Nullability) -> VortexResult<()> {
185-
let metadata = UuidMetadata::default();
186-
let storage_dtype = uuid_storage_dtype(nullability);
187-
ExtDType::try_with_vtable(Uuid, metadata, storage_dtype)?;
185+
ExtDType::try_with_vtable(
186+
Uuid,
187+
UuidMetadata::default(),
188+
Uuid::storage_dtype(nullability),
189+
)?;
188190
Ok(())
189191
}
190192

@@ -229,10 +231,7 @@ mod tests {
229231
let expected = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000")
230232
.map_err(|e| vortex_error::vortex_err!("{e}"))?;
231233

232-
let ext_dtype = ExtDType::try_new(
233-
UuidMetadata::default(),
234-
uuid_storage_dtype(Nullability::NonNullable),
235-
)?;
234+
let ext_dtype = Uuid::default(Nullability::NonNullable);
236235
let children: Vec<Scalar> = expected
237236
.as_bytes()
238237
.iter()
@@ -261,13 +260,12 @@ mod tests {
261260
assert_eq!(v4_uuid.get_version(), Some(Version::Random));
262261

263262
// Metadata says v7, but the UUID is v4.
264-
let ext_dtype = ExtDType::try_with_vtable(
265-
Uuid,
263+
let ext_dtype = Uuid::new(
266264
UuidMetadata {
267265
version: Some(Version::SortRand),
268266
},
269-
uuid_storage_dtype(Nullability::NonNullable),
270-
)?;
267+
Nullability::NonNullable,
268+
);
271269
let children: Vec<Scalar> = v4_uuid
272270
.as_bytes()
273271
.iter()
@@ -307,13 +305,12 @@ mod tests {
307305
let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000")
308306
.map_err(|e| vortex_error::vortex_err!("{e}"))?;
309307

310-
let ext_dtype = ExtDType::try_new(
308+
let ext_dtype = Uuid::new(
311309
UuidMetadata {
312310
version: Some(Version::Random),
313311
},
314-
uuid_storage_dtype(Nullability::NonNullable),
315-
)
316-
.unwrap();
312+
Nullability::NonNullable,
313+
);
317314
let storage_value = uuid_storage_scalar(&v4_uuid);
318315

319316
let result = Uuid::unpack_native(&ext_dtype, &storage_value)?;
@@ -327,23 +324,11 @@ mod tests {
327324
let v4_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000")
328325
.map_err(|e| vortex_error::vortex_err!("{e}"))?;
329326

330-
let ext_dtype = ExtDType::try_new(
331-
UuidMetadata::default(),
332-
uuid_storage_dtype(Nullability::NonNullable),
333-
)
334-
.unwrap();
327+
let ext_dtype = Uuid::default(Nullability::NonNullable);
335328
let storage_value = uuid_storage_scalar(&v4_uuid);
336329

337330
let result = Uuid::unpack_native(&ext_dtype, &storage_value)?;
338331
assert_eq!(result, v4_uuid);
339332
Ok(())
340333
}
341-
342-
fn uuid_storage_dtype(nullability: Nullability) -> DType {
343-
DType::FixedSizeList(
344-
Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)),
345-
UUID_BYTE_LEN as u32,
346-
nullability,
347-
)
348-
}
349334
}

0 commit comments

Comments
 (0)