Skip to content

Commit f0785b4

Browse files
committed
fix
Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent ae906c7 commit f0785b4

10 files changed

Lines changed: 217 additions & 14 deletions

File tree

vortex-duckdb/cpp/include/duckdb_vx/vector.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ void duckdb_vx_vector_set_vector_data_buffer(duckdb_vector ffi_vector, duckdb_vx
5050
// Set the data pointer for the vector. This is the start of the values array in the vector.
5151
void duckdb_vx_vector_set_data_ptr(duckdb_vector ffi_vector, void *ptr);
5252

53+
// Set the validity pointer for the vector to external data, and store the buffer in auxiliary
54+
// to keep it alive. This enables zero-copy export of validity masks.
55+
void duckdb_vx_vector_set_validity_data(duckdb_vector ffi_vector, void *validity_ptr, idx_t capacity,
56+
duckdb_vx_vector_buffer buffer);
57+
5358
// Converts a duckdb flat vector into a Sequence vector.
5459
void duckdb_vx_sequence_vector(duckdb_vector c_vector, int64_t start, int64_t step, idx_t capacity);
5560

vortex-duckdb/cpp/vector.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,21 @@ class DataVector : public Vector {
5656
inline void SetDataPtr(data_ptr_t ptr) {
5757
data = ptr;
5858
};
59+
60+
inline ValidityMask &GetValidity() {
61+
return validity;
62+
};
63+
};
64+
65+
// Same hack for ValidityMask: access protected fields via inheritance.
66+
class ExternalValidityMask : public ValidityMask {
67+
public:
68+
inline void SetExternal(validity_t *ptr, idx_t cap,
69+
buffer_ptr<ValidityBuffer> keeper) {
70+
validity_mask = ptr;
71+
capacity = cap;
72+
validity_data = std::move(keeper);
73+
};
5974
};
6075

6176
} // namespace vortex
@@ -81,6 +96,26 @@ extern "C" void duckdb_vx_vector_set_data_ptr(duckdb_vector ffi_vector, void *pt
8196
dvector->SetDataPtr((data_ptr_t)ptr);
8297
}
8398

99+
extern "C" void duckdb_vx_vector_set_validity_data(duckdb_vector ffi_vector,
100+
void *validity_ptr,
101+
idx_t capacity,
102+
duckdb_vx_vector_buffer buffer) {
103+
auto dvector = reinterpret_cast<vortex::DataVector *>(ffi_vector);
104+
auto &validity = dvector->GetValidity();
105+
auto ext_validity = reinterpret_cast<vortex::ExternalValidityMask *>(&validity);
106+
107+
// Use the shared_ptr aliasing constructor: the control block ref-counts the
108+
// ExternalVectorBuffer (preventing the Rust buffer from being freed),
109+
// while the stored pointer satisfies ValidityMask's buffer_ptr<ValidityBuffer> type.
110+
auto ext_buf = reinterpret_cast<shared_ptr<vortex::ExternalVectorBuffer> *>(buffer);
111+
auto keeper = shared_ptr<TemplatedValidityData<validity_t>>(
112+
*ext_buf, reinterpret_cast<TemplatedValidityData<validity_t> *>(ext_buf->get()));
113+
114+
// Set validity_mask, capacity, and validity_data (which keeps the buffer alive).
115+
ext_validity->SetExternal(reinterpret_cast<validity_t *>(validity_ptr), capacity,
116+
std::move(keeper));
117+
}
118+
84119
extern "C" duckdb_value duckdb_vx_vector_get_value(duckdb_vector ffi_vector, idx_t index) {
85120
auto vector = reinterpret_cast<Vector *>(ffi_vector);
86121
auto value = duckdb::make_uniq<Value>(vector->GetValue(index));

vortex-duckdb/src/duckdb/vector.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,29 @@ impl VectorRef {
151151
unsafe { cpp::duckdb_vx_vector_set_data_ptr(self.as_ptr(), ptr as *mut c_void) }
152152
}
153153

154+
/// Sets the validity pointer for the vector to external data, and stores the buffer in
155+
/// auxiliary to keep it alive. This enables zero-copy export of validity masks.
156+
///
157+
/// # Safety
158+
///
159+
/// The `validity_ptr` must point to a valid `u64` array with at least
160+
/// `capacity.div_ceil(64)` elements. The buffer must keep this memory alive.
161+
pub unsafe fn set_validity_data(
162+
&self,
163+
validity_ptr: *mut u64,
164+
capacity: usize,
165+
buffer: &VectorBufferRef,
166+
) {
167+
unsafe {
168+
cpp::duckdb_vx_vector_set_validity_data(
169+
self.as_ptr(),
170+
validity_ptr as *mut c_void,
171+
capacity as idx_t,
172+
buffer.as_ptr(),
173+
)
174+
}
175+
}
176+
154177
/// Assigns the element at the specified index with a string value.
155178
/// FIXME(ngates): remove this.
156179
pub fn assign_string_element(&self, idx: usize, value: &CStr) {

vortex-duckdb/src/exporter/constant.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ impl ColumnExporter for ConstantExporter {
7272
match self.value.as_ref() {
7373
None => {
7474
// TODO(ngates): would be good if DuckDB supported constant null vectors.
75-
unsafe { vector.set_validity(&Mask::AllFalse(len), 0, len) };
75+
unsafe { vector.set_validity(&Mask::AllFalse(len), 0, len, None) };
7676
}
7777
Some(value) => {
7878
vector.reference_value(value);

vortex-duckdb/src/exporter/list.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ impl<O: IntegerPType> ColumnExporter for ListExporter<O> {
121121
);
122122

123123
// Set validity if necessary.
124-
if unsafe { vector.set_validity(&self.validity, offset, len) } {
124+
if unsafe { vector.set_validity(&self.validity, offset, len, None) } {
125125
// All values are null, so no point copying the data.
126126
return Ok(());
127127
}

vortex-duckdb/src/exporter/list_view.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ impl<O: IntegerPType, S: IntegerPType> ColumnExporter for ListViewExporter<O, S>
131131
);
132132

133133
// Set validity if necessary.
134-
if unsafe { vector.set_validity(&self.validity, offset, len) } {
134+
if unsafe { vector.set_validity(&self.validity, offset, len, None) } {
135135
// All values are null, so no point copying the data.
136136
return Ok(());
137137
}

vortex-duckdb/src/exporter/mod.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ mod tests {
216216
let mut vector = Vector::with_capacity(&logical_type, 100);
217217

218218
let mask = Mask::AllTrue(10);
219-
let all_null = unsafe { vector.set_validity(&mask, 0, 10) };
219+
let all_null = unsafe { vector.set_validity(&mask, 0, 10, None) };
220220

221221
assert!(!all_null);
222222
}
@@ -228,7 +228,7 @@ mod tests {
228228
let len = 10;
229229

230230
let mask = Mask::AllFalse(len);
231-
let all_null = unsafe { vector.set_validity(&mask, 0, len) };
231+
let all_null = unsafe { vector.set_validity(&mask, 0, len, None) };
232232

233233
assert!(all_null);
234234

@@ -246,7 +246,7 @@ mod tests {
246246

247247
let mask = Mask::from(BitBuffer::from(vec![true; 10]));
248248

249-
let all_null = unsafe { vector.set_validity(&mask, 0, 10) };
249+
let all_null = unsafe { vector.set_validity(&mask, 0, 10, None) };
250250

251251
assert!(!all_null);
252252

@@ -266,7 +266,7 @@ mod tests {
266266
let bits = vec![false; LEN];
267267
let mask = Mask::from(BitBuffer::from(bits));
268268

269-
let all_null = unsafe { vector.set_validity(&mask, 0, LEN) };
269+
let all_null = unsafe { vector.set_validity(&mask, 0, LEN, None) };
270270

271271
assert!(all_null);
272272

@@ -286,7 +286,7 @@ mod tests {
286286
];
287287
let mask = Mask::from(BitBuffer::from(bits.as_slice()));
288288

289-
let all_null = unsafe { vector.set_validity(&mask, 0, 10) };
289+
let all_null = unsafe { vector.set_validity(&mask, 0, 10, None) };
290290

291291
assert!(!all_null);
292292

@@ -306,7 +306,7 @@ mod tests {
306306
];
307307
let mask = Mask::from(BitBuffer::from(bits.as_slice()));
308308

309-
let all_null = unsafe { vector.set_validity(&mask, 2, 8) };
309+
let all_null = unsafe { vector.set_validity(&mask, 2, 8, None) };
310310

311311
assert!(!all_null);
312312

@@ -327,7 +327,7 @@ mod tests {
327327
];
328328
let mask = Mask::from(BitBuffer::from(bits.as_slice()));
329329

330-
let all_null = unsafe { vector.set_validity(&mask, 3, 5) };
330+
let all_null = unsafe { vector.set_validity(&mask, 3, 5, None) };
331331

332332
assert!(!all_null);
333333

@@ -345,7 +345,7 @@ mod tests {
345345
let bits = (0..70).map(|i| i % 3 == 0).collect::<Vec<_>>();
346346
let mask = Mask::from(BitBuffer::from(bits.as_slice()));
347347

348-
let all_null = unsafe { vector.set_validity(&mask, 5, 60) };
348+
let all_null = unsafe { vector.set_validity(&mask, 5, 60, None) };
349349

350350
assert!(!all_null);
351351

vortex-duckdb/src/exporter/primitive.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,78 @@ mod tests {
101101
);
102102
}
103103

104+
#[test]
105+
fn test_primitive_exporter_with_nulls() {
106+
let arr = PrimitiveArray::from_option_iter([Some(10i32), None, Some(30), None, Some(50)]);
107+
108+
let mut chunk = DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)]);
109+
let mut ctx = SESSION.create_execution_ctx();
110+
111+
new_exporter(arr, &mut ctx)
112+
.unwrap()
113+
.export(0, 5, chunk.get_vector_mut(0), &mut ctx)
114+
.unwrap();
115+
chunk.set_len(5);
116+
117+
assert_eq!(
118+
format!("{}", String::try_from(&*chunk).unwrap()),
119+
r#"Chunk - [1 Columns]
120+
- FLAT INTEGER: 5 = [ 10, NULL, 30, NULL, 50]
121+
"#
122+
);
123+
}
124+
125+
/// Export a large nullable primitive array over many chunks to exercise the
126+
/// zero-copy validity path. The non-zero-copy fallback currently panics,
127+
/// so this test proves every chunk goes through the zero-copy branch.
128+
#[test]
129+
fn test_primitive_exporter_with_nulls_zero_copy() {
130+
let vector_size = duckdb_vector_size();
131+
const NUM_CHUNKS: usize = 8;
132+
let len = vector_size * NUM_CHUNKS;
133+
134+
// Every 3rd element is null — guarantees mixed validity in every chunk.
135+
#[expect(clippy::cast_possible_truncation, reason = "test data fits in i32")]
136+
let arr = PrimitiveArray::from_option_iter(
137+
(0..len).map(|i| if i % 3 == 1 { None } else { Some(i as i32) }),
138+
);
139+
140+
let mut ctx = SESSION.create_execution_ctx();
141+
let exporter = new_exporter(arr, &mut ctx).unwrap();
142+
143+
for chunk_idx in 0..NUM_CHUNKS {
144+
let mut chunk =
145+
DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)]);
146+
147+
// This will panic if the non-zero-copy path is hit.
148+
exporter
149+
.export(
150+
chunk_idx * vector_size,
151+
vector_size,
152+
chunk.get_vector_mut(0),
153+
&mut ctx,
154+
)
155+
.unwrap();
156+
chunk.set_len(vector_size);
157+
158+
let vec = chunk.get_vector(0);
159+
for i in 0..vector_size {
160+
let global_idx = chunk_idx * vector_size + i;
161+
if global_idx % 3 == 1 {
162+
assert!(
163+
vec.row_is_null(i as u64),
164+
"expected null at global index {global_idx}"
165+
);
166+
} else {
167+
assert!(
168+
!vec.row_is_null(i as u64),
169+
"expected non-null at global index {global_idx}"
170+
);
171+
}
172+
}
173+
}
174+
}
175+
104176
#[test]
105177
fn test_long_primitive_exporter() {
106178
let vector_size = duckdb_vector_size();

vortex-duckdb/src/exporter/validity.rs

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,69 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use vortex::array::ExecutionCtx;
5+
use vortex::buffer::ByteBuffer;
56
use vortex::error::VortexResult;
67
use vortex::mask::Mask;
78

9+
use crate::duckdb::VectorBuffer;
810
use crate::duckdb::VectorRef;
911
use crate::exporter::ColumnExporter;
1012

1113
struct ValidityExporter {
1214
mask: Mask,
15+
/// If the mask's bit buffer is u64-aligned with no sub-byte offset,
16+
/// we can zero-copy it into DuckDB. We hold the VectorBuffer to keep
17+
/// the underlying memory alive via DuckDB's ref-counting.
18+
zero_copy: Option<ZeroCopyValidity>,
1319
exporter: Box<dyn ColumnExporter>,
1420
}
1521

22+
pub(super) struct ZeroCopyValidity {
23+
/// The underlying byte buffer backing the validity bits.
24+
pub(super) buffer: ByteBuffer,
25+
pub(super) shared_buffer: VectorBuffer,
26+
}
27+
28+
/// Returns true if the bit buffer can be zero-copied as a DuckDB validity mask.
29+
///
30+
/// Requirements:
31+
/// - No sub-byte bit offset (offset == 0)
32+
/// - The underlying byte buffer is u64-aligned
33+
fn can_zero_copy_validity(mask: &Mask) -> bool {
34+
let Mask::Values(values) = mask else {
35+
return false;
36+
};
37+
let bit_buf = values.bit_buffer();
38+
if bit_buf.offset() != 0 {
39+
return false;
40+
}
41+
let inner = bit_buf.inner();
42+
// Check u64 alignment of the underlying data pointer
43+
(inner.as_slice().as_ptr() as usize).is_multiple_of(size_of::<u64>())
44+
}
45+
1646
pub(crate) fn new_exporter(
1747
mask: Mask,
1848
exporter: Box<dyn ColumnExporter>,
1949
) -> Box<dyn ColumnExporter> {
2050
if mask.all_true() {
2151
exporter
2252
} else {
23-
Box::new(ValidityExporter { mask, exporter })
53+
let zero_copy = can_zero_copy_validity(&mask).then(|| {
54+
let Mask::Values(values) = &mask else {
55+
unreachable!()
56+
};
57+
let buffer = values.bit_buffer().inner().clone();
58+
ZeroCopyValidity {
59+
shared_buffer: VectorBuffer::new(buffer.clone()),
60+
buffer,
61+
}
62+
});
63+
Box::new(ValidityExporter {
64+
mask,
65+
zero_copy,
66+
exporter,
67+
})
2468
}
2569
}
2670

@@ -36,7 +80,7 @@ impl ColumnExporter for ValidityExporter {
3680
offset + len <= self.mask.len(),
3781
"cannot access outside of array"
3882
);
39-
if unsafe { vector.set_validity(&self.mask, offset, len) } {
83+
if unsafe { vector.set_validity(&self.mask, offset, len, self.zero_copy.as_ref()) } {
4084
// All values are null, so no point copying the data.
4185
return Ok(());
4286
}

vortex-duckdb/src/exporter/vector.rs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,17 @@ use vortex::mask::Mask;
66
use crate::duckdb::Value;
77
use crate::duckdb::VectorRef;
88
use crate::exporter::copy_from_slice;
9+
use crate::exporter::validity::ZeroCopyValidity;
910

1011
impl VectorRef {
11-
pub(super) unsafe fn set_validity(&mut self, mask: &Mask, offset: usize, len: usize) -> bool {
12+
/// Returns true if all values are null (caller can skip data export).
13+
pub(super) unsafe fn set_validity(
14+
&mut self,
15+
mask: &Mask,
16+
offset: usize,
17+
len: usize,
18+
zero_copy: Option<&ZeroCopyValidity>,
19+
) -> bool {
1220
match mask {
1321
Mask::AllTrue(_) => {
1422
// We only need to blank out validity if there is already a slice allocated.
@@ -27,7 +35,23 @@ impl VectorRef {
2735
unsafe { self.set_all_true_validity(len) }
2836
} else if true_count == 0 {
2937
self.set_all_false_validity()
38+
} else if let Some(zc) = zero_copy.filter(|_| offset.is_multiple_of(64)) {
39+
let u64_offset = offset / 64;
40+
// SAFETY: the underlying buffer is u64-aligned (checked in
41+
// can_zero_copy_validity) and we only read through this pointer.
42+
// The cast to *mut is an artifact of the DuckDB C API.
43+
let ptr = zc.buffer.as_slice().as_ptr().cast_mut().cast::<u64>();
44+
// SAFETY: we verified alignment in can_zero_copy_validity
45+
// and the VectorBuffer keeps the data alive.
46+
unsafe { self.set_validity_data(ptr.add(u64_offset), len, &zc.shared_buffer) };
3047
} else {
48+
// If zero_copy is available and offset is aligned, we should
49+
// have taken the branch above. Assert this invariant.
50+
assert!(
51+
zero_copy.is_none() || !offset.is_multiple_of(64),
52+
"zero-copy validity available and offset {offset} is aligned \
53+
but copy path was taken"
54+
);
3155
let source = arr.bit_buffer().inner().as_slice();
3256
copy_from_slice(
3357
unsafe { self.ensure_validity_slice(len) },

0 commit comments

Comments
 (0)