diff --git a/vortex-duckdb/cpp/include/duckdb_vx/vector.h b/vortex-duckdb/cpp/include/duckdb_vx/vector.h index ea3e49184f7..906937f1629 100644 --- a/vortex-duckdb/cpp/include/duckdb_vx/vector.h +++ b/vortex-duckdb/cpp/include/duckdb_vx/vector.h @@ -47,6 +47,10 @@ void duckdb_vx_string_vector_add_vector_data_buffer(duckdb_vector ffi_vector, du // valid. void duckdb_vx_vector_set_vector_data_buffer(duckdb_vector ffi_vector, duckdb_vx_vector_buffer buffer); +// Reset vector's validity mask to nullptr, making all vector's elements valid. +// vector must not be a DictionaryVector or a SequenceVector +void duckdb_vx_vector_set_all_valid(duckdb_vector ffi_vector); + // Set the data pointer for the vector. This is the start of the values array in the vector. void duckdb_vx_vector_set_data_ptr(duckdb_vector ffi_vector, void *ptr); diff --git a/vortex-duckdb/cpp/vector.cpp b/vortex-duckdb/cpp/vector.cpp index 3b6439f4c95..0328b7aff3e 100644 --- a/vortex-duckdb/cpp/vector.cpp +++ b/vortex-duckdb/cpp/vector.cpp @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +#include "include/duckdb_vx/vector.h" #include "duckdb_vx/duckdb_diagnostics.h" DUCKDB_INCLUDES_BEGIN @@ -106,3 +107,20 @@ const char *duckdb_vector_to_string(duckdb_vector vector, unsigned long len, duc return nullptr; } } + +void duckdb_vx_vector_set_all_valid(duckdb_vector ffi_vector) { + using enum VectorType; + Vector &vector = *reinterpret_cast(ffi_vector); + const VectorType type = vector.GetVectorType(); + D_ASSERT(type != DICTIONARY_VECTOR && type != SEQUENCE_VECTOR); + switch (type) { + case CONSTANT_VECTOR: + return ConstantVector::Validity(vector).Reset(); + case FLAT_VECTOR: + return FlatVector::Validity(vector).Reset(); + case FSST_VECTOR: + return FSSTVector::Validity(vector).Reset(); + default: + __builtin_unreachable(); + } +} diff --git a/vortex-duckdb/src/duckdb/vector.rs b/vortex-duckdb/src/duckdb/vector.rs index 4a822629980..ef2d35b2803 100644 --- a/vortex-duckdb/src/duckdb/vector.rs +++ b/vortex-duckdb/src/duckdb/vector.rs @@ -207,7 +207,11 @@ impl VectorRef { /// /// The provided capacity *must* be the actual capacity of this vector. pub unsafe fn validity_bitslice_mut(&mut self, capacity: usize) -> Option<&mut BitSlice> { - unsafe { self.validity_slice_mut(capacity) }.map(|slice| slice.view_bits_mut()) + // capacity is always less than BitSlice::MAX_ELTS + unsafe { + self.validity_slice_mut(capacity) + .map(|slice| BitSlice::from_slice_unchecked_mut(slice)) + } } pub fn validity_ref(&self, len: usize) -> ValidityRef<'_> { diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs index 502038b858c..b5b26340efe 100644 --- a/vortex-duckdb/src/exporter/mod.rs +++ b/vortex-duckdb/src/exporter/mod.rs @@ -19,8 +19,6 @@ mod validity; mod varbinview; mod vector; -use bitvec::prelude::Lsb0; -use bitvec::view::BitView; pub use cache::ConversionCache; pub use decimal::precision_to_duckdb_storage_size; use vortex::array::ArrayRef; @@ -32,6 +30,7 @@ use vortex::array::arrays::List; use vortex::array::arrays::StructArray; use vortex::array::arrays::TemporalArray; use vortex::array::arrays::struct_::StructArrayExt; +use vortex::buffer::BitChunks; use vortex::encodings::runend::RunEnd; use vortex::encodings::sequence::Sequence; use vortex::error::VortexResult; @@ -187,17 +186,33 @@ fn new_array_exporter_with_flatten( } } -/// Copy the sliced bits from source into target. +/// Copy the sliced bits from source into target, returning whether all copied bits are zero, +/// all are one, or mixed. /// -/// Offset and length are a _bit_ offset and a _bit_ length into source. +/// offset and len are a _bit_ offset and a _bit_ length into `source`. /// -/// `target.len()` must equal `len`. -fn copy_from_slice(target: &mut [u64], source: &[u8], offset: usize, len: usize) { - let (start, middle, end) = unsafe { target.align_to_mut::() }; - assert!(start.is_empty()); - assert!(end.is_empty()); - let target = &mut middle.view_bits_mut::()[..len]; - target.copy_from_bitslice(&source.view_bits()[offset..][..len]); +/// target must have at least len.div_ceil(64) elements. +/// Returns the number of ones in copied slice. +fn copy_from_slice(target: &mut [u64], source: &[u8], offset: usize, len: usize) -> usize { + if len == 0 { + return 0; + } + + let mut ones = 0usize; + let chunks = BitChunks::new(source, offset, len); + let chunk_len = chunks.chunk_len(); + let remainder_len = chunks.remainder_len(); + let remainder = chunks.remainder_bits(); + for (slot, chunk) in target.iter_mut().zip(chunks) { + *slot = chunk; + ones += chunk.count_ones() as usize; + } + + if remainder_len > 0 { + target[chunk_len] = remainder; + ones += remainder.count_ones() as usize; + } + ones } #[cfg(test)] @@ -359,23 +374,23 @@ mod tests { fn test_copy_from_slice_empty_to_empty() { let target = &mut []; let source = Vec::::new(); - copy_from_slice(target, &source, 0, 0); + assert_eq!(copy_from_slice(target, &source, 0, 0), 0); } #[test] fn test_copy_from_slice_64_to_empty() { let target = &mut []; let source = [1u8, 2, 3, 50, 51, 52, 100, 101]; - copy_from_slice(target, &source, 0, 0); - copy_from_slice(target, &source, 5, 0); - copy_from_slice(target, &source, 8, 0); + assert_eq!(copy_from_slice(target, &source, 0, 0), 0); + assert_eq!(copy_from_slice(target, &source, 5, 0), 0); + assert_eq!(copy_from_slice(target, &source, 8, 0), 0); } #[test] fn test_copy_from_slice_64_to_64() { let mut target = vec![0u64]; let source = [1u8, 2, 3, 50, 51, 52, 100, 101]; - copy_from_slice(&mut target, &source, 0, 64); + assert_eq!(copy_from_slice(&mut target, &source, 0, 64), 21); assert_eq!( target[0], 0x65_64_34_33_32_03_02_01_u64, "{:#08x} == {:#08x}", @@ -383,20 +398,27 @@ mod tests { ); } + #[test] + fn test_copy_from_slice_64_ones() { + let mut target = [0u64]; + let source = [u8::MAX; 8]; + assert_eq!(copy_from_slice(&mut target, &source, 0, 64), 64); + } + #[test] fn test_copy_from_slice_80_to_0() { let target = &mut []; let source = [1u8, 2, 3, 50, 51, 52, 100, 101, 254, 255]; - copy_from_slice(target, &source, 0, 0); - copy_from_slice(target, &source, 8, 0); - copy_from_slice(target, &source, 10, 0); + assert_eq!(copy_from_slice(target, &source, 0, 0), 0); + assert_eq!(copy_from_slice(target, &source, 8, 0), 0); + assert_eq!(copy_from_slice(target, &source, 10, 0), 0,); } #[test] fn test_copy_from_slice_80_to_64_case_1() { let mut target = [0u64]; let source = [1u8, 2, 3, 50, 51, 52, 100, 101, 254, 255]; - copy_from_slice(&mut target, &source, 16, 64); + assert_eq!(copy_from_slice(&mut target, &source, 16, 64), 34); assert_eq!( target[0], 0xff_fe_65_64_34_33_32_03_u64, "{:#08x} == {:#08x}", @@ -408,7 +430,7 @@ mod tests { fn test_copy_from_slice_80_to_64_case_2() { let mut target = [0u64]; let source = [1u8, 2, 3, 50, 51, 52, 100, 101, 254, 255]; - copy_from_slice(&mut target, &source, 8, 64); + assert_eq!(copy_from_slice(&mut target, &source, 8, 64), 27); assert_eq!( target[0], 0xfe_65_64_34_33_32_03_02_u64, "{:#08x} == {:#08x}", @@ -420,7 +442,7 @@ mod tests { fn test_copy_from_slice_80_to_64_case_3() { let mut target = [0u64]; let source = [1u8, 2, 3, 50, 51, 52, 100, 101, 254, 255]; - copy_from_slice(&mut target, &source, 0, 64); + assert_eq!(copy_from_slice(&mut target, &source, 0, 64), 21,); assert_eq!( target[0], 0x65_64_34_33_32_03_02_01_u64, "{:#08x} == {:#08x}", @@ -432,7 +454,7 @@ mod tests { fn test_copy_from_slice_80_to_64_case_4() { let mut target = [0u64]; let source = [1u8, 2, 3, 50, 51, 52, 100, 101, 254, 255]; - copy_from_slice(&mut target, &source, 10, 64); + assert_eq!(copy_from_slice(&mut target, &source, 10, 64), 28,); assert_eq!( target[0], 0xff_99_59_0d_0c_cc_80_c0_u64, // Python: hex(0xff_fe_65_64_34_33_32_03_02 >> 2), then remove the high two hexits @@ -454,7 +476,7 @@ mod tests { let (_, middle, _) = unsafe { source.align_to::() }; assert!(!middle.is_empty()); - copy_from_slice(&mut target, &source, 0, 128); + assert_eq!(copy_from_slice(&mut target, &source, 0, 128), 66,); assert_eq!( target[0], 0xfc_fd_fe_ff_04_03_02_01_u64, "{:#08x} == {:#08x}", @@ -466,7 +488,7 @@ mod tests { target[1], 0xf9_fa_fb_fc_08_07_06_05_u64, ); - copy_from_slice(&mut target, &source, 8, 128); + assert_eq!(copy_from_slice(&mut target, &source, 8, 128), 66); assert_eq!( target[0], 0x05_fc_fd_fe_ff_04_03_02_u64, "{:#08x} == {:#08x}", @@ -478,7 +500,7 @@ mod tests { target[1], 0x01_f9_fa_fb_fc_08_07_06_u64, ); - copy_from_slice(&mut target, &source, 8 * 8, 128); + assert_eq!(copy_from_slice(&mut target, &source, 8 * 8, 128), 66,); assert_eq!( target[0], 0xf9_fa_fb_fc_08_07_06_05_u64, "{:#08x} == {:#08x}", @@ -490,7 +512,7 @@ mod tests { target[1], 0xfc_fd_fe_ff_04_03_02_01_u64, ); - copy_from_slice(&mut target, &source, 8 * 12, 128); + assert_eq!(copy_from_slice(&mut target, &source, 8 * 12, 128), 66,); assert_eq!( target[0], 0x04_03_02_01_f9_fa_fb_fc_u64, "{:#08x} == {:#08x}", @@ -502,7 +524,7 @@ mod tests { target[1], 0x08_07_06_05_fc_fd_fe_ff_u64, ); - copy_from_slice(&mut target, &source, 8 * 12 + 4, 128); + assert_eq!(copy_from_slice(&mut target, &source, 8 * 12 + 4, 128), 66,); // Find the 12th byte, skip the first hexit, take the next 32 hexits (i.e. 16 bytesor 128 // bits). assert_eq!( @@ -517,7 +539,10 @@ mod tests { ); // Take the above and shift one bit towards the right-hand-side. - copy_from_slice(&mut target, &source, 8 * 12 + 4 + 1, 128); + assert_eq!( + copy_from_slice(&mut target, &source, 8 * 12 + 4 + 1, 128), + 66, + ); assert_eq!( target[0], 0xf8_20_18_10_0f_cf_d7_df_u64, "{:#08x} == {:#08x}", diff --git a/vortex-duckdb/src/exporter/vector.rs b/vortex-duckdb/src/exporter/vector.rs index 5e4cef97d11..c8b9e36f2eb 100644 --- a/vortex-duckdb/src/exporter/vector.rs +++ b/vortex-duckdb/src/exporter/vector.rs @@ -2,7 +2,9 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use vortex::mask::Mask; +use vortex::mask::MaskValues; +use crate::cpp::duckdb_vx_vector_set_all_valid; use crate::duckdb::Value; use crate::duckdb::VectorRef; use crate::exporter::copy_from_slice; @@ -11,44 +13,46 @@ impl VectorRef { pub(super) unsafe fn set_validity(&mut self, mask: &Mask, offset: usize, len: usize) -> bool { match mask { Mask::AllTrue(_) => { - // We only need to blank out validity if there is already a slice allocated. - // SAFETY: Caller guarantees this. - unsafe { self.set_all_true_validity(len) } + self.set_all_true_validity(); false } Mask::AllFalse(_) => { - // SAFETY: Caller guarantees this. self.set_all_false_validity(); true } - Mask::Values(arr) => { - let true_count = arr.bit_buffer().slice(offset..(offset + len)).true_count(); - if true_count == len { - unsafe { self.set_all_true_validity(len) } - } else if true_count == 0 { - self.set_all_false_validity() - } else { - let source = arr.bit_buffer().inner().as_slice(); - copy_from_slice( - unsafe { self.ensure_validity_slice(len) }, - source, - offset, - len, - ); - } - - true_count == 0 - } + Mask::Values(arr) => self.set_validity_with_array(arr, len, offset), } } - pub(super) unsafe fn set_all_true_validity(&mut self, len: usize) { - if let Some(validity) = unsafe { self.validity_bitslice_mut(len) } { - validity.fill(true); + fn set_validity_with_array(&mut self, arr: &MaskValues, len: usize, offset: usize) -> bool { + let true_count = arr.true_count(); + if true_count == arr.len() { + self.set_all_true_validity(); + return false; + } else if true_count == 0 { + self.set_all_false_validity(); + return true; + } + + let dest = unsafe { self.ensure_validity_slice(len) }; + let source = arr.bit_buffer().inner().as_slice(); + let ones = copy_from_slice(dest, source, offset, len); + if ones == 0 { + self.set_all_false_validity(); + true + } else if ones == len { + self.set_all_true_validity(); + false + } else { + false } } - pub(super) fn set_all_false_validity(&mut self) { + fn set_all_true_validity(&mut self) { + unsafe { duckdb_vx_vector_set_all_valid(self.as_ptr()) }; + } + + fn set_all_false_validity(&mut self) { self.reference_value(&Value::null(&self.logical_type())); } }