Skip to content

Commit 1aebccf

Browse files
committed
fix(cuda): propagate validity through dynamic dispatch and fix standalone kernel panics
The dynamic dispatch plan builder silently dropped validity (null bitmaps) from nullable arrays, hardcoding Validity::NonNullable on the output. This caused silent data corruption when nullable arrays were fused, and especially when the output fed into downstream kernels like CUB filter. Dynamic dispatch changes: - Thread root array validity through FusedPlan -> MaterializedPlan -> execute_typed, replacing the hardcoded Validity::NonNullable. - Guard against Dict with nullable codes (garbage code values could cause OOB shared memory reads in the DICT gather scalar op). - Guard against RunEnd with nullable ends (garbage end values could cause unpredictable binary search / forward-scan behavior). - Skip kernel launch entirely for Validity::AllInvalid arrays. - Respect nullability in the len==0 early-return path. Standalone kernel fixes: - RunEnd: replace unreachable!() with vortex_bail!() when values have per-element validity (Validity::Array), allowing graceful CPU fallback instead of a panic. - Zstd: replace unimplemented!() with vortex_bail!() when decompressed data contains nulls, allowing graceful CPU fallback instead of a panic. - ALP: clarify that patch validity does not need scattering (the encoder strips null positions from the exception list), remove stale TODO. Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent ae906c7 commit 1aebccf

6 files changed

Lines changed: 395 additions & 9 deletions

File tree

vortex-cuda/benches/dynamic_dispatch_cuda.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ impl BenchRunner {
135135
dispatch_plan,
136136
device_buffers,
137137
shared_mem_bytes,
138+
..
138139
} = plan.materialize(cuda_ctx).vortex_expect("materialize plan");
139140

140141
let device_plan = Arc::new(

vortex-cuda/src/dynamic_dispatch/mod.rs

Lines changed: 225 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,18 @@ use cudarc::driver::DevicePtr;
2424
use cudarc::driver::LaunchConfig;
2525
use cudarc::driver::PushKernelArg;
2626
use vortex::array::Canonical;
27+
use vortex::array::IntoArray;
28+
use vortex::array::arrays::ConstantArray;
2729
use vortex::array::arrays::PrimitiveArray;
2830
use vortex::array::buffer::BufferHandle;
2931
use vortex::array::buffer::DeviceBufferExt;
3032
use vortex::array::match_each_unsigned_integer_ptype;
33+
use vortex::array::scalar::Scalar;
3134
use vortex::array::validity::Validity;
3235
use vortex::buffer::Alignment;
3336
use vortex::buffer::ByteBuffer;
3437
use vortex::buffer::ByteBufferMut;
38+
use vortex::dtype::DType;
3539
use vortex::dtype::Nullability;
3640
use vortex::dtype::PType;
3741
use vortex::error::VortexResult;
@@ -408,6 +412,15 @@ impl ScalarOp {
408412
impl MaterializedPlan {
409413
pub fn execute(self, len: usize, ctx: &mut CudaExecutionCtx) -> VortexResult<Canonical> {
410414
let output_ptype = self.dispatch_plan.output_ptype();
415+
416+
// All values are null — no need to touch the GPU.
417+
if matches!(self.validity, Validity::AllInvalid) {
418+
let dtype = DType::Primitive(output_ptype, Nullability::Nullable);
419+
return ConstantArray::new(Scalar::null(dtype), len)
420+
.into_array()
421+
.to_canonical();
422+
}
423+
411424
// The CUDA kernels are instantiated for unsigned integer types only;
412425
// map signed/float ptypes to their same-width unsigned counterpart.
413426
let unsigned_ptype = match output_ptype {
@@ -431,9 +444,11 @@ impl MaterializedPlan {
431444
where
432445
T: cudarc::driver::DeviceRepr + vortex::dtype::NativePType,
433446
{
447+
let nullability = self.validity.nullability();
448+
434449
if len == 0 {
435450
return Ok(Canonical::Primitive(PrimitiveArray::empty::<T>(
436-
Nullability::NonNullable,
451+
nullability,
437452
)));
438453
}
439454

@@ -467,7 +482,7 @@ impl MaterializedPlan {
467482
Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
468483
BufferHandle::new_device(output_buf.slice_typed::<T>(0..len)),
469484
output_ptype,
470-
Validity::NonNullable,
485+
self.validity,
471486
)))
472487
}
473488
}
@@ -485,6 +500,7 @@ mod tests {
485500
use vortex::array::arrays::DictArray;
486501
use vortex::array::arrays::PrimitiveArray;
487502
use vortex::array::scalar::Scalar;
503+
use vortex::array::validity::Validity;
488504
use vortex::array::validity::Validity::NonNullable;
489505
use vortex::buffer::Buffer;
490506
use vortex::dtype::PType;
@@ -509,6 +525,7 @@ mod tests {
509525
use crate::CudaBufferExt;
510526
use crate::CudaDeviceBuffer;
511527
use crate::CudaExecutionCtx;
528+
use crate::executor::CudaArrayExt;
512529
use crate::hybrid_dispatch::try_gpu_dispatch;
513530
use crate::session::CudaSession;
514531

@@ -1817,4 +1834,210 @@ mod tests {
18171834

18181835
Ok(())
18191836
}
1837+
1838+
// ═══════════════════════════════════════════════════════════════════
1839+
// Validity propagation tests
1840+
// ═══════════════════════════════════════════════════════════════════
1841+
1842+
/// Nullable Primitive array — LOAD source with validity propagated.
1843+
#[crate::test]
1844+
async fn test_nullable_primitive() -> VortexResult<()> {
1845+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1846+
1847+
let array = PrimitiveArray::from_option_iter(
1848+
(0..2048u32).map(|i| if i % 3 == 0 { None } else { Some(i) }),
1849+
);
1850+
let cpu = array.to_canonical()?.into_array();
1851+
1852+
let gpu = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx)
1853+
.await?
1854+
.into_host()
1855+
.await?
1856+
.into_array();
1857+
1858+
vortex::array::assert_arrays_eq!(cpu, gpu);
1859+
Ok(())
1860+
}
1861+
1862+
/// Nullable FoR(BitPacked) — validity from the root propagated through
1863+
/// the fused plan. The standard encoding flow is: subtract FoR reference
1864+
/// to get residuals, then bitpack. BitPacked::encode preserves input
1865+
/// validity, so this produces a real nullable FoR(BitPacked) tree.
1866+
#[crate::test]
1867+
async fn test_nullable_for_bitpacked() -> VortexResult<()> {
1868+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1869+
1870+
let len = 2048;
1871+
let reference = 1000u32;
1872+
1873+
// Original values in [reference, reference+63], every 5th null.
1874+
let values: Vec<Option<u32>> = (0..len)
1875+
.map(|i| {
1876+
if i % 5 == 0 {
1877+
None
1878+
} else {
1879+
Some((i as u32 % 64) + reference)
1880+
}
1881+
})
1882+
.collect();
1883+
let prim = PrimitiveArray::from_option_iter(values.iter().copied());
1884+
let cpu = prim.to_canonical()?.into_array();
1885+
1886+
// FoR encoding: subtract reference to get residuals [0..63].
1887+
// Null positions get 0 (from from_option_iter), which is fine —
1888+
// after subtracting reference it wraps, but validity masks it.
1889+
let residuals =
1890+
PrimitiveArray::from_option_iter(values.iter().map(|v| v.map(|x| x - reference)));
1891+
1892+
// BitPacked::encode preserves nullable validity from the input.
1893+
let bp = BitPacked::encode(&residuals.into_array(), 6)?;
1894+
let for_arr = FoR::try_new(bp.into_array(), reference.into())?;
1895+
1896+
// Verify the plan actually fuses (not just a LOAD).
1897+
assert!(
1898+
matches!(
1899+
DispatchPlan::new(&for_arr.clone().into_array())?,
1900+
DispatchPlan::Fused(_)
1901+
),
1902+
"FoR(BitPacked) with nullable validity should produce a Fused plan"
1903+
);
1904+
1905+
let gpu = try_gpu_dispatch(&for_arr.into_array(), &mut cuda_ctx)
1906+
.await?
1907+
.into_host()
1908+
.await?
1909+
.into_array();
1910+
1911+
vortex::array::assert_arrays_eq!(cpu, gpu);
1912+
Ok(())
1913+
}
1914+
1915+
/// AllInvalid array — kernel should be skipped entirely.
1916+
#[crate::test]
1917+
async fn test_all_invalid_skips_kernel() -> VortexResult<()> {
1918+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1919+
1920+
let array = PrimitiveArray::new(Buffer::from(vec![0u32; 2048]), Validity::AllInvalid);
1921+
1922+
let result = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx)
1923+
.await?
1924+
.into_host()
1925+
.await?;
1926+
1927+
let prim = result.into_primitive();
1928+
assert_eq!(prim.len(), 2048);
1929+
assert!(matches!(prim.validity()?, Validity::AllInvalid));
1930+
Ok(())
1931+
}
1932+
1933+
/// AllValid nullable array — should fuse and produce AllValid output.
1934+
#[crate::test]
1935+
async fn test_all_valid_nullable() -> VortexResult<()> {
1936+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1937+
1938+
let values: Vec<u32> = (0..2048).collect();
1939+
let array = PrimitiveArray::new(Buffer::from(values.clone()), Validity::AllValid);
1940+
1941+
let cpu = array.to_canonical()?.into_array();
1942+
let gpu = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx)
1943+
.await?
1944+
.into_host()
1945+
.await?
1946+
.into_array();
1947+
1948+
vortex::array::assert_arrays_eq!(cpu, gpu);
1949+
Ok(())
1950+
}
1951+
1952+
/// Dict with nullable codes must fall back to Unfused (not fused).
1953+
#[crate::test]
1954+
fn test_dict_nullable_codes_rejected() -> VortexResult<()> {
1955+
use vortex::buffer::buffer;
1956+
1957+
let codes = PrimitiveArray::from_option_iter([Some(0u32), None, Some(1), None, Some(2)]);
1958+
let values = PrimitiveArray::new(buffer![10u32, 20, 30], NonNullable);
1959+
let dict = DictArray::try_new(codes.into_array(), values.into_array())?;
1960+
1961+
let plan = DispatchPlan::new(&dict.into_array())?;
1962+
assert!(
1963+
matches!(plan, DispatchPlan::Unfused),
1964+
"Dict with nullable codes should fall back to Unfused"
1965+
);
1966+
Ok(())
1967+
}
1968+
1969+
/// Dict with non-nullable codes but nullable values should still fuse.
1970+
#[crate::test]
1971+
async fn test_dict_nullable_values_fuses() -> VortexResult<()> {
1972+
use vortex::buffer::buffer;
1973+
1974+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1975+
1976+
let codes = PrimitiveArray::new(buffer![0u32, 1, 2, 2, 1, 0], NonNullable);
1977+
let values = PrimitiveArray::from_option_iter([Some(10u32), None, Some(30)]);
1978+
let dict = DictArray::try_new(codes.into_array(), values.into_array())?;
1979+
1980+
let cpu = dict.to_canonical()?.into_array();
1981+
let gpu = dict
1982+
.into_array()
1983+
.execute_cuda(&mut cuda_ctx)
1984+
.await?
1985+
.into_host()
1986+
.await?
1987+
.into_array();
1988+
1989+
vortex::array::assert_arrays_eq!(cpu, gpu);
1990+
Ok(())
1991+
}
1992+
1993+
/// Nullable FoR(BitPacked) through CUB filter — the original bug scenario.
1994+
/// Validity must survive through fused dispatch and into the filter.
1995+
#[crate::test]
1996+
async fn test_nullable_fused_then_filter() -> VortexResult<()> {
1997+
use vortex::array::arrays::FilterArray;
1998+
use vortex::mask::Mask;
1999+
2000+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
2001+
2002+
let len = 2048usize;
2003+
let values: Vec<Option<u32>> = (0..len)
2004+
.map(|i| {
2005+
if i % 7 == 0 {
2006+
None
2007+
} else {
2008+
Some((i % 64) as u32)
2009+
}
2010+
})
2011+
.collect();
2012+
let prim = PrimitiveArray::from_option_iter(values.iter().copied());
2013+
2014+
// Keep every other element.
2015+
let mask = Mask::from_iter((0..len).map(|i| i % 2 == 0));
2016+
let filter_array = FilterArray::try_new(prim.into_array(), mask)?;
2017+
2018+
let cpu = filter_array.to_canonical()?.into_array();
2019+
let gpu = filter_array
2020+
.into_array()
2021+
.execute_cuda(&mut cuda_ctx)
2022+
.await?
2023+
.into_host()
2024+
.await?
2025+
.into_array();
2026+
2027+
vortex::array::assert_arrays_eq!(cpu, gpu);
2028+
Ok(())
2029+
}
2030+
2031+
/// Empty nullable array should preserve nullability.
2032+
#[crate::test]
2033+
async fn test_empty_nullable_array() -> VortexResult<()> {
2034+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
2035+
2036+
let array = PrimitiveArray::new(Buffer::<u32>::empty(), Validity::AllValid);
2037+
let result = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx).await?;
2038+
let prim = result.into_primitive();
2039+
assert_eq!(prim.len(), 0);
2040+
assert_eq!(prim.validity()?.nullability(), Nullability::Nullable);
2041+
Ok(())
2042+
}
18202043
}

vortex-cuda/src/dynamic_dispatch/plan_builder.rs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use vortex::array::arrays::Slice;
1515
use vortex::array::arrays::dict::DictArraySlotsExt;
1616
use vortex::array::arrays::slice::SliceArrayExt;
1717
use vortex::array::buffer::BufferHandle;
18+
use vortex::array::validity::Validity;
1819
use vortex::dtype::PType;
1920
use vortex::encodings::alp::ALP;
2021
use vortex::encodings::alp::ALPArrayExt;
@@ -52,6 +53,8 @@ pub struct MaterializedPlan {
5253
pub device_buffers: Vec<BufferHandle>,
5354
/// Dynamic shared memory bytes needed to launch this plan.
5455
pub shared_mem_bytes: u32,
56+
/// Validity of the root array, propagated to the output.
57+
pub validity: Validity,
5558
}
5659

5760
/// Checks whether the encoding of an array can be fused into a dynamic-dispatch plan.
@@ -72,6 +75,11 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool {
7275
}
7376
if id == Dict::ID {
7477
let arr = array.as_::<Dict>();
78+
// Nullable codes could hold garbage values at null positions, causing
79+
// out-of-bounds shared memory reads in the DICT gather scalar op.
80+
if arr.codes().dtype().is_nullable() {
81+
return false;
82+
}
7583
// Dict codes and values may have different byte widths.
7684
// The kernel handles mixed widths via widening input stages,
7785
// but only when codes are no wider than values (the output type).
@@ -85,6 +93,12 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool {
8593
}
8694
if id == RunEnd::ID {
8795
let arr = array.as_::<RunEnd>();
96+
// Nullable ends could hold garbage values at null positions, causing
97+
// unpredictable binary search / forward-scan behavior in the RUNEND
98+
// source op.
99+
if arr.ends().dtype().is_nullable() {
100+
return false;
101+
}
88102
// RunEnd ends and values may have different byte widths.
89103
// The kernel handles mixed widths via widening input stages,
90104
// but only when ends are no wider than values (the output type).
@@ -213,14 +227,18 @@ pub struct FusedPlan {
213227
output_elem_bytes: u32,
214228
/// PType of the root (output) array, as a C ABI tag.
215229
output_ptype: PTypeTag,
230+
/// Validity of the root array, propagated to the output.
231+
validity: Validity,
216232
}
217233

218234
impl DispatchPlan {
219235
/// Construct a plan by inspecting the encoding tree in a single pass.
220236
///
221237
/// # Limitations
222238
///
223-
/// - Validity bitmaps are ignored; only `NonNullable`/`AllValid` is supported.
239+
/// - Validity is propagated from the root array to the output. Nullable
240+
/// arrays are supported, but Dict with nullable codes and RunEnd with
241+
/// nullable ends are rejected to guard against out-of-bounds access.
224242
/// - `BitPackedArray` and `ALPArray` with patches are not supported.
225243
/// - Only f32 ALP is supported (kernel stores multipliers as `float`).
226244
pub fn new(array: &ArrayRef) -> VortexResult<Self> {
@@ -276,6 +294,7 @@ impl FusedPlan {
276294
}
277295
let output_elem_bytes = output_ptype_rust.byte_width() as u32;
278296
let output_ptype = ptype_to_tag(output_ptype_rust);
297+
let validity = array.validity()?;
279298

280299
let mut pending_subtrees: Vec<ArrayRef> = Vec::new();
281300
let mut plan = Self {
@@ -284,6 +303,7 @@ impl FusedPlan {
284303
source_buffers: Vec::new(),
285304
output_elem_bytes,
286305
output_ptype,
306+
validity,
287307
};
288308

289309
let len = array.len() as u32;
@@ -365,6 +385,7 @@ impl FusedPlan {
365385
dispatch_plan: CudaDispatchPlan::new(stages, self.output_ptype),
366386
device_buffers,
367387
shared_mem_bytes,
388+
validity: self.validity,
368389
})
369390
}
370391

0 commit comments

Comments
 (0)