Skip to content

Commit f75e977

Browse files
committed
fix(cuda): propagate validity through dynamic dispatch and fix standalone kernel panics
The dynamic dispatch plan builder silently dropped validity (null bitmaps) from nullable arrays, hardcoding Validity::NonNullable on the output. This caused silent data corruption when nullable arrays were fused, and especially when the output fed into downstream kernels like CUB filter. Dynamic dispatch changes: - Thread root array validity through FusedPlan -> MaterializedPlan -> execute_typed, replacing the hardcoded Validity::NonNullable. - Guard against Dict with nullable codes (garbage code values could cause OOB shared memory reads in the DICT gather scalar op). - Guard against RunEnd with nullable ends (garbage end values could cause unpredictable binary search / forward-scan behavior). - Skip kernel launch entirely for Validity::AllInvalid arrays. - Respect nullability in the len==0 early-return path. Standalone kernel fixes: - RunEnd: replace unreachable!() with vortex_bail!() when values have per-element validity (Validity::Array), allowing graceful CPU fallback instead of a panic. - Zstd: replace unimplemented!() with vortex_bail!() when decompressed data contains nulls, allowing graceful CPU fallback instead of a panic. - ALP: clarify that patch validity does not need scattering (the encoder strips null positions from the exception list), remove stale TODO. Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent ae906c7 commit f75e977

6 files changed

Lines changed: 390 additions & 9 deletions

File tree

vortex-cuda/benches/dynamic_dispatch_cuda.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ impl BenchRunner {
135135
dispatch_plan,
136136
device_buffers,
137137
shared_mem_bytes,
138+
..
138139
} = plan.materialize(cuda_ctx).vortex_expect("materialize plan");
139140

140141
let device_plan = Arc::new(

vortex-cuda/src/dynamic_dispatch/mod.rs

Lines changed: 220 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,18 @@ use cudarc::driver::DevicePtr;
2424
use cudarc::driver::LaunchConfig;
2525
use cudarc::driver::PushKernelArg;
2626
use vortex::array::Canonical;
27+
use vortex::array::IntoArray;
28+
use vortex::array::arrays::ConstantArray;
2729
use vortex::array::arrays::PrimitiveArray;
2830
use vortex::array::buffer::BufferHandle;
2931
use vortex::array::buffer::DeviceBufferExt;
3032
use vortex::array::match_each_unsigned_integer_ptype;
33+
use vortex::array::scalar::Scalar;
3134
use vortex::array::validity::Validity;
3235
use vortex::buffer::Alignment;
3336
use vortex::buffer::ByteBuffer;
3437
use vortex::buffer::ByteBufferMut;
38+
use vortex::dtype::DType;
3539
use vortex::dtype::Nullability;
3640
use vortex::dtype::PType;
3741
use vortex::error::VortexResult;
@@ -408,6 +412,15 @@ impl ScalarOp {
408412
impl MaterializedPlan {
409413
pub fn execute(self, len: usize, ctx: &mut CudaExecutionCtx) -> VortexResult<Canonical> {
410414
let output_ptype = self.dispatch_plan.output_ptype();
415+
416+
// All values are null — no need to touch the GPU.
417+
if matches!(self.validity, Validity::AllInvalid) {
418+
let dtype = DType::Primitive(output_ptype, Nullability::Nullable);
419+
return ConstantArray::new(Scalar::null(dtype), len)
420+
.into_array()
421+
.to_canonical();
422+
}
423+
411424
// The CUDA kernels are instantiated for unsigned integer types only;
412425
// map signed/float ptypes to their same-width unsigned counterpart.
413426
let unsigned_ptype = match output_ptype {
@@ -431,9 +444,11 @@ impl MaterializedPlan {
431444
where
432445
T: cudarc::driver::DeviceRepr + vortex::dtype::NativePType,
433446
{
447+
let nullability = self.validity.nullability();
448+
434449
if len == 0 {
435450
return Ok(Canonical::Primitive(PrimitiveArray::empty::<T>(
436-
Nullability::NonNullable,
451+
nullability,
437452
)));
438453
}
439454

@@ -467,7 +482,7 @@ impl MaterializedPlan {
467482
Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
468483
BufferHandle::new_device(output_buf.slice_typed::<T>(0..len)),
469484
output_ptype,
470-
Validity::NonNullable,
485+
self.validity,
471486
)))
472487
}
473488
}
@@ -485,8 +500,10 @@ mod tests {
485500
use vortex::array::arrays::DictArray;
486501
use vortex::array::arrays::PrimitiveArray;
487502
use vortex::array::scalar::Scalar;
503+
use vortex::array::validity::Validity;
488504
use vortex::array::validity::Validity::NonNullable;
489505
use vortex::buffer::Buffer;
506+
use vortex::dtype::Nullability;
490507
use vortex::dtype::PType;
491508
use vortex::encodings::alp::ALP;
492509
use vortex::encodings::alp::ALPArrayExt;
@@ -509,6 +526,7 @@ mod tests {
509526
use crate::CudaBufferExt;
510527
use crate::CudaDeviceBuffer;
511528
use crate::CudaExecutionCtx;
529+
use crate::executor::CudaArrayExt;
512530
use crate::hybrid_dispatch::try_gpu_dispatch;
513531
use crate::session::CudaSession;
514532

@@ -1817,4 +1835,204 @@ mod tests {
18171835

18181836
Ok(())
18191837
}
1838+
1839+
// ═══════════════════════════════════════════════════════════════════
1840+
// Validity propagation tests
1841+
// ═══════════════════════════════════════════════════════════════════
1842+
1843+
/// Nullable Primitive array — LOAD source with validity propagated.
1844+
#[crate::test]
1845+
async fn test_nullable_primitive() -> VortexResult<()> {
1846+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1847+
1848+
let array = PrimitiveArray::from_option_iter(
1849+
(0..2048u32).map(|i| if i % 3 == 0 { None } else { Some(i) }),
1850+
);
1851+
let cpu = array.to_canonical()?.into_array();
1852+
1853+
let gpu = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx)
1854+
.await?
1855+
.into_host()
1856+
.await?
1857+
.into_array();
1858+
1859+
vortex::array::assert_arrays_eq!(cpu, gpu);
1860+
Ok(())
1861+
}
1862+
1863+
/// Nullable FoR(BitPacked) — validity from the root FoR propagated
1864+
/// through the fused plan.
1865+
#[crate::test]
1866+
async fn test_nullable_for_bitpacked() -> VortexResult<()> {
1867+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1868+
1869+
let len = 2048;
1870+
let reference = 1000u32;
1871+
let residuals: Vec<u32> = (0..len).map(|i| (i % 64) as u32).collect();
1872+
1873+
// Build a nullable source — encode as FoR(BitPacked), then replace
1874+
// validity on the root FoR array.
1875+
let prim = PrimitiveArray::from_option_iter(residuals.iter().enumerate().map(|(i, &v)| {
1876+
if i % 5 == 0 {
1877+
None
1878+
} else {
1879+
Some(v + reference)
1880+
}
1881+
}));
1882+
1883+
// Build expected from the nullable primitive.
1884+
let cpu = prim.to_canonical()?.into_array();
1885+
1886+
// Encode the non-null residuals into FoR(BitPacked). The validity
1887+
// lives on the outermost array; inner children are non-nullable.
1888+
let inner_prim = PrimitiveArray::new(Buffer::from(residuals), NonNullable);
1889+
let bp = BitPacked::encode(&inner_prim.into_array(), 6)?;
1890+
let for_arr = FoR::try_new(bp.into_array(), reference.into())?;
1891+
1892+
// Re-wrap with the nullable validity from the original primitive.
1893+
let validity = prim.validity()?;
1894+
let nullable_for = PrimitiveArray::from_buffer_handle(
1895+
for_arr.into_array().to_primitive().into_data_parts().buffer,
1896+
PType::U32,
1897+
validity,
1898+
);
1899+
1900+
let gpu = try_gpu_dispatch(&nullable_for.into_array(), &mut cuda_ctx)
1901+
.await?
1902+
.into_host()
1903+
.await?
1904+
.into_array();
1905+
1906+
vortex::array::assert_arrays_eq!(cpu, gpu);
1907+
Ok(())
1908+
}
1909+
1910+
/// AllInvalid array — kernel should be skipped entirely.
1911+
#[crate::test]
1912+
async fn test_all_invalid_skips_kernel() -> VortexResult<()> {
1913+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1914+
1915+
let array = PrimitiveArray::new(Buffer::from(vec![0u32; 2048]), Validity::AllInvalid);
1916+
1917+
let result = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx)
1918+
.await?
1919+
.into_host()
1920+
.await?;
1921+
1922+
let prim = result.into_primitive();
1923+
assert_eq!(prim.len(), 2048);
1924+
assert!(matches!(prim.validity()?, Validity::AllInvalid));
1925+
Ok(())
1926+
}
1927+
1928+
/// AllValid nullable array — should fuse and produce AllValid output.
1929+
#[crate::test]
1930+
async fn test_all_valid_nullable() -> VortexResult<()> {
1931+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1932+
1933+
let values: Vec<u32> = (0..2048).collect();
1934+
let array = PrimitiveArray::new(Buffer::from(values.clone()), Validity::AllValid);
1935+
1936+
let cpu = array.to_canonical()?.into_array();
1937+
let gpu = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx)
1938+
.await?
1939+
.into_host()
1940+
.await?
1941+
.into_array();
1942+
1943+
vortex::array::assert_arrays_eq!(cpu, gpu);
1944+
Ok(())
1945+
}
1946+
1947+
/// Dict with nullable codes must fall back to Unfused (not fused).
1948+
#[crate::test]
1949+
fn test_dict_nullable_codes_rejected() -> VortexResult<()> {
1950+
use vortex::buffer::buffer;
1951+
1952+
let codes = PrimitiveArray::from_option_iter([Some(0u32), None, Some(1), None, Some(2)]);
1953+
let values = PrimitiveArray::new(buffer![10u32, 20, 30], NonNullable);
1954+
let dict = DictArray::try_new(codes.into_array(), values.into_array())?;
1955+
1956+
let plan = DispatchPlan::new(&dict.into_array())?;
1957+
assert!(
1958+
matches!(plan, DispatchPlan::Unfused),
1959+
"Dict with nullable codes should fall back to Unfused"
1960+
);
1961+
Ok(())
1962+
}
1963+
1964+
/// Dict with non-nullable codes but nullable values should still fuse.
1965+
#[crate::test]
1966+
async fn test_dict_nullable_values_fuses() -> VortexResult<()> {
1967+
use vortex::buffer::buffer;
1968+
1969+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1970+
1971+
let codes = PrimitiveArray::new(buffer![0u32, 1, 2, 2, 1, 0], NonNullable);
1972+
let values = PrimitiveArray::from_option_iter([Some(10u32), None, Some(30)]);
1973+
let dict = DictArray::try_new(codes.into_array(), values.into_array())?;
1974+
1975+
let cpu = dict.to_canonical()?.into_array();
1976+
let gpu = dict
1977+
.into_array()
1978+
.execute_cuda(&mut cuda_ctx)
1979+
.await?
1980+
.into_host()
1981+
.await?
1982+
.into_array();
1983+
1984+
vortex::array::assert_arrays_eq!(cpu, gpu);
1985+
Ok(())
1986+
}
1987+
1988+
/// Nullable FoR(BitPacked) through CUB filter — the original bug scenario.
1989+
/// Validity must survive through fused dispatch and into the filter.
1990+
#[crate::test]
1991+
async fn test_nullable_fused_then_filter() -> VortexResult<()> {
1992+
use vortex::array::arrays::FilterArray;
1993+
use vortex::mask::Mask;
1994+
1995+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
1996+
1997+
let len = 2048usize;
1998+
let values: Vec<Option<u32>> = (0..len)
1999+
.map(|i| {
2000+
if i % 7 == 0 {
2001+
None
2002+
} else {
2003+
Some((i % 64) as u32)
2004+
}
2005+
})
2006+
.collect();
2007+
let prim = PrimitiveArray::from_option_iter(values.iter().copied());
2008+
2009+
// Keep every other element.
2010+
let mask = Mask::from_iter((0..len).map(|i| i % 2 == 0));
2011+
let filter_array = FilterArray::try_new(prim.into_array(), mask)?;
2012+
2013+
let cpu = filter_array.to_canonical()?.into_array();
2014+
let gpu = filter_array
2015+
.into_array()
2016+
.execute_cuda(&mut cuda_ctx)
2017+
.await?
2018+
.into_host()
2019+
.await?
2020+
.into_array();
2021+
2022+
vortex::array::assert_arrays_eq!(cpu, gpu);
2023+
Ok(())
2024+
}
2025+
2026+
/// Empty nullable array should preserve nullability.
2027+
#[crate::test]
2028+
async fn test_empty_nullable_array() -> VortexResult<()> {
2029+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
2030+
2031+
let array = PrimitiveArray::new(Buffer::<u32>::empty(), Validity::AllValid);
2032+
let result = try_gpu_dispatch(&array.into_array(), &mut cuda_ctx).await?;
2033+
let prim = result.into_primitive();
2034+
assert_eq!(prim.len(), 0);
2035+
assert_eq!(prim.validity()?.nullability(), Nullability::Nullable);
2036+
Ok(())
2037+
}
18202038
}

vortex-cuda/src/dynamic_dispatch/plan_builder.rs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use vortex::array::arrays::Slice;
1515
use vortex::array::arrays::dict::DictArraySlotsExt;
1616
use vortex::array::arrays::slice::SliceArrayExt;
1717
use vortex::array::buffer::BufferHandle;
18+
use vortex::array::validity::Validity;
1819
use vortex::dtype::PType;
1920
use vortex::encodings::alp::ALP;
2021
use vortex::encodings::alp::ALPArrayExt;
@@ -52,6 +53,8 @@ pub struct MaterializedPlan {
5253
pub device_buffers: Vec<BufferHandle>,
5354
/// Dynamic shared memory bytes needed to launch this plan.
5455
pub shared_mem_bytes: u32,
56+
/// Validity of the root array, propagated to the output.
57+
pub validity: Validity,
5558
}
5659

5760
/// Checks whether the encoding of an array can be fused into a dynamic-dispatch plan.
@@ -72,6 +75,11 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool {
7275
}
7376
if id == Dict::ID {
7477
let arr = array.as_::<Dict>();
78+
// Nullable codes could hold garbage values at null positions, causing
79+
// out-of-bounds shared memory reads in the DICT gather scalar op.
80+
if arr.codes().dtype().is_nullable() {
81+
return false;
82+
}
7583
// Dict codes and values may have different byte widths.
7684
// The kernel handles mixed widths via widening input stages,
7785
// but only when codes are no wider than values (the output type).
@@ -85,6 +93,12 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool {
8593
}
8694
if id == RunEnd::ID {
8795
let arr = array.as_::<RunEnd>();
96+
// Nullable ends could hold garbage values at null positions, causing
97+
// unpredictable binary search / forward-scan behavior in the RUNEND
98+
// source op.
99+
if arr.ends().dtype().is_nullable() {
100+
return false;
101+
}
88102
// RunEnd ends and values may have different byte widths.
89103
// The kernel handles mixed widths via widening input stages,
90104
// but only when ends are no wider than values (the output type).
@@ -213,14 +227,18 @@ pub struct FusedPlan {
213227
output_elem_bytes: u32,
214228
/// PType of the root (output) array, as a C ABI tag.
215229
output_ptype: PTypeTag,
230+
/// Validity of the root array, propagated to the output.
231+
validity: Validity,
216232
}
217233

218234
impl DispatchPlan {
219235
/// Construct a plan by inspecting the encoding tree in a single pass.
220236
///
221237
/// # Limitations
222238
///
223-
/// - Validity bitmaps are ignored; only `NonNullable`/`AllValid` is supported.
239+
/// - Validity is propagated from the root array to the output. Nullable
240+
/// arrays are supported, but Dict with nullable codes and RunEnd with
241+
/// nullable ends are rejected to guard against out-of bounds access.
224242
/// - `BitPackedArray` and `ALPArray` with patches are not supported.
225243
/// - Only f32 ALP is supported (kernel stores multipliers as `float`).
226244
pub fn new(array: &ArrayRef) -> VortexResult<Self> {
@@ -276,6 +294,7 @@ impl FusedPlan {
276294
}
277295
let output_elem_bytes = output_ptype_rust.byte_width() as u32;
278296
let output_ptype = ptype_to_tag(output_ptype_rust);
297+
let validity = array.validity()?;
279298

280299
let mut pending_subtrees: Vec<ArrayRef> = Vec::new();
281300
let mut plan = Self {
@@ -284,6 +303,7 @@ impl FusedPlan {
284303
source_buffers: Vec::new(),
285304
output_elem_bytes,
286305
output_ptype,
306+
validity,
287307
};
288308

289309
let len = array.len() as u32;
@@ -365,6 +385,7 @@ impl FusedPlan {
365385
dispatch_plan: CudaDispatchPlan::new(stages, self.output_ptype),
366386
device_buffers,
367387
shared_mem_bytes,
388+
validity: self.validity,
368389
})
369390
}
370391

0 commit comments

Comments
 (0)