@@ -24,14 +24,18 @@ use cudarc::driver::DevicePtr;
2424use cudarc:: driver:: LaunchConfig ;
2525use cudarc:: driver:: PushKernelArg ;
2626use vortex:: array:: Canonical ;
27+ use vortex:: array:: IntoArray ;
28+ use vortex:: array:: arrays:: ConstantArray ;
2729use vortex:: array:: arrays:: PrimitiveArray ;
2830use vortex:: array:: buffer:: BufferHandle ;
2931use vortex:: array:: buffer:: DeviceBufferExt ;
3032use vortex:: array:: match_each_unsigned_integer_ptype;
33+ use vortex:: array:: scalar:: Scalar ;
3134use vortex:: array:: validity:: Validity ;
3235use vortex:: buffer:: Alignment ;
3336use vortex:: buffer:: ByteBuffer ;
3437use vortex:: buffer:: ByteBufferMut ;
38+ use vortex:: dtype:: DType ;
3539use vortex:: dtype:: Nullability ;
3640use vortex:: dtype:: PType ;
3741use vortex:: error:: VortexResult ;
@@ -408,6 +412,15 @@ impl ScalarOp {
408412impl MaterializedPlan {
409413 pub fn execute ( self , len : usize , ctx : & mut CudaExecutionCtx ) -> VortexResult < Canonical > {
410414 let output_ptype = self . dispatch_plan . output_ptype ( ) ;
415+
416+ // All values are null — no need to touch the GPU.
417+ if matches ! ( self . validity, Validity :: AllInvalid ) {
418+ let dtype = DType :: Primitive ( output_ptype, Nullability :: Nullable ) ;
419+ return ConstantArray :: new ( Scalar :: null ( dtype) , len)
420+ . into_array ( )
421+ . to_canonical ( ) ;
422+ }
423+
411424 // The CUDA kernels are instantiated for unsigned integer types only;
412425 // map signed/float ptypes to their same-width unsigned counterpart.
413426 let unsigned_ptype = match output_ptype {
@@ -431,9 +444,11 @@ impl MaterializedPlan {
431444 where
432445 T : cudarc:: driver:: DeviceRepr + vortex:: dtype:: NativePType ,
433446 {
447+ let nullability = self . validity . nullability ( ) ;
448+
434449 if len == 0 {
435450 return Ok ( Canonical :: Primitive ( PrimitiveArray :: empty :: < T > (
436- Nullability :: NonNullable ,
451+ nullability ,
437452 ) ) ) ;
438453 }
439454
@@ -467,7 +482,7 @@ impl MaterializedPlan {
467482 Ok ( Canonical :: Primitive ( PrimitiveArray :: from_buffer_handle (
468483 BufferHandle :: new_device ( output_buf. slice_typed :: < T > ( 0 ..len) ) ,
469484 output_ptype,
470- Validity :: NonNullable ,
485+ self . validity ,
471486 ) ) )
472487 }
473488}
@@ -485,6 +500,7 @@ mod tests {
485500 use vortex:: array:: arrays:: DictArray ;
486501 use vortex:: array:: arrays:: PrimitiveArray ;
487502 use vortex:: array:: scalar:: Scalar ;
503+ use vortex:: array:: validity:: Validity ;
488504 use vortex:: array:: validity:: Validity :: NonNullable ;
489505 use vortex:: buffer:: Buffer ;
490506 use vortex:: dtype:: PType ;
@@ -509,6 +525,7 @@ mod tests {
509525 use crate :: CudaBufferExt ;
510526 use crate :: CudaDeviceBuffer ;
511527 use crate :: CudaExecutionCtx ;
528+ use crate :: executor:: CudaArrayExt ;
512529 use crate :: hybrid_dispatch:: try_gpu_dispatch;
513530 use crate :: session:: CudaSession ;
514531
@@ -1817,4 +1834,210 @@ mod tests {
18171834
18181835 Ok ( ( ) )
18191836 }
1837+
1838+ // ═══════════════════════════════════════════════════════════════════
1839+ // Validity propagation tests
1840+ // ═══════════════════════════════════════════════════════════════════
1841+
1842+ /// Nullable Primitive array — LOAD source with validity propagated.
1843+ #[ crate :: test]
1844+ async fn test_nullable_primitive ( ) -> VortexResult < ( ) > {
1845+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) ?;
1846+
1847+ let array = PrimitiveArray :: from_option_iter (
1848+ ( 0 ..2048u32 ) . map ( |i| if i % 3 == 0 { None } else { Some ( i) } ) ,
1849+ ) ;
1850+ let cpu = array. to_canonical ( ) ?. into_array ( ) ;
1851+
1852+ let gpu = try_gpu_dispatch ( & array. into_array ( ) , & mut cuda_ctx)
1853+ . await ?
1854+ . into_host ( )
1855+ . await ?
1856+ . into_array ( ) ;
1857+
1858+ vortex:: array:: assert_arrays_eq!( cpu, gpu) ;
1859+ Ok ( ( ) )
1860+ }
1861+
1862+ /// Nullable FoR(BitPacked) — validity from the root propagated through
1863+ /// the fused plan. The standard encoding flow is: subtract FoR reference
1864+ /// to get residuals, then bitpack. BitPacked::encode preserves input
1865+ /// validity, so this produces a real nullable FoR(BitPacked) tree.
1866+ #[ crate :: test]
1867+ async fn test_nullable_for_bitpacked ( ) -> VortexResult < ( ) > {
1868+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) ?;
1869+
1870+ let len = 2048 ;
1871+ let reference = 1000u32 ;
1872+
1873+ // Original values in [reference, reference+63], every 5th null.
1874+ let values: Vec < Option < u32 > > = ( 0 ..len)
1875+ . map ( |i| {
1876+ if i % 5 == 0 {
1877+ None
1878+ } else {
1879+ Some ( ( i as u32 % 64 ) + reference)
1880+ }
1881+ } )
1882+ . collect ( ) ;
1883+ let prim = PrimitiveArray :: from_option_iter ( values. iter ( ) . copied ( ) ) ;
1884+ let cpu = prim. to_canonical ( ) ?. into_array ( ) ;
1885+
1886+ // FoR encoding: subtract reference to get residuals [0..63].
1887+ // Null positions get 0 (from from_option_iter), which is fine —
1888+ // after subtracting reference it wraps, but validity masks it.
1889+ let residuals =
1890+ PrimitiveArray :: from_option_iter ( values. iter ( ) . map ( |v| v. map ( |x| x - reference) ) ) ;
1891+
1892+ // BitPacked::encode preserves nullable validity from the input.
1893+ let bp = BitPacked :: encode ( & residuals. into_array ( ) , 6 ) ?;
1894+ let for_arr = FoR :: try_new ( bp. into_array ( ) , reference. into ( ) ) ?;
1895+
1896+ // Verify the plan actually fuses (not just a LOAD).
1897+ assert ! (
1898+ matches!(
1899+ DispatchPlan :: new( & for_arr. clone( ) . into_array( ) ) ?,
1900+ DispatchPlan :: Fused ( _)
1901+ ) ,
1902+ "FoR(BitPacked) with nullable validity should produce a Fused plan"
1903+ ) ;
1904+
1905+ let gpu = try_gpu_dispatch ( & for_arr. into_array ( ) , & mut cuda_ctx)
1906+ . await ?
1907+ . into_host ( )
1908+ . await ?
1909+ . into_array ( ) ;
1910+
1911+ vortex:: array:: assert_arrays_eq!( cpu, gpu) ;
1912+ Ok ( ( ) )
1913+ }
1914+
1915+ /// AllInvalid array — kernel should be skipped entirely.
1916+ #[ crate :: test]
1917+ async fn test_all_invalid_skips_kernel ( ) -> VortexResult < ( ) > {
1918+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) ?;
1919+
1920+ let array = PrimitiveArray :: new ( Buffer :: from ( vec ! [ 0u32 ; 2048 ] ) , Validity :: AllInvalid ) ;
1921+
1922+ let result = try_gpu_dispatch ( & array. into_array ( ) , & mut cuda_ctx)
1923+ . await ?
1924+ . into_host ( )
1925+ . await ?;
1926+
1927+ let prim = result. into_primitive ( ) ;
1928+ assert_eq ! ( prim. len( ) , 2048 ) ;
1929+ assert ! ( matches!( prim. validity( ) ?, Validity :: AllInvalid ) ) ;
1930+ Ok ( ( ) )
1931+ }
1932+
1933+ /// AllValid nullable array — should fuse and produce AllValid output.
1934+ #[ crate :: test]
1935+ async fn test_all_valid_nullable ( ) -> VortexResult < ( ) > {
1936+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) ?;
1937+
1938+ let values: Vec < u32 > = ( 0 ..2048 ) . collect ( ) ;
1939+ let array = PrimitiveArray :: new ( Buffer :: from ( values. clone ( ) ) , Validity :: AllValid ) ;
1940+
1941+ let cpu = array. to_canonical ( ) ?. into_array ( ) ;
1942+ let gpu = try_gpu_dispatch ( & array. into_array ( ) , & mut cuda_ctx)
1943+ . await ?
1944+ . into_host ( )
1945+ . await ?
1946+ . into_array ( ) ;
1947+
1948+ vortex:: array:: assert_arrays_eq!( cpu, gpu) ;
1949+ Ok ( ( ) )
1950+ }
1951+
1952+ /// Dict with nullable codes must fall back to Unfused (not fused).
1953+ #[ crate :: test]
1954+ fn test_dict_nullable_codes_rejected ( ) -> VortexResult < ( ) > {
1955+ use vortex:: buffer:: buffer;
1956+
1957+ let codes = PrimitiveArray :: from_option_iter ( [ Some ( 0u32 ) , None , Some ( 1 ) , None , Some ( 2 ) ] ) ;
1958+ let values = PrimitiveArray :: new ( buffer ! [ 10u32 , 20 , 30 ] , NonNullable ) ;
1959+ let dict = DictArray :: try_new ( codes. into_array ( ) , values. into_array ( ) ) ?;
1960+
1961+ let plan = DispatchPlan :: new ( & dict. into_array ( ) ) ?;
1962+ assert ! (
1963+ matches!( plan, DispatchPlan :: Unfused ) ,
1964+ "Dict with nullable codes should fall back to Unfused"
1965+ ) ;
1966+ Ok ( ( ) )
1967+ }
1968+
1969+ /// Dict with non-nullable codes but nullable values should still fuse.
1970+ #[ crate :: test]
1971+ async fn test_dict_nullable_values_fuses ( ) -> VortexResult < ( ) > {
1972+ use vortex:: buffer:: buffer;
1973+
1974+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) ?;
1975+
1976+ let codes = PrimitiveArray :: new ( buffer ! [ 0u32 , 1 , 2 , 2 , 1 , 0 ] , NonNullable ) ;
1977+ let values = PrimitiveArray :: from_option_iter ( [ Some ( 10u32 ) , None , Some ( 30 ) ] ) ;
1978+ let dict = DictArray :: try_new ( codes. into_array ( ) , values. into_array ( ) ) ?;
1979+
1980+ let cpu = dict. to_canonical ( ) ?. into_array ( ) ;
1981+ let gpu = dict
1982+ . into_array ( )
1983+ . execute_cuda ( & mut cuda_ctx)
1984+ . await ?
1985+ . into_host ( )
1986+ . await ?
1987+ . into_array ( ) ;
1988+
1989+ vortex:: array:: assert_arrays_eq!( cpu, gpu) ;
1990+ Ok ( ( ) )
1991+ }
1992+
1993+ /// Nullable FoR(BitPacked) through CUB filter — the original bug scenario.
1994+ /// Validity must survive through fused dispatch and into the filter.
1995+ #[ crate :: test]
1996+ async fn test_nullable_fused_then_filter ( ) -> VortexResult < ( ) > {
1997+ use vortex:: array:: arrays:: FilterArray ;
1998+ use vortex:: mask:: Mask ;
1999+
2000+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) ?;
2001+
2002+ let len = 2048usize ;
2003+ let values: Vec < Option < u32 > > = ( 0 ..len)
2004+ . map ( |i| {
2005+ if i % 7 == 0 {
2006+ None
2007+ } else {
2008+ Some ( ( i % 64 ) as u32 )
2009+ }
2010+ } )
2011+ . collect ( ) ;
2012+ let prim = PrimitiveArray :: from_option_iter ( values. iter ( ) . copied ( ) ) ;
2013+
2014+ // Keep every other element.
2015+ let mask = Mask :: from_iter ( ( 0 ..len) . map ( |i| i % 2 == 0 ) ) ;
2016+ let filter_array = FilterArray :: try_new ( prim. into_array ( ) , mask) ?;
2017+
2018+ let cpu = filter_array. to_canonical ( ) ?. into_array ( ) ;
2019+ let gpu = filter_array
2020+ . into_array ( )
2021+ . execute_cuda ( & mut cuda_ctx)
2022+ . await ?
2023+ . into_host ( )
2024+ . await ?
2025+ . into_array ( ) ;
2026+
2027+ vortex:: array:: assert_arrays_eq!( cpu, gpu) ;
2028+ Ok ( ( ) )
2029+ }
2030+
2031+ /// Empty nullable array should preserve nullability.
2032+ #[ crate :: test]
2033+ async fn test_empty_nullable_array ( ) -> VortexResult < ( ) > {
2034+ let mut cuda_ctx = CudaSession :: create_execution_ctx ( & VortexSession :: empty ( ) ) ?;
2035+
2036+ let array = PrimitiveArray :: new ( Buffer :: < u32 > :: empty ( ) , Validity :: AllValid ) ;
2037+ let result = try_gpu_dispatch ( & array. into_array ( ) , & mut cuda_ctx) . await ?;
2038+ let prim = result. into_primitive ( ) ;
2039+ assert_eq ! ( prim. len( ) , 0 ) ;
2040+ assert_eq ! ( prim. validity( ) ?. nullability( ) , Nullability :: Nullable ) ;
2041+ Ok ( ( ) )
2042+ }
18202043}
0 commit comments