apply offset_within_chunk

a10y · a10y · commit f90367a247f4 · 2026-04-16T11:58:54.000-04:00
Signed-off-by: Andrew Duffy &lt;andrew@a10y.dev&gt;
diff --git a/vortex-cuda/kernels/src/patches.cuh b/vortex-cuda/kernels/src/patches.cuh
@@ -56,11 +56,27 @@ public:
         }
 
         // Get patch range for this chunk.
-        // chunk_offsets has n_chunks elements; the final offset is implicit (num_patches).
-        uint32_t chunk_start = load_chunk_offset(patches, chunk);
-        uint32_t chunk_end =
-            (chunk + 1 < patches.n_chunks) ? load_chunk_offset(patches, chunk + 1) : patches.num_patches;
-        uint32_t num_patches = chunk_end - chunk_start;
+        // chunk_offsets stores absolute offsets into the original (unsliced) patches array.
+        // We need to subtract the base offset (chunk 0) and offset_within_chunk to get
+        // the actual range in the sliced indices/values arrays.
+        uint32_t base_offset = load_chunk_offset(patches, 0);
+        uint32_t chunk_start_raw = load_chunk_offset(patches, chunk) - base_offset;
+        uint32_t chunk_end_raw =
+            (chunk + 1 < patches.n_chunks)
+                ? load_chunk_offset(patches, chunk + 1) - base_offset
+                : patches.num_patches + patches.offset_within_chunk;
+
+        // Apply offset_within_chunk adjustment (saturating subtraction)
+        uint32_t chunk_start = (chunk_start_raw > patches.offset_within_chunk)
+                                   ? chunk_start_raw - patches.offset_within_chunk
+                                   : 0;
+        uint32_t chunk_end = (chunk_end_raw > patches.offset_within_chunk)
+                                 ? chunk_end_raw - patches.offset_within_chunk
+                                 : 0;
+        // Clamp to num_patches
+        chunk_end = min(chunk_end, patches.num_patches);
+
+        uint32_t num_patches = (chunk_end > chunk_start) ? chunk_end - chunk_start : 0;
 
         // Divide patches among threads (ceil division)
         uint32_t patches_per_thread = (num_patches + n_threads - 1) / n_threads;
diff --git a/vortex-cuda/kernels/src/patches.h b/vortex-cuda/kernels/src/patches.h
@@ -20,6 +20,11 @@ typedef enum { CO_U8 = 0, CO_U16 = 1, CO_U32 = 2, CO_U64 = 3 } ChunkOffsetType;
 /// and equals num_patches.
 ///
 /// A NULL chunk_offsets pointer indicates no patches are present.
+///
+/// When patches are sliced, offset_within_chunk tracks how many patches from
+/// the first chunk were sliced off. This is necessary because chunk_offsets
+/// is only sliced at chunk boundaries, while indices/values are sliced at
+/// element level.
 typedef struct {
     void *chunk_offsets;
     ChunkOffsetType chunk_offset_type;
@@ -28,6 +33,7 @@ typedef struct {
     uint32_t offset;
     uint32_t num_patches;
     uint32_t n_chunks;
+    uint32_t offset_within_chunk;
 } GPUPatches;
 
 #ifdef __cplusplus
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -156,6 +156,7 @@ where
             offset: p.offset as u32,
             num_patches: p.num_patches as u32,
             n_chunks: p.n_chunks as u32,
+            offset_within_chunk: p.offset_within_chunk as u32,
         }
     } else {
         // NULL chunk_offsets signals no patches to the kernel
@@ -167,6 +168,7 @@ where
             offset: 0,
             num_patches: 0,
             n_chunks: 0,
+            offset_within_chunk: 0,
         }
     };
 
@@ -561,4 +563,151 @@ mod tests {
 
         Ok(())
     }
+
+    /// Test slicing a bitpacked array with patches where the slice boundary
+    /// falls in the middle of a chunk's patch range, creating a non-zero
+    /// offset_within_chunk.
+    #[crate::test]
+    fn test_cuda_bitunpack_sliced_patches_offset_within_chunk() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        // Create an array with values that will generate patches.
+        // We use values 0-511 (fits in 9 bits) but include some larger values
+        // that will become patches.
+        let mut values: Vec<u16> = Vec::with_capacity(3072);
+        for i in 0u16..3072 {
+            if i == 100 || i == 200 || i == 300 || i == 1100 || i == 1200 || i == 2100 {
+                // These will be patches (values > 511)
+                values.push(600);
+            } else {
+                values.push(i % 512);
+            }
+        }
+
+        let primitive_array =
+            PrimitiveArray::new(Buffer::from_iter(values.iter().copied()), NonNullable);
+
+        // Encode with bit width 9 (max value 511)
+        let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), 9)?;
+        assert!(
+            bitpacked_array.patches().is_some(),
+            "Expected patches to be present"
+        );
+
+        // Slice to create non-zero offset_within_chunk.
+        // The first chunk (0-1023) has patches at indices 100, 200, 300.
+        // Slicing from 150 should skip the patch at 100, creating offset_within_chunk=1.
+        let sliced_array = bitpacked_array.into_array().slice(150..2500)?;
+        assert!(sliced_array.is::<BitPacked>());
+
+        let cpu_result = sliced_array.to_canonical()?;
+        let gpu_result = block_on(async {
+            BitPackedExecutor
+                .execute(sliced_array, &mut cuda_ctx)
+                .await
+                .vortex_expect("GPU decompression failed")
+                .into_host()
+                .await
+                .map(|a| a.into_array())
+        })?;
+
+        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+
+        Ok(())
+    }
+
+    /// Test slicing a bitpacked array multiple times, accumulating offset_within_chunk.
+    #[crate::test]
+    fn test_cuda_bitunpack_double_sliced_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        // Create an array with values that will generate patches.
+        let mut values: Vec<u16> = Vec::with_capacity(3072);
+        for i in 0u16..3072 {
+            if i == 50 || i == 100 || i == 200 || i == 300 || i == 400 || i == 1100 || i == 2100 {
+                values.push(600);
+            } else {
+                values.push(i % 512);
+            }
+        }
+
+        let primitive_array =
+            PrimitiveArray::new(Buffer::from_iter(values.iter().copied()), NonNullable);
+
+        let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), 9)?;
+        assert!(
+            bitpacked_array.patches().is_some(),
+            "Expected patches to be present"
+        );
+
+        // First slice: skip patches at 50
+        let first_slice = bitpacked_array.into_array().slice(75..2500)?;
+        // Second slice: skip more patches
+        let second_slice = first_slice.slice(50..2000)?;
+        assert!(second_slice.is::<BitPacked>());
+
+        let cpu_result = second_slice.to_canonical()?;
+        let gpu_result = block_on(async {
+            BitPackedExecutor
+                .execute(second_slice, &mut cuda_ctx)
+                .await
+                .vortex_expect("GPU decompression failed")
+                .into_host()
+                .await
+                .map(|a| a.into_array())
+        })?;
+
+        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+
+        Ok(())
+    }
+
+    /// Test slicing to skip an entire chunk's worth of patches.
+    #[crate::test]
+    fn test_cuda_bitunpack_sliced_skip_first_chunk_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        // Create patches in first chunk only, then slice past them all.
+        let mut values: Vec<u16> = Vec::with_capacity(3072);
+        for i in 0u16..3072 {
+            if i == 100 || i == 200 || i == 300 {
+                values.push(600);
+            } else if i == 1500 || i == 2500 {
+                values.push(700);
+            } else {
+                values.push(i % 512);
+            }
+        }
+
+        let primitive_array =
+            PrimitiveArray::new(Buffer::from_iter(values.iter().copied()), NonNullable);
+
+        let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), 9)?;
+        assert!(
+            bitpacked_array.patches().is_some(),
+            "Expected patches to be present"
+        );
+
+        // Slice to skip past all first chunk patches
+        let sliced_array = bitpacked_array.into_array().slice(1024..3072)?;
+        assert!(sliced_array.is::<BitPacked>());
+
+        let cpu_result = sliced_array.to_canonical()?;
+        let gpu_result = block_on(async {
+            BitPackedExecutor
+                .execute(sliced_array, &mut cuda_ctx)
+                .await
+                .vortex_expect("GPU decompression failed")
+                .into_host()
+                .await
+                .map(|a| a.into_array())
+        })?;
+
+        assert_arrays_eq!(cpu_result.into_array(), gpu_result);
+
+        Ok(())
+    }
 }
diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs
@@ -25,6 +25,12 @@ pub struct DevicePatches {
     pub(crate) offset: usize,
     pub(crate) num_patches: usize,
     pub(crate) n_chunks: usize,
+    /// Number of patches sliced off from the first chunk.
+    ///
+    /// When patches are sliced, the chunk_offsets array is only sliced at chunk
+    /// boundaries, while indices/values are sliced at element level. This offset
+    /// tracks how many patches from the first chunk were sliced off.
+    pub(crate) offset_within_chunk: usize,
 }
 
 /// Load patches for GPU use.
@@ -93,6 +99,7 @@ pub async fn load_patches(
 
     let num_patches = patches.num_patches();
     let n_chunks = array_len.div_ceil(1024);
+    let offset_within_chunk = patches.offset_within_chunk().unwrap_or(0);
 
     Ok(DevicePatches {
         chunk_offsets,
@@ -102,6 +109,7 @@ pub async fn load_patches(
         offset,
         num_patches,
         n_chunks,
+        offset_within_chunk,
     })
 }