Skip to content

Commit f90367a

Browse files
committed
apply offset_within_chunk
Signed-off-by: Andrew Duffy <andrew@a10y.dev>
1 parent 1f51e18 commit f90367a

4 files changed

Lines changed: 184 additions & 5 deletions

File tree

vortex-cuda/kernels/src/patches.cuh

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,27 @@ public:
5656
}
5757

5858
// Get patch range for this chunk.
59-
// chunk_offsets has n_chunks elements; the final offset is implicit (num_patches).
60-
uint32_t chunk_start = load_chunk_offset(patches, chunk);
61-
uint32_t chunk_end =
62-
(chunk + 1 < patches.n_chunks) ? load_chunk_offset(patches, chunk + 1) : patches.num_patches;
63-
uint32_t num_patches = chunk_end - chunk_start;
59+
// chunk_offsets stores absolute offsets into the original (unsliced) patches array.
60+
// We need to subtract the base offset (chunk 0) and offset_within_chunk to get
61+
// the actual range in the sliced indices/values arrays.
62+
uint32_t base_offset = load_chunk_offset(patches, 0);
63+
uint32_t chunk_start_raw = load_chunk_offset(patches, chunk) - base_offset;
64+
uint32_t chunk_end_raw =
65+
(chunk + 1 < patches.n_chunks)
66+
? load_chunk_offset(patches, chunk + 1) - base_offset
67+
: patches.num_patches + patches.offset_within_chunk;
68+
69+
// Apply offset_within_chunk adjustment (saturating subtraction)
70+
uint32_t chunk_start = (chunk_start_raw > patches.offset_within_chunk)
71+
? chunk_start_raw - patches.offset_within_chunk
72+
: 0;
73+
uint32_t chunk_end = (chunk_end_raw > patches.offset_within_chunk)
74+
? chunk_end_raw - patches.offset_within_chunk
75+
: 0;
76+
// Clamp to num_patches
77+
chunk_end = min(chunk_end, patches.num_patches);
78+
79+
uint32_t num_patches = (chunk_end > chunk_start) ? chunk_end - chunk_start : 0;
6480

6581
// Divide patches among threads (ceil division)
6682
uint32_t patches_per_thread = (num_patches + n_threads - 1) / n_threads;

vortex-cuda/kernels/src/patches.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ typedef enum { CO_U8 = 0, CO_U16 = 1, CO_U32 = 2, CO_U64 = 3 } ChunkOffsetType;
2020
/// and equals num_patches.
2121
///
2222
/// A NULL chunk_offsets pointer indicates no patches are present.
23+
///
24+
/// When patches are sliced, offset_within_chunk tracks how many patches from
25+
/// the first chunk were sliced off. This is necessary because chunk_offsets
26+
/// is only sliced at chunk boundaries, while indices/values are sliced at
27+
/// element level.
2328
typedef struct {
2429
void *chunk_offsets;
2530
ChunkOffsetType chunk_offset_type;
@@ -28,6 +33,7 @@ typedef struct {
2833
uint32_t offset;
2934
uint32_t num_patches;
3035
uint32_t n_chunks;
36+
uint32_t offset_within_chunk;
3137
} GPUPatches;
3238

3339
#ifdef __cplusplus

vortex-cuda/src/kernel/encodings/bitpacked.rs

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ where
156156
offset: p.offset as u32,
157157
num_patches: p.num_patches as u32,
158158
n_chunks: p.n_chunks as u32,
159+
offset_within_chunk: p.offset_within_chunk as u32,
159160
}
160161
} else {
161162
// NULL chunk_offsets signals no patches to the kernel
@@ -167,6 +168,7 @@ where
167168
offset: 0,
168169
num_patches: 0,
169170
n_chunks: 0,
171+
offset_within_chunk: 0,
170172
}
171173
};
172174

@@ -561,4 +563,151 @@ mod tests {
561563

562564
Ok(())
563565
}
566+
567+
/// Test slicing a bitpacked array with patches where the slice boundary
568+
/// falls in the middle of a chunk's patch range, creating a non-zero
569+
/// offset_within_chunk.
570+
#[crate::test]
571+
fn test_cuda_bitunpack_sliced_patches_offset_within_chunk() -> VortexResult<()> {
572+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
573+
.vortex_expect("failed to create execution context");
574+
575+
// Create an array with values that will generate patches.
576+
// We use values 0-511 (fits in 9 bits) but include some larger values
577+
// that will become patches.
578+
let mut values: Vec<u16> = Vec::with_capacity(3072);
579+
for i in 0u16..3072 {
580+
if i == 100 || i == 200 || i == 300 || i == 1100 || i == 1200 || i == 2100 {
581+
// These will be patches (values > 511)
582+
values.push(600);
583+
} else {
584+
values.push(i % 512);
585+
}
586+
}
587+
588+
let primitive_array =
589+
PrimitiveArray::new(Buffer::from_iter(values.iter().copied()), NonNullable);
590+
591+
// Encode with bit width 9 (max value 511)
592+
let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), 9)?;
593+
assert!(
594+
bitpacked_array.patches().is_some(),
595+
"Expected patches to be present"
596+
);
597+
598+
// Slice to create non-zero offset_within_chunk.
599+
// The first chunk (0-1023) has patches at indices 100, 200, 300.
600+
// Slicing from 150 should skip the patch at 100, creating offset_within_chunk=1.
601+
let sliced_array = bitpacked_array.into_array().slice(150..2500)?;
602+
assert!(sliced_array.is::<BitPacked>());
603+
604+
let cpu_result = sliced_array.to_canonical()?;
605+
let gpu_result = block_on(async {
606+
BitPackedExecutor
607+
.execute(sliced_array, &mut cuda_ctx)
608+
.await
609+
.vortex_expect("GPU decompression failed")
610+
.into_host()
611+
.await
612+
.map(|a| a.into_array())
613+
})?;
614+
615+
assert_arrays_eq!(cpu_result.into_array(), gpu_result);
616+
617+
Ok(())
618+
}
619+
620+
/// Test slicing a bitpacked array multiple times, accumulating offset_within_chunk.
621+
#[crate::test]
622+
fn test_cuda_bitunpack_double_sliced_patches() -> VortexResult<()> {
623+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
624+
.vortex_expect("failed to create execution context");
625+
626+
// Create an array with values that will generate patches.
627+
let mut values: Vec<u16> = Vec::with_capacity(3072);
628+
for i in 0u16..3072 {
629+
if i == 50 || i == 100 || i == 200 || i == 300 || i == 400 || i == 1100 || i == 2100 {
630+
values.push(600);
631+
} else {
632+
values.push(i % 512);
633+
}
634+
}
635+
636+
let primitive_array =
637+
PrimitiveArray::new(Buffer::from_iter(values.iter().copied()), NonNullable);
638+
639+
let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), 9)?;
640+
assert!(
641+
bitpacked_array.patches().is_some(),
642+
"Expected patches to be present"
643+
);
644+
645+
// First slice: skip patches at 50
646+
let first_slice = bitpacked_array.into_array().slice(75..2500)?;
647+
// Second slice: skip more patches
648+
let second_slice = first_slice.slice(50..2000)?;
649+
assert!(second_slice.is::<BitPacked>());
650+
651+
let cpu_result = second_slice.to_canonical()?;
652+
let gpu_result = block_on(async {
653+
BitPackedExecutor
654+
.execute(second_slice, &mut cuda_ctx)
655+
.await
656+
.vortex_expect("GPU decompression failed")
657+
.into_host()
658+
.await
659+
.map(|a| a.into_array())
660+
})?;
661+
662+
assert_arrays_eq!(cpu_result.into_array(), gpu_result);
663+
664+
Ok(())
665+
}
666+
667+
/// Test slicing to skip an entire chunk's worth of patches.
668+
#[crate::test]
669+
fn test_cuda_bitunpack_sliced_skip_first_chunk_patches() -> VortexResult<()> {
670+
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
671+
.vortex_expect("failed to create execution context");
672+
673+
// Create patches in first chunk only, then slice past them all.
674+
let mut values: Vec<u16> = Vec::with_capacity(3072);
675+
for i in 0u16..3072 {
676+
if i == 100 || i == 200 || i == 300 {
677+
values.push(600);
678+
} else if i == 1500 || i == 2500 {
679+
values.push(700);
680+
} else {
681+
values.push(i % 512);
682+
}
683+
}
684+
685+
let primitive_array =
686+
PrimitiveArray::new(Buffer::from_iter(values.iter().copied()), NonNullable);
687+
688+
let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), 9)?;
689+
assert!(
690+
bitpacked_array.patches().is_some(),
691+
"Expected patches to be present"
692+
);
693+
694+
// Slice to skip past all first chunk patches
695+
let sliced_array = bitpacked_array.into_array().slice(1024..3072)?;
696+
assert!(sliced_array.is::<BitPacked>());
697+
698+
let cpu_result = sliced_array.to_canonical()?;
699+
let gpu_result = block_on(async {
700+
BitPackedExecutor
701+
.execute(sliced_array, &mut cuda_ctx)
702+
.await
703+
.vortex_expect("GPU decompression failed")
704+
.into_host()
705+
.await
706+
.map(|a| a.into_array())
707+
})?;
708+
709+
assert_arrays_eq!(cpu_result.into_array(), gpu_result);
710+
711+
Ok(())
712+
}
564713
}

vortex-cuda/src/kernel/patches/types.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ pub struct DevicePatches {
2525
pub(crate) offset: usize,
2626
pub(crate) num_patches: usize,
2727
pub(crate) n_chunks: usize,
28+
/// Number of patches sliced off from the first chunk.
29+
///
30+
/// When patches are sliced, the chunk_offsets array is only sliced at chunk
31+
/// boundaries, while indices/values are sliced at element level. This offset
32+
/// tracks how many patches from the first chunk were sliced off.
33+
pub(crate) offset_within_chunk: usize,
2834
}
2935

3036
/// Load patches for GPU use.
@@ -93,6 +99,7 @@ pub async fn load_patches(
9399

94100
let num_patches = patches.num_patches();
95101
let n_chunks = array_len.div_ceil(1024);
102+
let offset_within_chunk = patches.offset_within_chunk().unwrap_or(0);
96103

97104
Ok(DevicePatches {
98105
chunk_offsets,
@@ -102,6 +109,7 @@ pub async fn load_patches(
102109
offset,
103110
num_patches,
104111
n_chunks,
112+
offset_within_chunk,
105113
})
106114
}
107115

0 commit comments

Comments
 (0)