Skip to content

Commit af8e9cc

Browse files
committed
chore: add patches_ptr to BitunpackParams and AlpParams
Structural plumbing for per-op exception patches in the fused dynamic dispatch kernel. Adds PackedPatchesHeader and kernel helpers (patch_fl_chunk, patch_all_fl_chunks) but does not yet populate patches_ptr - all constructors initialize it to 0. Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 4135209 commit af8e9cc

5 files changed

Lines changed: 105 additions & 7 deletions

File tree

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,8 @@ __device__ inline void bitunpack(const T *__restrict packed,
162162
uint64_t chunk_start,
163163
uint32_t chunk_len,
164164
const struct SourceOp &src) {
165-
constexpr uint32_t T_BITS = sizeof(T) * 8;
166-
constexpr uint32_t FL_CHUNK = 1024;
167-
constexpr uint32_t LANES = FL_CHUNK / T_BITS;
168165
const uint32_t bw = src.params.bitunpack.bit_width;
169-
const uint32_t words_per_block = LANES * bw;
166+
const uint32_t words_per_block = FL_LANES<T> * bw;
170167
const uint32_t elem_off = src.params.bitunpack.element_offset;
171168
const uint32_t dst_off = (chunk_start + elem_off) % FL_CHUNK;
172169
const uint64_t first_block = (chunk_start + elem_off) / FL_CHUNK;
@@ -177,12 +174,77 @@ __device__ inline void bitunpack(const T *__restrict packed,
177174
for (uint32_t c = 0; c < n_chunks; ++c) {
178175
const T *src_chunk = packed + (first_block + c) * words_per_block;
179176
T *chunk_dst = dst + c * FL_CHUNK;
180-
for (uint32_t lane = threadIdx.x; lane < LANES; lane += blockDim.x) {
177+
for (uint32_t lane = threadIdx.x; lane < FL_LANES<T>; lane += blockDim.x) {
181178
bit_unpack_lane<T>(src_chunk, chunk_dst, 0, lane, bw);
182179
}
183180
}
184181
}
185182

183+
// ═══════════════════════════════════════════════════════════════════════════
184+
// Patches
185+
// ═══════════════════════════════════════════════════════════════════════════
186+
187+
/// Parsed view into a packed patches buffer (the fused-dispatch counterpart
188+
/// of GPUPatches, which is used by the standalone per-bitwidth kernels).
189+
/// Each op with patches gets its own contiguous device allocation holding
190+
/// chunk_offsets, indices, and values, referenced by a single uint64_t
191+
/// pointer (patches_ptr in BitunpackParams); see PackedPatchesHeader in
192+
/// patches.h for the layout.
193+
template <typename T>
194+
struct PackedPatchesView {
195+
const uint32_t *chunk_offsets; // n_chunks+1 entries (sentinel)
196+
uint32_t n_chunks;
197+
const uint16_t *indices; // within-chunk positions (0–1023)
198+
const T *values;
199+
};
200+
201+
/// Parse a packed patches buffer into its component arrays.
202+
template <typename T>
203+
__device__ inline PackedPatchesView<T> parse_patches(uint64_t patches_ptr) {
204+
const uint8_t *base = reinterpret_cast<const uint8_t *>(patches_ptr);
205+
const auto *header = reinterpret_cast<const PackedPatchesHeader *>(base);
206+
return {
207+
reinterpret_cast<const uint32_t *>(base + sizeof(PackedPatchesHeader)),
208+
header->n_chunks,
209+
reinterpret_cast<const uint16_t *>(base + header->indices_byte_offset),
210+
reinterpret_cast<const T *>(base + header->values_byte_offset),
211+
};
212+
}
213+
214+
/// Overwrite exception positions in `out` for a single chunk.
215+
/// All threads in the block cooperate. Caller must issue __syncthreads()
216+
/// afterward if other threads read from `out`.
217+
template <typename T>
218+
__device__ __noinline__ void apply_patches(uint64_t patches_ptr, T *__restrict out, uint32_t chunk) {
219+
const auto patches = parse_patches<T>(patches_ptr);
220+
assert(chunk + 1 <= patches.n_chunks);
221+
uint32_t start = patches.chunk_offsets[chunk];
222+
uint32_t end = patches.chunk_offsets[chunk + 1];
223+
for (uint32_t i = start + threadIdx.x; i < end; i += blockDim.x) {
224+
out[patches.indices[i]] = patches.values[i];
225+
}
226+
}
227+
228+
/// Overwrite exception positions in `out` for a range of chunks.
229+
/// All threads in the block cooperate. Caller must issue __syncthreads()
230+
/// afterward if other threads read from `out`.
231+
template <typename T>
232+
__device__ __noinline__ void
233+
apply_patches_range(uint64_t patches_ptr, T *__restrict out, uint32_t stage_len, uint32_t element_offset) {
234+
const auto patches = parse_patches<T>(patches_ptr);
235+
const uint32_t first_chunk = element_offset / FL_CHUNK;
236+
const uint32_t n_chunks = (stage_len + (element_offset % FL_CHUNK) + FL_CHUNK - 1) / FL_CHUNK;
237+
assert(first_chunk + n_chunks <= patches.n_chunks);
238+
for (uint32_t c = 0; c < n_chunks; ++c) {
239+
T *chunk_base = out + c * FL_CHUNK;
240+
uint32_t start = patches.chunk_offsets[first_chunk + c];
241+
uint32_t end = patches.chunk_offsets[first_chunk + c + 1];
242+
for (uint32_t i = start + threadIdx.x; i < end; i += blockDim.x) {
243+
chunk_base[patches.indices[i]] = patches.values[i];
244+
}
245+
}
246+
}
247+
186248
/// Read N values from a source op into `out`.
187249
///
188250
/// Dispatches on `src.op_code` to handle each encoding:

vortex-cuda/kernels/src/dynamic_dispatch.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#pragma once
3131

3232
#include <stdint.h>
33+
#include "patches.h"
3334

3435
/// Compact tag identifying a Vortex PType for GPU dispatch.
3536
///
@@ -108,6 +109,7 @@ union SourceParams {
108109
struct BitunpackParams {
109110
uint8_t bit_width;
110111
uint32_t element_offset; // Sub-byte offset
112+
uint64_t patches_ptr; // device pointer to packed patches buffer (0 = none)
111113
} bitunpack;
112114

113115
/// Copy from global to shared memory.
@@ -157,6 +159,7 @@ union ScalarParams {
157159
struct AlpParams {
158160
float f;
159161
float e;
162+
uint64_t patches_ptr; // device pointer to packed patches buffer (0 = none)
160163
} alp;
161164

162165
/// Dictionary gather: use current value as index into decoded values in smem.

vortex-cuda/kernels/src/fastlanes_common.cuh

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,25 @@
88
// FastLanes ordering array
99
__constant__ int FL_ORDER[] = {0, 4, 2, 6, 1, 5, 3, 7};
1010

11+
// FastLanes organises every 1024-element vector into a transposed layout
12+
// of FL_LANES columns × (1024 / FL_LANES) rows. Each column is a "lane"
13+
// that can be processed independently of every other lane, which is what
14+
// makes all FastLanes encodings (FFOR, DELTA, RLE, ALP, …) fully
15+
// data-parallel. One CUDA thread or one CPU SIMD lane handles one
16+
// FastLanes lane.
17+
//
18+
// Paper: https://ir.cwi.nl/pub/35881/35881.pdf
19+
// Repo: https://github.com/cwida/FastLanes
20+
21+
/// FastLanes chunk size in elements.
22+
constexpr uint32_t FL_CHUNK = 1024;
23+
24+
/// Number of FastLanes lanes for element type T (1024 / bit-width).
25+
template <typename T>
26+
constexpr uint32_t FL_LANES = FL_CHUNK / (sizeof(T) * 8);
27+
1128
// Compute the index in the FastLanes layout
1229
#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)
1330

1431
// Create a mask with 'width' bits set
15-
#define MASK(T, width) (((T)1 << width) - 1)
32+
#define MASK(T, width) (((T)1 << width) - 1)

vortex-cuda/kernels/src/patches.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,17 @@
99
extern "C" {
1010
#endif
1111

12+
/// Header at the start of a packed patches buffer.
13+
///
14+
/// Layout: [PackedPatchesHeader | chunk_offsets (u32, n_chunks+1 sentinel) | indices (u16) | padding | values (V)]
15+
///
16+
/// A `patches_ptr` of 0 signals no patches.
17+
struct PackedPatchesHeader {
18+
uint32_t n_chunks; // number of FL chunks covered
19+
uint32_t indices_byte_offset; // absolute byte offset from buffer start to indices
20+
uint32_t values_byte_offset; // absolute byte offset from buffer start to values
21+
};
22+
1223
/// Type tag for chunk_offsets pointer.
1324
typedef enum { CO_U8 = 0, CO_U16 = 1, CO_U32 = 2, CO_U64 = 3 } ChunkOffsetType;
1425

vortex-cuda/src/dynamic_dispatch/mod.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ impl SourceOp {
313313
bitunpack: SourceParams_BitunpackParams {
314314
bit_width,
315315
element_offset: u32::from(element_offset),
316+
patches_ptr: 0,
316317
},
317318
},
318319
}
@@ -393,7 +394,11 @@ impl ScalarOp {
393394
op_code: ScalarOp_ScalarOpCode_ALP,
394395
output_ptype: PTypeTag_PTYPE_F32,
395396
params: ScalarParams {
396-
alp: ScalarParams_AlpParams { f, e },
397+
alp: ScalarParams_AlpParams {
398+
f,
399+
e,
400+
patches_ptr: 0,
401+
},
397402
},
398403
}
399404
}

0 commit comments

Comments
 (0)