Skip to content

Commit 296b541

Browse files
committed
style(cuda): replace bare 1024 with FL_CHUNK in patches.cuh, dynamic_dispatch.cu, and bit_unpack_gen.rs
Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 163154c commit 296b541

7 files changed

Lines changed: 22 additions & 21 deletions

File tree

vortex-cuda/kernels/src/bit_unpack_16.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
template <int BW>
66
__device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx, GPUPatches& patches) {
7-
__shared__ uint16_t shared_out[1024];
7+
__shared__ uint16_t shared_out[FL_CHUNK];
88

99
// Step 1: Unpack into shared memory
1010
#pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
1616
// Step 2: Apply patches to shared memory in parallel
1717
PatchesCursor<uint16_t> cursor(patches, blockIdx.x, thread_idx, 32);
1818
auto patch = cursor.next();
19-
while (patch.index != 1024) {
19+
while (patch.index != FL_CHUNK) {
2020
shared_out[patch.index] = patch.value;
2121
patch = cursor.next();
2222
}

vortex-cuda/kernels/src/bit_unpack_32.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
template <int BW>
66
__device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx, GPUPatches& patches) {
7-
__shared__ uint32_t shared_out[1024];
7+
__shared__ uint32_t shared_out[FL_CHUNK];
88

99
// Step 1: Unpack into shared memory
1010
#pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *_
1616
// Step 2: Apply patches to shared memory in parallel
1717
PatchesCursor<uint32_t> cursor(patches, blockIdx.x, thread_idx, 32);
1818
auto patch = cursor.next();
19-
while (patch.index != 1024) {
19+
while (patch.index != FL_CHUNK) {
2020
shared_out[patch.index] = patch.value;
2121
patch = cursor.next();
2222
}

vortex-cuda/kernels/src/bit_unpack_64.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
template <int BW>
66
__device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx, GPUPatches& patches) {
7-
__shared__ uint64_t shared_out[1024];
7+
__shared__ uint64_t shared_out[FL_CHUNK];
88

99
// Step 1: Unpack into shared memory
1010
#pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *_
1616
// Step 2: Apply patches to shared memory in parallel
1717
PatchesCursor<uint64_t> cursor(patches, blockIdx.x, thread_idx, 16);
1818
auto patch = cursor.next();
19-
while (patch.index != 1024) {
19+
while (patch.index != FL_CHUNK) {
2020
shared_out[patch.index] = patch.value;
2121
patch = cursor.next();
2222
}

vortex-cuda/kernels/src/bit_unpack_8.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
template <int BW>
66
__device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx, GPUPatches& patches) {
7-
__shared__ uint8_t shared_out[1024];
7+
__shared__ uint8_t shared_out[FL_CHUNK];
88

99
// Step 1: Unpack into shared memory
1010
#pragma unroll
@@ -16,7 +16,7 @@ __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__re
1616
// Step 2: Apply patches to shared memory in parallel
1717
PatchesCursor<uint8_t> cursor(patches, blockIdx.x, thread_idx, 32);
1818
auto patch = cursor.next();
19-
while (patch.index != 1024) {
19+
while (patch.index != FL_CHUNK) {
2020
shared_out[patch.index] = patch.value;
2121
patch = cursor.next();
2222
}

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -147,16 +147,16 @@ scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t
147147
// chunk_start is the first original chunk covered by the sliced
148148
// chunk_offsets array. PatchesCursor indexes from 0 into that
149149
// array, so we subtract chunk_start from the absolute chunk.
150-
const uint32_t chunk_start = patches.offset / 1024;
150+
const uint32_t chunk_start = patches.offset / FL_CHUNK;
151151
#pragma unroll
152152
for (uint32_t i = 0; i < N; ++i) {
153153
uint64_t my_pos = (N > 1) ? abs_pos + i * blockDim.x + threadIdx.x : abs_pos;
154154
uint64_t orig = my_pos + patches.offset;
155-
uint32_t chunk = static_cast<uint32_t>(orig / 1024) - chunk_start;
156-
uint32_t within = static_cast<uint32_t>(orig % 1024);
155+
uint32_t chunk = static_cast<uint32_t>(orig / FL_CHUNK) - chunk_start;
156+
uint32_t within = static_cast<uint32_t>(orig % FL_CHUNK);
157157
PatchesCursor<T> cursor(patches, chunk, 0, 1);
158158
auto patch = cursor.next();
159-
while (patch.index != 1024) {
159+
while (patch.index != FL_CHUNK) {
160160
if (patch.index == within) {
161161
values[i] = patch.value;
162162
break;
@@ -192,7 +192,7 @@ __device__ __forceinline__ void
192192
scatter_patches_chunk(const GPUPatches &patches, T *__restrict out, uint32_t chunk) {
193193
PatchesCursor<T> cursor(patches, chunk, threadIdx.x, blockDim.x);
194194
auto patch = cursor.next();
195-
while (patch.index != 1024) {
195+
while (patch.index != FL_CHUNK) {
196196
out[patch.index] = patch.value;
197197
patch = cursor.next();
198198
}

vortex-cuda/kernels/src/patches.cuh

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#pragma once
55

6+
#include "fastlanes_common.cuh"
67
#include "patches.h"
78

89
/// Load a chunk offset value, dispatching on the runtime type.
@@ -21,8 +22,8 @@ __device__ inline uint32_t load_chunk_offset(const GPUPatches &patches, uint32_t
2122
}
2223

2324
/// A single patch: a within-chunk index and its replacement value.
24-
/// A sentinel patch has index == 1024, which can never match a valid
25-
/// within-chunk position (0–1023).
25+
/// A sentinel patch has index == FL_CHUNK, which can never match a valid
26+
/// within-chunk position (0–FL_CHUNK-1).
2627
template <typename T>
2728
struct Patch {
2829
uint16_t index;
@@ -38,7 +39,7 @@ struct Patch {
3839
///
3940
/// PatchesCursor<uint32_t> cursor(patches, blockIdx.x, thread_idx, 32);
4041
/// auto patch = cursor.next();
41-
/// while (patch.index != 1024) {
42+
/// while (patch.index != FL_CHUNK) {
4243
/// shared_out[patch.index] = patch.value;
4344
/// patch = cursor.next();
4445
/// }
@@ -89,15 +90,15 @@ public:
8990
// The iterator returns indices relative to the start of the chunk.
9091
// `chunk_base` is the index of the first element within a chunk, accounting
9192
// for the slice offset.
92-
chunk_base = chunk * 1024 + patches.offset;
93-
chunk_base -= min(chunk_base, patches.offset % 1024);
93+
chunk_base = chunk * FL_CHUNK + patches.offset;
94+
chunk_base -= min(chunk_base, patches.offset % FL_CHUNK);
9495
}
9596

9697
/// Return the current patch (with within-chunk index) and advance,
9798
/// or a sentinel {1024, 0} if exhausted.
9899
__device__ Patch<T> next() {
99100
if (remaining == 0) {
100-
return {1024, T {}};
101+
return {FL_CHUNK, T {}};
101102
}
102103
uint16_t within_chunk = static_cast<uint16_t>(*indices - chunk_base);
103104
Patch<T> patch = {within_chunk, *values};

vortex-cuda/src/bit_unpack_gen.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ fn generate_device_kernel_template(
143143
output,
144144
r#"template <int BW>
145145
__device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx, GPUPatches& patches) {{
146-
__shared__ uint{bits}_t shared_out[1024];
146+
__shared__ uint{bits}_t shared_out[FL_CHUNK];
147147
148148
// Step 1: Unpack into shared memory
149149
#pragma unroll
@@ -155,7 +155,7 @@ __device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uin
155155
// Step 2: Apply patches to shared memory in parallel
156156
PatchesCursor<uint{bits}_t> cursor(patches, blockIdx.x, thread_idx, {thread_count});
157157
auto patch = cursor.next();
158-
while (patch.index != 1024) {{
158+
while (patch.index != FL_CHUNK) {{
159159
shared_out[patch.index] = patch.value;
160160
patch = cursor.next();
161161
}}

0 commit comments

Comments
 (0)