Skip to content

Commit dc42095

Browse files
committed
style(cuda): rename loop variable i to elem_idx in execute_input_stage
Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 19e1256 commit dc42095

1 file changed

Lines changed: 13 additions & 12 deletions

File tree

vortex-cuda/kernels/src/dynamic_dispatch.cu

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -141,12 +141,13 @@ scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t
141141
}
142142
// Apply ALP patches: override positions whose float value couldn't
143143
// be reconstructed through the ALP encode/decode cycle.
144-
// Per-value cursor — tiles can span chunk boundaries for sliced arrays.
144+
// Per-value cursor — with a slice offset, a tile's N values can
145+
// straddle two FL chunks, so each value needs its own lookup.
145146
if (op.params.alp.patches_ptr != 0) {
146147
const auto &patches = *reinterpret_cast<const GPUPatches *>(op.params.alp.patches_ptr);
147-
// chunk_start is the first original chunk covered by the sliced
148-
// chunk_offsets array. PatchesCursor indexes from 0 into that
149-
// array, so we subtract chunk_start from the absolute chunk.
148+
// The sliced chunk_offsets array starts at original chunk
149+
// (offset / FL_CHUNK). PatchesCursor indexes from 0, so
150+
// subtract that base to get the index into chunk_offsets.
150151
const uint32_t chunk_start = patches.offset / FL_CHUNK;
151152
#pragma unroll
152153
for (uint32_t i = 0; i < N; ++i) {
@@ -448,12 +449,12 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
448449
smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
449450

450451
if (stage.num_scalar_ops > 0) {
451-
for (uint32_t i = threadIdx.x; i < stage.len; i += blockDim.x) {
452-
T val = smem_out[i];
452+
for (uint32_t elem_idx = threadIdx.x; elem_idx < stage.len; elem_idx += blockDim.x) {
453+
T val = smem_out[elem_idx];
453454
for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) {
454-
scalar_op<T, 1>(&val, stage.scalar_ops[op], smem, i);
455+
scalar_op<T, 1>(&val, stage.scalar_ops[op], smem, elem_idx);
455456
}
456-
smem_out[i] = val;
457+
smem_out[elem_idx] = val;
457458
}
458459
// Write barrier: scalar ops applied in-place, smem region is
459460
// now fully populated for subsequent stages to read.
@@ -471,13 +472,13 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
471472
upper_bound(ends, src.params.runend.num_runs, threadIdx.x + src.params.runend.offset);
472473
}
473474
const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
474-
for (uint32_t i = threadIdx.x; i < stage.len; i += blockDim.x) {
475+
for (uint32_t elem_idx = threadIdx.x; elem_idx < stage.len; elem_idx += blockDim.x) {
475476
T val;
476-
source_op<T, 1>(&val, src, raw_input, stage.source_ptype, nullptr, 0, i, smem);
477+
source_op<T, 1>(&val, src, raw_input, stage.source_ptype, nullptr, 0, elem_idx, smem);
477478
for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) {
478-
scalar_op<T, 1>(&val, stage.scalar_ops[op], smem, i);
479+
scalar_op<T, 1>(&val, stage.scalar_ops[op], smem, elem_idx);
479480
}
480-
smem_out[i] = val;
481+
smem_out[elem_idx] = val;
481482
}
482483
// Write barrier: smem region is fully populated for subsequent
483484
// stages to read.

0 commit comments

Comments
 (0)