@@ -141,12 +141,13 @@ scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t
141141 }
142142 // Apply ALP patches: override positions whose float value couldn't
143143 // be reconstructed through the ALP encode/decode cycle.
144- // Per-value cursor — tiles can span chunk boundaries for sliced arrays.
144+ // Per-value cursor — with a slice offset, a tile's N values can
145+ // straddle two FL chunks, so each value needs its own lookup.
145146 if (op.params .alp .patches_ptr != 0 ) {
146147 const auto &patches = *reinterpret_cast <const GPUPatches *>(op.params .alp .patches_ptr );
147- // chunk_start is the first original chunk covered by the sliced
148- // chunk_offsets array . PatchesCursor indexes from 0 into that
149- // array, so we subtract chunk_start from the absolute chunk .
148+ // The sliced chunk_offsets array starts at original chunk
149+ // (offset / FL_CHUNK) . PatchesCursor indexes from 0, so
150+ // subtract that base to get the index into chunk_offsets .
150151 const uint32_t chunk_start = patches.offset / FL_CHUNK;
151152#pragma unroll
152153 for (uint32_t i = 0 ; i < N; ++i) {
@@ -448,12 +449,12 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
448449 smem_out += src.params .bitunpack .element_offset % SMEM_TILE_SIZE;
449450
450451 if (stage.num_scalar_ops > 0 ) {
451- for (uint32_t i = threadIdx .x ; i < stage.len ; i += blockDim .x ) {
452- T val = smem_out[i ];
452+ for (uint32_t elem_idx = threadIdx .x ; elem_idx < stage.len ; elem_idx += blockDim .x ) {
453+ T val = smem_out[elem_idx ];
453454 for (uint8_t op = 0 ; op < stage.num_scalar_ops ; ++op) {
454- scalar_op<T, 1 >(&val, stage.scalar_ops [op], smem, i );
455+ scalar_op<T, 1 >(&val, stage.scalar_ops [op], smem, elem_idx );
455456 }
456- smem_out[i ] = val;
457+ smem_out[elem_idx ] = val;
457458 }
458459 // Write barrier: scalar ops applied in-place, smem region is
459460 // now fully populated for subsequent stages to read.
@@ -471,13 +472,13 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
471472 upper_bound (ends, src.params .runend .num_runs , threadIdx .x + src.params .runend .offset );
472473 }
473474 const void *raw_input = reinterpret_cast <const void *>(stage.input_ptr );
474- for (uint32_t i = threadIdx .x ; i < stage.len ; i += blockDim .x ) {
475+ for (uint32_t elem_idx = threadIdx .x ; elem_idx < stage.len ; elem_idx += blockDim .x ) {
475476 T val;
476- source_op<T, 1 >(&val, src, raw_input, stage.source_ptype , nullptr , 0 , i , smem);
477+ source_op<T, 1 >(&val, src, raw_input, stage.source_ptype , nullptr , 0 , elem_idx , smem);
477478 for (uint8_t op = 0 ; op < stage.num_scalar_ops ; ++op) {
478- scalar_op<T, 1 >(&val, stage.scalar_ops [op], smem, i );
479+ scalar_op<T, 1 >(&val, stage.scalar_ops [op], smem, elem_idx );
479480 }
480- smem_out[i ] = val;
481+ smem_out[elem_idx ] = val;
481482 }
482483 // Write barrier: smem region is fully populated for subsequent
483484 // stages to read.
0 commit comments