-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path01_sequential.cu
More file actions
41 lines (33 loc) · 1.42 KB
/
01_sequential.cu
File metadata and controls
41 lines (33 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
// Layer 2 — Reduction sequential addressing: active threads stay contiguous
// as the stride halves, so no intra-warp divergence on the fold branch.
// Each thread loads 2 elements on entry (grid-stride x2), halving block count.
// ptxas sm_86: regs=10 smem=1024(dynamic) spill=0, 1 barrier
// ncu anchor: smsp__thread_inst_executed_pred_on_per_inst_executed.ratio (warp efficiency rises)
#include "reduction_common.h"
namespace {
__global__ __launch_bounds__(256)
void sequential_reduction_kernel(const float* __restrict__ input,
float* __restrict__ output,
int N) {
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
float val = 0.0f;
if (i < N) val = input[i];
if (i + blockDim.x < N) val += input[i + blockDim.x];
sdata[tid] = val;
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) atomicAdd(output, sdata[0]);
}
} // namespace
void sequential_reduction_launch(const ReductionParams& p) {
constexpr int THREADS = 256;
int blocks = (p.N + THREADS * 2 - 1) / (THREADS * 2);
int smem = THREADS * sizeof(float);
sequential_reduction_kernel<<<blocks, THREADS, smem>>>(p.dInput, p.dOutput, p.N);
CUDA_CHECK_LAST();
}