-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel_histogram_256bin.zig
More file actions
37 lines (32 loc) · 1.15 KB
/
kernel_histogram_256bin.zig
File metadata and controls
37 lines (32 loc) · 1.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// examples/kernel/3_Atomics/kernel_histogram_256bin.zig — 256-bin histogram with shared+global
//
// Reference: cuda-samples/2_Concepts_and_Techniques/histogram
// API exercised: SharedArray, atomicAdd (shared+global), __syncthreads, clearShared
const cuda = @import("zcuda_kernel");
const smem = cuda.shared_mem;
/// Privatized histogram: each block uses shared memory bins,
/// then atomically merges into global bins.
/// This avoids global atomic contention.
export fn histogram256Privatized(
data: [*]const u8,
global_bins: [*]u32,
n: u32,
) callconv(.kernel) void {
const local_bins = smem.SharedArray(u32, 256);
const lb = local_bins.ptr();
const tid = cuda.threadIdx().x;
// Zero shared bins cooperatively
smem.clearShared(u32, lb, 256);
cuda.__syncthreads();
// Accumulate into shared bins
var iter = cuda.types.gridStrideLoop(n);
while (iter.next()) |i| {
const bin = @as(u32, data[i]);
_ = cuda.atomicAdd(&lb[bin], @as(u32, 1));
}
cuda.__syncthreads();
// Merge shared → global (one thread per bin)
if (tid < 256) {
_ = cuda.atomicAdd(&global_bins[tid], lb[tid]);
}
}