-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel_cooperative_groups.zig
More file actions
87 lines (73 loc) · 3.03 KB
/
kernel_cooperative_groups.zig
File metadata and controls
87 lines (73 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// examples/kernel/9_Advanced/kernel_cooperative_groups.zig — Cooperative groups reduce
//
// Reference: cuda-samples/3_CUDA_Features/binaryPartitionCG
// API exercised: __syncthreads, __ballot_sync, __shfl_down_sync, SharedArray
//
// Note: Full CG API requires runtime support. This emulates CG patterns
// using existing warp/block primitives.
const cuda = @import("zcuda_kernel");
const smem = cuda.shared_mem;
const BLOCK_SIZE = 256;
const WARPS_PER_BLOCK = BLOCK_SIZE / 32;
/// Binary partition: partition warp into subgroups and reduce within each.
/// Emulates cooperative_groups::binary_partition().
export fn binaryPartitionReduce(
input: [*]const f32,
output: [*]f32,
predicate_val: f32,
n: u32,
) callconv(.kernel) void {
const gid = cuda.blockIdx().x * cuda.blockDim().x + cuda.threadIdx().x;
if (gid >= n) return;
const val = input[gid];
const pred = val > predicate_val;
// Partition the warp: get mask of threads with same predicate
const all_mask = cuda.__ballot_sync(cuda.FULL_MASK, pred);
const true_mask = all_mask;
const false_mask = ~all_mask;
// Each thread determines its partition mask
const my_mask = if (pred) true_mask else false_mask;
const partition_size = cuda.__popc(my_mask);
// Reduce within partition using only matching threads
var sum = val;
var offset: u32 = 1;
while (offset < 32) : (offset *= 2) {
const received: f32 = @bitCast(cuda.__shfl_down_sync(my_mask, @bitCast(sum), offset, 32));
sum += received;
}
// Normalize by partition size
output[gid] = sum / @as(f32, @floatFromInt(partition_size));
}
/// Block-level cooperative reduce: all warps cooperate via shared memory
export fn cooperativeBlockReduce(
input: [*]const f32,
output: *f32,
n: u32,
) callconv(.kernel) void {
const warp_sums = smem.SharedArray(f32, WARPS_PER_BLOCK);
const ws = warp_sums.ptr();
const tid = cuda.threadIdx().x;
const lane = tid % 32;
const warp_id = tid / 32;
var sum: f32 = 0.0;
var iter = cuda.types.gridStrideLoop(n);
while (iter.next()) |i| {
sum += input[i];
}
// Warp reduce
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 16, 32));
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 8, 32));
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 4, 32));
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 2, 32));
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 1, 32));
if (lane == 0) ws[warp_id] = sum;
cuda.__syncthreads();
// Final reduction by first warp
if (tid < WARPS_PER_BLOCK) sum = ws[tid] else sum = 0.0;
if (warp_id == 0) {
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 4, 32));
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 2, 32));
sum += @bitCast(cuda.__shfl_down_sync(cuda.FULL_MASK, @bitCast(sum), 1, 32));
}
if (tid == 0) _ = cuda.atomicAdd(output, sum);
}