-
Notifications
You must be signed in to change notification settings - Fork 270
Expand file tree
/
Copy pathclock_nvrtc.py
More file actions
128 lines (93 loc) · 3.39 KB
/
clock_nvrtc.py
File metadata and controls
128 lines (93 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Copyright 2021-2025 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
# ################################################################################
#
# This example demonstrates using the device clock for kernel timing via
# NVRTC-compiled CUDA code.
#
# ################################################################################
# /// script
# dependencies = ["cuda_bindings>13.2.1", "numpy"]
# ///
import platform
import numpy as np
from cuda.bindings import driver as cuda
from cuda.bindings._example_helpers import KernelHelper, check_cuda_errors, find_cuda_device, requirement_not_met
clock_nvrtc = """\
extern "C" __global__ void timedReduction(const float *hinput, float *output, clock_t *timer)
{
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock();
// Copy hinput.
shared[tid] = hinput[tid];
shared[tid + blockDim.x] = hinput[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2)
{
__syncthreads();
if (tid < d)
{
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0)
{
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid+gridDim.x] = clock();
}
"""
num_blocks = 64
num_threads = 256
def elems_to_bytes(nelems, dt):
return nelems * np.dtype(dt).itemsize
def check_requirements():
if platform.machine() == "armv7l":
requirement_not_met("clock_nvrtc is not supported on ARMv7")
def main():
check_requirements()
timer = np.empty(num_blocks * 2, dtype="int64")
hinput = np.empty(num_threads * 2, dtype="float32")
for i in range(num_threads * 2):
hinput[i] = i
dev_id = find_cuda_device()
kernel_helper = KernelHelper(clock_nvrtc, dev_id)
kernel_addr = kernel_helper.get_function(b"timedReduction")
dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes))
doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32)))
dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes))
check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
args = ((dinput, doutput, dtimer), (None, None, None))
shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32)
grid_dims = (num_blocks, 1, 1)
block_dims = (num_threads, 1, 1)
check_cuda_errors(
cuda.cuLaunchKernel(
kernel_addr,
*grid_dims, # grid dim
*block_dims, # block dim
shared_memory_nbytes,
0, # shared mem, stream
args,
0,
)
) # arguments
check_cuda_errors(cuda.cuCtxSynchronize())
check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
check_cuda_errors(cuda.cuMemFree(dinput))
check_cuda_errors(cuda.cuMemFree(doutput))
check_cuda_errors(cuda.cuMemFree(dtimer))
avg_elapsed_clocks = 0.0
for i in range(num_blocks):
avg_elapsed_clocks += timer[i + num_blocks] - timer[i]
avg_elapsed_clocks = avg_elapsed_clocks / num_blocks
print(f"Average clocks/block = {avg_elapsed_clocks}")
if __name__ == "__main__":
main()