forked from NVIDIA/cuda-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcuda_graphs.py
More file actions
166 lines (128 loc) · 5.69 KB
/
cuda_graphs.py
File metadata and controls
166 lines (128 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
# ################################################################################
#
# This demo illustrates how to use CUDA graphs to capture and execute
# multiple kernel launches with minimal overhead. The graph performs a
# sequence of vector operations: add, multiply, and subtract.
#
# ################################################################################
import sys
import time
import cupy as cp
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
def main():
# CUDA kernels for vector operations
code = """
template<typename T>
__global__ void vector_add(const T* A, const T* B, T* C, size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
C[i] = A[i] + B[i];
}
}
template<typename T>
__global__ void vector_multiply(const T* A, const T* B, T* C, size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
C[i] = A[i] * B[i];
}
}
template<typename T>
__global__ void vector_subtract(const T* A, const T* B, T* C, size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
C[i] = A[i] - B[i];
}
}
"""
# Initialize device and stream
dev = Device()
dev.set_current()
stream = dev.create_stream()
# tell CuPy to use our stream as the current stream:
cp.cuda.ExternalStream(int(stream.handle)).use()
# Compile the program
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
prog = Program(code, code_type="c++", options=program_options)
mod = prog.compile(
"cubin", name_expressions=("vector_add<float>", "vector_multiply<float>", "vector_subtract<float>")
)
# Get kernel functions
add_kernel = mod.get_kernel("vector_add<float>")
multiply_kernel = mod.get_kernel("vector_multiply<float>")
subtract_kernel = mod.get_kernel("vector_subtract<float>")
# Prepare data
size = 1000000
dtype = cp.float32
# Create input arrays
rng = cp.random.default_rng(42) # Fixed seed for reproducibility
a = rng.random(size, dtype=dtype)
b = rng.random(size, dtype=dtype)
c = rng.random(size, dtype=dtype)
# Create output arrays
result1 = cp.empty_like(a)
result2 = cp.empty_like(a)
result3 = cp.empty_like(a)
# Prepare launch configuration
block = 256
grid = (size + block - 1) // block
config = LaunchConfig(grid=grid, block=block)
# Sync before graph capture
dev.sync()
print("Building CUDA graph...", file=sys.stderr)
# Build the graph
graph_builder = stream.create_graph_builder()
graph_builder.begin_building()
# Add multiple kernel launches to the graph
# Kernel 1: result1 = a + b
launch(graph_builder, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size))
# Kernel 2: result2 = result1 * c
launch(graph_builder, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size))
# Kernel 3: result3 = result2 - a
launch(graph_builder, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size))
# Complete the graph
graph = graph_builder.end_building().complete()
# Upload the graph to the stream
graph.upload(stream)
# Execute the entire graph with a single launch
print("Executing graph...", file=sys.stderr)
start_time = time.time()
graph.launch(stream)
stream.sync()
end_time = time.time()
graph_execution_time = end_time - start_time
print(f"Graph execution time: {graph_execution_time:.6f} seconds")
# Verify results
expected_result1 = a + b
expected_result2 = expected_result1 * c
expected_result3 = expected_result2 - a
assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch"
assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch"
assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch"
# Demonstrate performance benefit by running the same operations without graph
print("\nRunning same operations without graph for comparison...", file=sys.stderr)
# Reset results
result1.fill(0)
result2.fill(0)
result3.fill(0)
start_time = time.time()
# Individual kernel launches
launch(stream, config, add_kernel, a.data.ptr, b.data.ptr, result1.data.ptr, cp.uint64(size))
launch(stream, config, multiply_kernel, result1.data.ptr, c.data.ptr, result2.data.ptr, cp.uint64(size))
launch(stream, config, subtract_kernel, result2.data.ptr, a.data.ptr, result3.data.ptr, cp.uint64(size))
stream.sync()
end_time = time.time()
individual_execution_time = end_time - start_time
print(f"Individual kernel execution time: {individual_execution_time:.6f} seconds")
# Calculate speedup
speedup = individual_execution_time / graph_execution_time
print(f"Graph provides {speedup:.2f}x speedup")
# Verify results again
assert cp.allclose(result1, expected_result1, rtol=1e-5, atol=1e-5), "Result 1 mismatch"
assert cp.allclose(result2, expected_result2, rtol=1e-5, atol=1e-5), "Result 2 mismatch"
assert cp.allclose(result3, expected_result3, rtol=1e-5, atol=1e-5), "Result 3 mismatch"
cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream
if __name__ == "__main__":
main()