Skip to content

Commit 7848bc3

Browse files
authored
fix(examples): standardize explicit resource cleanup (NVIDIA#1711)
* fix(examples): standardize explicit resource cleanup paths Ensure cuda.core and cuda.bindings examples deterministically release streams, buffers, and loaded modules so examples remain safe when copied into long-running programs. Made-with: Cursor * chore(examples): apply pre-commit fixes Accept pre-commit auto-fixes from the nix run to keep the cleanup PR fully hook-clean. Made-with: Cursor * refactor(examples): use KernelHelper context management Replace manual KernelHelper close calls with context-managed usage so modules are unloaded consistently even on early exits and cleanup stays centralized. Made-with: Cursor * fix(examples): restore NVRTC output retrieval flow Move CUBIN/PTX extraction back into the successful compile path in KernelHelper so example tests do not crash with UnboundLocalError when compilation succeeds. Made-with: Cursor
1 parent 863f898 commit 7848bc3

21 files changed

Lines changed: 975 additions & 898 deletions

cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -71,37 +71,37 @@ def main():
7171
hinput[i] = i
7272

7373
devID = findCudaDevice()
74-
kernelHelper = common.KernelHelper(clock_nvrtc, devID)
75-
kernel_addr = kernelHelper.getFunction(b"timedReduction")
76-
77-
dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
78-
doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
79-
dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
80-
checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
81-
82-
args = ((dinput, doutput, dtimer), (None, None, None))
83-
shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
84-
85-
grid_dims = (NUM_BLOCKS, 1, 1)
86-
block_dims = (NUM_THREADS, 1, 1)
87-
88-
checkCudaErrors(
89-
cuda.cuLaunchKernel(
90-
kernel_addr,
91-
*grid_dims, # grid dim
92-
*block_dims, # block dim
93-
shared_memory_nbytes,
94-
0, # shared mem, stream
95-
args,
96-
0,
97-
)
98-
) # arguments
99-
100-
checkCudaErrors(cuda.cuCtxSynchronize())
101-
checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
102-
checkCudaErrors(cuda.cuMemFree(dinput))
103-
checkCudaErrors(cuda.cuMemFree(doutput))
104-
checkCudaErrors(cuda.cuMemFree(dtimer))
74+
with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
75+
kernel_addr = kernelHelper.getFunction(b"timedReduction")
76+
77+
dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
78+
doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
79+
dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
80+
checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
81+
82+
args = ((dinput, doutput, dtimer), (None, None, None))
83+
shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
84+
85+
grid_dims = (NUM_BLOCKS, 1, 1)
86+
block_dims = (NUM_THREADS, 1, 1)
87+
88+
checkCudaErrors(
89+
cuda.cuLaunchKernel(
90+
kernel_addr,
91+
*grid_dims, # grid dim
92+
*block_dims, # block dim
93+
shared_memory_nbytes,
94+
0, # shared mem, stream
95+
args,
96+
0,
97+
)
98+
) # arguments
99+
100+
checkCudaErrors(cuda.cuCtxSynchronize())
101+
checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
102+
checkCudaErrors(cuda.cuMemFree(dinput))
103+
checkCudaErrors(cuda.cuMemFree(doutput))
104+
checkCudaErrors(cuda.cuMemFree(dtimer))
105105

106106
avgElapsedClocks = 0.0
107107

cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py

Lines changed: 49 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -156,55 +156,55 @@ def main():
156156
f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
157157
)
158158

159-
kernelHelper = common.KernelHelper(simpleCubemapTexture, devID)
160-
_transformKernel = kernelHelper.getFunction(b"transformKernel")
161-
kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
162-
checkCudaErrors(
163-
cuda.cuLaunchKernel(
164-
_transformKernel,
165-
dimGrid.x,
166-
dimGrid.y,
167-
dimGrid.z, # grid dim
168-
dimBlock.x,
169-
dimBlock.y,
170-
dimBlock.z, # block dim
171-
0,
172-
0, # shared mem and stream
173-
kernelArgs,
174-
0,
175-
)
176-
) # arguments
177-
178-
checkCudaErrors(cudart.cudaDeviceSynchronize())
179-
180-
start = time.time()
181-
182-
# Execute the kernel
183-
checkCudaErrors(
184-
cuda.cuLaunchKernel(
185-
_transformKernel,
186-
dimGrid.x,
187-
dimGrid.y,
188-
dimGrid.z, # grid dim
189-
dimBlock.x,
190-
dimBlock.y,
191-
dimBlock.z, # block dim
192-
0,
193-
0, # shared mem and stream
194-
kernelArgs,
195-
0,
196-
)
197-
) # arguments
198-
199-
checkCudaErrors(cudart.cudaDeviceSynchronize())
200-
stop = time.time()
201-
print(f"Processing time: {stop - start:.3f} msec")
202-
print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
203-
204-
# Allocate mem for the result on host side
205-
h_odata = np.empty_like(h_data)
206-
# Copy result from device to host
207-
checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
159+
with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper:
160+
_transformKernel = kernelHelper.getFunction(b"transformKernel")
161+
kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
162+
checkCudaErrors(
163+
cuda.cuLaunchKernel(
164+
_transformKernel,
165+
dimGrid.x,
166+
dimGrid.y,
167+
dimGrid.z, # grid dim
168+
dimBlock.x,
169+
dimBlock.y,
170+
dimBlock.z, # block dim
171+
0,
172+
0, # shared mem and stream
173+
kernelArgs,
174+
0,
175+
)
176+
) # arguments
177+
178+
checkCudaErrors(cudart.cudaDeviceSynchronize())
179+
180+
start = time.time()
181+
182+
# Execute the kernel
183+
checkCudaErrors(
184+
cuda.cuLaunchKernel(
185+
_transformKernel,
186+
dimGrid.x,
187+
dimGrid.y,
188+
dimGrid.z, # grid dim
189+
dimBlock.x,
190+
dimBlock.y,
191+
dimBlock.z, # block dim
192+
0,
193+
0, # shared mem and stream
194+
kernelArgs,
195+
0,
196+
)
197+
) # arguments
198+
199+
checkCudaErrors(cudart.cudaDeviceSynchronize())
200+
stop = time.time()
201+
print(f"Processing time: {stop - start:.3f} msec")
202+
print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
203+
204+
# Allocate mem for the result on host side
205+
h_odata = np.empty_like(h_data)
206+
# Copy result from device to host
207+
checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
208208

209209
checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
210210
checkCudaErrors(cudart.cudaFree(d_data))

cuda_bindings/examples/0_Introduction/simpleP2P_test.py

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -147,53 +147,49 @@ def main():
147147
print(f"Run kernel on GPU{gpuid[1]}, taking source data from GPU{gpuid[0]} and writing to GPU{gpuid[1]}...")
148148
checkCudaErrors(cudart.cudaSetDevice(gpuid[1]))
149149

150-
kernelHelper = [None] * 2
151-
_simpleKernel = [None] * 2
152-
kernelArgs = [None] * 2
153-
154-
kernelHelper[1] = common.KernelHelper(simplep2p, gpuid[1])
155-
_simpleKernel[1] = kernelHelper[1].getFunction(b"SimpleKernel")
156-
kernelArgs[1] = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
157-
checkCudaErrors(
158-
cuda.cuLaunchKernel(
159-
_simpleKernel[1],
160-
blocks.x,
161-
blocks.y,
162-
blocks.z,
163-
threads.x,
164-
threads.y,
165-
threads.z,
166-
0,
167-
0,
168-
kernelArgs[1],
169-
0,
150+
with common.KernelHelper(simplep2p, gpuid[1]) as kernelHelper:
151+
simple_kernel_1 = kernelHelper.getFunction(b"SimpleKernel")
152+
kernel_args_1 = ((g0, g1), (ctypes.c_void_p, ctypes.c_void_p))
153+
checkCudaErrors(
154+
cuda.cuLaunchKernel(
155+
simple_kernel_1,
156+
blocks.x,
157+
blocks.y,
158+
blocks.z,
159+
threads.x,
160+
threads.y,
161+
threads.z,
162+
0,
163+
0,
164+
kernel_args_1,
165+
0,
166+
)
170167
)
171-
)
172168

173169
checkCudaErrors(cudart.cudaDeviceSynchronize())
174170

175171
# Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
176172
# output to the GPU 0 buffer
177173
print(f"Run kernel on GPU{gpuid[0]}, taking source data from GPU{gpuid[1]} and writing to GPU{gpuid[0]}...")
178174
checkCudaErrors(cudart.cudaSetDevice(gpuid[0]))
179-
kernelHelper[0] = common.KernelHelper(simplep2p, gpuid[0])
180-
_simpleKernel[0] = kernelHelper[0].getFunction(b"SimpleKernel")
181-
kernelArgs[0] = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
182-
checkCudaErrors(
183-
cuda.cuLaunchKernel(
184-
_simpleKernel[0],
185-
blocks.x,
186-
blocks.y,
187-
blocks.z,
188-
threads.x,
189-
threads.y,
190-
threads.z,
191-
0,
192-
0,
193-
kernelArgs[0],
194-
0,
175+
with common.KernelHelper(simplep2p, gpuid[0]) as kernelHelper:
176+
simple_kernel_0 = kernelHelper.getFunction(b"SimpleKernel")
177+
kernel_args_0 = ((g1, g0), (ctypes.c_void_p, ctypes.c_void_p))
178+
checkCudaErrors(
179+
cuda.cuLaunchKernel(
180+
simple_kernel_0,
181+
blocks.x,
182+
blocks.y,
183+
blocks.z,
184+
threads.x,
185+
threads.y,
186+
threads.z,
187+
0,
188+
0,
189+
kernel_args_0,
190+
0,
191+
)
195192
)
196-
)
197193

198194
checkCudaErrors(cudart.cudaDeviceSynchronize())
199195

cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -124,27 +124,27 @@ def main():
124124
grid.x = math.ceil(nelem / float(block.x))
125125
grid.y = 1
126126
grid.z = 1
127-
kernelHelper = common.KernelHelper(simpleZeroCopy, idev)
128-
_vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
129-
kernelArgs = (
130-
(d_a, d_b, d_c, nelem),
131-
(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
132-
)
133-
checkCudaErrors(
134-
cuda.cuLaunchKernel(
135-
_vectorAddGPU,
136-
grid.x,
137-
grid.y,
138-
grid.z,
139-
block.x,
140-
block.y,
141-
block.z,
142-
0,
143-
cuda.CU_STREAM_LEGACY,
144-
kernelArgs,
145-
0,
127+
with common.KernelHelper(simpleZeroCopy, idev) as kernelHelper:
128+
_vectorAddGPU = kernelHelper.getFunction(b"vectorAddGPU")
129+
kernelArgs = (
130+
(d_a, d_b, d_c, nelem),
131+
(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int),
132+
)
133+
checkCudaErrors(
134+
cuda.cuLaunchKernel(
135+
_vectorAddGPU,
136+
grid.x,
137+
grid.y,
138+
grid.z,
139+
block.x,
140+
block.y,
141+
block.z,
142+
0,
143+
cuda.CU_STREAM_LEGACY,
144+
kernelArgs,
145+
0,
146+
)
146147
)
147-
)
148148
checkCudaErrors(cudart.cudaDeviceSynchronize())
149149

150150
print("> Checking the results from vectorAddGPU() ...")

cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -206,24 +206,24 @@ def main():
206206
# To make the AND and XOR tests generate something other than 0...
207207
atom_arr_h[7] = atom_arr_h[9] = 0xFF
208208

209-
kernelHelper = common.KernelHelper(systemWideAtomics, dev_id)
210-
_atomicKernel = kernelHelper.getFunction(b"atomicKernel")
211-
kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
212-
checkCudaErrors(
213-
cuda.cuLaunchKernel(
214-
_atomicKernel,
215-
numBlocks,
216-
1,
217-
1, # grid dim
218-
numThreads,
219-
1,
220-
1, # block dim
221-
0,
222-
cuda.CU_STREAM_LEGACY, # shared mem and stream
223-
kernelArgs,
224-
0,
225-
)
226-
) # arguments
209+
with common.KernelHelper(systemWideAtomics, dev_id) as kernelHelper:
210+
_atomicKernel = kernelHelper.getFunction(b"atomicKernel")
211+
kernelArgs = ((atom_arr,), (ctypes.c_void_p,))
212+
checkCudaErrors(
213+
cuda.cuLaunchKernel(
214+
_atomicKernel,
215+
numBlocks,
216+
1,
217+
1, # grid dim
218+
numThreads,
219+
1,
220+
1, # block dim
221+
0,
222+
cuda.CU_STREAM_LEGACY, # shared mem and stream
223+
kernelArgs,
224+
0,
225+
)
226+
) # arguments
227227
# NOTE: Python doesn't have an equivalent system atomic operations
228228
# atomicKernel_CPU(atom_arr_h, numBlocks * numThreads)
229229

0 commit comments

Comments
 (0)