Skip to content

Commit d818a75

Browse files
authored
Fix cuda_bindings conjugate_gradient_multi_block_cg.py example (#1922)
* fix: re-enable and fix the conjugate gradient multi-block example Remove the unconditional NVRTC waiver from the renamed example so CI can exercise its real execution path again. While re-enabling it, replace the standalone pytest.skip() checks with requirement_not_met() and simplify the platform gating. The waived code path had been hiding several Python-side runtime bugs that required these fixes: - replaced the invalid C-style %d f-string formatting - fixed gen_tridiag() variable shadowing so the CSR row-offset array is actually populated - passed the computed dynamic shared-memory size into cuLaunchCooperativeKernel() and made that size integer-valued - stopped overwriting managed-memory pointer variables with loop indices before kernel launch and cleanup - cached the residual before freeing dot_result, which removed the teardown segfault Made-with: Cursor * fix: check the QNX example gate via platform.system() QNX is an operating system rather than a machine architecture, so checking platform.machine() can miss the requirement_not_met() path on QNX hosts. Use platform.system() so the example is waived consistently on that platform. Made-with: Cursor
1 parent fd5700d commit d818a75

File tree

2 files changed

+47
-48
lines changed

2 files changed

+47
-48
lines changed

cuda_bindings/examples/3_CUDA_Features/global_to_shmem_async_copy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1142,7 +1142,7 @@ def matrix_multiply(dims_a, dims_b, kernel_number):
11421142
def main():
11431143
check_compute_capability_too_low(find_cuda_device(), (7, 0))
11441144

1145-
if platform.machine() == "qnx":
1145+
if platform.system() == "QNX":
11461146
requirement_not_met("globalToShmemAsyncCopy is not supported on QNX")
11471147

11481148
version = check_cuda_errors(cuda.cuDriverGetVersion())

cuda_bindings/examples/4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py

Lines changed: 46 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
KernelHelper,
2727
check_cuda_errors,
2828
find_cuda_device,
29+
requirement_not_met,
2930
)
3031

3132
conjugate_gradient_multi_block_cg = """\
@@ -177,71 +178,68 @@
177178
"""
178179

179180

180-
def gen_tridiag(i, j, val, n, nz):
181-
i[0] = 0
182-
j[0] = 0
183-
j[1] = 0
181+
def gen_tridiag(row_offsets, col_indices, values, n, nz):
182+
row_offsets[0] = 0
183+
col_indices[0] = 0
184+
col_indices[1] = 0
184185

185-
val[0] = float(random()) + 10.0
186-
val[1] = float(random())
186+
values[0] = float(random()) + 10.0
187+
values[1] = float(random())
187188

188-
for i in range(1, n):
189-
if i > 1:
190-
i[i] = i[i - 1] + 3
189+
for row_idx in range(1, n):
190+
if row_idx > 1:
191+
row_offsets[row_idx] = row_offsets[row_idx - 1] + 3
191192
else:
192-
i[1] = 2
193+
row_offsets[1] = 2
193194

194-
start = (i - 1) * 3 + 2
195-
j[start] = i - 1
196-
j[start + 1] = i
195+
start = (row_idx - 1) * 3 + 2
196+
col_indices[start] = row_idx - 1
197+
col_indices[start + 1] = row_idx
197198

198-
if i < n - 1:
199-
j[start + 2] = i + 1
199+
if row_idx < n - 1:
200+
col_indices[start + 2] = row_idx + 1
200201

201-
val[start] = val[start - 1]
202-
val[start + 1] = float(random()) + 10.0
202+
values[start] = values[start - 1]
203+
values[start + 1] = float(random()) + 10.0
203204

204-
if i < n - 1:
205-
val[start + 2] = float(random())
206-
i[n] = nz
205+
if row_idx < n - 1:
206+
values[start + 2] = float(random())
207+
row_offsets[n] = nz
207208

208209

209210
THREADS_PER_BLOCK = 512
210211
s_sd_kname = "conjugateGradientMultiBlockCG"
212+
UNSUPPORTED_SYSTEMS = {"Darwin", "QNX"}
213+
UNSUPPORTED_MACHINES = {"armv7l"}
211214

212215

213216
def main():
214217
tol = 1e-5
215218

216-
import pytest
219+
system_name = platform.system()
220+
if system_name in UNSUPPORTED_SYSTEMS:
221+
requirement_not_met(f"{s_sd_kname} is not supported on {system_name}")
217222

218-
# WAIVE: Due to bug in NVRTC
219-
return
220-
221-
if platform.system() == "Darwin":
222-
pytest.skip("conjugateGradientMultiBlockCG is not supported on Mac OSX")
223-
224-
if platform.machine() == "armv7l":
225-
pytest.skip("conjugateGradientMultiBlockCG is not supported on ARMv7")
226-
227-
if platform.machine() == "qnx":
228-
pytest.skip("conjugateGradientMultiBlockCG is not supported on QNX")
223+
machine_name = platform.machine()
224+
if machine_name in UNSUPPORTED_MACHINES:
225+
requirement_not_met(f"{s_sd_kname} is not supported on {machine_name}")
229226

230227
# This will pick the best possible CUDA capable device
231228
dev_id = find_cuda_device()
232229
device_prop = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
233230

234231
if not device_prop.managedMemory:
235-
pytest.skip("Unified Memory not supported on this device")
232+
requirement_not_met("Unified Memory not supported on this device")
236233

237234
# This sample requires being run on a device that supports Cooperative Kernel
238235
# Launch
239236
if not device_prop.cooperativeLaunch:
240-
pytest.skip(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch")
237+
requirement_not_met(f"Selected GPU {dev_id} does not support Cooperative Kernel Launch")
241238

242239
# Statistics about the GPU device
243240
print(
244-
f"> GPU device has {device_prop.multiProcessorCount:%d} Multi-Processors, SM {device_prop.major:%d}.{device_prop.minor:%d} compute capabilities\n"
241+
f"> GPU device has {device_prop.multiProcessorCount} Multi-Processors, "
242+
f"SM {device_prop.major}.{device_prop.minor} compute capabilities\n"
245243
)
246244

247245
# Get kernel
@@ -267,7 +265,7 @@ def main():
267265
x_local = (ctypes.c_float * n).from_address(x)
268266
rhs_local = (ctypes.c_float * n).from_address(rhs)
269267
dot_result_local = (ctypes.c_double).from_address(dot_result)
270-
dot_result_local = 0
268+
dot_result_local.value = 0.0
271269

272270
# temp memory for CG
273271
r = check_cuda_errors(cudart.cudaMallocManaged(np.dtype(np.float32).itemsize * n, cudart.cudaMemAttachGlobal))
@@ -280,9 +278,9 @@ def main():
280278
start = check_cuda_errors(cudart.cudaEventCreate())
281279
stop = check_cuda_errors(cudart.cudaEventCreate())
282280

283-
for i in range(n):
284-
r_local[i] = rhs_local[i] = 1.0
285-
x_local[i] = 0.0
281+
for idx in range(n):
282+
r_local[idx] = rhs_local[idx] = 1.0
283+
x_local[idx] = 0.0
286284

287285
kernel_args_value = (i, j, val, x, ax, p, r, dot_result, nz, n, tol)
288286
kernel_args_types = (
@@ -300,7 +298,7 @@ def main():
300298
)
301299
kernel_args = (kernel_args_value, kernel_args_types)
302300

303-
s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK / 32) + 1)
301+
s_mem_size = np.dtype(np.float64).itemsize * ((THREADS_PER_BLOCK // 32) + 1)
304302
num_threads = THREADS_PER_BLOCK
305303
num_blocks_per_sm = check_cuda_errors(
306304
cuda.cuOccupancyMaxActiveBlocksPerMultiprocessor(_gpu_conjugate_gradient, num_threads, s_mem_size)
@@ -325,7 +323,7 @@ def main():
325323
dim_block.x,
326324
dim_block.y,
327325
dim_block.z,
328-
0,
326+
s_mem_size,
329327
0,
330328
kernel_args,
331329
)
@@ -334,16 +332,17 @@ def main():
334332
check_cuda_errors(cudart.cudaDeviceSynchronize())
335333

336334
time = check_cuda_errors(cudart.cudaEventElapsedTime(start, stop))
337-
print(f"GPU Final, residual = {math.sqrt(dot_result_local):e}, kernel execution time = {time:f} ms")
335+
residual = math.sqrt(dot_result_local.value)
336+
print(f"GPU Final, residual = {residual:e}, kernel execution time = {time:f} ms")
338337

339338
err = 0.0
340-
for i in range(n):
339+
for row_idx in range(n):
341340
rsum = 0.0
342341

343-
for j in range(i_local[i], i_local[i + 1]):
344-
rsum += val_local[j] * x_local[j_local[j]]
342+
for elem_idx in range(i_local[row_idx], i_local[row_idx + 1]):
343+
rsum += val_local[elem_idx] * x_local[j_local[elem_idx]]
345344

346-
diff = math.fabs(rsum - rhs_local[i])
345+
diff = math.fabs(rsum - rhs_local[row_idx])
347346

348347
if diff > err:
349348
err = diff
@@ -361,7 +360,7 @@ def main():
361360
check_cuda_errors(cudart.cudaEventDestroy(stop))
362361

363362
print(f"Test Summary: Error amount = {err:f}")
364-
if math.sqrt(dot_result_local) >= tol:
363+
if residual >= tol:
365364
print("conjugateGradientMultiBlockCG FAILED", file=sys.stderr)
366365
sys.exit(1)
367366

0 commit comments

Comments
 (0)