Skip to content

Commit e46fcac

Browse files
cpcloudrwgk
andauthored
Standardize naming conventions across cuda.core examples (#1695)
* Standardize error handling and output across all examples (#1678) - Remove success/completion fluff messages ("done!", "passed", etc.) - Use sys.exit(1) instead of sys.exit(0) for unsupported configurations - Send skip/warning/error messages to stderr - Replace user-facing assert with proper checks - Standardize sys.exit(-1) to sys.exit(1) Made-with: Cursor * fix: use pytest.skip() for unsupported configs in test examples The _test.py files in cuda_bindings/examples/ are run under pytest, so sys.exit(1) for unsupported configurations causes SystemExit(1) which pytest treats as a test failure. Use pytest.skip() instead for platform/device/capability checks that indicate unsupported configs. Actual test failures (incorrect results) still use sys.exit(1). Made-with: Cursor * Standardize naming conventions across cuda.core examples Apply consistent variable names for common objects across all cuda.core example files, addressing issue #1675. Conventions applied: - Stream: `stream` (not `s`), numbered `stream0`/`stream1` - Kernel: `kernel` (not `ker`/`gpu_ker`), descriptive `add_kernel`/`sub_kernel` - Program: `prog` (not `gpu_prog`) - Kernel args: `kernel_args` (not `ker_args`) - Program options: `program_options` (not `opts`), using ProgramOptions (not dicts) - Grid/block: `grid`/`block` (not `grid_size`/`block_size`) Made-with: Cursor * Standardize error handling and output across all examples (#1678) - Remove success/completion fluff messages ("done!", "passed", etc.) - Use sys.exit(1) instead of sys.exit(0) for unsupported configurations - Send skip/warning/error messages to stderr - Replace user-facing assert with proper checks - Standardize sys.exit(-1) to sys.exit(1) Made-with: Cursor * fix: use pytest.skip() for unsupported configs in test examples The _test.py files in cuda_bindings/examples/ are run under pytest, so sys.exit(1) for unsupported configurations causes SystemExit(1) which pytest treats as a test failure. Use pytest.skip() instead for platform/device/capability checks that indicate unsupported configs. Actual test failures (incorrect results) still use sys.exit(1). Made-with: Cursor * fix: enforce naming conventions in cuda_bindings examples Keep pep8-naming checks active in cuda_bindings examples so local identifiers follow snake_case, while retaining targeted naming-rule exceptions for non-example cuda_bindings paths that mirror CUDA/setuptools APIs. Made-with: Cursor --------- Co-authored-by: Ralf W. Grosse-Kunstleve <rwgkio@gmail.com>
1 parent 2b611c8 commit e46fcac

25 files changed

+1407
-1415
lines changed

cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import numpy as np
77
from common import common
8-
from common.helper_cuda import checkCudaErrors, findCudaDevice
8+
from common.helper_cuda import check_cuda_errors, find_cuda_device
99

1010
from cuda.bindings import driver as cuda
1111

@@ -50,8 +50,8 @@
5050
}
5151
"""
5252

53-
NUM_BLOCKS = 64
54-
NUM_THREADS = 256
53+
num_blocks = 64
54+
num_threads = 256
5555

5656

5757
def elems_to_bytes(nelems, dt):
@@ -64,52 +64,52 @@ def main():
6464
if platform.machine() == "armv7l":
6565
pytest.skip("clock_nvrtc is not supported on ARMv7")
6666

67-
timer = np.empty(NUM_BLOCKS * 2, dtype="int64")
68-
hinput = np.empty(NUM_THREADS * 2, dtype="float32")
67+
timer = np.empty(num_blocks * 2, dtype="int64")
68+
hinput = np.empty(num_threads * 2, dtype="float32")
6969

70-
for i in range(NUM_THREADS * 2):
70+
for i in range(num_threads * 2):
7171
hinput[i] = i
7272

73-
devID = findCudaDevice()
74-
with common.KernelHelper(clock_nvrtc, devID) as kernelHelper:
75-
kernel_addr = kernelHelper.getFunction(b"timedReduction")
76-
77-
dinput = checkCudaErrors(cuda.cuMemAlloc(hinput.nbytes))
78-
doutput = checkCudaErrors(cuda.cuMemAlloc(elems_to_bytes(NUM_BLOCKS, np.float32)))
79-
dtimer = checkCudaErrors(cuda.cuMemAlloc(timer.nbytes))
80-
checkCudaErrors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
81-
82-
args = ((dinput, doutput, dtimer), (None, None, None))
83-
shared_memory_nbytes = elems_to_bytes(2 * NUM_THREADS, np.float32)
84-
85-
grid_dims = (NUM_BLOCKS, 1, 1)
86-
block_dims = (NUM_THREADS, 1, 1)
87-
88-
checkCudaErrors(
89-
cuda.cuLaunchKernel(
90-
kernel_addr,
91-
*grid_dims, # grid dim
92-
*block_dims, # block dim
93-
shared_memory_nbytes,
94-
0, # shared mem, stream
95-
args,
96-
0,
97-
)
98-
) # arguments
99-
100-
checkCudaErrors(cuda.cuCtxSynchronize())
101-
checkCudaErrors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
102-
checkCudaErrors(cuda.cuMemFree(dinput))
103-
checkCudaErrors(cuda.cuMemFree(doutput))
104-
checkCudaErrors(cuda.cuMemFree(dtimer))
105-
106-
avgElapsedClocks = 0.0
107-
108-
for i in range(NUM_BLOCKS):
109-
avgElapsedClocks += timer[i + NUM_BLOCKS] - timer[i]
110-
111-
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS
112-
print(f"Average clocks/block = {avgElapsedClocks}")
73+
dev_id = find_cuda_device()
74+
kernel_helper = common.KernelHelper(clock_nvrtc, dev_id)
75+
kernel_addr = kernel_helper.get_function(b"timedReduction")
76+
77+
dinput = check_cuda_errors(cuda.cuMemAlloc(hinput.nbytes))
78+
doutput = check_cuda_errors(cuda.cuMemAlloc(elems_to_bytes(num_blocks, np.float32)))
79+
dtimer = check_cuda_errors(cuda.cuMemAlloc(timer.nbytes))
80+
check_cuda_errors(cuda.cuMemcpyHtoD(dinput, hinput, hinput.nbytes))
81+
82+
args = ((dinput, doutput, dtimer), (None, None, None))
83+
shared_memory_nbytes = elems_to_bytes(2 * num_threads, np.float32)
84+
85+
grid_dims = (num_blocks, 1, 1)
86+
block_dims = (num_threads, 1, 1)
87+
88+
check_cuda_errors(
89+
cuda.cuLaunchKernel(
90+
kernel_addr,
91+
*grid_dims, # grid dim
92+
*block_dims, # block dim
93+
shared_memory_nbytes,
94+
0, # shared mem, stream
95+
args,
96+
0,
97+
)
98+
) # arguments
99+
100+
check_cuda_errors(cuda.cuCtxSynchronize())
101+
check_cuda_errors(cuda.cuMemcpyDtoH(timer, dtimer, timer.nbytes))
102+
check_cuda_errors(cuda.cuMemFree(dinput))
103+
check_cuda_errors(cuda.cuMemFree(doutput))
104+
check_cuda_errors(cuda.cuMemFree(dtimer))
105+
106+
avg_elapsed_clocks = 0.0
107+
108+
for i in range(num_blocks):
109+
avg_elapsed_clocks += timer[i + num_blocks] - timer[i]
110+
111+
avg_elapsed_clocks = avg_elapsed_clocks / num_blocks
112+
print(f"Average clocks/block = {avg_elapsed_clocks}")
113113

114114

115115
if __name__ == "__main__":

cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py

Lines changed: 90 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77

88
import numpy as np
99
from common import common
10-
from common.helper_cuda import checkCudaErrors, findCudaDevice
10+
from common.helper_cuda import check_cuda_errors, find_cuda_device
1111

1212
from cuda.bindings import driver as cuda
1313
from cuda.bindings import runtime as cudart
1414

15-
simpleCubemapTexture = """\
15+
simple_cubemap_texture = """\
1616
extern "C"
1717
__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
1818
{
@@ -83,14 +83,14 @@
8383

8484
def main():
8585
# Use command-line specified CUDA device, otherwise use device with highest Gflops/s
86-
devID = findCudaDevice()
86+
dev_id = find_cuda_device()
8787

8888
# Get number of SMs on this GPU
89-
deviceProps = checkCudaErrors(cudart.cudaGetDeviceProperties(devID))
89+
device_props = check_cuda_errors(cudart.cudaGetDeviceProperties(dev_id))
9090
print(
91-
f"CUDA device [{deviceProps.name}] has {deviceProps.multiProcessorCount} Multi-Processors SM {deviceProps.major}.{deviceProps.minor}"
91+
f"CUDA device [{device_props.name}] has {device_props.multiProcessorCount} Multi-Processors SM {device_props.major}.{device_props.minor}"
9292
)
93-
if deviceProps.major < 2:
93+
if device_props.major < 2:
9494
import pytest
9595

9696
pytest.skip("Test requires SM 2.0 or higher for support of Texture Arrays.")
@@ -107,15 +107,15 @@ def main():
107107
h_data_ref = np.repeat(np.arange(num_layers, dtype=h_data.dtype), cubemap_size) - h_data
108108

109109
# Allocate device memory for result
110-
d_data = checkCudaErrors(cudart.cudaMalloc(size))
110+
d_data = check_cuda_errors(cudart.cudaMalloc(size))
111111

112112
# Allocate array and copy image data
113-
channelDesc = checkCudaErrors(
113+
channel_desc = check_cuda_errors(
114114
cudart.cudaCreateChannelDesc(32, 0, 0, 0, cudart.cudaChannelFormatKind.cudaChannelFormatKindFloat)
115115
)
116-
cu_3darray = checkCudaErrors(
116+
cu_3darray = check_cuda_errors(
117117
cudart.cudaMalloc3DArray(
118-
channelDesc,
118+
channel_desc,
119119
cudart.make_cudaExtent(width, width, num_faces),
120120
cudart.cudaArrayCubemap,
121121
)
@@ -128,90 +128,90 @@ def main():
128128
myparms.dstArray = cu_3darray
129129
myparms.extent = cudart.make_cudaExtent(width, width, num_faces)
130130
myparms.kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
131-
checkCudaErrors(cudart.cudaMemcpy3D(myparms))
132-
133-
texRes = cudart.cudaResourceDesc()
134-
texRes.resType = cudart.cudaResourceType.cudaResourceTypeArray
135-
texRes.res.array.array = cu_3darray
136-
137-
texDescr = cudart.cudaTextureDesc()
138-
texDescr.normalizedCoords = True
139-
texDescr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
140-
texDescr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
141-
texDescr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
142-
texDescr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
143-
texDescr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
144-
145-
tex = checkCudaErrors(cudart.cudaCreateTextureObject(texRes, texDescr, None))
146-
dimBlock = cudart.dim3()
147-
dimBlock.x = 8
148-
dimBlock.y = 8
149-
dimBlock.z = 1
150-
dimGrid = cudart.dim3()
151-
dimGrid.x = width / dimBlock.x
152-
dimGrid.y = width / dimBlock.y
153-
dimGrid.z = 1
131+
check_cuda_errors(cudart.cudaMemcpy3D(myparms))
132+
133+
tex_res = cudart.cudaResourceDesc()
134+
tex_res.resType = cudart.cudaResourceType.cudaResourceTypeArray
135+
tex_res.res.array.array = cu_3darray
136+
137+
tex_descr = cudart.cudaTextureDesc()
138+
tex_descr.normalizedCoords = True
139+
tex_descr.filterMode = cudart.cudaTextureFilterMode.cudaFilterModeLinear
140+
tex_descr.addressMode[0] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
141+
tex_descr.addressMode[1] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
142+
tex_descr.addressMode[2] = cudart.cudaTextureAddressMode.cudaAddressModeWrap
143+
tex_descr.readMode = cudart.cudaTextureReadMode.cudaReadModeElementType
144+
145+
tex = check_cuda_errors(cudart.cudaCreateTextureObject(tex_res, tex_descr, None))
146+
dim_block = cudart.dim3()
147+
dim_block.x = 8
148+
dim_block.y = 8
149+
dim_block.z = 1
150+
dim_grid = cudart.dim3()
151+
dim_grid.x = width / dim_block.x
152+
dim_grid.y = width / dim_block.y
153+
dim_grid.z = 1
154154

155155
print(
156-
f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dimGrid.x} x {dimGrid.y}, each block has 8 x 8 threads"
156+
f"Covering Cubemap data array of {width}~3 x {num_layers}: Grid size is {dim_grid.x} x {dim_grid.y}, each block has 8 x 8 threads"
157157
)
158158

159-
with common.KernelHelper(simpleCubemapTexture, devID) as kernelHelper:
160-
_transformKernel = kernelHelper.getFunction(b"transformKernel")
161-
kernelArgs = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
162-
checkCudaErrors(
163-
cuda.cuLaunchKernel(
164-
_transformKernel,
165-
dimGrid.x,
166-
dimGrid.y,
167-
dimGrid.z, # grid dim
168-
dimBlock.x,
169-
dimBlock.y,
170-
dimBlock.z, # block dim
171-
0,
172-
0, # shared mem and stream
173-
kernelArgs,
174-
0,
175-
)
176-
) # arguments
177-
178-
checkCudaErrors(cudart.cudaDeviceSynchronize())
179-
180-
start = time.time()
181-
182-
# Execute the kernel
183-
checkCudaErrors(
184-
cuda.cuLaunchKernel(
185-
_transformKernel,
186-
dimGrid.x,
187-
dimGrid.y,
188-
dimGrid.z, # grid dim
189-
dimBlock.x,
190-
dimBlock.y,
191-
dimBlock.z, # block dim
192-
0,
193-
0, # shared mem and stream
194-
kernelArgs,
195-
0,
196-
)
197-
) # arguments
198-
199-
checkCudaErrors(cudart.cudaDeviceSynchronize())
200-
stop = time.time()
201-
print(f"Processing time: {stop - start:.3f} msec")
202-
print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
203-
204-
# Allocate mem for the result on host side
205-
h_odata = np.empty_like(h_data)
206-
# Copy result from device to host
207-
checkCudaErrors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
208-
209-
checkCudaErrors(cudart.cudaDestroyTextureObject(tex))
210-
checkCudaErrors(cudart.cudaFree(d_data))
211-
checkCudaErrors(cudart.cudaFreeArray(cu_3darray))
212-
213-
MIN_EPSILON_ERROR = 5.0e-3
214-
if np.max(np.abs(h_odata - h_data_ref)) > MIN_EPSILON_ERROR:
159+
kernel_helper = common.KernelHelper(simple_cubemap_texture, dev_id)
160+
_transform_kernel = kernel_helper.get_function(b"transformKernel")
161+
kernel_args = ((d_data, width, tex), (ctypes.c_void_p, ctypes.c_int, None))
162+
check_cuda_errors(
163+
cuda.cuLaunchKernel(
164+
_transform_kernel,
165+
dim_grid.x,
166+
dim_grid.y,
167+
dim_grid.z, # grid dim
168+
dim_block.x,
169+
dim_block.y,
170+
dim_block.z, # block dim
171+
0,
172+
0, # shared mem and stream
173+
kernel_args,
174+
0,
175+
)
176+
) # arguments
177+
178+
check_cuda_errors(cudart.cudaDeviceSynchronize())
179+
180+
start = time.time()
181+
182+
# Execute the kernel
183+
check_cuda_errors(
184+
cuda.cuLaunchKernel(
185+
_transform_kernel,
186+
dim_grid.x,
187+
dim_grid.y,
188+
dim_grid.z, # grid dim
189+
dim_block.x,
190+
dim_block.y,
191+
dim_block.z, # block dim
192+
0,
193+
0, # shared mem and stream
194+
kernel_args,
195+
0,
196+
)
197+
) # arguments
198+
199+
check_cuda_errors(cudart.cudaDeviceSynchronize())
200+
stop = time.time()
201+
print(f"Processing time: {stop - start:.3f} msec")
202+
print(f"{cubemap_size / ((stop - start + 1) / 1000.0) / 1e6:.2f} Mtexlookups/sec")
203+
204+
# Allocate mem for the result on host side
205+
h_odata = np.empty_like(h_data)
206+
# Copy result from device to host
207+
check_cuda_errors(cudart.cudaMemcpy(h_odata, d_data, size, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))
208+
209+
check_cuda_errors(cudart.cudaDestroyTextureObject(tex))
210+
check_cuda_errors(cudart.cudaFree(d_data))
211+
check_cuda_errors(cudart.cudaFreeArray(cu_3darray))
212+
213+
min_epsilon_error = 5.0e-3
214+
if np.max(np.abs(h_odata - h_data_ref)) > min_epsilon_error:
215215
print("Failed", file=sys.stderr)
216216
sys.exit(1)
217217

0 commit comments

Comments
 (0)