Skip to content

Commit 71c81fd

Browse files
committed
Standardize naming conventions across cuda.core examples
Apply consistent variable names for common objects across all cuda.core example files, addressing issue NVIDIA#1675. Conventions applied: - Stream: `stream` (not `s`), numbered `stream0`/`stream1` - Kernel: `kernel` (not `ker`/`gpu_ker`), descriptive `add_kernel`/`sub_kernel` - Program: `prog` (not `gpu_prog`) - Kernel args: `kernel_args` (not `ker_args`) - Program options: `program_options` (not `opts`), using ProgramOptions (not dicts) - Grid/block: `grid`/`block` (not `grid_size`/`block_size`) Made-with: Cursor
1 parent 5aa5fd3 commit 71c81fd

8 files changed

Lines changed: 50 additions & 51 deletions

cuda_core/examples/cuda_graphs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ def main():
8181
result3 = cp.empty_like(a)
8282

8383
# Prepare launch configuration
84-
block_size = 256
85-
grid_size = (size + block_size - 1) // block_size
86-
config = LaunchConfig(grid=grid_size, block=block_size)
84+
block = 256
85+
grid = (size + block - 1) // block
86+
config = LaunchConfig(grid=grid, block=block)
8787

8888
# Sync before graph capture
8989
dev.sync()

cuda_core/examples/gl_interop_plasma.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ def setup_cuda(kernel_source):
9494
dev.set_current()
9595
stream = dev.create_stream()
9696

97-
opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
98-
prog = Program(kernel_source, code_type="c++", options=opts)
97+
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
98+
prog = Program(kernel_source, code_type="c++", options=program_options)
9999
mod = prog.compile("cubin")
100100
kernel = mod.get_kernel("plasma")
101101

cuda_core/examples/pytorch_example.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __cuda_stream__(self):
4848
return (0, stream_id) # Return format required by CUDA Python
4949

5050

51-
s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
51+
stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
5252

5353
# prepare program
5454
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -60,7 +60,7 @@ def __cuda_stream__(self):
6060
)
6161

6262
# Run in single precision
63-
ker = mod.get_kernel("saxpy_kernel<float>")
63+
kernel = mod.get_kernel("saxpy_kernel<float>")
6464
dtype = torch.float32
6565

6666
# prepare input/output
@@ -75,16 +75,16 @@ def __cuda_stream__(self):
7575
block = 32
7676
grid = int((size + block - 1) // block)
7777
config = LaunchConfig(grid=grid, block=block)
78-
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
78+
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
7979

8080
# launch kernel on our stream
81-
launch(s, config, ker, *ker_args)
81+
launch(stream, config, kernel, *kernel_args)
8282

8383
# check result
8484
assert torch.allclose(out, a.item() * x + y)
8585

8686
# let's repeat again with double precision
87-
ker = mod.get_kernel("saxpy_kernel<double>")
87+
kernel = mod.get_kernel("saxpy_kernel<double>")
8888
dtype = torch.float64
8989

9090
# prepare input
@@ -101,10 +101,10 @@ def __cuda_stream__(self):
101101
block = 64
102102
grid = int((size + block - 1) // block)
103103
config = LaunchConfig(grid=grid, block=block)
104-
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
104+
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
105105

106106
# launch kernel on PyTorch's stream
107-
launch(s, config, ker, *ker_args)
107+
launch(stream, config, kernel, *kernel_args)
108108

109109
# check result
110110
assert torch.allclose(out, a * x + y)

cuda_core/examples/saxpy.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
dev = Device()
3737
dev.set_current()
38-
s = dev.create_stream()
38+
stream = dev.create_stream()
3939

4040
# prepare program
4141
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
@@ -51,7 +51,7 @@
5151
)
5252

5353
# run in single precision
54-
ker = mod.get_kernel("saxpy<float>")
54+
kernel = mod.get_kernel("saxpy<float>")
5555
dtype = cp.float32
5656

5757
# prepare input/output
@@ -61,24 +61,24 @@
6161
x = rng.random(size, dtype=dtype)
6262
y = rng.random(size, dtype=dtype)
6363
out = cp.empty_like(x)
64-
dev.sync() # cupy runs on a different stream from s, so sync before accessing
64+
dev.sync() # cupy runs on a different stream from stream, so sync before accessing
6565

6666
# prepare launch
6767
block = 32
6868
grid = int((size + block - 1) // block)
6969
config = LaunchConfig(grid=grid, block=block)
70-
ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
70+
kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
7171

72-
# launch kernel on stream s
73-
launch(s, config, ker, *ker_args)
74-
s.sync()
72+
# launch kernel on stream
73+
launch(stream, config, kernel, *kernel_args)
74+
stream.sync()
7575

7676
# check result
7777
assert cp.allclose(out, a * x + y)
7878

7979
# let's repeat again, this time allocates our own out buffer instead of cupy's
8080
# run in double precision
81-
ker = mod.get_kernel("saxpy<double>")
81+
kernel = mod.get_kernel("saxpy<double>")
8282
dtype = cp.float64
8383

8484
# prepare input
@@ -91,18 +91,18 @@
9191
# prepare output
9292
buf = dev.allocate(
9393
size * 8, # = dtype.itemsize
94-
stream=s,
94+
stream=stream,
9595
)
9696

9797
# prepare launch
9898
block = 64
9999
grid = int((size + block - 1) // block)
100100
config = LaunchConfig(grid=grid, block=block)
101-
ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
101+
kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
102102

103-
# launch kernel on stream s
104-
launch(s, config, ker, *ker_args)
105-
s.sync()
103+
# launch kernel on stream
104+
launch(stream, config, kernel, *kernel_args)
105+
stream.sync()
106106

107107
# check result
108108
# we wrap output buffer as a cupy array for simplicity
@@ -113,5 +113,5 @@
113113

114114
# clean up resources that we allocate
115115
# cupy cleans up automatically the rest
116-
buf.close(s)
117-
s.close()
116+
buf.close(stream)
117+
stream.close()

cuda_core/examples/simple_multi_gpu_example.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@
1212
import sys
1313

1414
import cupy as cp
15-
16-
from cuda.core import Device, LaunchConfig, Program, launch, system
15+
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
1716

1817
if system.get_num_devices() < 2:
1918
print("this example requires at least 2 GPUs", file=sys.stderr)
@@ -40,9 +39,9 @@
4039
}
4140
}
4241
"""
43-
prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
42+
prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
4443
mod_add = prog_add.compile("cubin")
45-
ker_add = mod_add.get_kernel("vector_add")
44+
add_kernel = mod_add.get_kernel("vector_add")
4645

4746
# Set GPU 1
4847
dev1 = Device(1)
@@ -62,9 +61,9 @@
6261
}
6362
}
6463
"""
65-
prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
64+
prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
6665
mod_sub = prog_sub.compile("cubin")
67-
ker_sub = mod_sub.get_kernel("vector_sub")
66+
sub_kernel = mod_sub.get_kernel("vector_sub")
6867

6968

7069
# This adaptor ensures that any foreign stream (ex: from CuPy) that have not
@@ -100,7 +99,7 @@ def __cuda_stream__(self):
10099
stream0.wait(cp_stream0)
101100

102101
# Launch the add kernel on GPU 0 / stream 0
103-
launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
102+
launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
104103

105104
# Allocate memory on GPU 1
106105
# Note: This runs on CuPy's current stream for GPU 1.
@@ -115,7 +114,7 @@ def __cuda_stream__(self):
115114
stream1.wait(cp_stream1)
116115

117116
# Launch the subtract kernel on GPU 1 / stream 1
118-
launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
117+
launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
119118

120119
# Synchronize both GPUs are validate the results
121120
dev0.set_current()

cuda_core/examples/strided_memory_view_gpu.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
5858
# of which are supported by StridedMemoryView).
5959
@args_viewable_as_strided_memory((0,))
60-
def my_func(arr, work_stream, gpu_ker):
60+
def my_func(arr, work_stream, kernel):
6161
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
6262
# ordering is taken care of, so that arr can be safely accessed on our work
6363
# stream (ordered after a data stream on which arr is potentially prepared).
@@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker):
7373
block = 256
7474
grid = (size + block - 1) // block
7575
config = LaunchConfig(grid=grid, block=block)
76-
launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
76+
launch(work_stream, config, kernel, view.ptr, np.uint64(size))
7777
# Here we're being conservative and synchronize over our work stream,
7878
# assuming we do not know the data stream; if we know then we could
7979
# just order the data stream after the work stream here, e.g.
@@ -101,24 +101,24 @@ def run():
101101
# To know the GPU's compute capability, we need to identify which GPU to use.
102102
dev = Device(0)
103103
dev.set_current()
104-
gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
105-
mod = gpu_prog.compile(target_type="cubin")
106-
gpu_ker = mod.get_kernel(func_name)
104+
prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
105+
mod = prog.compile(target_type="cubin")
106+
kernel = mod.get_kernel(func_name)
107107

108-
s = dev.create_stream()
108+
stream = dev.create_stream()
109109
try:
110110
# Create input array on GPU
111111
arr_gpu = cp.ones(1024, dtype=cp.int32)
112112
print(f"before: {arr_gpu[:10]=}")
113113

114114
# Run the workload
115-
my_func(arr_gpu, s, gpu_ker)
115+
my_func(arr_gpu, stream, kernel)
116116

117117
# Check the result
118118
print(f"after: {arr_gpu[:10]=}")
119119
assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
120120
finally:
121-
s.close()
121+
stream.close()
122122

123123

124124
if __name__ == "__main__":

cuda_core/examples/thread_block_cluster.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
9595
)
9696
mod = prog.compile(target_type="cubin")
97-
ker = mod.get_kernel("check_cluster_info")
97+
kernel = mod.get_kernel("check_cluster_info")
9898

9999
# prepare launch config
100100
grid = 4
@@ -122,7 +122,7 @@
122122
block_dims[:] = 0
123123

124124
# launch kernel on the default stream
125-
launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
125+
launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
126126
dev.sync()
127127

128128
# verify results

cuda_core/examples/vector_add.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@
3030

3131
dev = Device()
3232
dev.set_current()
33-
s = dev.create_stream()
33+
stream = dev.create_stream()
3434

3535
# prepare program
3636
program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
3737
prog = Program(code, code_type="c++", options=program_options)
3838
mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
3939

4040
# run in single precision
41-
ker = mod.get_kernel("vector_add<float>")
41+
kernel = mod.get_kernel("vector_add<float>")
4242
dtype = cp.float32
4343

4444
# prepare input/output
@@ -48,17 +48,17 @@
4848
b = rng.random(size, dtype=dtype)
4949
c = cp.empty_like(a)
5050

51-
# cupy runs on a different stream from s, so sync before accessing
51+
# cupy runs on a different stream from stream, so sync before accessing
5252
dev.sync()
5353

5454
# prepare launch
5555
block = 256
5656
grid = (size + block - 1) // block
5757
config = LaunchConfig(grid=grid, block=block)
5858

59-
# launch kernel on stream s
60-
launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
61-
s.sync()
59+
# launch kernel on stream
60+
launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
61+
stream.sync()
6262

6363
# check result
6464
assert cp.allclose(c, a + b)

0 commit comments

Comments
 (0)