Skip to content

Commit 1abd97a

Browse files
committed
Standardize naming conventions across cuda.core examples
Apply consistent variable names for common objects across all cuda.core example files, addressing issue NVIDIA#1675. Conventions applied: - Stream: `stream` (not `s`), numbered `stream0`/`stream1` - Kernel: `kernel` (not `ker`/`gpu_ker`), descriptive `add_kernel`/`sub_kernel` - Program: `prog` (not `gpu_prog`) - Kernel args: `kernel_args` (not `ker_args`) - Program options: `program_options` (not `opts`), using ProgramOptions (not dicts) - Grid/block: `grid`/`block` (not `grid_size`/`block_size`) Made-with: Cursor
1 parent 3dfc2ed commit 1abd97a

8 files changed

Lines changed: 52 additions & 52 deletions

cuda_core/examples/cuda_graphs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ def main():
8484
result3 = cp.empty_like(a)
8585

8686
# Prepare launch configuration
87-
block_size = 256
88-
grid_size = (size + block_size - 1) // block_size
89-
config = LaunchConfig(grid=grid_size, block=block_size)
87+
block = 256
88+
grid = (size + block - 1) // block
89+
config = LaunchConfig(grid=grid, block=block)
9090

9191
# Sync before graph capture
9292
dev.sync()

cuda_core/examples/gl_interop_plasma.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ def setup_cuda(kernel_source):
9494
dev.set_current()
9595
stream = dev.create_stream()
9696

97-
opts = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
98-
prog = Program(kernel_source, code_type="c++", options=opts)
97+
program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
98+
prog = Program(kernel_source, code_type="c++", options=program_options)
9999
mod = prog.compile("cubin")
100100
kernel = mod.get_kernel("plasma")
101101

cuda_core/examples/pytorch_example.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __cuda_stream__(self):
4848
return (0, stream_id) # Return format required by CUDA Python
4949

5050

51-
s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
51+
stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
5252

5353
try:
5454
# prepare program
@@ -61,7 +61,7 @@ def __cuda_stream__(self):
6161
)
6262

6363
# Run in single precision
64-
ker = mod.get_kernel("saxpy_kernel<float>")
64+
kernel = mod.get_kernel("saxpy_kernel<float>")
6565
dtype = torch.float32
6666

6767
# prepare input/output
@@ -76,16 +76,16 @@ def __cuda_stream__(self):
7676
block = 32
7777
grid = int((size + block - 1) // block)
7878
config = LaunchConfig(grid=grid, block=block)
79-
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
79+
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
8080

8181
# launch kernel on our stream
82-
launch(s, config, ker, *ker_args)
82+
launch(stream, config, kernel, *kernel_args)
8383

8484
# check result
8585
assert torch.allclose(out, a.item() * x + y)
8686

8787
# let's repeat again with double precision
88-
ker = mod.get_kernel("saxpy_kernel<double>")
88+
kernel = mod.get_kernel("saxpy_kernel<double>")
8989
dtype = torch.float64
9090

9191
# prepare input
@@ -102,12 +102,12 @@ def __cuda_stream__(self):
102102
block = 64
103103
grid = int((size + block - 1) // block)
104104
config = LaunchConfig(grid=grid, block=block)
105-
ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
105+
kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
106106

107107
# launch kernel on PyTorch's stream
108-
launch(s, config, ker, *ker_args)
108+
launch(stream, config, kernel, *kernel_args)
109109

110110
# check result
111111
assert torch.allclose(out, a * x + y)
112112
finally:
113-
s.close()
113+
stream.close()

cuda_core/examples/saxpy.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
dev = Device()
3737
dev.set_current()
38-
s = dev.create_stream()
38+
stream = dev.create_stream()
3939
buf = None
4040

4141
try:
@@ -53,7 +53,7 @@
5353
)
5454

5555
# run in single precision
56-
ker = mod.get_kernel("saxpy<float>")
56+
kernel = mod.get_kernel("saxpy<float>")
5757
dtype = cp.float32
5858

5959
# prepare input/output
@@ -63,24 +63,24 @@
6363
x = rng.random(size, dtype=dtype)
6464
y = rng.random(size, dtype=dtype)
6565
out = cp.empty_like(x)
66-
dev.sync() # cupy runs on a different stream from s, so sync before accessing
66+
dev.sync() # cupy runs on a different stream from stream, so sync before accessing
6767

6868
# prepare launch
6969
block = 32
7070
grid = int((size + block - 1) // block)
7171
config = LaunchConfig(grid=grid, block=block)
72-
ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
72+
kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
7373

74-
# launch kernel on stream s
75-
launch(s, config, ker, *ker_args)
76-
s.sync()
74+
# launch kernel on stream
75+
launch(stream, config, kernel, *kernel_args)
76+
stream.sync()
7777

7878
# check result
7979
assert cp.allclose(out, a * x + y)
8080

8181
# let's repeat again, this time allocates our own out buffer instead of cupy's
8282
# run in double precision
83-
ker = mod.get_kernel("saxpy<double>")
83+
kernel = mod.get_kernel("saxpy<double>")
8484
dtype = cp.float64
8585

8686
# prepare input
@@ -93,18 +93,18 @@
9393
# prepare output
9494
buf = dev.allocate(
9595
size * 8, # = dtype.itemsize
96-
stream=s,
96+
stream=stream,
9797
)
9898

9999
# prepare launch
100100
block = 64
101101
grid = int((size + block - 1) // block)
102102
config = LaunchConfig(grid=grid, block=block)
103-
ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
103+
kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
104104

105-
# launch kernel on stream s
106-
launch(s, config, ker, *ker_args)
107-
s.sync()
105+
# launch kernel on stream
106+
launch(stream, config, kernel, *kernel_args)
107+
stream.sync()
108108

109109
# check result
110110
# we wrap output buffer as a cupy array for simplicity
@@ -115,5 +115,5 @@
115115
finally:
116116
# cupy cleans up automatically the rest
117117
if buf is not None:
118-
buf.close(s)
119-
s.close()
118+
buf.close(stream)
119+
stream.close()

cuda_core/examples/simple_multi_gpu_example.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
import cupy as cp
1515

16-
from cuda.core import Device, LaunchConfig, Program, launch, system
16+
from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
1717

1818
if system.get_num_devices() < 2:
1919
print("this example requires at least 2 GPUs", file=sys.stderr)
@@ -56,9 +56,9 @@ def __cuda_stream__(self):
5656
}
5757
}
5858
"""
59-
prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
59+
prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
6060
mod_add = prog_add.compile("cubin")
61-
ker_add = mod_add.get_kernel("vector_add")
61+
add_kernel = mod_add.get_kernel("vector_add")
6262

6363
# Set GPU 1
6464
dev1 = Device(1)
@@ -78,9 +78,9 @@ def __cuda_stream__(self):
7878
}
7979
}
8080
"""
81-
prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
81+
prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
8282
mod_sub = prog_sub.compile("cubin")
83-
ker_sub = mod_sub.get_kernel("vector_sub")
83+
sub_kernel = mod_sub.get_kernel("vector_sub")
8484

8585
# Create launch configs for each kernel that will be executed on the respective
8686
# CUDA streams.
@@ -103,7 +103,7 @@ def __cuda_stream__(self):
103103
stream0.wait(cp_stream0)
104104

105105
# Launch the add kernel on GPU 0 / stream 0
106-
launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
106+
launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
107107

108108
# Allocate memory on GPU 1
109109
# Note: This runs on CuPy's current stream for GPU 1.
@@ -118,7 +118,7 @@ def __cuda_stream__(self):
118118
stream1.wait(cp_stream1)
119119

120120
# Launch the subtract kernel on GPU 1 / stream 1
121-
launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
121+
launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
122122

123123
# Synchronize both GPUs are validate the results
124124
dev0.set_current()

cuda_core/examples/strided_memory_view_gpu.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
5858
# of which are supported by StridedMemoryView).
5959
@args_viewable_as_strided_memory((0,))
60-
def my_func(arr, work_stream, gpu_ker):
60+
def my_func(arr, work_stream, kernel):
6161
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
6262
# ordering is taken care of, so that arr can be safely accessed on our work
6363
# stream (ordered after a data stream on which arr is potentially prepared).
@@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker):
7373
block = 256
7474
grid = (size + block - 1) // block
7575
config = LaunchConfig(grid=grid, block=block)
76-
launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
76+
launch(work_stream, config, kernel, view.ptr, np.uint64(size))
7777
# Here we're being conservative and synchronize over our work stream,
7878
# assuming we do not know the data stream; if we know then we could
7979
# just order the data stream after the work stream here, e.g.
@@ -101,24 +101,24 @@ def run():
101101
# To know the GPU's compute capability, we need to identify which GPU to use.
102102
dev = Device(0)
103103
dev.set_current()
104-
gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
105-
mod = gpu_prog.compile(target_type="cubin")
106-
gpu_ker = mod.get_kernel(func_name)
104+
prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{dev.arch}", std="c++11"))
105+
mod = prog.compile(target_type="cubin")
106+
kernel = mod.get_kernel(func_name)
107107

108-
s = dev.create_stream()
108+
stream = dev.create_stream()
109109
try:
110110
# Create input array on GPU
111111
arr_gpu = cp.ones(1024, dtype=cp.int32)
112112
print(f"before: {arr_gpu[:10]=}")
113113

114114
# Run the workload
115-
my_func(arr_gpu, s, gpu_ker)
115+
my_func(arr_gpu, stream, kernel)
116116

117117
# Check the result
118118
print(f"after: {arr_gpu[:10]=}")
119119
assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
120120
finally:
121-
s.close()
121+
stream.close()
122122

123123

124124
if __name__ == "__main__":

cuda_core/examples/thread_block_cluster.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
9595
)
9696
mod = prog.compile(target_type="cubin")
97-
ker = mod.get_kernel("check_cluster_info")
97+
kernel = mod.get_kernel("check_cluster_info")
9898

9999
# prepare launch config
100100
grid = 4
@@ -126,7 +126,7 @@
126126
block_dims[:] = 0
127127

128128
# launch kernel on the default stream
129-
launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
129+
launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
130130
dev.sync()
131131

132132
# verify results

cuda_core/examples/vector_add.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
dev = Device()
3232
dev.set_current()
33-
s = dev.create_stream()
33+
stream = dev.create_stream()
3434

3535
try:
3636
# prepare program
@@ -39,7 +39,7 @@
3939
mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
4040

4141
# run in single precision
42-
ker = mod.get_kernel("vector_add<float>")
42+
kernel = mod.get_kernel("vector_add<float>")
4343
dtype = cp.float32
4444

4545
# prepare input/output
@@ -49,19 +49,19 @@
4949
b = rng.random(size, dtype=dtype)
5050
c = cp.empty_like(a)
5151

52-
# cupy runs on a different stream from s, so sync before accessing
52+
# cupy runs on a different stream from stream, so sync before accessing
5353
dev.sync()
5454

5555
# prepare launch
5656
block = 256
5757
grid = (size + block - 1) // block
5858
config = LaunchConfig(grid=grid, block=block)
5959

60-
# launch kernel on stream s
61-
launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
62-
s.sync()
60+
# launch kernel on stream
61+
launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
62+
stream.sync()
6363

6464
# check result
6565
assert cp.allclose(c, a + b)
6666
finally:
67-
s.close()
67+
stream.close()

0 commit comments

Comments
 (0)