5757# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
5858# of which are supported by StridedMemoryView).
5959@args_viewable_as_strided_memory ((0 ,))
60- def my_func (arr , work_stream , gpu_ker ):
60+ def my_func (arr , work_stream , kernel ):
6161 # Create a memory view over arr (assumed to be a 1D array of int32). The stream
6262 # ordering is taken care of, so that arr can be safely accessed on our work
6363 # stream (ordered after a data stream on which arr is potentially prepared).
@@ -73,7 +73,7 @@ def my_func(arr, work_stream, gpu_ker):
7373 block = 256
7474 grid = (size + block - 1 ) // block
7575 config = LaunchConfig (grid = grid , block = block )
76- launch (work_stream , config , gpu_ker , view .ptr , np .uint64 (size ))
76+ launch (work_stream , config , kernel , view .ptr , np .uint64 (size ))
7777 # Here we're being conservative and synchronize over our work stream,
7878 # assuming we do not know the data stream; if we know then we could
7979 # just order the data stream after the work stream here, e.g.
@@ -101,24 +101,24 @@ def run():
101101 # To know the GPU's compute capability, we need to identify which GPU to use.
102102 dev = Device (0 )
103103 dev .set_current ()
104- gpu_prog = Program (gpu_code , code_type = "c++" , options = ProgramOptions (arch = f"sm_{ dev .arch } " , std = "c++11" ))
105- mod = gpu_prog .compile (target_type = "cubin" )
106- gpu_ker = mod .get_kernel (func_name )
104+ prog = Program (gpu_code , code_type = "c++" , options = ProgramOptions (arch = f"sm_{ dev .arch } " , std = "c++11" ))
105+ mod = prog .compile (target_type = "cubin" )
106+ kernel = mod .get_kernel (func_name )
107107
108- s = dev .create_stream ()
108+ stream = dev .create_stream ()
109109 try :
110110 # Create input array on GPU
111111 arr_gpu = cp .ones (1024 , dtype = cp .int32 )
112112 print (f"before: { arr_gpu [:10 ]= } " )
113113
114114 # Run the workload
115- my_func (arr_gpu , s , gpu_ker )
115+ my_func (arr_gpu , stream , kernel )
116116
117117 # Check the result
118118 print (f"after: { arr_gpu [:10 ]= } " )
119119 assert cp .allclose (arr_gpu , 1 + cp .arange (1024 , dtype = cp .int32 ))
120120 finally :
121- s .close ()
121+ stream .close ()
122122
123123
124124if __name__ == "__main__" :
0 commit comments