2020#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
2121#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
2222#define REP16(x, T) REP8(x##0, T) REP8(x##1, T)
23+ #define REP32(x, T) REP16(x##0, T) REP16(x##1, T)
24+ #define REP64(x, T) REP32(x##0, T) REP32(x##1, T)
25+ #define REP128(x, T) REP64(x##0, T) REP64(x##1, T)
26+ #define REP256(x, T) REP128(x##0, T) REP128(x##1, T)
27+
28+ template<size_t maxBytes>
29+ struct KernelFunctionParam {
30+ unsigned char p[maxBytes];
31+ };
2332
2433extern "C" __global__
2534void small_kernel_16_args(
2938 REP4(A, int*)
3039 REP8(A, int*))
3140{ *F = 0; }
41+
42+ extern "C" __global__
43+ void small_kernel_256_args(
44+ ITEM_PARAM(F, int*)
45+ REP1(A, int*)
46+ REP2(A, int*)
47+ REP4(A, int*)
48+ REP8(A, int*)
49+ REP16(A, int*)
50+ REP32(A, int*)
51+ REP64(A, int*)
52+ REP128(A, int*))
53+ { *F = 0; }
54+
55+ extern "C" __global__
56+ void small_kernel_512_args(
57+ ITEM_PARAM(F, int*)
58+ REP1(A, int*)
59+ REP2(A, int*)
60+ REP4(A, int*)
61+ REP8(A, int*)
62+ REP16(A, int*)
63+ REP32(A, int*)
64+ REP64(A, int*)
65+ REP128(A, int*)
66+ REP256(A, int*))
67+ { *F = 0; }
68+
69+ extern "C" __global__
70+ void small_kernel_512_bools(
71+ ITEM_PARAM(F, bool)
72+ REP1(A, bool)
73+ REP2(A, bool)
74+ REP4(A, bool)
75+ REP8(A, bool)
76+ REP16(A, bool)
77+ REP32(A, bool)
78+ REP64(A, bool)
79+ REP128(A, bool)
80+ REP256(A, bool))
81+ { return; }
82+
83+ extern "C" __global__
84+ void small_kernel_512_ints(
85+ ITEM_PARAM(F, int)
86+ REP1(A, int)
87+ REP2(A, int)
88+ REP4(A, int)
89+ REP8(A, int)
90+ REP16(A, int)
91+ REP32(A, int)
92+ REP64(A, int)
93+ REP128(A, int)
94+ REP256(A, int))
95+ { return; }
96+
97+ extern "C" __global__
98+ void small_kernel_512_doubles(
99+ ITEM_PARAM(F, double)
100+ REP1(A, double)
101+ REP2(A, double)
102+ REP4(A, double)
103+ REP8(A, double)
104+ REP16(A, double)
105+ REP32(A, double)
106+ REP64(A, double)
107+ REP128(A, double)
108+ REP256(A, double))
109+ { return; }
110+
111+ extern "C" __global__
112+ void small_kernel_512_chars(
113+ ITEM_PARAM(F, char)
114+ REP1(A, char)
115+ REP2(A, char)
116+ REP4(A, char)
117+ REP8(A, char)
118+ REP16(A, char)
119+ REP32(A, char)
120+ REP64(A, char)
121+ REP128(A, char)
122+ REP256(A, char))
123+ { return; }
124+
125+ extern "C" __global__
126+ void small_kernel_512_longlongs(
127+ ITEM_PARAM(F, long long)
128+ REP1(A, long long)
129+ REP2(A, long long)
130+ REP4(A, long long)
131+ REP8(A, long long)
132+ REP16(A, long long)
133+ REP32(A, long long)
134+ REP64(A, long long)
135+ REP128(A, long long)
136+ REP256(A, long long))
137+ { return; }
138+
139+ extern "C" __global__
140+ void small_kernel_2048B(KernelFunctionParam<2048> param) {
141+ // Do not touch param to prevent compiler from copying
142+ // the whole structure from const bank to lmem.
143+ }
32144"""
33145
34146MODULE = None
35147EMPTY_KERNEL = None
36148SMALL_KERNEL = None
37149KERNEL_16_ARGS = None
150+ KERNEL_256_ARGS = None
151+ KERNEL_512_ARGS = None
152+ KERNEL_512_BOOLS = None
153+ KERNEL_512_INTS = None
154+ KERNEL_512_DOUBLES = None
155+ KERNEL_512_CHARS = None
156+ KERNEL_512_LONGLONGS = None
157+ KERNEL_2048B = None
38158STREAM = None
39159FLOAT_PTR = None
40- INT_PTRS = None
41- _VAL_PS = None
160+ INT_PTRS_512 = None
161+ _VAL_PS_16 = None
162+ _VAL_PS_512 = None
42163PACKED_16 = None
164+ PACKED_512 = None
165+
166+
167+ class _Struct2048B (ctypes .Structure ):
168+ _fields_ = [("values" , ctypes .c_uint8 * 2048 )]
169+
170+
171+ STRUCT_2048B = _Struct2048B ()
43172
44173
45174def _ensure_launch_state () -> None :
46- global MODULE , EMPTY_KERNEL , SMALL_KERNEL , KERNEL_16_ARGS , STREAM
47- global FLOAT_PTR , INT_PTRS , _VAL_PS , PACKED_16
175+ global MODULE , EMPTY_KERNEL , SMALL_KERNEL
176+ global KERNEL_16_ARGS , KERNEL_256_ARGS , KERNEL_512_ARGS
177+ global KERNEL_512_BOOLS , KERNEL_512_INTS , KERNEL_512_DOUBLES
178+ global KERNEL_512_CHARS , KERNEL_512_LONGLONGS , KERNEL_2048B
179+ global STREAM , FLOAT_PTR , INT_PTRS_512
180+ global _VAL_PS_16 , _VAL_PS_512 , PACKED_16 , PACKED_512
48181
49182 if EMPTY_KERNEL is not None :
50183 return
51184
52185 module = compile_and_load (KERNEL_SOURCE )
53186
54- err , empty_kernel = cuda .cuModuleGetFunction (module , b"empty_kernel" )
55- assert_drv (err )
56- err , small_kernel = cuda .cuModuleGetFunction (module , b"small_kernel" )
57- assert_drv (err )
58- err , kernel_16_args = cuda .cuModuleGetFunction (module , b"small_kernel_16_args" )
59- assert_drv (err )
187+ def get_func (name ):
188+ err , func = cuda .cuModuleGetFunction (module , name .encode ())
189+ assert_drv (err )
190+ return func
60191
61192 err , stream = cuda .cuStreamCreate (cuda .CUstream_flags .CU_STREAM_NON_BLOCKING .value )
62193 assert_drv (err )
63194
64195 float_ptr = alloc_persistent (ctypes .sizeof (ctypes .c_float ))
65- int_ptrs = tuple (alloc_persistent (ctypes .sizeof (ctypes .c_int )) for _ in range (16 ))
196+ int_ptrs_512 = tuple (alloc_persistent (ctypes .sizeof (ctypes .c_int )) for _ in range (512 ))
66197
67- val_ps = [ctypes .c_void_p (int (ptr )) for ptr in int_ptrs ]
198+ # Pre-pack 16 args
199+ val_ps_16 = [ctypes .c_void_p (int (ptr )) for ptr in int_ptrs_512 [:16 ]]
68200 packed_16 = (ctypes .c_void_p * 16 )()
69- for index , value_ptr in enumerate (val_ps ):
70- packed_16 [index ] = ctypes .addressof (value_ptr )
201+ for i , vp in enumerate (val_ps_16 ):
202+ packed_16 [i ] = ctypes .addressof (vp )
203+
204+ # Pre-pack 512 args
205+ val_ps_512 = [ctypes .c_void_p (int (ptr )) for ptr in int_ptrs_512 ]
206+ packed_512 = (ctypes .c_void_p * 512 )()
207+ for i , vp in enumerate (val_ps_512 ):
208+ packed_512 [i ] = ctypes .addressof (vp )
71209
72210 MODULE = module
73- EMPTY_KERNEL = empty_kernel
74- SMALL_KERNEL = small_kernel
75- KERNEL_16_ARGS = kernel_16_args
211+ EMPTY_KERNEL = get_func ("empty_kernel" )
212+ SMALL_KERNEL = get_func ("small_kernel" )
213+ KERNEL_16_ARGS = get_func ("small_kernel_16_args" )
214+ KERNEL_256_ARGS = get_func ("small_kernel_256_args" )
215+ KERNEL_512_ARGS = get_func ("small_kernel_512_args" )
216+ KERNEL_512_BOOLS = get_func ("small_kernel_512_bools" )
217+ KERNEL_512_INTS = get_func ("small_kernel_512_ints" )
218+ KERNEL_512_DOUBLES = get_func ("small_kernel_512_doubles" )
219+ KERNEL_512_CHARS = get_func ("small_kernel_512_chars" )
220+ KERNEL_512_LONGLONGS = get_func ("small_kernel_512_longlongs" )
221+ KERNEL_2048B = get_func ("small_kernel_2048B" )
76222 STREAM = stream
77223 FLOAT_PTR = float_ptr
78- INT_PTRS = int_ptrs
79- _VAL_PS = val_ps
224+ INT_PTRS_512 = int_ptrs_512
225+ _VAL_PS_16 = val_ps_16
226+ _VAL_PS_512 = val_ps_512
80227 PACKED_16 = packed_16
228+ PACKED_512 = packed_512
81229
82230
83231def bench_launch_empty_kernel (loops : int ) -> float :
@@ -111,7 +259,7 @@ def bench_launch_16_args(loops: int) -> float:
111259 _fn = cuda .cuLaunchKernel
112260 _kernel = KERNEL_16_ARGS
113261 _stream = STREAM
114- _args = INT_PTRS
262+ _args = INT_PTRS_512 [: 16 ]
115263 _arg_types = (None ,) * 16
116264
117265 t0 = time .perf_counter ()
@@ -131,3 +279,128 @@ def bench_launch_16_args_pre_packed(loops: int) -> float:
131279 for _ in range (loops ):
132280 _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , _packed , 0 )
133281 return time .perf_counter () - t0
282+
283+
284+ def bench_launch_256_args (loops : int ) -> float :
285+ _ensure_launch_state ()
286+ _fn = cuda .cuLaunchKernel
287+ _kernel = KERNEL_256_ARGS
288+ _stream = STREAM
289+ _args = INT_PTRS_512 [:256 ]
290+ _arg_types = (None ,) * 256
291+
292+ t0 = time .perf_counter ()
293+ for _ in range (loops ):
294+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
295+ return time .perf_counter () - t0
296+
297+
298+ def bench_launch_512_args (loops : int ) -> float :
299+ _ensure_launch_state ()
300+ _fn = cuda .cuLaunchKernel
301+ _kernel = KERNEL_512_ARGS
302+ _stream = STREAM
303+ _args = INT_PTRS_512
304+ _arg_types = (None ,) * 512
305+
306+ t0 = time .perf_counter ()
307+ for _ in range (loops ):
308+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
309+ return time .perf_counter () - t0
310+
311+
312+ def bench_launch_512_args_pre_packed (loops : int ) -> float :
313+ _ensure_launch_state ()
314+ _fn = cuda .cuLaunchKernel
315+ _kernel = KERNEL_512_ARGS
316+ _stream = STREAM
317+ _packed = PACKED_512
318+
319+ t0 = time .perf_counter ()
320+ for _ in range (loops ):
321+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , _packed , 0 )
322+ return time .perf_counter () - t0
323+
324+
325+ def bench_launch_512_bools (loops : int ) -> float :
326+ _ensure_launch_state ()
327+ _fn = cuda .cuLaunchKernel
328+ _kernel = KERNEL_512_BOOLS
329+ _stream = STREAM
330+ _args = (True ,) * 512
331+ _arg_types = (ctypes .c_bool ,) * 512
332+
333+ t0 = time .perf_counter ()
334+ for _ in range (loops ):
335+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
336+ return time .perf_counter () - t0
337+
338+
339+ def bench_launch_512_ints (loops : int ) -> float :
340+ _ensure_launch_state ()
341+ _fn = cuda .cuLaunchKernel
342+ _kernel = KERNEL_512_INTS
343+ _stream = STREAM
344+ _args = (123 ,) * 512
345+ _arg_types = (ctypes .c_int ,) * 512
346+
347+ t0 = time .perf_counter ()
348+ for _ in range (loops ):
349+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
350+ return time .perf_counter () - t0
351+
352+
353+ def bench_launch_512_doubles (loops : int ) -> float :
354+ _ensure_launch_state ()
355+ _fn = cuda .cuLaunchKernel
356+ _kernel = KERNEL_512_DOUBLES
357+ _stream = STREAM
358+ _args = (1.2345 ,) * 512
359+ _arg_types = (ctypes .c_double ,) * 512
360+
361+ t0 = time .perf_counter ()
362+ for _ in range (loops ):
363+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
364+ return time .perf_counter () - t0
365+
366+
367+ def bench_launch_512_bytes (loops : int ) -> float :
368+ _ensure_launch_state ()
369+ _fn = cuda .cuLaunchKernel
370+ _kernel = KERNEL_512_CHARS
371+ _stream = STREAM
372+ _args = (127 ,) * 512
373+ _arg_types = (ctypes .c_byte ,) * 512
374+
375+ t0 = time .perf_counter ()
376+ for _ in range (loops ):
377+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
378+ return time .perf_counter () - t0
379+
380+
381+ def bench_launch_512_longlongs (loops : int ) -> float :
382+ _ensure_launch_state ()
383+ _fn = cuda .cuLaunchKernel
384+ _kernel = KERNEL_512_LONGLONGS
385+ _stream = STREAM
386+ _args = (9223372036854775806 ,) * 512
387+ _arg_types = (ctypes .c_longlong ,) * 512
388+
389+ t0 = time .perf_counter ()
390+ for _ in range (loops ):
391+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
392+ return time .perf_counter () - t0
393+
394+
395+ def bench_launch_2048b (loops : int ) -> float :
396+ _ensure_launch_state ()
397+ _fn = cuda .cuLaunchKernel
398+ _kernel = KERNEL_2048B
399+ _stream = STREAM
400+ _args = (STRUCT_2048B ,)
401+ _arg_types = (None ,)
402+
403+ t0 = time .perf_counter ()
404+ for _ in range (loops ):
405+ _fn (_kernel , 1 , 1 , 1 , 1 , 1 , 1 , 0 , _stream , (_args , _arg_types ), 0 )
406+ return time .perf_counter () - t0
0 commit comments