Skip to content

Commit 9863ea8

Browse files
authored
cuda.bindings latency benchmarks part 4 (#1959)
1 parent b162f64 commit 9863ea8

9 files changed

Lines changed: 914 additions & 80 deletions

File tree

benchmarks/cuda_bindings/benchmarks/bench_launch.py

Lines changed: 293 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@
2020
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
2121
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
2222
#define REP16(x, T) REP8(x##0, T) REP8(x##1, T)
23+
#define REP32(x, T) REP16(x##0, T) REP16(x##1, T)
24+
#define REP64(x, T) REP32(x##0, T) REP32(x##1, T)
25+
#define REP128(x, T) REP64(x##0, T) REP64(x##1, T)
26+
#define REP256(x, T) REP128(x##0, T) REP128(x##1, T)
27+
28+
template<size_t maxBytes>
29+
struct KernelFunctionParam {
30+
unsigned char p[maxBytes];
31+
};
2332
2433
extern "C" __global__
2534
void small_kernel_16_args(
@@ -29,55 +38,194 @@
2938
REP4(A, int*)
3039
REP8(A, int*))
3140
{ *F = 0; }
41+
42+
extern "C" __global__
43+
void small_kernel_256_args(
44+
ITEM_PARAM(F, int*)
45+
REP1(A, int*)
46+
REP2(A, int*)
47+
REP4(A, int*)
48+
REP8(A, int*)
49+
REP16(A, int*)
50+
REP32(A, int*)
51+
REP64(A, int*)
52+
REP128(A, int*))
53+
{ *F = 0; }
54+
55+
extern "C" __global__
56+
void small_kernel_512_args(
57+
ITEM_PARAM(F, int*)
58+
REP1(A, int*)
59+
REP2(A, int*)
60+
REP4(A, int*)
61+
REP8(A, int*)
62+
REP16(A, int*)
63+
REP32(A, int*)
64+
REP64(A, int*)
65+
REP128(A, int*)
66+
REP256(A, int*))
67+
{ *F = 0; }
68+
69+
extern "C" __global__
70+
void small_kernel_512_bools(
71+
ITEM_PARAM(F, bool)
72+
REP1(A, bool)
73+
REP2(A, bool)
74+
REP4(A, bool)
75+
REP8(A, bool)
76+
REP16(A, bool)
77+
REP32(A, bool)
78+
REP64(A, bool)
79+
REP128(A, bool)
80+
REP256(A, bool))
81+
{ return; }
82+
83+
extern "C" __global__
84+
void small_kernel_512_ints(
85+
ITEM_PARAM(F, int)
86+
REP1(A, int)
87+
REP2(A, int)
88+
REP4(A, int)
89+
REP8(A, int)
90+
REP16(A, int)
91+
REP32(A, int)
92+
REP64(A, int)
93+
REP128(A, int)
94+
REP256(A, int))
95+
{ return; }
96+
97+
extern "C" __global__
98+
void small_kernel_512_doubles(
99+
ITEM_PARAM(F, double)
100+
REP1(A, double)
101+
REP2(A, double)
102+
REP4(A, double)
103+
REP8(A, double)
104+
REP16(A, double)
105+
REP32(A, double)
106+
REP64(A, double)
107+
REP128(A, double)
108+
REP256(A, double))
109+
{ return; }
110+
111+
extern "C" __global__
112+
void small_kernel_512_chars(
113+
ITEM_PARAM(F, char)
114+
REP1(A, char)
115+
REP2(A, char)
116+
REP4(A, char)
117+
REP8(A, char)
118+
REP16(A, char)
119+
REP32(A, char)
120+
REP64(A, char)
121+
REP128(A, char)
122+
REP256(A, char))
123+
{ return; }
124+
125+
extern "C" __global__
126+
void small_kernel_512_longlongs(
127+
ITEM_PARAM(F, long long)
128+
REP1(A, long long)
129+
REP2(A, long long)
130+
REP4(A, long long)
131+
REP8(A, long long)
132+
REP16(A, long long)
133+
REP32(A, long long)
134+
REP64(A, long long)
135+
REP128(A, long long)
136+
REP256(A, long long))
137+
{ return; }
138+
139+
extern "C" __global__
140+
void small_kernel_2048B(KernelFunctionParam<2048> param) {
141+
// Do not touch param to prevent compiler from copying
142+
// the whole structure from const bank to lmem.
143+
}
32144
"""
33145

34146
MODULE = None
35147
EMPTY_KERNEL = None
36148
SMALL_KERNEL = None
37149
KERNEL_16_ARGS = None
150+
KERNEL_256_ARGS = None
151+
KERNEL_512_ARGS = None
152+
KERNEL_512_BOOLS = None
153+
KERNEL_512_INTS = None
154+
KERNEL_512_DOUBLES = None
155+
KERNEL_512_CHARS = None
156+
KERNEL_512_LONGLONGS = None
157+
KERNEL_2048B = None
38158
STREAM = None
39159
FLOAT_PTR = None
40-
INT_PTRS = None
41-
_VAL_PS = None
160+
INT_PTRS_512 = None
161+
_VAL_PS_16 = None
162+
_VAL_PS_512 = None
42163
PACKED_16 = None
164+
PACKED_512 = None
165+
166+
167+
class _Struct2048B(ctypes.Structure):
168+
_fields_ = [("values", ctypes.c_uint8 * 2048)]
169+
170+
171+
STRUCT_2048B = _Struct2048B()
43172

44173

45174
def _ensure_launch_state() -> None:
46-
global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM
47-
global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16
175+
global MODULE, EMPTY_KERNEL, SMALL_KERNEL
176+
global KERNEL_16_ARGS, KERNEL_256_ARGS, KERNEL_512_ARGS
177+
global KERNEL_512_BOOLS, KERNEL_512_INTS, KERNEL_512_DOUBLES
178+
global KERNEL_512_CHARS, KERNEL_512_LONGLONGS, KERNEL_2048B
179+
global STREAM, FLOAT_PTR, INT_PTRS_512
180+
global _VAL_PS_16, _VAL_PS_512, PACKED_16, PACKED_512
48181

49182
if EMPTY_KERNEL is not None:
50183
return
51184

52185
module = compile_and_load(KERNEL_SOURCE)
53186

54-
err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
55-
assert_drv(err)
56-
err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel")
57-
assert_drv(err)
58-
err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
59-
assert_drv(err)
187+
def get_func(name):
188+
err, func = cuda.cuModuleGetFunction(module, name.encode())
189+
assert_drv(err)
190+
return func
60191

61192
err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
62193
assert_drv(err)
63194

64195
float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float))
65-
int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16))
196+
int_ptrs_512 = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(512))
66197

67-
val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs]
198+
# Pre-pack 16 args
199+
val_ps_16 = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs_512[:16]]
68200
packed_16 = (ctypes.c_void_p * 16)()
69-
for index, value_ptr in enumerate(val_ps):
70-
packed_16[index] = ctypes.addressof(value_ptr)
201+
for i, vp in enumerate(val_ps_16):
202+
packed_16[i] = ctypes.addressof(vp)
203+
204+
# Pre-pack 512 args
205+
val_ps_512 = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs_512]
206+
packed_512 = (ctypes.c_void_p * 512)()
207+
for i, vp in enumerate(val_ps_512):
208+
packed_512[i] = ctypes.addressof(vp)
71209

72210
MODULE = module
73-
EMPTY_KERNEL = empty_kernel
74-
SMALL_KERNEL = small_kernel
75-
KERNEL_16_ARGS = kernel_16_args
211+
EMPTY_KERNEL = get_func("empty_kernel")
212+
SMALL_KERNEL = get_func("small_kernel")
213+
KERNEL_16_ARGS = get_func("small_kernel_16_args")
214+
KERNEL_256_ARGS = get_func("small_kernel_256_args")
215+
KERNEL_512_ARGS = get_func("small_kernel_512_args")
216+
KERNEL_512_BOOLS = get_func("small_kernel_512_bools")
217+
KERNEL_512_INTS = get_func("small_kernel_512_ints")
218+
KERNEL_512_DOUBLES = get_func("small_kernel_512_doubles")
219+
KERNEL_512_CHARS = get_func("small_kernel_512_chars")
220+
KERNEL_512_LONGLONGS = get_func("small_kernel_512_longlongs")
221+
KERNEL_2048B = get_func("small_kernel_2048B")
76222
STREAM = stream
77223
FLOAT_PTR = float_ptr
78-
INT_PTRS = int_ptrs
79-
_VAL_PS = val_ps
224+
INT_PTRS_512 = int_ptrs_512
225+
_VAL_PS_16 = val_ps_16
226+
_VAL_PS_512 = val_ps_512
80227
PACKED_16 = packed_16
228+
PACKED_512 = packed_512
81229

82230

83231
def bench_launch_empty_kernel(loops: int) -> float:
@@ -111,7 +259,7 @@ def bench_launch_16_args(loops: int) -> float:
111259
_fn = cuda.cuLaunchKernel
112260
_kernel = KERNEL_16_ARGS
113261
_stream = STREAM
114-
_args = INT_PTRS
262+
_args = INT_PTRS_512[:16]
115263
_arg_types = (None,) * 16
116264

117265
t0 = time.perf_counter()
@@ -131,3 +279,128 @@ def bench_launch_16_args_pre_packed(loops: int) -> float:
131279
for _ in range(loops):
132280
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
133281
return time.perf_counter() - t0
282+
283+
284+
def bench_launch_256_args(loops: int) -> float:
285+
_ensure_launch_state()
286+
_fn = cuda.cuLaunchKernel
287+
_kernel = KERNEL_256_ARGS
288+
_stream = STREAM
289+
_args = INT_PTRS_512[:256]
290+
_arg_types = (None,) * 256
291+
292+
t0 = time.perf_counter()
293+
for _ in range(loops):
294+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
295+
return time.perf_counter() - t0
296+
297+
298+
def bench_launch_512_args(loops: int) -> float:
299+
_ensure_launch_state()
300+
_fn = cuda.cuLaunchKernel
301+
_kernel = KERNEL_512_ARGS
302+
_stream = STREAM
303+
_args = INT_PTRS_512
304+
_arg_types = (None,) * 512
305+
306+
t0 = time.perf_counter()
307+
for _ in range(loops):
308+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
309+
return time.perf_counter() - t0
310+
311+
312+
def bench_launch_512_args_pre_packed(loops: int) -> float:
313+
_ensure_launch_state()
314+
_fn = cuda.cuLaunchKernel
315+
_kernel = KERNEL_512_ARGS
316+
_stream = STREAM
317+
_packed = PACKED_512
318+
319+
t0 = time.perf_counter()
320+
for _ in range(loops):
321+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
322+
return time.perf_counter() - t0
323+
324+
325+
def bench_launch_512_bools(loops: int) -> float:
326+
_ensure_launch_state()
327+
_fn = cuda.cuLaunchKernel
328+
_kernel = KERNEL_512_BOOLS
329+
_stream = STREAM
330+
_args = (True,) * 512
331+
_arg_types = (ctypes.c_bool,) * 512
332+
333+
t0 = time.perf_counter()
334+
for _ in range(loops):
335+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
336+
return time.perf_counter() - t0
337+
338+
339+
def bench_launch_512_ints(loops: int) -> float:
340+
_ensure_launch_state()
341+
_fn = cuda.cuLaunchKernel
342+
_kernel = KERNEL_512_INTS
343+
_stream = STREAM
344+
_args = (123,) * 512
345+
_arg_types = (ctypes.c_int,) * 512
346+
347+
t0 = time.perf_counter()
348+
for _ in range(loops):
349+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
350+
return time.perf_counter() - t0
351+
352+
353+
def bench_launch_512_doubles(loops: int) -> float:
354+
_ensure_launch_state()
355+
_fn = cuda.cuLaunchKernel
356+
_kernel = KERNEL_512_DOUBLES
357+
_stream = STREAM
358+
_args = (1.2345,) * 512
359+
_arg_types = (ctypes.c_double,) * 512
360+
361+
t0 = time.perf_counter()
362+
for _ in range(loops):
363+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
364+
return time.perf_counter() - t0
365+
366+
367+
def bench_launch_512_bytes(loops: int) -> float:
368+
_ensure_launch_state()
369+
_fn = cuda.cuLaunchKernel
370+
_kernel = KERNEL_512_CHARS
371+
_stream = STREAM
372+
_args = (127,) * 512
373+
_arg_types = (ctypes.c_byte,) * 512
374+
375+
t0 = time.perf_counter()
376+
for _ in range(loops):
377+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
378+
return time.perf_counter() - t0
379+
380+
381+
def bench_launch_512_longlongs(loops: int) -> float:
382+
_ensure_launch_state()
383+
_fn = cuda.cuLaunchKernel
384+
_kernel = KERNEL_512_LONGLONGS
385+
_stream = STREAM
386+
_args = (9223372036854775806,) * 512
387+
_arg_types = (ctypes.c_longlong,) * 512
388+
389+
t0 = time.perf_counter()
390+
for _ in range(loops):
391+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
392+
return time.perf_counter() - t0
393+
394+
395+
def bench_launch_2048b(loops: int) -> float:
396+
_ensure_launch_state()
397+
_fn = cuda.cuLaunchKernel
398+
_kernel = KERNEL_2048B
399+
_stream = STREAM
400+
_args = (STRUCT_2048B,)
401+
_arg_types = (None,)
402+
403+
t0 = time.perf_counter()
404+
for _ in range(loops):
405+
_fn(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
406+
return time.perf_counter() - t0

0 commit comments

Comments
 (0)