2323import pytest
2424
2525import dpctl
26- import dpctl .memory as dpctl_mem
26+ import dpctl .memory as dpm
2727import dpctl .program as dpctl_prog
28- import dpctl .tensor as dpt
2928from dpctl ._sycl_queue import kernel_arg_type
3029
3130
3231@pytest .mark .parametrize (
3332 "ctype_str,dtype,ctypes_ctor" ,
3433 [
35- ("short" , dpt .dtype ("i2" ), ctypes .c_short ),
36- ("int" , dpt .dtype ("i4" ), ctypes .c_int ),
37- ("unsigned int" , dpt .dtype ("u4" ), ctypes .c_uint ),
38- ("long" , dpt .dtype (np .longlong ), ctypes .c_longlong ),
39- ("unsigned long" , dpt .dtype (np .ulonglong ), ctypes .c_ulonglong ),
40- ("float" , dpt .dtype ("f4" ), ctypes .c_float ),
41- ("double" , dpt .dtype ("f8" ), ctypes .c_double ),
34+ ("short" , np .dtype ("i2" ), ctypes .c_short ),
35+ ("int" , np .dtype ("i4" ), ctypes .c_int ),
36+ ("unsigned int" , np .dtype ("u4" ), ctypes .c_uint ),
37+ ("long" , np .dtype (np .longlong ), ctypes .c_longlong ),
38+ ("unsigned long" , np .dtype (np .ulonglong ), ctypes .c_ulonglong ),
39+ ("float" , np .dtype ("f4" ), ctypes .c_float ),
40+ ("double" , np .dtype ("f8" ), ctypes .c_double ),
4241 ],
4342)
4443def test_create_program_from_source (ctype_str , dtype , ctypes_ctor ):
4544 try :
4645 q = dpctl .SyclQueue ("opencl" , property = "enable_profiling" )
4746 except dpctl .SyclQueueCreationError :
4847 pytest .skip ("OpenCL queue could not be created" )
49- if dtype == dpt .dtype ("f8" ) and q .sycl_device .has_aspect_fp64 is False :
48+ if dtype == np .dtype ("f8" ) and q .sycl_device .has_aspect_fp64 is False :
5049 pytest .skip (
5150 "Device does not support double precision floating point type"
5251 )
@@ -66,35 +65,42 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
6665 n_elems = 1024 * 512
6766 lws = 128
6867 if dtype .kind in "ui" :
69- n_elems = min (n_elems , dpt .iinfo (dtype ).max )
68+ n_elems = min (n_elems , np .iinfo (dtype ).max )
7069 n_elems = (n_elems // lws ) * lws
71- a = dpt .arange (n_elems , dtype = dtype , sycl_queue = q )
72- b = dpt .arange (n_elems , stop = 0 , step = - 1 , dtype = dtype , sycl_queue = q )
73- c = dpt .zeros (n_elems , dtype = dtype , sycl_queue = q )
70+ a = np .arange (n_elems , dtype = dtype )
71+ b = np .arange (n_elems , stop = 0 , step = - 1 , dtype = dtype )
72+ c = np .zeros (n_elems , dtype = dtype )
73+
74+ a_usm = dpm .MemoryUSMDevice (a .nbytes , queue = q )
75+ b_usm = dpm .MemoryUSMDevice (b .nbytes , queue = q )
76+ c_usm = dpm .MemoryUSMDevice (c .nbytes , queue = q )
77+
78+ ev1 = q .memcpy_async (dest = a_usm , src = a , count = a .nbytes )
79+ ev2 = q .memcpy_async (dest = b_usm , src = b , count = b .nbytes )
80+
81+ dpctl .SyclEvent .wait_for ([ev1 , ev2 ])
7482
7583 d = 2
76- args = [a . usm_data , b . usm_data , c . usm_data , ctypes_ctor (d )]
84+ args = [a_usm , b_usm , c_usm , ctypes_ctor (d )]
7785
7886 assert n_elems % lws == 0
7987
80- b_np = dpt .asnumpy (b )
81- a_np = dpt .asnumpy (a )
82-
8388 for r in (
8489 [
8590 n_elems ,
8691 ],
8792 [2 , n_elems ],
8893 [2 , 2 , n_elems ],
8994 ):
90- c [:] = 0
95+ c_usm . memset ()
9196 timer = dpctl .SyclTimer ()
9297 with timer (q ):
9398 q .submit (axpyKernel , args , r ).wait ()
94- ref_c = a_np * np .array (d , dtype = dtype ) + b_np
99+ ref_c = a * np .array (d , dtype = dtype ) + b
95100 host_dt , device_dt = timer .dt
96101 assert type (host_dt ) is float and type (device_dt ) is float
97- assert np .allclose (dpt .asnumpy (c ), ref_c ), "Failed for {}" .format (r )
102+ q .memcpy (c , c_usm , c .nbytes )
103+ assert np .allclose (c , ref_c ), "Failed for {}" .format (r )
98104
99105 for gr , lr in (
100106 (
@@ -106,16 +112,15 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
106112 ([2 , n_elems ], [2 , lws // 2 ]),
107113 ([2 , 2 , n_elems ], [2 , 2 , lws // 4 ]),
108114 ):
109- c [:] = 0
115+ c_usm . memset ()
110116 timer = dpctl .SyclTimer ()
111117 with timer (q ):
112118 q .submit (axpyKernel , args , gr , lr , [dpctl .SyclEvent ()]).wait ()
113- ref_c = a_np * np .array (d , dtype = dtype ) + b_np
119+ ref_c = a * np .array (d , dtype = dtype ) + b
114120 host_dt , device_dt = timer .dt
115121 assert type (host_dt ) is float and type (device_dt ) is float
116- assert np .allclose (dpt .asnumpy (c ), ref_c ), "Failed for {}, {}" .formatg (
117- r , lr
118- )
122+ q .memcpy (c , c_usm , c .nbytes )
123+ assert np .allclose (c , ref_c ), "Failed for {}, {}" .format (gr , lr )
119124
120125
121126def test_submit_async ():
@@ -124,33 +129,41 @@ def test_submit_async():
124129 except dpctl .SyclQueueCreationError :
125130 pytest .skip ("OpenCL queue could not be created" )
126131 oclSrc = (
127- "kernel void kern1(global unsigned int *res, unsigned int mod) {"
132+ "kernel void kern1("
133+ " global unsigned int *res_base, ulong res_off, unsigned int mod) {"
128134 " size_t unused_sum = 0;"
129135 " size_t i = 0; "
130136 " for (i = 0; i < 4000; i++) { "
131137 " unused_sum += i;"
132138 " } "
139+ " global unsigned int *res = res_base + (size_t)res_off;"
133140 " size_t index = get_global_id(0);"
134141 " int ri = (index % mod);"
135142 " res[index] = (ri * ri) % mod;"
136143 "}"
137144 " "
138- "kernel void kern2(global unsigned int *res, unsigned int mod) {"
145+ "kernel void kern2("
146+ " global unsigned int *res_base, ulong res_off, unsigned int mod) {"
139147 " size_t unused_sum = 0;"
140148 " size_t i = 0; "
141149 " for (i = 0; i < 4000; i++) { "
142150 " unused_sum += i;"
143151 " } "
152+ " global unsigned int *res = res_base + (size_t)res_off;"
144153 " size_t index = get_global_id(0);"
145154 " int ri = (index % mod);"
146155 " int ri2 = (ri * ri) % mod;"
147156 " res[index] = (ri2 * ri) % mod;"
148157 "}"
149158 " "
150159 "kernel void kern3("
151- " global unsigned int *res, global unsigned int *arg1, "
152- " global unsigned int *arg2)"
160+ " global unsigned int *res_base, ulong res_off,"
161+ " global unsigned int *arg1_base, ulong arg1_off,"
162+ " global unsigned int *arg2_base, ulong arg2_off)"
153163 "{"
164+ " global unsigned int *res = res_base + (size_t)res_off;"
165+ " global unsigned int *arg1 = arg1_base + (size_t)arg1_off;"
166+ " global unsigned int *arg2 = arg2_base + (size_t)arg2_off;"
154167 " size_t index = get_global_id(0);"
155168 " size_t i = 0; "
156169 " size_t unused_sum = 0;"
@@ -177,10 +190,10 @@ def test_submit_async():
177190 n = f * 1024
178191 n_alloc = 4 * n
179192
180- X = dpt .empty ((3 , n_alloc ), dtype = "u4" , usm_type = "device" , sycl_queue = q )
181- first_row = dpctl_mem . as_usm_memory ( X [ 0 ] )
182- second_row = dpctl_mem . as_usm_memory ( X [ 1 ])
183- third_row = dpctl_mem . as_usm_memory ( X [ 2 ] )
193+ x = np .empty ((3 , n_alloc ), dtype = "u4" )
194+ x_usm = dpm . MemoryUSMDevice ( x . nbytes , queue = q )
195+
196+ e1 = q . memcpy_async ( dest = x_usm , src = x , count = x . nbytes )
184197
185198 p1 , p2 = 17 , 27
186199
@@ -189,26 +202,39 @@ def test_submit_async():
189202 e1 = q .submit_async (
190203 kern1Kernel ,
191204 [
192- first_row ,
205+ x_usm ,
206+ ctypes .c_ulonglong (0 ),
193207 ctypes .c_uint (p1 ),
194208 ],
195209 [
196210 n ,
197211 ],
212+ None ,
213+ [e1 ],
198214 )
199215 e2 = q .submit_async (
200216 kern2Kernel ,
201217 [
202- second_row ,
218+ x_usm ,
219+ ctypes .c_ulonglong (n_alloc ),
203220 ctypes .c_uint (p2 ),
204221 ],
205222 [
206223 n ,
207224 ],
225+ None ,
226+ [e1 ],
208227 )
209228 e3 = q .submit_async (
210229 kern3Kernel ,
211- [third_row , first_row , second_row ],
230+ [
231+ x_usm ,
232+ ctypes .c_ulonglong (2 * n_alloc ),
233+ x_usm ,
234+ ctypes .c_ulonglong (0 ),
235+ x_usm ,
236+ ctypes .c_ulonglong (n_alloc ),
237+ ],
212238 [
213239 n ,
214240 ],
@@ -218,9 +244,7 @@ def test_submit_async():
218244 e3_st = e3 .execution_status
219245 e2_st = e2 .execution_status
220246 e1_st = e1 .execution_status
221- ht_e = q ._submit_keep_args_alive (
222- [first_row , second_row , third_row ], [e1 , e2 , e3 ]
223- )
247+ ht_e = q ._submit_keep_args_alive ([x_usm ], [e1 , e2 , e3 ])
224248 are_complete = [
225249 e == status_complete
226250 for e in (
@@ -240,14 +264,13 @@ def test_submit_async():
240264 break
241265
242266 assert async_detected , "No evidence of async submission detected, unlucky?"
243- Xnp = dpt . asnumpy ( X )
244- Xref = np .empty ((3 , n ), dtype = "u4" )
267+ q . memcpy ( dest = x , src = x_usm , count = x . nbytes )
268+ x_ref = np .empty ((3 , n ), dtype = "u4" )
245269 for i in range (n ):
246- Xref [0 , i ] = (i * i ) % p1
247- Xref [1 , i ] = (i * i * i ) % p2
248- Xref [2 , i ] = min (Xref [0 , i ], Xref [1 , i ])
249-
250- assert np .array_equal (Xnp [:, :n ], Xref [:, :n ])
270+ x_ref [0 , i ] = (i * i ) % p1
271+ x_ref [1 , i ] = (i * i * i ) % p2
272+ x_ref [2 , i ] = min (x_ref [0 , i ], x_ref [1 , i ])
273+ assert np .array_equal (x [:, :n ], x_ref [:, :n ])
251274
252275
253276def _check_kernel_arg_type_instance (kati ):
@@ -303,19 +326,20 @@ def test_submit_local_accessor_arg():
303326 krn = prog .get_sycl_kernel ("_ZTS14SyclKernel_SLMIlE" )
304327 lws = 32
305328 gws = lws * 10
306- x = dpt .ones (gws , dtype = "i8" )
307- x .sycl_queue .wait ()
329+ x = np .ones (gws , dtype = "i8" )
330+ res = np .empty_like (x )
331+ x_usm = dpm .MemoryUSMDevice (x .nbytes , queue = q )
332+ q .memcpy (dest = x_usm , src = x , count = x .nbytes )
308333 try :
309334 e = q .submit (
310335 krn ,
311- [x . usm_data , dpctl .LocalAccessor ("i8" , (lws ,))],
336+ [x_usm , dpctl .LocalAccessor ("i8" , (lws ,))],
312337 [gws ],
313338 [lws ],
314339 )
315340 e .wait ()
316341 except dpctl ._sycl_queue .SyclKernelSubmitError :
317342 pytest .skip (f"Kernel submission failed for device { q .sycl_device } " )
318- expected = dpt .arange (1 , x .size + 1 , dtype = x .dtype , device = x .device ) * (
319- 2 * lws
320- )
321- assert dpt .all (x == expected )
343+ q .memcpy (dest = res , src = x_usm , count = x .nbytes )
344+ expected = np .arange (1 , x .size + 1 , dtype = x .dtype ) * (2 * lws )
345+ assert np .all (res == expected )
0 commit comments