2626 KernelHelper ,
2727 check_cuda_errors ,
2828 find_cuda_device ,
29+ requirement_not_met ,
2930)
3031
3132conjugate_gradient_multi_block_cg = """\
177178"""
178179
179180
180- def gen_tridiag (i , j , val , n , nz ):
181- i [0 ] = 0
182- j [0 ] = 0
183- j [1 ] = 0
181+ def gen_tridiag (row_offsets , col_indices , values , n , nz ):
182+ row_offsets [0 ] = 0
183+ col_indices [0 ] = 0
184+ col_indices [1 ] = 0
184185
185- val [0 ] = float (random ()) + 10.0
186- val [1 ] = float (random ())
186+ values [0 ] = float (random ()) + 10.0
187+ values [1 ] = float (random ())
187188
188- for i in range (1 , n ):
189- if i > 1 :
190- i [ i ] = i [ i - 1 ] + 3
189+ for row_idx in range (1 , n ):
190+ if row_idx > 1 :
191+ row_offsets [ row_idx ] = row_offsets [ row_idx - 1 ] + 3
191192 else :
192- i [1 ] = 2
193+ row_offsets [1 ] = 2
193194
194- start = (i - 1 ) * 3 + 2
195- j [start ] = i - 1
196- j [start + 1 ] = i
195+ start = (row_idx - 1 ) * 3 + 2
196+ col_indices [start ] = row_idx - 1
197+ col_indices [start + 1 ] = row_idx
197198
198- if i < n - 1 :
199- j [start + 2 ] = i + 1
199+ if row_idx < n - 1 :
200+ col_indices [start + 2 ] = row_idx + 1
200201
201- val [start ] = val [start - 1 ]
202- val [start + 1 ] = float (random ()) + 10.0
202+ values [start ] = values [start - 1 ]
203+ values [start + 1 ] = float (random ()) + 10.0
203204
204- if i < n - 1 :
205- val [start + 2 ] = float (random ())
206- i [n ] = nz
205+ if row_idx < n - 1 :
206+ values [start + 2 ] = float (random ())
207+ row_offsets [n ] = nz
207208
208209
209210THREADS_PER_BLOCK = 512
210211s_sd_kname = "conjugateGradientMultiBlockCG"
212+ UNSUPPORTED_SYSTEMS = {"Darwin" , "QNX" }
213+ UNSUPPORTED_MACHINES = {"armv7l" }
211214
212215
213216def main ():
214217 tol = 1e-5
215218
216- import pytest
219+ system_name = platform .system ()
220+ if system_name in UNSUPPORTED_SYSTEMS :
221+ requirement_not_met (f"{ s_sd_kname } is not supported on { system_name } " )
217222
218- # WAIVE: Due to bug in NVRTC
219- return
220-
221- if platform .system () == "Darwin" :
222- pytest .skip ("conjugateGradientMultiBlockCG is not supported on Mac OSX" )
223-
224- if platform .machine () == "armv7l" :
225- pytest .skip ("conjugateGradientMultiBlockCG is not supported on ARMv7" )
226-
227- if platform .machine () == "qnx" :
228- pytest .skip ("conjugateGradientMultiBlockCG is not supported on QNX" )
223+ machine_name = platform .machine ()
224+ if machine_name in UNSUPPORTED_MACHINES :
225+ requirement_not_met (f"{ s_sd_kname } is not supported on { machine_name } " )
229226
230227 # This will pick the best possible CUDA capable device
231228 dev_id = find_cuda_device ()
232229 device_prop = check_cuda_errors (cudart .cudaGetDeviceProperties (dev_id ))
233230
234231 if not device_prop .managedMemory :
235- pytest . skip ("Unified Memory not supported on this device" )
232+ requirement_not_met ("Unified Memory not supported on this device" )
236233
237234 # This sample requires being run on a device that supports Cooperative Kernel
238235 # Launch
239236 if not device_prop .cooperativeLaunch :
240- pytest . skip (f"Selected GPU { dev_id } does not support Cooperative Kernel Launch" )
237+ requirement_not_met (f"Selected GPU { dev_id } does not support Cooperative Kernel Launch" )
241238
242239 # Statistics about the GPU device
243240 print (
244- f"> GPU device has { device_prop .multiProcessorCount :%d} Multi-Processors, SM { device_prop .major :%d} .{ device_prop .minor :%d} compute capabilities\n "
241+ f"> GPU device has { device_prop .multiProcessorCount } Multi-Processors, "
242+ f"SM { device_prop .major } .{ device_prop .minor } compute capabilities\n "
245243 )
246244
247245 # Get kernel
@@ -267,7 +265,7 @@ def main():
267265 x_local = (ctypes .c_float * n ).from_address (x )
268266 rhs_local = (ctypes .c_float * n ).from_address (rhs )
269267 dot_result_local = (ctypes .c_double ).from_address (dot_result )
270- dot_result_local = 0
268+ dot_result_local . value = 0. 0
271269
272270 # temp memory for CG
273271 r = check_cuda_errors (cudart .cudaMallocManaged (np .dtype (np .float32 ).itemsize * n , cudart .cudaMemAttachGlobal ))
@@ -280,9 +278,9 @@ def main():
280278 start = check_cuda_errors (cudart .cudaEventCreate ())
281279 stop = check_cuda_errors (cudart .cudaEventCreate ())
282280
283- for i in range (n ):
284- r_local [i ] = rhs_local [i ] = 1.0
285- x_local [i ] = 0.0
281+ for idx in range (n ):
282+ r_local [idx ] = rhs_local [idx ] = 1.0
283+ x_local [idx ] = 0.0
286284
287285 kernel_args_value = (i , j , val , x , ax , p , r , dot_result , nz , n , tol )
288286 kernel_args_types = (
@@ -300,7 +298,7 @@ def main():
300298 )
301299 kernel_args = (kernel_args_value , kernel_args_types )
302300
303- s_mem_size = np .dtype (np .float64 ).itemsize * ((THREADS_PER_BLOCK / 32 ) + 1 )
301+ s_mem_size = np .dtype (np .float64 ).itemsize * ((THREADS_PER_BLOCK // 32 ) + 1 )
304302 num_threads = THREADS_PER_BLOCK
305303 num_blocks_per_sm = check_cuda_errors (
306304 cuda .cuOccupancyMaxActiveBlocksPerMultiprocessor (_gpu_conjugate_gradient , num_threads , s_mem_size )
@@ -325,7 +323,7 @@ def main():
325323 dim_block .x ,
326324 dim_block .y ,
327325 dim_block .z ,
328- 0 ,
326+ s_mem_size ,
329327 0 ,
330328 kernel_args ,
331329 )
@@ -334,16 +332,17 @@ def main():
334332 check_cuda_errors (cudart .cudaDeviceSynchronize ())
335333
336334 time = check_cuda_errors (cudart .cudaEventElapsedTime (start , stop ))
337- print (f"GPU Final, residual = { math .sqrt (dot_result_local ):e} , kernel execution time = { time :f} ms" )
335+ residual = math .sqrt (dot_result_local .value )
336+ print (f"GPU Final, residual = { residual :e} , kernel execution time = { time :f} ms" )
338337
339338 err = 0.0
340- for i in range (n ):
339+ for row_idx in range (n ):
341340 rsum = 0.0
342341
343- for j in range (i_local [i ], i_local [i + 1 ]):
344- rsum += val_local [j ] * x_local [j_local [j ]]
342+ for elem_idx in range (i_local [row_idx ], i_local [row_idx + 1 ]):
343+ rsum += val_local [elem_idx ] * x_local [j_local [elem_idx ]]
345344
346- diff = math .fabs (rsum - rhs_local [i ])
345+ diff = math .fabs (rsum - rhs_local [row_idx ])
347346
348347 if diff > err :
349348 err = diff
@@ -361,7 +360,7 @@ def main():
361360 check_cuda_errors (cudart .cudaEventDestroy (stop ))
362361
363362 print (f"Test Summary: Error amount = { err :f} " )
364- if math . sqrt ( dot_result_local ) >= tol :
363+ if residual >= tol :
365364 print ("conjugateGradientMultiBlockCG FAILED" , file = sys .stderr )
366365 sys .exit (1 )
367366
0 commit comments