77
88import numpy as np
99
10- try :
11- import cupy as cp
12- except ImportError :
13- cp = np
10+ def _get_cupy ():
11+ try :
12+ import cupy as _cp
13+ except ImportError :
14+ return None
15+ return _cp
1416
1517import kernel_tuner .util as util
1618from kernel_tuner .accuracy import Tunable
17- from kernel_tuner .backends .compiler import CompilerFunctions
18- from kernel_tuner .backends .cupy import CupyFunctions
19- from kernel_tuner .backends .hip import HipFunctions
20- from kernel_tuner .backends .hypertuner import HypertunerFunctions
21- from kernel_tuner .backends .nvcuda import CudaFunctions
22- from kernel_tuner .backends .opencl import OpenCLFunctions
23- from kernel_tuner .backends .pycuda import PyCudaFunctions
24- from kernel_tuner .observers .nvml import NVMLObserver
2519from kernel_tuner .observers .observer import ContinuousObserver , OutputObserver , PrologueObserver
2620from kernel_tuner .observers .tegra import TegraObserver
2721
3529except ImportError :
3630 DeviceArray = Exception # using Exception here as a type that will never be among kernel arguments
3731
32+
3833_KernelInstance = namedtuple (
3934 "_KernelInstance" ,
4035 [
@@ -272,27 +267,31 @@ def __init__(
272267 logging .debug ("DeviceInterface instantiated, lang=%s" , lang )
273268
274269 if lang .upper () == "CUDA" :
270+ from kernel_tuner .backends .pycuda import PyCudaFunctions
275271 dev = PyCudaFunctions (
276272 device ,
277273 compiler_options = compiler_options ,
278274 iterations = iterations ,
279275 observers = observers ,
280276 )
281277 elif lang .upper () == "CUPY" :
278+ from kernel_tuner .backends .cupy import CupyFunctions
282279 dev = CupyFunctions (
283280 device ,
284281 compiler_options = compiler_options ,
285282 iterations = iterations ,
286283 observers = observers ,
287284 )
288285 elif lang .upper () == "NVCUDA" :
286+ from kernel_tuner .backends .nvcuda import CudaFunctions
289287 dev = CudaFunctions (
290288 device ,
291289 compiler_options = compiler_options ,
292290 iterations = iterations ,
293291 observers = observers ,
294292 )
295293 elif lang .upper () == "OPENCL" :
294+ from kernel_tuner .backends .opencl import OpenCLFunctions
296295 dev = OpenCLFunctions (
297296 device ,
298297 platform ,
@@ -301,20 +300,23 @@ def __init__(
301300 observers = observers ,
302301 )
303302 elif lang .upper () in ["C" , "FORTRAN" ]:
303+ from kernel_tuner .backends .compiler import CompilerFunctions
304304 dev = CompilerFunctions (
305305 compiler = compiler ,
306306 compiler_options = compiler_options ,
307307 iterations = iterations ,
308308 observers = observers ,
309309 )
310310 elif lang .upper () == "HIP" :
311+ from kernel_tuner .backends .hip import HipFunctions
311312 dev = HipFunctions (
312313 device ,
313314 compiler_options = compiler_options ,
314315 iterations = iterations ,
315316 observers = observers ,
316317 )
317318 elif lang .upper () == "HYPERTUNER" :
319+ from kernel_tuner .backends .hypertuner import HypertunerFunctions
318320 dev = HypertunerFunctions (
319321 iterations = iterations ,
320322 compiler_options = compiler_options
@@ -333,8 +335,12 @@ def __init__(
333335 self .output_observers = []
334336 self .prologue_observers = []
335337 if observers :
338+ try :
339+ from kernel_tuner .observers .nvml import NVMLObserver as _NVMLObserver
340+ except ImportError :
341+ _NVMLObserver = None
336342 for obs in observers :
337- if isinstance (obs , NVMLObserver ):
343+ if _NVMLObserver is not None and isinstance (obs , _NVMLObserver ):
338344 self .nvml = obs .nvml
339345 self .use_nvml = True
340346 if isinstance (obs , TegraObserver ):
@@ -500,7 +506,12 @@ def check_kernel_output(
500506
501507 should_sync = [answer [i ] is not None for i , arg in enumerate (instance .arguments )]
502508 else :
503- should_sync = [isinstance (arg , (np .ndarray , cp .ndarray , torch .Tensor , DeviceArray )) for arg in instance .arguments ]
509+ cp = _get_cupy ()
510+ cupy_ndarray = (cp .ndarray ,) if cp is not None else ()
511+ should_sync = [
512+ isinstance (arg , (np .ndarray , torch .Tensor , DeviceArray ) + cupy_ndarray )
513+ for arg in instance .arguments
514+ ]
504515
505516 # re-copy original contents of output arguments to GPU memory, to overwrite any changes
506517 # by earlier kernel runs
@@ -516,7 +527,9 @@ def check_kernel_output(
516527 result_host = []
517528 for i , arg in enumerate (instance .arguments ):
518529 if should_sync [i ]:
519- if isinstance (arg , (np .ndarray , cp .ndarray )):
530+ cp = _get_cupy ()
531+ cupy_ndarray = (cp .ndarray ,) if cp is not None else ()
532+ if isinstance (arg , (np .ndarray ,) + cupy_ndarray ):
520533 result_host .append (np .zeros_like (arg ))
521534 self .dev .memcpy_dtoh (result_host [- 1 ], gpu_args [i ])
522535 elif isinstance (arg , torch .Tensor ) and isinstance (answer [i ], torch .Tensor ):
@@ -790,8 +803,10 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
790803 # for each element in the argument list, check if the types match
791804 for i , arg in enumerate (instance .arguments ):
792805 if answer [i ] is not None : # skip None elements in the answer list
793- if isinstance (answer [i ], (np .ndarray , cp .ndarray )) and isinstance (
794- arg , (np .ndarray , cp .ndarray )
806+ cp = _get_cupy ()
807+ cupy_ndarray = (cp .ndarray ,) if cp is not None else ()
808+ if isinstance (answer [i ], (np .ndarray ,) + cupy_ndarray ) and isinstance (
809+ arg , (np .ndarray ,) + cupy_ndarray
795810 ):
796811 if not np .can_cast (arg .dtype , answer [i ].dtype ):
797812 raise TypeError (
@@ -840,7 +855,9 @@ def _default_verify_function(instance, answer, result_host, atol, verbose):
840855 )
841856 else :
842857 # either answer[i] and argument have different types or answer[i] is not a numpy type
843- if not isinstance (answer [i ], (np .ndarray , cp .ndarray , torch .Tensor )) or not isinstance (
858+ cp = _get_cupy ()
859+ cupy_ndarray = (cp .ndarray ,) if cp is not None else ()
860+ if not isinstance (answer [i ], (np .ndarray , torch .Tensor ) + cupy_ndarray ) or not isinstance (
844861 answer [i ], np .number
845862 ):
846863 raise TypeError (
@@ -865,7 +882,8 @@ def _flatten(a):
865882 if expected is not None :
866883 result = _ravel (result_host [i ])
867884 expected = _flatten (expected )
868- if any ([isinstance (array , cp .ndarray ) for array in [expected , result ]]):
885+ cp = _get_cupy ()
886+ if cp is not None and any ([isinstance (array , cp .ndarray ) for array in [expected , result ]]):
869887 output_test = cp .allclose (expected , result , atol = atol )
870888 elif isinstance (expected , torch .Tensor ) and isinstance (result , torch .Tensor ):
871889 output_test = torch .allclose (expected , result , atol = atol )
0 commit comments