KernelTuner
diff --git a/‎doc/requirements.txt‎
Lines changed: 8 additions & 8 deletions b/‎doc/requirements.txt‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎doc/requirements_test.txt‎
Lines changed: 6 additions & 6 deletions b/‎doc/requirements_test.txt‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎kernel_tuner/accuracy.py‎
Lines changed: 1 addition & 1 deletion b/‎kernel_tuner/accuracy.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernel_tuner/backends/cupy.py‎
Lines changed: 2 additions & 1 deletion b/‎kernel_tuner/backends/cupy.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎kernel_tuner/backends/nvcuda.py‎
Lines changed: 38 additions & 17 deletions b/‎kernel_tuner/backends/nvcuda.py‎
Lines changed: 38 additions & 17 deletions
diff --git a/‎kernel_tuner/interface.py‎
Lines changed: 7 additions & 1 deletion b/‎kernel_tuner/interface.py‎
Lines changed: 7 additions & 1 deletion
@@ -16,7 +16,7 @@ domdf-python-tools==3.10.0 ; python_version >= "3.9" and python_version < "3.15"
 exceptiongroup==1.2.2 ; python_version >= "3.9" and python_version < "3.11"
 executing==2.2.0 ; python_version >= "3.9" and python_version < "3.15"
 fastjsonschema==2.21.1 ; python_version >= "3.9" and python_version < "3.15"
-idna==3.10 ; python_version >= "3.9" and python_version < "3.15"
+idna==3.15 ; python_version >= "3.9" and python_version < "3.15"
 imagesize==1.4.1 ; python_version >= "3.9" and python_version < "3.15"
 importlib-metadata==8.6.1 ; python_version >= "3.9" and python_version < "3.10"
 iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "3.15"
@@ -31,10 +31,10 @@ jupyter-core==5.7.2 ; python_version >= "3.9" and python_version < "3.15"
 jupyterlab-pygments==0.3.0 ; python_version >= "3.9" and python_version < "3.15"
 markupsafe==2.1.5 ; python_version >= "3.9" and python_version < "3.15"
 matplotlib-inline==0.1.7 ; python_version >= "3.9" and python_version < "3.15"
-mistune==3.1.2 ; python_version >= "3.9" and python_version < "3.15"
+mistune==3.2.1 ; python_version >= "3.9" and python_version < "3.15"
 natsort==8.4.0 ; python_version >= "3.9" and python_version < "3.15"
 nbclient==0.10.2 ; python_version >= "3.9" and python_version < "3.15"
-nbconvert==7.17.0 ; python_version >= "3.9" and python_version < "3.15"
+nbconvert==7.17.1 ; python_version >= "3.9" and python_version < "3.15"
 nbformat==5.10.4 ; python_version >= "3.9" and python_version < "3.15"
 nbsphinx==0.9.7 ; python_version >= "3.9" and python_version < "3.15"
 numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.15"
@@ -49,15 +49,15 @@ prompt-toolkit==3.0.50 ; python_version >= "3.9" and python_version < "3.15"
 ptyprocess==0.7.0 ; python_version >= "3.9" and python_version < "3.15" and sys_platform != "win32"
 pure-eval==0.2.3 ; python_version >= "3.9" and python_version < "3.15"
 pycparser==2.22 ; python_version >= "3.9" and python_version < "3.15" and implementation_name == "pypy"
-pygments==2.19.1 ; python_version >= "3.9" and python_version < "3.15"
-pytest==8.3.5 ; python_version >= "3.9" and python_version < "3.15"
+pygments==2.20.0 ; python_version >= "3.9" and python_version < "3.15"
+pytest==9.0.3 ; python_version >= "3.9" and python_version < "3.15"
 python-constraint2==2.1.0 ; python_version >= "3.9" and python_version < "3.15"
 python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_version < "3.15"
 pytz==2025.1 ; python_version >= "3.9" and python_version < "3.15"
 pywin32==308 ; sys_platform == "win32" and platform_python_implementation != "PyPy" and python_version >= "3.9" and python_version < "3.15"
 pyzmq==26.2.1 ; python_version >= "3.9" and python_version < "3.15"
 referencing==0.36.2 ; python_version >= "3.9" and python_version < "3.15"
-requests==2.32.4 ; python_version >= "3.9" and python_version < "3.15"
+requests==2.33.0 ; python_version >= "3.9" and python_version < "3.15"
 rpds-py==0.23.1 ; python_version >= "3.9" and python_version < "3.15"
 scikit-learn==1.6.1 ; python_version >= "3.9" and python_version < "3.15"
 scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.15"
@@ -78,11 +78,11 @@ stack-data==0.6.3 ; python_version >= "3.9" and python_version < "3.15"
 threadpoolctl==3.5.0 ; python_version >= "3.9" and python_version < "3.15"
 tinycss2==1.4.0 ; python_version >= "3.9" and python_version < "3.15"
 tomli==2.2.1 ; python_version >= "3.9" and python_version < "3.15"
-tornado==6.5.1 ; python_version >= "3.9" and python_version < "3.15"
+tornado==6.5.5 ; python_version >= "3.9" and python_version < "3.15"
 traitlets==5.14.3 ; python_version >= "3.9" and python_version < "3.15"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.15"
 tzdata==2025.1 ; python_version >= "3.9" and python_version < "3.15"
-urllib3==2.6.3 ; python_version >= "3.9" and python_version < "3.15"
+urllib3==2.7.0 ; python_version >= "3.9" and python_version < "3.15"
 wcwidth==0.2.13 ; python_version >= "3.9" and python_version < "3.15"
 webencodings==0.5.1 ; python_version >= "3.9" and python_version < "3.15"
 xmltodict==0.14.2 ; python_version >= "3.9" and python_version < "3.15"
 
@@ -311,9 +311,9 @@ ptyprocess==0.7.0 ; python_version >= "3.10" and python_version < "4" and (os_na
 pure-eval==0.2.3 ; python_version >= "3.10" and python_version < "4" \
     --hash=sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 \
     --hash=sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42
-pygments==2.19.1 ; python_version >= "3.10" and python_version < "4" \
-    --hash=sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f \
-    --hash=sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c
+pygments==2.20.0 ; python_version >= "3.10" and python_version < "4" \
+    --hash=sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f \
+    --hash=sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176
 pyproject-hooks==1.2.0 ; python_version >= "3.10" and python_version < "4" \
     --hash=sha256:1e859bd5c40fae9448642dd871adf459e5e2084186e8d2c2a79a824c970da1f8 \
     --hash=sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913
@@ -323,9 +323,9 @@ pytest-cov==5.0.0 ; python_version >= "3.10" and python_version < "4" \
 pytest-timeout==2.3.1 ; python_version >= "3.10" and python_version < "4" \
     --hash=sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9 \
     --hash=sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e
-pytest==8.3.5 ; python_version >= "3.10" and python_version < "4" \
-    --hash=sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820 \
-    --hash=sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845
+pytest==9.0.3 ; python_version >= "3.10" and python_version < "4" \
+    --hash=sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9 \
+    --hash=sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c
 python-constraint2==2.2.2 ; python_version >= "3.10" and python_version < "4" \
     --hash=sha256:02dcdf6d6f2d403b6304dddb242ef1b3db791600c7b8f8cd895dc3f87509bc6e \
     --hash=sha256:0951ff7ee0d549037ed078ecf828f33003730531a7231f9773c3674553362efa \
 
@@ -96,7 +96,7 @@ def _find_bfloat16_if_available():
             + "please install either the package `ml_dtypes`, `jax`, or `tensorflow`"
         )
 
-    return None
+    return dtype
 
 
 def _to_float_dtype(x: str) -> np.dtype:
 
@@ -10,6 +10,7 @@
 # and run tests without cupy installed
 try:
     import cupy as cp
+    from cupyx import get_runtime_info
 except ImportError:
     cp = None
 
@@ -68,7 +69,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
 
         # collect environment information
         env = dict()
-        cupy_info = str(cp._cupyx.get_runtime_info()).split("\n")[:-1]
+        cupy_info = str(get_runtime_info()).split("\n")[:-1]
         info_dict = {
             s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
         }
 
@@ -2,11 +2,12 @@
 from warnings import warn
 
 import numpy as np
+import os
 
 from kernel_tuner.backends.backend import GPUBackend
 from kernel_tuner.observers.nvcuda import CudaRuntimeObserver
 from kernel_tuner.util import SkippableFailure
-from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc
+from kernel_tuner.utils.nvcuda import cuda_error_check, to_valid_nvrtc_gpu_arch_cc, find_cuda_home
 
 # embedded in try block to be able to generate documentation
 # and run tests without cuda-python installed
@@ -74,9 +75,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         self.current_module = None
         self.func = None
         self.compiler_options = compiler_options or []
-        self.compiler_options_bytes = []
-        for option in self.compiler_options:
-            self.compiler_options_bytes.append(str(option).encode("UTF-8"))
 
         # create a stream and events
         err, self.stream = driver.cuStreamCreate(0)
@@ -154,37 +152,60 @@ def compile(self, kernel_instance):
         """
         kernel_string = kernel_instance.kernel_string
         kernel_name = kernel_instance.name
+        expression_name = str.encode(kernel_name)
+        compiler_options = list(self.compiler_options)
 
-        # mimic pycuda behavior to wrap kernel_string in extern "C" if not in kernel_string already
-        if 'extern "C"' not in kernel_string:
-            kernel_string = 'extern "C" {\n' + kernel_string + "\n}"
+        # Add -std=c++11
+        if not any(opt.startswith(("-std=", "--std=")) for opt in self.compiler_options):
+            compiler_options.append("--std=c++11")
 
-        compiler_options = self.compiler_options_bytes
-        if not any([b"--std=" in opt for opt in compiler_options]):
-            compiler_options.append(b"--std=c++11")
-        if not any(["--std=" in opt for opt in self.compiler_options]):
-            self.compiler_options.append("--std=c++11")
-        if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
-            compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
-        if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
-            self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
+        # Add -arch
+        if not any(opt.startswith(("-arch", "--arch", "--gpu-architecture=")) for opt in self.compiler_options):
+            arch_val = to_valid_nvrtc_gpu_arch_cc(self.cc)
+            compiler_options.append(f"--gpu-architecture=compute_{arch_val}")
+
+        # Add CUDA home to include path
+        cuda_home = find_cuda_home()
+        if cuda_home:
+            cuda_include = os.path.join(cuda_home, "include")
+            compiler_options.append(f"-I{cuda_include}")
+
+        # nvrtcCompileProgram requires bytes instead of str
+        compiler_options = [str(opt).encode("UTF-8") for opt in compiler_options]
 
         err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
         try:
+            # Add the kernel as an expression. This is necessary for templated kernels to ensure that the
+            # compiler actually instantiates the kernel that we want to compile.
+            cuda_error_check(err)
+            err = nvrtc.nvrtcAddNameExpression(program, expression_name)
+
+            # Compile the program
             cuda_error_check(err)
             err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
+
+            # Get the PTX
             cuda_error_check(err)
             err, size = nvrtc.nvrtcGetPTXSize(program)
             cuda_error_check(err)
             buff = b" " * size
             err = nvrtc.nvrtcGetPTX(program, buff)
             cuda_error_check(err)
+
+            # Load the module
             err, self.current_module = driver.cuModuleLoadData(np.char.array(buff))
             if err == driver.CUresult.CUDA_ERROR_INVALID_PTX:
                 raise SkippableFailure("uses too much shared data")
             else:
                 cuda_error_check(err)
-            err, self.func = driver.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
+
+            # First, get the "lowered" name of the kernel (i.e., the name inside the PTX).
+            # After, we can use the lowered name to lookup the kernel in the module.
+            err, lowered_name = nvrtc.nvrtcGetLoweredName(program, expression_name)
+            cuda_error_check(err)
+            err, self.func = driver.cuModuleGetFunction(
+                self.current_module, lowered_name
+            )
             cuda_error_check(err)
 
             # get the number of registers per thread used in this kernel
 
@@ -65,7 +65,9 @@
     pyatf_strategies,
     random_sample,
     simulated_annealing,
-    skopt
+    skopt,
+    gen_hybrid_vndx,
+    gen_adaptive_tabu_greywolf,
 )
 from kernel_tuner.strategies.wrapper import OptAlgWrapper
 
@@ -87,6 +89,8 @@
     "firefly_algorithm": firefly_algorithm,
     "bayes_opt": bayes_opt,
     "pyatf_strategies": pyatf_strategies,
+    "hybrid_vndx": gen_hybrid_vndx,
+    "adaptive_tabu_greywolf": gen_adaptive_tabu_greywolf,
 }
 
 
@@ -397,6 +401,8 @@ def __deepcopy__(self, _):
             * "random_sample" takes a random sample of the search space
             * "simulated_annealing" simulated annealing strategy
             * "skopt" uses the minimization methods from `skopt`
+            * "HybridVNDX" a hybrid variable neighborhood descent strategy
+            * "AdaptiveTabuGreyWolf" an adaptive tabu-guided grey wolf optimization strategy
 
         Strategy-specific parameters and options are explained under strategy_options.
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def _find_bfloat16_if_available():`
`96`	`96`	+ "please install either the package `ml_dtypes`, `jax`, or `tensorflow`"
`97`	`97`	`)`
`98`	`98`
`99`		`- return None`
	`99`	`+ return dtype`
`100`	`100`
`101`	`101`
`102`	`102`	`def _to_float_dtype(x: str) -> np.dtype:`