Skip to content

Commit 1ff3d04

Browse files
authored
Merge branch 'main' into ci-windows-update
2 parents 1eab981 + 6552c77 commit 1ff3d04

8 files changed

Lines changed: 411 additions & 70 deletions

File tree

.github/workflows/build-wheel.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ jobs:
7676
CIBW_BUILD: ${{ env.CIBW_BUILD }}
7777
CIBW_ARCHS_LINUX: "native"
7878
CIBW_BUILD_VERBOSITY: 1
79+
CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel"
80+
CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}"
7981
with:
8082
package-dir: ./cuda_core/
8183
output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
@@ -124,6 +126,8 @@ jobs:
124126
CUDA_HOME="$(cygpath -w ${{ env.CUDA_PATH }})"
125127
LIB="${CUDA_HOME}\\lib\\x64;${LIB}"
126128
CUDA_BINDINGS_PARALLEL_LEVEL=${{ env.CUDA_BINDINGS_PARALLEL_LEVEL }}
129+
CIBW_BEFORE_BUILD_WINDOWS: "pip install delvewheel"
130+
CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair --namespace-pkg cuda -w {dest_dir} {wheel}"
127131
with:
128132
package-dir: ./cuda_bindings/
129133
output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}

cuda_core/cuda/core/experimental/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
from cuda.core.experimental import utils
66
from cuda.core.experimental._device import Device
77
from cuda.core.experimental._event import Event, EventOptions
8-
from cuda.core.experimental._launcher import LaunchConfig, launch
8+
from cuda.core.experimental._launch_config import LaunchConfig
9+
from cuda.core.experimental._launcher import launch
910
from cuda.core.experimental._linker import Linker, LinkerOptions
1011
from cuda.core.experimental._module import ObjectCode
1112
from cuda.core.experimental._program import Program, ProgramOptions
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from dataclasses import dataclass
6+
from typing import Optional, Union
7+
8+
from cuda.core.experimental._device import Device
9+
from cuda.core.experimental._utils.cuda_utils import (
10+
CUDAError,
11+
cast_to_3_tuple,
12+
driver,
13+
get_binding_version,
14+
handle_return,
15+
)
16+
17+
# TODO: revisit this treatment for py313t builds
18+
_inited = False
19+
20+
21+
def _lazy_init():
22+
global _inited
23+
if _inited:
24+
return
25+
26+
global _use_ex
27+
# binding availability depends on cuda-python version
28+
_py_major_minor = get_binding_version()
29+
_driver_ver = handle_return(driver.cuDriverGetVersion())
30+
_use_ex = (_driver_ver >= 11080) and (_py_major_minor >= (11, 8))
31+
_inited = True
32+
33+
34+
@dataclass
35+
class LaunchConfig:
36+
"""Customizable launch options.
37+
38+
Attributes
39+
----------
40+
grid : Union[tuple, int]
41+
Collection of threads that will execute a kernel function.
42+
cluster : Union[tuple, int]
43+
Group of blocks (Thread Block Cluster) that will execute on the same
44+
GPU Processing Cluster (GPC). Blocks within a cluster have access to
45+
distributed shared memory and can be explicitly synchronized.
46+
block : Union[tuple, int]
47+
Group of threads (Thread Block) that will execute on the same
48+
streaming multiprocessor (SM). Threads within a thread blocks have
49+
access to shared memory and can be explicitly synchronized.
50+
shmem_size : int, optional
51+
Dynamic shared-memory size per thread block in bytes.
52+
(Default to size 0)
53+
54+
"""
55+
56+
# TODO: expand LaunchConfig to include other attributes
57+
grid: Union[tuple, int] = None
58+
cluster: Union[tuple, int] = None
59+
block: Union[tuple, int] = None
60+
shmem_size: Optional[int] = None
61+
62+
def __post_init__(self):
63+
_lazy_init()
64+
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
65+
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
66+
# thread block clusters are supported starting H100
67+
if self.cluster is not None:
68+
if not _use_ex:
69+
err, drvers = driver.cuDriverGetVersion()
70+
drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
71+
raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
72+
cc = Device().compute_capability
73+
if cc < (9, 0):
74+
raise CUDAError(
75+
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
76+
)
77+
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
78+
if self.shmem_size is None:
79+
self.shmem_size = 0
80+
81+
82+
def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
83+
_lazy_init()
84+
drv_cfg = driver.CUlaunchConfig()
85+
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
86+
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
87+
drv_cfg.sharedMemBytes = config.shmem_size
88+
attrs = [] # TODO: support more attributes
89+
if config.cluster:
90+
attr = driver.CUlaunchAttribute()
91+
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
92+
dim = attr.value.clusterDim
93+
dim.x, dim.y, dim.z = config.cluster
94+
attrs.append(attr)
95+
drv_cfg.numAttrs = len(attrs)
96+
drv_cfg.attrs = attrs
97+
return drv_cfg

cuda_core/cuda/core/experimental/_launcher.py

Lines changed: 4 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,13 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
from dataclasses import dataclass
6-
from typing import Optional, Union
75

8-
from cuda.core.experimental._device import Device
96
from cuda.core.experimental._kernel_arg_handler import ParamHolder
7+
from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
108
from cuda.core.experimental._module import Kernel
119
from cuda.core.experimental._stream import Stream
1210
from cuda.core.experimental._utils.clear_error_support import assert_type
1311
from cuda.core.experimental._utils.cuda_utils import (
14-
CUDAError,
15-
cast_to_3_tuple,
1612
check_or_create_options,
1713
driver,
1814
get_binding_version,
@@ -37,54 +33,6 @@ def _lazy_init():
3733
_inited = True
3834

3935

40-
@dataclass
41-
class LaunchConfig:
42-
"""Customizable launch options.
43-
44-
Attributes
45-
----------
46-
grid : Union[tuple, int]
47-
Collection of threads that will execute a kernel function.
48-
cluster : Union[tuple, int]
49-
Group of blocks (Thread Block Cluster) that will execute on the same
50-
GPU Processing Cluster (GPC). Blocks within a cluster have access to
51-
distributed shared memory and can be explicitly synchronized.
52-
block : Union[tuple, int]
53-
Group of threads (Thread Block) that will execute on the same
54-
streaming multiprocessor (SM). Threads within a thread blocks have
55-
access to shared memory and can be explicitly synchronized.
56-
shmem_size : int, optional
57-
Dynamic shared-memory size per thread block in bytes.
58-
(Default to size 0)
59-
60-
"""
61-
62-
# TODO: expand LaunchConfig to include other attributes
63-
grid: Union[tuple, int] = None
64-
cluster: Union[tuple, int] = None
65-
block: Union[tuple, int] = None
66-
shmem_size: Optional[int] = None
67-
68-
def __post_init__(self):
69-
_lazy_init()
70-
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
71-
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
72-
# thread block clusters are supported starting H100
73-
if self.cluster is not None:
74-
if not _use_ex:
75-
err, drvers = driver.cuDriverGetVersion()
76-
drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
77-
raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
78-
cc = Device().compute_capability
79-
if cc < (9, 0):
80-
raise CUDAError(
81-
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
82-
)
83-
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
84-
if self.shmem_size is None:
85-
self.shmem_size = 0
86-
87-
8836
def launch(stream, config, kernel, *kernel_args):
8937
"""Launches a :obj:`~_module.Kernel`
9038
object with launch-time configuration.
@@ -114,6 +62,7 @@ def launch(stream, config, kernel, *kernel_args):
11462
f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
11563
) from e
11664
assert_type(kernel, Kernel)
65+
_lazy_init()
11766
config = check_or_create_options(LaunchConfig, config, "launch config")
11867

11968
# TODO: can we ensure kernel_args is valid/safe to use here?
@@ -127,25 +76,13 @@ def launch(stream, config, kernel, *kernel_args):
12776
# mainly to see if the "Ex" API is available and if so we use it, as it's more feature
12877
# rich.
12978
if _use_ex:
130-
drv_cfg = driver.CUlaunchConfig()
131-
drv_cfg.gridDimX, drv_cfg.gridDimY, drv_cfg.gridDimZ = config.grid
132-
drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
79+
drv_cfg = _to_native_launch_config(config)
13380
drv_cfg.hStream = stream.handle
134-
drv_cfg.sharedMemBytes = config.shmem_size
135-
attrs = [] # TODO: support more attributes
136-
if config.cluster:
137-
attr = driver.CUlaunchAttribute()
138-
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
139-
dim = attr.value.clusterDim
140-
dim.x, dim.y, dim.z = config.cluster
141-
attrs.append(attr)
142-
drv_cfg.numAttrs = len(attrs)
143-
drv_cfg.attrs = attrs
14481
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
14582
else:
14683
# TODO: check if config has any unsupported attrs
14784
handle_return(
14885
driver.cuLaunchKernel(
149-
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream._handle, args_ptr, 0
86+
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
15087
)
15188
)

0 commit comments

Comments
 (0)