Skip to content

Commit fa905d2

Browse files
authored
[Compile] accelerate compilation speed using NVRTC (#18519)
This PR supports NVRTC as an alternative to NVCC for faster, device-side JIT compilation of CUDA kernels, in favor of the PR [https://github.com/apache/tvm-ffi/pull/283](https://github.com/apache/tvm-ffi/pull/283). It enhances the CUDA compilation backend by: - Adding Python NVRTC support using cuda-python bindings - Removing legacy C++ NVRTC fallback in favor of a Python-first approach - Keeping nvcc as the default compiler with fatbin output (no behavior change for existing users) Users can choose the compilation backend using an environment variable `TVM_CUDA_COMPILE_MODE`, choosing from "nvcc" and "nvrtc". For example, `TVM_CUDA_COMPILE_MODE=nvrtc python3 your_program.py` Here is a short benchmark of the compilation speed of kernels in `test_target_codegen_cuda.py`. ### NVCC vs NVRTC Compilation Time Comparison (Python-side Call) | Test Case | Code Size | NVCC Time (ms) | NVRTC Time (ms) | Speedup | | :--- | :--- | :--- | :--- | :--- | | `test_crossthread_reduction1` | 1945 B | 241.27 | 51.23 | **4.7x** | | `test_cuda_bf16_vectorize_add` | 3760 B | 342.72 | 44.50 | **7.7x** | | `test_cuda_const_float_to_half` | 12394 B | 272.85 | 31.99 | **8.5x** | | `test_cuda_device_func_call` | 975 B | 215.58 | 21.47 | **10.0x** | | `test_cuda_float_const_hex_format` | 685 B | 217.39 | 20.52 | **10.6x** | | `test_cuda_floordiv_with_vectorization` | 1050 B | 213.88 | 23.32 | **9.2x** | | `test_cuda_inf_nan` | 673 B | 214.33 | 24.94 | **8.6x** | | `test_cuda_tensormap` | 755 B | 213.91 | 20.74 | **10.3x** | | `test_cuda_thread_sync_inside_condition` | 1007 B | 213.43 | 28.29 | **7.5x** | | `test_cuda_vectorize_add` | 908 B | 226.81 | 40.39 | **5.6x** | | `test_cuda_vectorize_load` | 734 B | 217.25 | 24.02 | **9.0x** | | `test_device_host_call_same_func` | 924 B | 216.03 | 21.21 | **10.2x** | | `test_vectorized_intrin1` | 847 B | 226.15 | 26.34 | **8.6x** | ### NVSHMEM Support Currently, NVSHMEM is **not** supported via NVRTC. - Fallback Behavior: When NVSHMEM is required, the compilation pipeline will automatically fall back to NVCC, even if `TVM_CUDA_COMPILE_MODE` is set to nvrtc. - Future Roadmap: Support for NVRTC with NVSHMEM is planned for follow-up PRs.
1 parent b3b6024 commit fa905d2

File tree

13 files changed

+465
-150
lines changed

13 files changed

+465
-150
lines changed

cmake/modules/CUDA.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,8 @@ if(USE_CUDA)
5454
list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_SRCS})
5555
list(APPEND COMPILER_SRCS src/target/opt/build_cuda_on.cc)
5656

57-
list(APPEND TVM_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
5857
list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDART_LIBRARY})
5958
list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
60-
list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
6159

6260
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
6361
if(CMAKE_VERSION VERSION_LESS "3.24")

cmake/utils/FindCUDA.cmake

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
# - CUDA_TOOLKIT_ROOT_DIR
3434
# - CUDA_CUDA_LIBRARY
3535
# - CUDA_CUDART_LIBRARY
36-
# - CUDA_NVRTC_LIBRARY
3736
# - CUDA_CUDNN_INCLUDE_DIRS
3837
# - CUDA_CUDNN_LIBRARY
3938
# - CUDA_CUBLAS_LIBRARY
@@ -64,9 +63,6 @@ macro(find_cuda use_cuda use_cudnn)
6463
find_library(CUDA_CUDA_LIBRARY cuda
6564
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
6665
${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
67-
find_library(CUDA_NVRTC_LIBRARY nvrtc
68-
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
69-
${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
7066
find_library(CUDA_CUBLAS_LIBRARY cublas
7167
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
7268
${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32)
@@ -81,10 +77,6 @@ macro(find_cuda use_cuda use_cudnn)
8177
if(_CUDA_CUDA_LIBRARY)
8278
set(CUDA_CUDA_LIBRARY ${_CUDA_CUDA_LIBRARY})
8379
endif()
84-
find_library(CUDA_NVRTC_LIBRARY nvrtc
85-
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
86-
PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
87-
NO_DEFAULT_PATH)
8880
find_library(CUDA_CURAND_LIBRARY curand
8981
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
9082
PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
@@ -140,7 +132,6 @@ macro(find_cuda use_cuda use_cudnn)
140132
message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR})
141133
message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY})
142134
message(STATUS "Found CUDA_CUDART_LIBRARY=" ${CUDA_CUDART_LIBRARY})
143-
message(STATUS "Found CUDA_NVRTC_LIBRARY=" ${CUDA_NVRTC_LIBRARY})
144135
message(STATUS "Found CUDA_CUDNN_INCLUDE_DIRS=" ${CUDA_CUDNN_INCLUDE_DIRS})
145136
message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
146137
message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})

docker/Dockerfile.ci_gpu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ RUN bash /install/ubuntu_install_opencl.sh
6060
COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
6161
RUN bash /install/ubuntu_install_python_package.sh
6262

63+
COPY install/ubuntu_install_cuda_python.sh /install/ubuntu_install_cuda_python.sh
64+
RUN bash /install/ubuntu_install_cuda_python.sh
65+
6366
COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
6467
RUN bash /install/ubuntu_install_sphinx.sh
6568

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
19+
set -e
20+
set -u
21+
set -o pipefail
22+
23+
pip3 install cuda-python

0 commit comments

Comments
 (0)