Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux_cuda_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
docker_image_repo: onnxruntimecuda12manylinuxbuild
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
run_tests: false # <<< Do not run tests in this job
upload_build_output: true # <<< Upload the build/Release directory
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/linux_cuda_plugin_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ jobs:
--use_binskim_compliant_compile_flags
--build_wheel
--parallel
--nvcc_threads 1
--nvcc_threads 4 --flash_nvcc_threads 4
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
--cuda_version=12.8
--cuda_home=/usr/local/cuda-12.8
--cudnn_home=/usr/local/cuda-12.8
--enable_cuda_profiling
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
--cmake_extra_defines onnxruntime_QUICK_BUILD=ON
--cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
run_tests: false
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/linux_tensorrt_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host'
docker_image_repo: onnxruntimetensorrt86gpubuild
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
run_tests: false # <<< Do not run tests in this job
upload_build_output: true # <<< Upload the build/Release directory
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/windows_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:
exit $lastExitCode
}
# Execute the build process
python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
if ($lastExitCode -ne 0) {
exit $lastExitCode
}
Expand Down Expand Up @@ -235,7 +235,7 @@ jobs:
exit $lastExitCode
}

python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
if ($lastExitCode -ne 0) {
exit $lastExitCode
}
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/windows_cuda_plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
--build_dir build `
--skip_submodule_sync `
--parallel `
--nvcc_threads 1 `
--nvcc_threads 4 --flash_nvcc_threads 4 `
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
--use_binskim_compliant_compile_flags `
--cmake_generator "Visual Studio 17 2022" `
--build_shared_lib `
Expand All @@ -84,6 +84,7 @@ jobs:
--use_vcpkg `
--use_vcpkg_ms_internal_asset_cache `
--enable_cuda_profiling `
--cmake_extra_defines onnxruntime_QUICK_BUILD=ON `
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 `
--cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/windows_tensorrt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ jobs:
exit $lastExitCode
}
# Execute the build process
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
if ($lastExitCode -ne 0) {
exit $lastExitCode
}
Expand Down Expand Up @@ -247,7 +247,7 @@ jobs:
exit $lastExitCode
}

python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
if ($lastExitCode -ne 0) {
exit $lastExitCode
}
Expand Down
106 changes: 106 additions & 0 deletions cmake/onnxruntime_cuda_source_filters.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,109 @@ macro(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST)
list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
endif()
endmacro()

# Extract SM90/SM120 TMA warp-specialized generated source files from a CUDA source list.
# These files use CUTLASS 3.x features (GMMA, TMA) that are specific to SM90+ or SM120+.
# They are compiled in separate OBJECT libraries with restricted CUDA_ARCHITECTURES to:
# 1. Reduce compile time (avoid compiling heavy templates for unused architectures)
# 2. Reduce binary size (no dead device code for unsupported architectures)
# 3. Ensure correctness (SM90 code compiled at exactly 90a-real, SM120 at 120+)
#
# The per-source CUDA_ARCHITECTURES property does not work with the Visual Studio generator,
# so OBJECT libraries are needed.
#
# Usage:
# onnxruntime_extract_sm_specific_cuda_sources(<cu_src_list_var>
# SM90_SOURCES <output_var> SM120_SOURCES <output_var>)
#
# Removes matched files from <cu_src_list_var> and stores them in the output variables.
macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST)
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
cmake_parse_arguments(_EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "" ${ARGN})
Comment thread
tianleiwu marked this conversation as resolved.
Outdated

# Extract SM90 TMA WS generated files
set(${_EXTRACT_SM90_SOURCES})
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
if(ORT_HAS_SM90_OR_LATER)
foreach(_src IN LISTS ${CU_SRC_LIST})
if(_src MATCHES "moe_gemm_tma_ws_sm90_.*\\.generated\\.cu$")
list(APPEND ${_EXTRACT_SM90_SOURCES} "${_src}")
endif()
endforeach()
if(${_EXTRACT_SM90_SOURCES})
list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM90_SOURCES}})
endif()
endif()

# Extract SM120 TMA WS generated files
set(${_EXTRACT_SM120_SOURCES})
if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
foreach(_src IN LISTS ${CU_SRC_LIST})
if(_src MATCHES "moe_gemm_tma_ws_sm120_.*\\.generated\\.cu$")
list(APPEND ${_EXTRACT_SM120_SOURCES} "${_src}")
endif()
endforeach()
if(${_EXTRACT_SM120_SOURCES})
list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM120_SOURCES}})
endif()
endif()
endmacro()

# Extract Flash Attention CUDA source files into a separate list for compilation
# in a dedicated OBJECT library with SM80+ architectures and independent nvcc_threads.
# Flash Attention V2 kernels require SM80 (Ampere) or later — they contain
# __CUDA_ARCH__ >= 800 guards in kernel_traits.h and all files are *_sm80.cu.
# Compiling them separately allows:
# 1. Restricting CUDA_ARCHITECTURES to SM80+ (skip dead pre-Ampere passes)
# 2. Using --threads 1 (memory-intensive) while other targets use higher parallelism
#
# Usage:
# onnxruntime_extract_flash_attention_sources(<cu_src_list_var>
# FLASH_SOURCES <output_var>)
macro(onnxruntime_extract_flash_attention_sources CU_SRC_LIST)
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
cmake_parse_arguments(_FA "" "FLASH_SOURCES" "" ${ARGN})
Comment thread
tianleiwu marked this conversation as resolved.
Outdated

set(${_FA_FLASH_SOURCES})
foreach(_src IN LISTS ${CU_SRC_LIST})
if(_src MATCHES "/bert/flash_attention/.*\\.cu$")
list(APPEND ${_FA_FLASH_SOURCES} "${_src}")
endif()
endforeach()
if(${_FA_FLASH_SOURCES})
list(REMOVE_ITEM ${CU_SRC_LIST} ${${_FA_FLASH_SOURCES}})
endif()
endmacro()

# Extract LLM CUDA source files into separate lists for per-architecture compilation.
# The LLM directory (contrib_ops/cuda/llm/) contains kernels with minimum SM75 support
# (fpA_intB_gemv/gemm enforce arch >= 75). SM90-specific launchers (fpA_intB_gemm
# launchers guarded by #ifndef EXCLUDE_SM_90) are extracted separately to be compiled
# at 90a-real (merged into the SM90 TMA OBJECT library).
#
# Note: SM90 TMA MoE GEMM files are already extracted by
# onnxruntime_extract_sm_specific_cuda_sources() before this macro is called.
#
# Usage:
# onnxruntime_extract_llm_sources(<cu_src_list_var>
# LLM_SOURCES <output_var>
# LLM_SM90_SOURCES <output_var>)
macro(onnxruntime_extract_llm_sources CU_SRC_LIST)
Comment thread
tianleiwu marked this conversation as resolved.
Outdated
cmake_parse_arguments(_LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "" ${ARGN})

set(${_LLM_LLM_SOURCES})
set(${_LLM_LLM_SM90_SOURCES})
foreach(_src IN LISTS ${CU_SRC_LIST})
if(_src MATCHES "/contrib_ops/cuda/llm/.*\\.cu$")
# SM90-specific fpA_intB launchers (guarded by #ifndef EXCLUDE_SM_90)
if(_src MATCHES "fpA_intB_gemm_launcher_[0-9]+\\.generated\\.cu$")
list(APPEND ${_LLM_LLM_SM90_SOURCES} "${_src}")
else()
list(APPEND ${_LLM_LLM_SOURCES} "${_src}")
endif()
endif()
endforeach()
if(${_LLM_LLM_SOURCES})
list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SOURCES}})
endif()
if(${_LLM_LLM_SM90_SOURCES})
list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SM90_SOURCES}})
endif()
endmacro()
Loading
Loading