From 207789febdd5d8e32517a57425157a774ec3d5e2 Mon Sep 17 00:00:00 2001 From: Tianlei WU Date: Thu, 21 May 2026 17:43:37 -0700 Subject: [PATCH 01/17] obj lib for moe gemm to speed up --- cmake/onnxruntime_cuda_source_filters.cmake | 45 +++++++++++++++ cmake/onnxruntime_providers_cuda.cmake | 49 ++++++++++++++++ cmake/onnxruntime_providers_cuda_plugin.cmake | 56 +++++++++++++++++++ 3 files changed, 150 insertions(+) diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake index a93686c379de1..94c25fd14ef2f 100644 --- a/cmake/onnxruntime_cuda_source_filters.cmake +++ b/cmake/onnxruntime_cuda_source_filters.cmake @@ -41,3 +41,48 @@ macro(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST) list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu") endif() endmacro() + +# Extract SM90/SM120 TMA warp-specialized generated source files from a CUDA source list. +# These files use CUTLASS 3.x features (GMMA, TMA) that are specific to SM90+ or SM120+. +# They are compiled in separate OBJECT libraries with restricted CUDA_ARCHITECTURES to: +# 1. Reduce compile time (avoid compiling heavy templates for unused architectures) +# 2. Reduce binary size (no dead device code for unsupported architectures) +# 3. Ensure correctness (SM90 code compiled at exactly 90a-real, SM120 at 120+) +# +# The per-source CUDA_ARCHITECTURES property does not work with the Visual Studio generator, +# so OBJECT libraries are needed. +# +# Usage: +# onnxruntime_extract_sm_specific_cuda_sources( +# SM90_SOURCES SM120_SOURCES ) +# +# Removes matched files from and stores them in the output variables. +macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST) + cmake_parse_arguments(_EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "" ${ARGN}) + + # Extract SM90 TMA WS generated files + set(${_EXTRACT_SM90_SOURCES}) + if(ORT_HAS_SM90_OR_LATER) + foreach(_src IN LISTS ${CU_SRC_LIST}) + if(_src MATCHES "moe_gemm_tma_ws_sm90_.*\\.generated\\.cu$") + list(APPEND ${_EXTRACT_SM90_SOURCES} "${_src}") + endif() + endforeach() + if(${_EXTRACT_SM90_SOURCES}) + list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM90_SOURCES}}) + endif() + endif() + + # Extract SM120 TMA WS generated files + set(${_EXTRACT_SM120_SOURCES}) + if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) + foreach(_src IN LISTS ${CU_SRC_LIST}) + if(_src MATCHES "moe_gemm_tma_ws_sm120_.*\\.generated\\.cu$") + list(APPEND ${_EXTRACT_SM120_SOURCES} "${_src}") + endif() + endforeach() + if(${_EXTRACT_SM120_SOURCES}) + list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM120_SOURCES}}) + endif() + endif() +endmacro() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index b28c35fd502ed..77c350a92263d 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -67,6 +67,10 @@ include(onnxruntime_cuda_source_filters.cmake) onnxruntime_filter_cuda_cu_sources(onnxruntime_cuda_contrib_ops_cu_srcs) + onnxruntime_extract_sm_specific_cuda_sources(onnxruntime_cuda_contrib_ops_cu_srcs + SM90_SOURCES onnxruntime_cuda_sm90_tma_srcs + SM120_SOURCES onnxruntime_cuda_sm120_tma_srcs + ) # disable contrib ops conditionally if(NOT onnxruntime_DISABLE_CONTRIB_OPS AND NOT onnxruntime_CUDA_MINIMAL) @@ -309,6 +313,13 @@ if(MSVC) target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /Zc:__cplusplus>") target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /bigobj>") + # /permissive is required for CUTLASS cute headers and to work around MSVC template resolution + # issues with abseil headers when compiled through nvcc. + # See https://github.com/NVIDIA/cutlass/issues/3065 + target_compile_options(${target} PRIVATE + "$<$:/permissive>" + "$<$:SHELL:-Xcompiler /permissive>" + ) endif() onnxruntime_add_include_to_target(${target} onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers) @@ -434,6 +445,44 @@ config_cuda_provider_shared_module(onnxruntime_providers_cuda_obj) endif() config_cuda_provider_shared_module(onnxruntime_providers_cuda) + + # Create OBJECT libraries for SM90/SM120 TMA WS generated files that must be compiled + # with restricted CUDA architectures. These files use CUTLASS 3.x SM90+/SM120+ features + # (GMMA, TMA) that cannot produce useful device code for older architectures. + if(onnxruntime_cuda_sm90_tma_srcs) + # SM90 TMA warp-specialized files use SM90-specific collective operations. + # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat. + set(_ort_has_sm90_plus FALSE) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 90) + set(_ort_has_sm90_plus TRUE) + break() + endif() + endforeach() + if(_ort_has_sm90_plus) + onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${onnxruntime_cuda_sm90_tma_srcs}) + set_target_properties(onnxruntime_providers_cuda_sm90_tma PROPERTIES CUDA_ARCHITECTURES "90a-real") + config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm90_tma) + target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm90_tma) + endif() + endif() + + if(onnxruntime_cuda_sm120_tma_srcs) + set(_ort_sm120_cuda_architectures) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 120) + list(APPEND _ort_sm120_cuda_architectures "${_arch}") + endif() + endforeach() + if(_ort_sm120_cuda_architectures) + onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs}) + set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}") + config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm120_tma) + target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm120_tma) + endif() + endif() # Cannot use glob because the file cuda_provider_options.h should not be exposed out. set(ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS "${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_context.h" diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 7a76371b74132..10e9774347d9c 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -104,6 +104,10 @@ list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/transforme # Apply shared CUDA .cu source filtering (flash attention quick build, MoE GEMM FP4/FP8). include(onnxruntime_cuda_source_filters.cmake) onnxruntime_filter_cuda_cu_sources(CUDA_PLUGIN_EP_CU_SRCS) +onnxruntime_extract_sm_specific_cuda_sources(CUDA_PLUGIN_EP_CU_SRCS + SM90_SOURCES _cuda_plugin_sm90_tma_srcs + SM120_SOURCES _cuda_plugin_sm120_tma_srcs +) # Create shared library target using the ORT helper function for plugins onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin @@ -217,6 +221,58 @@ if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS) endif() +# SM90/SM120 TMA WS OBJECT libraries — compiled with restricted CUDA architectures. +if(_cuda_plugin_sm90_tma_srcs) + set(_plugin_has_sm90_plus FALSE) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 90) + set(_plugin_has_sm90_plus TRUE) + break() + endif() + endforeach() + if(_plugin_has_sm90_plus) + onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_cuda_plugin_sm90_tma_srcs}) + set_target_properties(onnxruntime_providers_cuda_plugin_sm90_tma PROPERTIES + CUDA_ARCHITECTURES "90a-real" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + target_include_directories(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE + $) + target_compile_definitions(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE + $) + target_compile_options(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE + $) + target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm90_tma) + endif() +endif() + +if(_cuda_plugin_sm120_tma_srcs) + set(_plugin_sm120_cuda_architectures) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 120) + list(APPEND _plugin_sm120_cuda_architectures "${_arch}") + endif() + endforeach() + if(_plugin_sm120_cuda_architectures) + onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm120_tma ${_cuda_plugin_sm120_tma_srcs}) + set_target_properties(onnxruntime_providers_cuda_plugin_sm120_tma PROPERTIES + CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + target_include_directories(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE + $) + target_compile_definitions(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE + $) + target_compile_options(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE + $) + target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm120_tma) + endif() +endif() + # --- Find cuDNN (may be at a custom path via onnxruntime_CUDNN_HOME) --- set(_CUDNN_SEARCH_PATHS "") if(onnxruntime_CUDNN_HOME) From 19fb1332cca52dc72f79eb25ac7524822e48c663 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 22 May 2026 13:05:20 -0700 Subject: [PATCH 02/17] object lib for flash attention and llm --- cmake/onnxruntime_cuda_source_filters.cmake | 61 +++++++++++ cmake/onnxruntime_providers_cuda.cmake | 66 +++++++++++- cmake/onnxruntime_providers_cuda_plugin.cmake | 102 +++++++++++++++--- cmake/onnxruntime_unittests.cmake | 14 +++ 4 files changed, 228 insertions(+), 15 deletions(-) diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake index 94c25fd14ef2f..761f43aab9f48 100644 --- a/cmake/onnxruntime_cuda_source_filters.cmake +++ b/cmake/onnxruntime_cuda_source_filters.cmake @@ -86,3 +86,64 @@ macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST) endif() endif() endmacro() + +# Extract Flash Attention CUDA source files into a separate list for compilation +# in a dedicated OBJECT library with SM80+ architectures and independent nvcc_threads. +# Flash Attention V2 kernels require SM80 (Ampere) or later — they contain +# __CUDA_ARCH__ >= 800 guards in kernel_traits.h and all files are *_sm80.cu. +# Compiling them separately allows: +# 1. Restricting CUDA_ARCHITECTURES to SM80+ (skip dead pre-Ampere passes) +# 2. Using --threads 1 (memory-intensive) while other targets use higher parallelism +# +# Usage: +# onnxruntime_extract_flash_attention_sources( +# FLASH_SOURCES ) +macro(onnxruntime_extract_flash_attention_sources CU_SRC_LIST) + cmake_parse_arguments(_FA "" "FLASH_SOURCES" "" ${ARGN}) + + set(${_FA_FLASH_SOURCES}) + foreach(_src IN LISTS ${CU_SRC_LIST}) + if(_src MATCHES "/bert/flash_attention/.*\\.cu$") + list(APPEND ${_FA_FLASH_SOURCES} "${_src}") + endif() + endforeach() + if(${_FA_FLASH_SOURCES}) + list(REMOVE_ITEM ${CU_SRC_LIST} ${${_FA_FLASH_SOURCES}}) + endif() +endmacro() + +# Extract LLM CUDA source files into separate lists for per-architecture compilation. +# The LLM directory (contrib_ops/cuda/llm/) contains kernels with minimum SM75 support +# (fpA_intB_gemv/gemm enforce arch >= 75). SM90-specific launchers (fpA_intB_gemm +# launchers guarded by #ifndef EXCLUDE_SM_90) are extracted separately to be compiled +# at 90a-real (merged into the SM90 TMA OBJECT library). +# +# Note: SM90 TMA MoE GEMM files are already extracted by +# onnxruntime_extract_sm_specific_cuda_sources() before this macro is called. +# +# Usage: +# onnxruntime_extract_llm_sources( +# LLM_SOURCES +# LLM_SM90_SOURCES ) +macro(onnxruntime_extract_llm_sources CU_SRC_LIST) + cmake_parse_arguments(_LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "" ${ARGN}) + + set(${_LLM_LLM_SOURCES}) + set(${_LLM_LLM_SM90_SOURCES}) + foreach(_src IN LISTS ${CU_SRC_LIST}) + if(_src MATCHES "/contrib_ops/cuda/llm/.*\\.cu$") + # SM90-specific fpA_intB launchers (guarded by #ifndef EXCLUDE_SM_90) + if(_src MATCHES "fpA_intB_gemm_launcher_[0-9]+\\.generated\\.cu$") + list(APPEND ${_LLM_LLM_SM90_SOURCES} "${_src}") + else() + list(APPEND ${_LLM_LLM_SOURCES} "${_src}") + endif() + endif() + endforeach() + if(${_LLM_LLM_SOURCES}) + list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SOURCES}}) + endif() + if(${_LLM_LLM_SM90_SOURCES}) + list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SM90_SOURCES}}) + endif() +endmacro() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 77c350a92263d..ddf9d9da9c1a0 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -71,6 +71,13 @@ SM90_SOURCES onnxruntime_cuda_sm90_tma_srcs SM120_SOURCES onnxruntime_cuda_sm120_tma_srcs ) + onnxruntime_extract_flash_attention_sources(onnxruntime_cuda_contrib_ops_cu_srcs + FLASH_SOURCES onnxruntime_cuda_flash_attention_srcs + ) + onnxruntime_extract_llm_sources(onnxruntime_cuda_contrib_ops_cu_srcs + LLM_SOURCES onnxruntime_cuda_llm_srcs + LLM_SM90_SOURCES onnxruntime_cuda_llm_sm90_srcs + ) # disable contrib ops conditionally if(NOT onnxruntime_DISABLE_CONTRIB_OPS AND NOT onnxruntime_CUDA_MINIMAL) @@ -224,8 +231,9 @@ # Note: CUDA 11.3+ supports parallel compilation # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads + # --threads is NOT set here; it is applied per-target after calling this function + # so that flash attention can use a different (lower) thread count. set(onnxruntime_NVCC_THREADS "1" CACHE STRING "Number of threads that NVCC can use for compilation.") - target_compile_options(${target} PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") # suppress warnings like this: # cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never @@ -443,15 +451,19 @@ endfunction() if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) config_cuda_provider_shared_module(onnxruntime_providers_cuda_obj) + target_compile_options(onnxruntime_providers_cuda_obj PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") endif() config_cuda_provider_shared_module(onnxruntime_providers_cuda) + target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") # Create OBJECT libraries for SM90/SM120 TMA WS generated files that must be compiled # with restricted CUDA architectures. These files use CUTLASS 3.x SM90+/SM120+ features # (GMMA, TMA) that cannot produce useful device code for older architectures. - if(onnxruntime_cuda_sm90_tma_srcs) + if(onnxruntime_cuda_sm90_tma_srcs OR onnxruntime_cuda_llm_sm90_srcs) # SM90 TMA warp-specialized files use SM90-specific collective operations. # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat. + # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). + set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs}) set(_ort_has_sm90_plus FALSE) foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") @@ -461,9 +473,10 @@ endif() endforeach() if(_ort_has_sm90_plus) - onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${onnxruntime_cuda_sm90_tma_srcs}) + onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${_ort_sm90_all_srcs}) set_target_properties(onnxruntime_providers_cuda_sm90_tma PROPERTIES CUDA_ARCHITECTURES "90a-real") config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm90_tma) + target_compile_options(onnxruntime_providers_cuda_sm90_tma PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm90_tma) endif() endif() @@ -480,9 +493,56 @@ onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs}) set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}") config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm120_tma) + target_compile_options(onnxruntime_providers_cuda_sm120_tma PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm120_tma) endif() endif() + + # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads. + # Flash Attention V2 kernels require SM80 (Ampere) and are memory-intensive to compile. + # Isolating them allows the rest of the build to use higher --threads without OOM. + set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING + "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).") + if(onnxruntime_cuda_flash_attention_srcs) + set(_ort_flash_cuda_architectures) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 80) + list(APPEND _ort_flash_cuda_architectures "${_arch}") + endif() + endforeach() + if(_ort_flash_cuda_architectures) + onnxruntime_add_object_library(onnxruntime_providers_cuda_flash_attention ${onnxruntime_cuda_flash_attention_srcs}) + set_target_properties(onnxruntime_providers_cuda_flash_attention PROPERTIES + CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}") + config_cuda_provider_shared_module(onnxruntime_providers_cuda_flash_attention) + target_compile_options(onnxruntime_providers_cuda_flash_attention PRIVATE + "$<$:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">" + ) + target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_flash_attention) + endif() + endif() + + # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). + # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs. + if(onnxruntime_cuda_llm_srcs) + set(_ort_llm_cuda_architectures) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 75) + list(APPEND _ort_llm_cuda_architectures "${_arch}") + endif() + endforeach() + if(_ort_llm_cuda_architectures) + onnxruntime_add_object_library(onnxruntime_providers_cuda_llm ${onnxruntime_cuda_llm_srcs}) + set_target_properties(onnxruntime_providers_cuda_llm PROPERTIES + CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}") + config_cuda_provider_shared_module(onnxruntime_providers_cuda_llm) + target_compile_options(onnxruntime_providers_cuda_llm PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") + target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_llm) + endif() + endif() + # Cannot use glob because the file cuda_provider_options.h should not be exposed out. set(ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS "${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_context.h" diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 10e9774347d9c..5a3df9d03c84b 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -108,6 +108,13 @@ onnxruntime_extract_sm_specific_cuda_sources(CUDA_PLUGIN_EP_CU_SRCS SM90_SOURCES _cuda_plugin_sm90_tma_srcs SM120_SOURCES _cuda_plugin_sm120_tma_srcs ) +onnxruntime_extract_flash_attention_sources(CUDA_PLUGIN_EP_CU_SRCS + FLASH_SOURCES _cuda_plugin_flash_attention_srcs +) +onnxruntime_extract_llm_sources(CUDA_PLUGIN_EP_CU_SRCS + LLM_SOURCES _cuda_plugin_llm_srcs + LLM_SM90_SOURCES _cuda_plugin_llm_sm90_srcs +) # Create shared library target using the ORT helper function for plugins onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin @@ -183,22 +190,23 @@ endif() if (DEFINED onnxruntime_NVCC_THREADS) set(onnxruntime_plugin_nvcc_threads "${onnxruntime_NVCC_THREADS}") else() - set(onnxruntime_plugin_nvcc_threads "1") + set(onnxruntime_plugin_nvcc_threads "4") endif() -target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE - "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">" - "$<$:--diag-suppress=177>" +# Shared CUDA compile options (excluding --threads, which is set per-target so that +# flash attention can use a lower thread count without duplicate-flag nvcc warnings). +set(_cuda_plugin_shared_compile_options + "$<$:--diag-suppress=177>" ) if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) - target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE + list(APPEND _cuda_plugin_shared_compile_options "$<$:--static-global-template-stub=false>" "$<$:--diag-suppress=221>" "$<$:--diag-suppress=2908>" ) if (MSVC) - target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE + list(APPEND _cuda_plugin_shared_compile_options "$<$:SHELL:-Xcompiler /wd4505>" ) endif() @@ -209,8 +217,9 @@ include(cutlass) # TMA compile definitions — mirror config_cuda_provider_shared_module in onnxruntime_providers_cuda.cmake if(ORT_HAS_SM90_OR_LATER) - target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE $<$:-Xptxas=-w>) - target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE $<$:-DCUTLASS_ENABLE_GDC_FOR_SM90=1>) + list(APPEND _cuda_plugin_shared_compile_options + "$<$:-Xptxas=-w>" + "$<$:-DCUTLASS_ENABLE_GDC_FOR_SM90=1>") target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GEMMS) target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GROUPED_GEMMS) endif() @@ -221,8 +230,16 @@ if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS) endif() +# Apply shared options + --threads to the parent plugin target. +target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE + ${_cuda_plugin_shared_compile_options} + "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">" +) + # SM90/SM120 TMA WS OBJECT libraries — compiled with restricted CUDA architectures. -if(_cuda_plugin_sm90_tma_srcs) +# Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). +if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs) + set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs}) set(_plugin_has_sm90_plus FALSE) foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") @@ -232,7 +249,7 @@ if(_cuda_plugin_sm90_tma_srcs) endif() endforeach() if(_plugin_has_sm90_plus) - onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_cuda_plugin_sm90_tma_srcs}) + onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_plugin_sm90_all_srcs}) set_target_properties(onnxruntime_providers_cuda_plugin_sm90_tma PROPERTIES CUDA_ARCHITECTURES "90a-real" CUDA_STANDARD 20 @@ -243,7 +260,8 @@ if(_cuda_plugin_sm90_tma_srcs) target_compile_definitions(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE $) target_compile_options(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE - $) + ${_cuda_plugin_shared_compile_options} + "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">") target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm90_tma) endif() endif() @@ -268,11 +286,71 @@ if(_cuda_plugin_sm120_tma_srcs) target_compile_definitions(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE $) target_compile_options(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE - $) + ${_cuda_plugin_shared_compile_options} + "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">") target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm120_tma) endif() endif() +# Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads. +# Flash Attention V2 kernels require SM80 and are memory-intensive to compile. +if(NOT DEFINED onnxruntime_FLASH_NVCC_THREADS) + set(onnxruntime_FLASH_NVCC_THREADS "1") +endif() +if(_cuda_plugin_flash_attention_srcs) + set(_plugin_flash_cuda_architectures) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 80) + list(APPEND _plugin_flash_cuda_architectures "${_arch}") + endif() + endforeach() + if(_plugin_flash_cuda_architectures) + onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_flash_attention ${_cuda_plugin_flash_attention_srcs}) + set_target_properties(onnxruntime_providers_cuda_plugin_flash_attention PROPERTIES + CUDA_ARCHITECTURES "${_plugin_flash_cuda_architectures}" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + target_include_directories(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE + $) + target_compile_definitions(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE + $) + target_compile_options(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE + ${_cuda_plugin_shared_compile_options} + # Flash attention uses a lower nvcc --threads (memory-intensive compilation). + "$<$:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">") + target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_flash_attention) + endif() +endif() + +# LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). +if(_cuda_plugin_llm_srcs) + set(_plugin_llm_cuda_architectures) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL 75) + list(APPEND _plugin_llm_cuda_architectures "${_arch}") + endif() + endforeach() + if(_plugin_llm_cuda_architectures) + onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_llm ${_cuda_plugin_llm_srcs}) + set_target_properties(onnxruntime_providers_cuda_plugin_llm PROPERTIES + CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + target_include_directories(onnxruntime_providers_cuda_plugin_llm PRIVATE + $) + target_compile_definitions(onnxruntime_providers_cuda_plugin_llm PRIVATE + $) + target_compile_options(onnxruntime_providers_cuda_plugin_llm PRIVATE + ${_cuda_plugin_shared_compile_options} + "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">") + target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_llm) + endif() +endif() + # --- Find cuDNN (may be at a custom path via onnxruntime_CUDNN_HOME) --- set(_CUDNN_SEARCH_PATHS "") if(onnxruntime_CUDNN_HOME) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index a061858fa068f..d5f7fd40aa918 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -981,10 +981,24 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS) # onnxruntime_providers_cuda_ut is only for unittests. onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $) config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut) + target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock) add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils) target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey) target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils) + # Link architecture-specific OBJECT libraries (same as onnxruntime_providers_cuda). + if(TARGET onnxruntime_providers_cuda_sm90_tma) + target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_sm90_tma) + endif() + if(TARGET onnxruntime_providers_cuda_sm120_tma) + target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_sm120_tma) + endif() + if(TARGET onnxruntime_providers_cuda_flash_attention) + target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_flash_attention) + endif() + if(TARGET onnxruntime_providers_cuda_llm) + target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_llm) + endif() if (MSVC) # Cutlass code has an issue with the following: # warning C4100: 'magic': unreferenced formal parameter From 7861019d3d0c7ee64d4fd1df52eb207a3209d398 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 22 May 2026 13:40:18 -0700 Subject: [PATCH 03/17] use `--nvcc_threads 4 --flash_nvcc_threads 2` in pipelines --- .github/workflows/linux_cuda_ci.yml | 2 +- .github/workflows/linux_cuda_plugin_ci.yml | 2 +- .github/workflows/windows_cuda.yml | 4 +-- .github/workflows/windows_cuda_plugin.yml | 2 +- .github/workflows/windows_tensorrt.yml | 4 +-- tools/ci_build/build.py | 33 ++++++------------- tools/ci_build/build_args.py | 11 +++++-- .../custom-nuget-packaging-pipeline.yml | 2 +- .../stages/nuget-win-cuda-packaging-stage.yml | 8 ++--- .../stages/plugin-win-cuda-stage.yml | 4 +-- .../stages/py-win-gpu-stage.yml | 2 +- .../github/linux/build_cuda_plugin_package.sh | 2 +- .../linux/build_linux_python_package.sh | 2 +- 13 files changed, 36 insertions(+), 42 deletions(-) diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml index cae7a84f49442..0c611119e4964 100644 --- a/.github/workflows/linux_cuda_ci.yml +++ b/.github/workflows/linux_cuda_ci.yml @@ -29,7 +29,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1' docker_image_repo: onnxruntimecuda12manylinuxbuild - extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' + extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' run_tests: false # <<< Do not run tests in this job upload_build_output: true # <<< Upload the build/Release directory diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml index 3b532c486cdfc..e874d2c09edd2 100644 --- a/.github/workflows/linux_cuda_plugin_ci.yml +++ b/.github/workflows/linux_cuda_plugin_ci.yml @@ -32,7 +32,7 @@ jobs: --use_binskim_compliant_compile_flags --build_wheel --parallel - --nvcc_threads 1 + --nvcc_threads 4 --flash_nvcc_threads 2 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 852d0164083c4..06e6a01cf0fa1 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -115,7 +115,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -235,7 +235,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml index 07083a5caa08a..8febd8c3d3fc3 100644 --- a/.github/workflows/windows_cuda_plugin.yml +++ b/.github/workflows/windows_cuda_plugin.yml @@ -73,7 +73,7 @@ jobs: --build_dir build ` --skip_submodule_sync ` --parallel ` - --nvcc_threads 1 ` + --nvcc_threads 4 --flash_nvcc_threads 2 ` --use_binskim_compliant_compile_flags ` --cmake_generator "Visual Studio 17 2022" ` --build_shared_lib ` diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index 0a47f46aa8516..8c499c3c3b3aa 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -121,7 +121,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -247,7 +247,7 @@ jobs: exit $lastExitCode } - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index f42617ba1b04c..d06d462c79cbd 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -209,31 +209,14 @@ def number_of_nvcc_threads(args): if args.nvcc_threads >= 0: return args.nvcc_threads - nvcc_threads = 1 - try: - import psutil # noqa: PLC0415 + return 4 - available_memory = psutil.virtual_memory().available - if isinstance(available_memory, int) and available_memory > 0: - if available_memory >= 64 * 1024 * 1024 * 1024: - # When available memory is large enough, chance of OOM is small. - nvcc_threads = min(4, int(available_memory / (8 * 4 * 1024 * 1024 * 1024))) - else: - # NVCC need a lot of memory to compile 48 flash attention cu files. - # Here we select number of threads to ensure each thread has enough memory (>= 4 GB). - memory_per_thread = 4 * 1024 * 1024 * 1024 - fmha_cu_files = 48 - fmha_parallel_jobs = min(fmha_cu_files, number_of_parallel_jobs(args)) - nvcc_threads = max(1, int(available_memory / (memory_per_thread * fmha_parallel_jobs))) - print( - f"nvcc_threads={nvcc_threads} to ensure memory per thread >= 4GB for available_memory={available_memory} and fmha_parallel_jobs={fmha_parallel_jobs}" - ) - except ImportError: - print( - "Failed to import psutil. Please `pip install psutil` for better estimation of nvcc threads. Use nvcc_threads=1" - ) - return nvcc_threads +def number_of_flash_nvcc_threads(args): + if args.flash_nvcc_threads >= 0: + return args.flash_nvcc_threads + + return number_of_nvcc_threads(args) # See https://learn.microsoft.com/en-us/vcpkg/commands/install @@ -724,6 +707,10 @@ def generate_build_tree( if args.use_cuda: nvcc_threads = number_of_nvcc_threads(args) cmake_args.append("-Donnxruntime_NVCC_THREADS=" + str(nvcc_threads)) + + flash_nvcc_threads = number_of_flash_nvcc_threads(args) + cmake_args.append("-Donnxruntime_FLASH_NVCC_THREADS=" + str(flash_nvcc_threads)) + cmake_args.append(f"-DCMAKE_CUDA_COMPILER={cuda_home}/bin/nvcc") add_default_definition(cmake_extra_defines, "onnxruntime_USE_CUDA", "ON") if args.cuda_version: diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index b40bf4c2b25c6..ca866a2db7ae6 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -647,9 +647,16 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: cuda_group.add_argument( "--nvcc_threads", nargs="?", - default=-1, # -1 signifies auto-detect based on jobs/memory + default=4, type=int, - help="Max NVCC threads per parallel job (-1=auto).", + help="Max NVCC threads per parallel job (default is 4).", + ) + cuda_group.add_argument( + "--flash_nvcc_threads", + nargs="?", + default=-1, + type=int, + help="Max NVCC threads per parallel job for flash attention (default is same value of --nvcc_threads).", ) # CUDA-specific profiling cuda_group.add_argument( diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml index ad6c04e7fd9d2..423ffb917571e 100644 --- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml @@ -97,7 +97,7 @@ extends: msbuildPlatform: x64 packageName: x64-cuda CudaVersion: ${{ parameters.CudaVersion }} - buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 1 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}" + buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 4 --flash_nvcc_threads 2 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}" runTests: false buildJava: false java_artifact_id: onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml index a2a325f01209a..b072e22818eec 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml @@ -73,9 +73,9 @@ stages: packageName: x64-cuda CudaVersion: ${{ parameters.CudaVersion }} ${{ if ne(parameters.win_cudnn_home, '') }}: - buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }} + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }} ${{ else }}: - buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu @@ -96,9 +96,9 @@ stages: CudaVersion: ${{ parameters.CudaVersion }} packageName: x64-tensorrt ${{ if ne(parameters.win_cudnn_home, '') }}: - buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }} + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }} ${{ else }}: - buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml index 676e03ddd7ff6..5ff6def4c1e10 100644 --- a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml @@ -127,7 +127,7 @@ stages: --skip_submodule_sync --cmake_generator "$(VSGenerator)" --parallel - --nvcc_threads 1 + --nvcc_threads 4 --flash_nvcc_threads 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags @@ -154,7 +154,7 @@ stages: --skip_submodule_sync --cmake_generator "$(VSGenerator)" --parallel - --nvcc_threads 1 + --nvcc_threads 4 --flash_nvcc_threads 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index 45bc7db05592a..65216e7c59198 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -122,7 +122,7 @@ stages: --enable_pybind --enable_onnx_tests --parallel - --nvcc_threads 1 + --nvcc_threads 4 --flash_nvcc_threads 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags diff --git a/tools/ci_build/github/linux/build_cuda_plugin_package.sh b/tools/ci_build/github/linux/build_cuda_plugin_package.sh index 1b4e897b05389..ad57648f8af81 100755 --- a/tools/ci_build/github/linux/build_cuda_plugin_package.sh +++ b/tools/ci_build/github/linux/build_cuda_plugin_package.sh @@ -55,7 +55,7 @@ docker run --rm \ --config ${BUILD_CONFIG} \ --skip_submodule_sync \ --parallel \ - --nvcc_threads 1 \ + --nvcc_threads 4 --flash_nvcc_threads 2 \ --use_binskim_compliant_compile_flags \ --use_cuda \ --cuda_version=${SHORT_CUDA_VERSION} \ diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 7ba5406e00ec0..672582edf0518 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -86,7 +86,7 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then CUDA_HOME=/usr/local/cuda fi #Enable CUDA EP. - BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF") + BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=4" "--flash_nvcc_threads=2" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF") # Enable TRT EP only if TensorRT is installed. if [ -f /usr/include/NvInfer.h ]; then BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/usr") From 21d7386819601d29018c2a895cd85975a12fb8a7 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 22 May 2026 20:21:50 -0700 Subject: [PATCH 04/17] exclude sm100 moe gemm kernels --- cmake/onnxruntime_providers_cuda.cmake | 4 ---- cmake/onnxruntime_providers_cuda_plugin.cmake | 3 --- 2 files changed, 7 deletions(-) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index ddf9d9da9c1a0..8d41197e3c13c 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -388,10 +388,6 @@ endif() endif() - if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - target_compile_definitions(${target} PRIVATE COMPILE_BLACKWELL_TMA_GROUPED_GEMMS) - endif() - if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) target_compile_definitions(${target} PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS) endif() diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 5a3df9d03c84b..7e4beb6bc1458 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -223,9 +223,6 @@ if(ORT_HAS_SM90_OR_LATER) target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GEMMS) target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GROUPED_GEMMS) endif() -if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_TMA_GROUPED_GEMMS) -endif() if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS) endif() From 6a3bc2866643b5b3fcd3391b2e2d699a7d5e478e Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 22 May 2026 23:35:39 -0700 Subject: [PATCH 05/17] fix cuda 12.8 plugin CI --- cmake/onnxruntime_providers_cuda.cmake | 2 ++ cmake/onnxruntime_providers_cuda_plugin.cmake | 1 + 2 files changed, 3 insertions(+) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 8d41197e3c13c..79b5669d53d3c 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -249,6 +249,8 @@ endif() # skip diagnosis error caused by cuda header files target_compile_options(${target} PRIVATE "$<$:--diag-suppress=221>") + # NVCC false positive: assigning a [[nodiscard]] Status via operator= is flagged as discarding the value. + target_compile_options(${target} PRIVATE "$<$:--diag-suppress=2810>") # CUDA 12.8 also reports deprecated implicit by-copy 'this' captures from CUTLASS headers. target_compile_options(${target} PRIVATE "$<$:--diag-suppress=2908>") endif() diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 7e4beb6bc1458..5aa96a57670e3 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -202,6 +202,7 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) list(APPEND _cuda_plugin_shared_compile_options "$<$:--static-global-template-stub=false>" "$<$:--diag-suppress=221>" + "$<$:--diag-suppress=2810>" "$<$:--diag-suppress=2908>" ) From 41e2974963ada82dd8b734900cc4bee9f82e9590 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sat, 23 May 2026 00:18:40 -0700 Subject: [PATCH 06/17] use quick build in CI (except linux cuda CI). --- .github/workflows/linux_cuda_plugin_ci.yml | 1 + .github/workflows/linux_tensorrt_ci.yml | 2 +- .github/workflows/windows_cuda.yml | 2 +- .github/workflows/windows_cuda_plugin.yml | 1 + .github/workflows/windows_tensorrt.yml | 2 +- onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc | 5 ++++- onnxruntime/test/python/transformers/test_gqa.py | 1 + 7 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml index e874d2c09edd2..2d8eb50e591bb 100644 --- a/.github/workflows/linux_cuda_plugin_ci.yml +++ b/.github/workflows/linux_cuda_plugin_ci.yml @@ -38,6 +38,7 @@ jobs: --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH' run_tests: false diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml index 263d9a98f0bb6..5879d455890cc 100644 --- a/.github/workflows/linux_tensorrt_ci.yml +++ b/.github/workflows/linux_tensorrt_ci.yml @@ -29,7 +29,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host' docker_image_repo: onnxruntimetensorrt86gpubuild - extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' + extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' run_tests: false # <<< Do not run tests in this job upload_build_output: true # <<< Upload the build/Release directory diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 06e6a01cf0fa1..aeb3c3c9c5ef0 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -235,7 +235,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml index 8febd8c3d3fc3..0dfd7feda388a 100644 --- a/.github/workflows/windows_cuda_plugin.yml +++ b/.github/workflows/windows_cuda_plugin.yml @@ -84,6 +84,7 @@ jobs: --use_vcpkg ` --use_vcpkg_ms_internal_asset_cache ` --enable_cuda_profiling ` + --cmake_extra_defines onnxruntime_QUICK_BUILD=ON ` --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 ` --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index 8c499c3c3b3aa..fa051ef694e52 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -121,7 +121,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc index f8bb8d0213099..8212b3b777c12 100644 --- a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc +++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc @@ -140,7 +140,10 @@ std::vector get_candidate_tiles( #ifdef ORT_QUICK_BUILD // Quick build: restrict SM80 tile shapes to the 3 instantiated tile sizes only. // This matches the reduced instantiations in fused_moe_gemm_sm80_f16.generated.cu. - (void)gemm_type; + // SIMT (float) kernels use a different tile shape that must still be returned. + if (gemm_type == CutlassGemmType::Simt) { + return {CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8}; + } return base_configs; #endif diff --git a/onnxruntime/test/python/transformers/test_gqa.py b/onnxruntime/test/python/transformers/test_gqa.py index 55c8b56ae027a..8a98c39d61eb9 100644 --- a/onnxruntime/test/python/transformers/test_gqa.py +++ b/onnxruntime/test/python/transformers/test_gqa.py @@ -2512,6 +2512,7 @@ def test_gqa_fp8_prompt(self): raise @unittest.skipIf(not has_cuda_device(89) or not has_fp8_kv_cache, "FP8 KV cache is not available, skipping tests.") + @unittest.skipIf(quick_build, "Quick build only has hdim128 flash attention kernels; head_size=48 needs hdim64.") def test_gqa_fp8_fallback_unsupported_head_size(self): """ Test GQA with FP8 KV cache on a head size not supported by XQA. From d5c70d53d79dac252f7cf0a53551683077e98a1b Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sat, 23 May 2026 01:46:47 -0700 Subject: [PATCH 07/17] Fix plugin CI --- cmake/onnxruntime_providers_cuda.cmake | 2 ++ cmake/onnxruntime_providers_cuda_plugin.cmake | 21 ++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 79b5669d53d3c..c3ec700a48925 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -239,6 +239,8 @@ # cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never # referenced target_compile_options(${target} PRIVATE "$<$:--diag-suppress=177>") + # suppress cudafe "variable was set but never used" (#550-D) from flatbuffers/adapter headers + target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcudafe --diag_suppress=550>") # Since CUDA 12.8, compiling diagnostics become stricter if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 5aa96a57670e3..e74b8e83c3bb3 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -194,15 +194,23 @@ else() endif() # Shared CUDA compile options (excluding --threads, which is set per-target so that # flash attention can use a lower thread count without duplicate-flag nvcc warnings). +# These mirror the options from the parent plugin target and config_cuda_provider_shared_module +# so that OBJECT libraries compiled separately receive the same flags. set(_cuda_plugin_shared_compile_options + # Force NVCC onto C++20 explicitly. With the VS generator the CUDA_STANDARD + # property alone still leaves `-std=c++17` in AdditionalOptions. + "$<$:SHELL:--std c++20>" "$<$:--diag-suppress=177>" + # Suppress cudafe front-end diagnostic 550 (variable set but never used) from third-party headers. + "$<$:SHELL:-Xcudafe --diag_suppress=550>" + # Suppress cudafe [[nodiscard]] false positive on Status assignments. + "$<$:SHELL:-Xcudafe --diag_suppress=2810>" ) if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) list(APPEND _cuda_plugin_shared_compile_options "$<$:--static-global-template-stub=false>" "$<$:--diag-suppress=221>" - "$<$:--diag-suppress=2810>" "$<$:--diag-suppress=2908>" ) @@ -213,6 +221,17 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) endif() endif() +if (MSVC) + list(APPEND _cuda_plugin_shared_compile_options + "$<$:SHELL:-Xcompiler /permissive>" + "$<$:SHELL:-Xcompiler /wd4834>" + "$<$:SHELL:-Xcompiler /wd4127>" + "$<$:SHELL:-Xcompiler /wd4211>" + "$<$:SHELL:-Xcompiler /Zc:__cplusplus>" + "$<$:SHELL:-Xcompiler /bigobj>" + ) +endif() + include(cudnn_frontend) include(cutlass) From 64f4acf64ceeda60fbfca42bc04ddc6dc444ac9b Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sat, 23 May 2026 21:45:45 -0700 Subject: [PATCH 08/17] CI --nvcc_threads 8 --flash_nvcc_threads 4 --- .github/workflows/linux_cuda_ci.yml | 2 +- .github/workflows/linux_cuda_plugin_ci.yml | 2 +- .github/workflows/linux_tensorrt_ci.yml | 2 +- .github/workflows/windows_cuda.yml | 4 ++-- .github/workflows/windows_cuda_plugin.yml | 2 +- .github/workflows/windows_tensorrt.yml | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml index 0c611119e4964..2e9d5b92655dc 100644 --- a/.github/workflows/linux_cuda_ci.yml +++ b/.github/workflows/linux_cuda_ci.yml @@ -29,7 +29,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1' docker_image_repo: onnxruntimecuda12manylinuxbuild - extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' + extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' run_tests: false # <<< Do not run tests in this job upload_build_output: true # <<< Upload the build/Release directory diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml index 2d8eb50e591bb..9406d389212ea 100644 --- a/.github/workflows/linux_cuda_plugin_ci.yml +++ b/.github/workflows/linux_cuda_plugin_ci.yml @@ -32,7 +32,7 @@ jobs: --use_binskim_compliant_compile_flags --build_wheel --parallel - --nvcc_threads 4 --flash_nvcc_threads 2 + --nvcc_threads 8 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml index 5879d455890cc..bb6d51f2f7190 100644 --- a/.github/workflows/linux_tensorrt_ci.yml +++ b/.github/workflows/linux_tensorrt_ci.yml @@ -29,7 +29,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host' docker_image_repo: onnxruntimetensorrt86gpubuild - extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' + extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' run_tests: false # <<< Do not run tests in this job upload_build_output: true # <<< Upload the build/Release directory diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index aeb3c3c9c5ef0..c2df2cc56c1f4 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -115,7 +115,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -235,7 +235,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml index 0dfd7feda388a..c3e3715b44528 100644 --- a/.github/workflows/windows_cuda_plugin.yml +++ b/.github/workflows/windows_cuda_plugin.yml @@ -73,7 +73,7 @@ jobs: --build_dir build ` --skip_submodule_sync ` --parallel ` - --nvcc_threads 4 --flash_nvcc_threads 2 ` + --nvcc_threads 8 --flash_nvcc_threads 4 ` --use_binskim_compliant_compile_flags ` --cmake_generator "Visual Studio 17 2022" ` --build_shared_lib ` diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index fa051ef694e52..49b36fdb03ce2 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -121,7 +121,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -247,7 +247,7 @@ jobs: exit $lastExitCode } - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } From d0afc5a96260c56cb4860356d38a03c587bb3476 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sun, 24 May 2026 01:21:30 -0700 Subject: [PATCH 09/17] --nvcc_threads 4 --flash_nvcc_threads 2 in packaging pipeline --- .../github/linux/build_cuda_c_api_package.sh | 3 +- tools/ci_build/github/linux/build_cuda_ci.sh | 53 ------------------- .../linux/build_tensorrt_c_api_package.sh | 3 +- 3 files changed, 4 insertions(+), 55 deletions(-) delete mode 100755 tools/ci_build/github/linux/build_cuda_ci.sh diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh index 606dcab3631c8..d5b4bde9589a7 100755 --- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh +++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh @@ -19,7 +19,8 @@ docker run -e SYSTEM_COLLECTIONURI --rm \ --volume "$HOME/.gradle:/home/onnxruntimedev/.gradle" \ -e NIGHTLY_BUILD "onnxruntimecuda${CUDA_VERSION_MAJOR}build" \ /bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs \ ---build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib \ +--build_dir /build --config Release --skip_submodule_sync --use_binskim_compliant_compile_flags --build_shared_lib \ +--parallel --nvcc_threads 4 --flash_nvcc_threads 2 \ --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \ --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache \ --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' \ diff --git a/tools/ci_build/github/linux/build_cuda_ci.sh b/tools/ci_build/github/linux/build_cuda_ci.sh deleted file mode 100755 index db48c31e0f2d0..0000000000000 --- a/tools/ci_build/github/linux/build_cuda_ci.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -set -ex -#Every cuda container has this $CUDA_VERSION env var set. -SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/') - -BUILD_ARGS=('--config' - 'Release' - '--update' - '--build' - '--skip_submodule_sync' - '--build_shared_lib' - '--parallel' - '--use_vcpkg' - '--use_vcpkg_ms_internal_asset_cache' - '--use_binskim_compliant_compile_flags' - '--build_wheel' - '--enable_onnx_tests' - '--use_cuda' - "--cuda_version=$SHORT_CUDA_VERSION" - "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" - "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" - "--enable_cuda_profiling" - "--enable_pybind" - "--build_java" - "--cmake_extra_defines" - "CMAKE_CUDA_ARCHITECTURES=80" - "onnxruntime_BUILD_UNIT_TESTS=ON" - "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON") -if [ -x "$(command -v ninja)" ]; then - BUILD_ARGS+=('--cmake_generator' 'Ninja') -fi - -if [ -d /build ]; then - BUILD_ARGS+=('--build_dir' '/build') -else - BUILD_ARGS+=('--build_dir' 'build') -fi - -if command -v ccache &> /dev/null; then - ccache --zero-stats - BUILD_ARGS+=("--use_cache") -fi - -if [ -f /opt/python/cp312-cp312/bin/python3 ]; then - PATH=/opt/python/cp312-cp312/bin:$PATH python tools/ci_build/build.py "${BUILD_ARGS[@]}" -else - python3 tools/ci_build/build.py "${BUILD_ARGS[@]}" -fi - -if command -v ccache &> /dev/null; then - # FIXME: can't use `-vv` for extra details b/c we're shipping with a decrepit version of ccache (3.something) that doesn't support it. - ccache --show-stats # -vv -fi diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh index c027e4a0560a2..b8ffe9fe09679 100755 --- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh +++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh @@ -20,7 +20,8 @@ docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volu --volume "$HOME/.gradle:/home/onnxruntimedev/.gradle" \ --volume "$HOME/.onnx:/home/onnxruntimedev/.onnx" -e NIGHTLY_BUILD "onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build" \ /bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests \ - --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs \ + --skip_submodule_sync --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs \ + --parallel --nvcc_threads 4 --flash_nvcc_threads 2 \ --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr \ --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' \ --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed" From c4d32b8f66ad12531a148e6b357d37a884476a46 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sun, 24 May 2026 01:48:37 -0700 Subject: [PATCH 10/17] Fix Windows_Packaging_TensorRT --- cmake/onnxruntime_providers_cuda.cmake | 7 +++++++ cmake/onnxruntime_providers_cuda_plugin.cmake | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index c3ec700a48925..06ee5867498bd 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -525,11 +525,18 @@ # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs. + # Excludes SM120+ real (native SASS) architectures because SM120-specific kernels are already + # compiled in the separate SM120 TMA OBJECT library, and compiling the general LLM code for + # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch + # (PTX) is kept so SM120 devices can JIT-compile the code. if(onnxruntime_cuda_llm_srcs) set(_ort_llm_cuda_architectures) foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") if(_arch_num GREATER_EQUAL 75) + if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$") + continue() + endif() list(APPEND _ort_llm_cuda_architectures "${_arch}") endif() endforeach() diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index e74b8e83c3bb3..4ea21e0f4617c 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -342,11 +342,17 @@ if(_cuda_plugin_flash_attention_srcs) endif() # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). +# Excludes SM120+ real (native SASS) architectures — SM120-specific kernels are compiled in +# the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX +# headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept. if(_cuda_plugin_llm_srcs) set(_plugin_llm_cuda_architectures) foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") if(_arch_num GREATER_EQUAL 75) + if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$") + continue() + endif() list(APPEND _plugin_llm_cuda_architectures "${_arch}") endif() endforeach() From f14741f8dfd1a7f5eac9e1b1341bac22b7f8c36f Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sun, 24 May 2026 01:49:27 -0700 Subject: [PATCH 11/17] cuda plugin packaging uses --nvcc_threads 2 --flash_nvcc_threads 1 --- tools/ci_build/github/linux/build_cuda_plugin_package.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/linux/build_cuda_plugin_package.sh b/tools/ci_build/github/linux/build_cuda_plugin_package.sh index ad57648f8af81..958744e000727 100755 --- a/tools/ci_build/github/linux/build_cuda_plugin_package.sh +++ b/tools/ci_build/github/linux/build_cuda_plugin_package.sh @@ -55,7 +55,7 @@ docker run --rm \ --config ${BUILD_CONFIG} \ --skip_submodule_sync \ --parallel \ - --nvcc_threads 4 --flash_nvcc_threads 2 \ + --nvcc_threads 2 --flash_nvcc_threads 1 \ --use_binskim_compliant_compile_flags \ --use_cuda \ --cuda_version=${SHORT_CUDA_VERSION} \ From 6ebd5e4e42995b7c871d2c18b6c219bb2ec489cd Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Sun, 24 May 2026 16:04:26 -0700 Subject: [PATCH 12/17] CI --nvcc_threads 4 --flash_nvcc_threads 4 --- .github/workflows/linux_cuda_ci.yml | 2 +- .github/workflows/linux_cuda_plugin_ci.yml | 2 +- .github/workflows/linux_tensorrt_ci.yml | 2 +- .github/workflows/windows_cuda.yml | 4 ++-- .github/workflows/windows_cuda_plugin.yml | 2 +- .github/workflows/windows_tensorrt.yml | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml index 2e9d5b92655dc..f50c0064dd956 100644 --- a/.github/workflows/linux_cuda_ci.yml +++ b/.github/workflows/linux_cuda_ci.yml @@ -29,7 +29,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1' docker_image_repo: onnxruntimecuda12manylinuxbuild - extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' + extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' run_tests: false # <<< Do not run tests in this job upload_build_output: true # <<< Upload the build/Release directory diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml index 9406d389212ea..e7fd32e864129 100644 --- a/.github/workflows/linux_cuda_plugin_ci.yml +++ b/.github/workflows/linux_cuda_plugin_ci.yml @@ -32,7 +32,7 @@ jobs: --use_binskim_compliant_compile_flags --build_wheel --parallel - --nvcc_threads 8 --flash_nvcc_threads 4 + --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml index bb6d51f2f7190..f5704dab8dcfa 100644 --- a/.github/workflows/linux_tensorrt_ci.yml +++ b/.github/workflows/linux_tensorrt_ci.yml @@ -29,7 +29,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host' docker_image_repo: onnxruntimetensorrt86gpubuild - extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' + extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr --build_java --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' run_tests: false # <<< Do not run tests in this job upload_build_output: true # <<< Upload the build/Release directory diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index c2df2cc56c1f4..5c9ff551c2a34 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -115,7 +115,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -235,7 +235,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml index c3e3715b44528..a77a407da8e9d 100644 --- a/.github/workflows/windows_cuda_plugin.yml +++ b/.github/workflows/windows_cuda_plugin.yml @@ -73,7 +73,7 @@ jobs: --build_dir build ` --skip_submodule_sync ` --parallel ` - --nvcc_threads 8 --flash_nvcc_threads 4 ` + --nvcc_threads 4 --flash_nvcc_threads 4 ` --use_binskim_compliant_compile_flags ` --cmake_generator "Visual Studio 17 2022" ` --build_shared_lib ` diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index 49b36fdb03ce2..d5710795942d1 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -121,7 +121,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -247,7 +247,7 @@ jobs: exit $lastExitCode } - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } From 999b02f18b90b7bafe6c57dd1dd011b530f05e25 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 25 May 2026 22:52:06 -0700 Subject: [PATCH 13/17] fix: address review feedback on nvcc_threads defaults and QUICK_BUILD - Remove nargs="?" from --nvcc_threads and --flash_nvcc_threads to prevent TypeError when passed without a value - Add onnxruntime_QUICK_BUILD=ON to Windows CUDA CI build job for consistency with the test job - Update CMake cache default for onnxruntime_NVCC_THREADS from 1 to 4 to match the build.py default --- .github/workflows/windows_cuda.yml | 2 +- cmake/onnxruntime_providers_cuda.cmake | 2 +- tools/ci_build/build_args.py | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 5c9ff551c2a34..53c7031c3c095 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -115,7 +115,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 06ee5867498bd..234ff643e2519 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -233,7 +233,7 @@ # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads # --threads is NOT set here; it is applied per-target after calling this function # so that flash attention can use a different (lower) thread count. - set(onnxruntime_NVCC_THREADS "1" CACHE STRING "Number of threads that NVCC can use for compilation.") + set(onnxruntime_NVCC_THREADS "4" CACHE STRING "Number of threads that NVCC can use for compilation.") # suppress warnings like this: # cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index ca866a2db7ae6..278453a9ddea7 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -646,14 +646,12 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: cuda_group.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.") cuda_group.add_argument( "--nvcc_threads", - nargs="?", default=4, type=int, help="Max NVCC threads per parallel job (default is 4).", ) cuda_group.add_argument( "--flash_nvcc_threads", - nargs="?", default=-1, type=int, help="Max NVCC threads per parallel job for flash attention (default is same value of --nvcc_threads).", From b5aae5cb2fcb4a62c2818c09734e1491d908ab82 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 26 May 2026 15:00:15 -0700 Subject: [PATCH 14/17] address feedbacks --- .github/workflows/linux_cuda_plugin_ci.yml | 3 +- .github/workflows/windows_cuda_plugin.yml | 3 +- cmake/onnxruntime_cuda_source_filters.cmake | 141 ++++++++++++------ cmake/onnxruntime_providers_cuda.cmake | 27 +--- cmake/onnxruntime_providers_cuda_plugin.cmake | 27 +--- .../stages/plugin-win-cuda-stage.yml | 6 +- .../stages/py-win-gpu-stage.yml | 3 +- .../github/linux/build_cuda_plugin_package.sh | 3 +- .../linux/build_linux_python_package.sh | 4 +- 9 files changed, 113 insertions(+), 104 deletions(-) diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml index e7fd32e864129..d2491f59812ab 100644 --- a/.github/workflows/linux_cuda_plugin_ci.yml +++ b/.github/workflows/linux_cuda_plugin_ci.yml @@ -32,7 +32,8 @@ jobs: --use_binskim_compliant_compile_flags --build_wheel --parallel - --nvcc_threads 4 --flash_nvcc_threads 4 + --nvcc_threads 4 + --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml index a77a407da8e9d..f9acdbd76a12d 100644 --- a/.github/workflows/windows_cuda_plugin.yml +++ b/.github/workflows/windows_cuda_plugin.yml @@ -73,7 +73,8 @@ jobs: --build_dir build ` --skip_submodule_sync ` --parallel ` - --nvcc_threads 4 --flash_nvcc_threads 4 ` + --nvcc_threads 4 ` + --flash_nvcc_threads 4 ` --use_binskim_compliant_compile_flags ` --cmake_generator "Visual Studio 17 2022" ` --build_shared_lib ` diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake index 761f43aab9f48..752f911d0fd1a 100644 --- a/cmake/onnxruntime_cuda_source_filters.cmake +++ b/cmake/onnxruntime_cuda_source_filters.cmake @@ -8,39 +8,43 @@ # Usage: # onnxruntime_filter_cuda_cu_sources() # -# The macro modifies the named list variable in the caller's scope. +# The function modifies the named list variable in the caller's scope. + +function(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST) + set(_list "${${CU_SRC_LIST}}") -macro(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST) # Quick build mode: Filter flash attention kernels for faster development iteration. # - We keep only hdim128 fp16 flash attention kernels in quick build mode. # - All other listed head dimensions are excluded (e.g., 32, 64, 96, 192, 256). # If new head dimensions are added or removed, update this list to match the supported set. if(onnxruntime_QUICK_BUILD) message(STATUS "Quick build mode enabled: Only building hdim128 fp16 flash attention kernels") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)") + list(FILTER _list EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)") endif() if(NOT onnxruntime_USE_FP4_QMOE) - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_.*\\.generated\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_.*\\.generated\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp4\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_.*\\.generated\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_.*\\.generated\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp4\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu") else() # CUDA 13 PTXAS does not complete the FP4 M=128/N=64 pingpong specializations in # this build configuration. The dispatcher routes that tile through cooperative # mainloop variants instead, so exclude only those unused generated units. - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_(fp16|bf16)_m128_n64_k[0-9]+_cm[12]_cn[12]_pp(_finalize)?\\.generated\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_(fp16|bf16)_m128_n64_k[0-9]+_cm[12]_cn[12]_pp(_finalize)?\\.generated\\.cu") endif() if(NOT onnxruntime_USE_FP8_QMOE) - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm90_wfp8_.*\\.generated\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_fp8_.*\\.generated\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp8\\.cu") - list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm90_wfp8_.*\\.generated\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_fp8_.*\\.generated\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp8\\.cu") + list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu") endif() -endmacro() + + set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE) +endfunction() # Extract SM90/SM120 TMA warp-specialized generated source files from a CUDA source list. # These files use CUTLASS 3.x features (GMMA, TMA) that are specific to SM90+ or SM120+. @@ -57,35 +61,41 @@ endmacro() # SM90_SOURCES SM120_SOURCES ) # # Removes matched files from and stores them in the output variables. -macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST) - cmake_parse_arguments(_EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "" ${ARGN}) +function(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST) + cmake_parse_arguments(PARSE_ARGV 1 _EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "") + + set(_list "${${CU_SRC_LIST}}") # Extract SM90 TMA WS generated files - set(${_EXTRACT_SM90_SOURCES}) + set(_sm90_srcs) if(ORT_HAS_SM90_OR_LATER) - foreach(_src IN LISTS ${CU_SRC_LIST}) + foreach(_src IN LISTS _list) if(_src MATCHES "moe_gemm_tma_ws_sm90_.*\\.generated\\.cu$") - list(APPEND ${_EXTRACT_SM90_SOURCES} "${_src}") + list(APPEND _sm90_srcs "${_src}") endif() endforeach() - if(${_EXTRACT_SM90_SOURCES}) - list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM90_SOURCES}}) + if(_sm90_srcs) + list(REMOVE_ITEM _list ${_sm90_srcs}) endif() endif() # Extract SM120 TMA WS generated files - set(${_EXTRACT_SM120_SOURCES}) + set(_sm120_srcs) if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) - foreach(_src IN LISTS ${CU_SRC_LIST}) + foreach(_src IN LISTS _list) if(_src MATCHES "moe_gemm_tma_ws_sm120_.*\\.generated\\.cu$") - list(APPEND ${_EXTRACT_SM120_SOURCES} "${_src}") + list(APPEND _sm120_srcs "${_src}") endif() endforeach() - if(${_EXTRACT_SM120_SOURCES}) - list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM120_SOURCES}}) + if(_sm120_srcs) + list(REMOVE_ITEM _list ${_sm120_srcs}) endif() endif() -endmacro() + + set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE) + set("${_EXTRACT_SM90_SOURCES}" "${_sm90_srcs}" PARENT_SCOPE) + set("${_EXTRACT_SM120_SOURCES}" "${_sm120_srcs}" PARENT_SCOPE) +endfunction() # Extract Flash Attention CUDA source files into a separate list for compilation # in a dedicated OBJECT library with SM80+ architectures and independent nvcc_threads. @@ -98,19 +108,23 @@ endmacro() # Usage: # onnxruntime_extract_flash_attention_sources( # FLASH_SOURCES ) -macro(onnxruntime_extract_flash_attention_sources CU_SRC_LIST) - cmake_parse_arguments(_FA "" "FLASH_SOURCES" "" ${ARGN}) +function(onnxruntime_extract_flash_attention_sources CU_SRC_LIST) + cmake_parse_arguments(PARSE_ARGV 1 _FA "" "FLASH_SOURCES" "") - set(${_FA_FLASH_SOURCES}) - foreach(_src IN LISTS ${CU_SRC_LIST}) + set(_list "${${CU_SRC_LIST}}") + set(_flash_srcs) + foreach(_src IN LISTS _list) if(_src MATCHES "/bert/flash_attention/.*\\.cu$") - list(APPEND ${_FA_FLASH_SOURCES} "${_src}") + list(APPEND _flash_srcs "${_src}") endif() endforeach() - if(${_FA_FLASH_SOURCES}) - list(REMOVE_ITEM ${CU_SRC_LIST} ${${_FA_FLASH_SOURCES}}) + if(_flash_srcs) + list(REMOVE_ITEM _list ${_flash_srcs}) endif() -endmacro() + + set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE) + set("${_FA_FLASH_SOURCES}" "${_flash_srcs}" PARENT_SCOPE) +endfunction() # Extract LLM CUDA source files into separate lists for per-architecture compilation. # The LLM directory (contrib_ops/cuda/llm/) contains kernels with minimum SM75 support @@ -119,31 +133,60 @@ endmacro() # at 90a-real (merged into the SM90 TMA OBJECT library). # # Note: SM90 TMA MoE GEMM files are already extracted by -# onnxruntime_extract_sm_specific_cuda_sources() before this macro is called. +# onnxruntime_extract_sm_specific_cuda_sources() before this function is called. # # Usage: # onnxruntime_extract_llm_sources( # LLM_SOURCES # LLM_SM90_SOURCES ) -macro(onnxruntime_extract_llm_sources CU_SRC_LIST) - cmake_parse_arguments(_LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "" ${ARGN}) +function(onnxruntime_extract_llm_sources CU_SRC_LIST) + cmake_parse_arguments(PARSE_ARGV 1 _LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "") - set(${_LLM_LLM_SOURCES}) - set(${_LLM_LLM_SM90_SOURCES}) - foreach(_src IN LISTS ${CU_SRC_LIST}) + set(_list "${${CU_SRC_LIST}}") + set(_llm_srcs) + set(_llm_sm90_srcs) + foreach(_src IN LISTS _list) if(_src MATCHES "/contrib_ops/cuda/llm/.*\\.cu$") # SM90-specific fpA_intB launchers (guarded by #ifndef EXCLUDE_SM_90) if(_src MATCHES "fpA_intB_gemm_launcher_[0-9]+\\.generated\\.cu$") - list(APPEND ${_LLM_LLM_SM90_SOURCES} "${_src}") + list(APPEND _llm_sm90_srcs "${_src}") else() - list(APPEND ${_LLM_LLM_SOURCES} "${_src}") + list(APPEND _llm_srcs "${_src}") endif() endif() endforeach() - if(${_LLM_LLM_SOURCES}) - list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SOURCES}}) + if(_llm_srcs) + list(REMOVE_ITEM _list ${_llm_srcs}) endif() - if(${_LLM_LLM_SM90_SOURCES}) - list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SM90_SOURCES}}) + if(_llm_sm90_srcs) + list(REMOVE_ITEM _list ${_llm_sm90_srcs}) endif() -endmacro() + + set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE) + set("${_LLM_LLM_SOURCES}" "${_llm_srcs}" PARENT_SCOPE) + set("${_LLM_LLM_SM90_SOURCES}" "${_llm_sm90_srcs}" PARENT_SCOPE) +endfunction() + +# Filter CMAKE_CUDA_ARCHITECTURES to only those >= a minimum SM version. +# Optionally excludes SM120+ real architectures (for LLM targets that hit +# CCCL tcgen05 PTX issues on Windows/MSVC when compiled for sm_120a native). +# +# Usage: +# onnxruntime_filter_cuda_archs( +# MIN_SM +# [EXCLUDE_SM120_REAL]) +function(onnxruntime_filter_cuda_archs OUTPUT_VAR) + cmake_parse_arguments(PARSE_ARGV 1 _FCA "EXCLUDE_SM120_REAL" "MIN_SM" "") + + set(_filtered) + foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") + if(_arch_num GREATER_EQUAL "${_FCA_MIN_SM}") + if(_FCA_EXCLUDE_SM120_REAL AND _arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$") + continue() + endif() + list(APPEND _filtered "${_arch}") + endif() + endforeach() + set("${OUTPUT_VAR}" "${_filtered}" PARENT_SCOPE) +endfunction() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 234ff643e2519..8ddd29763ff39 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -482,13 +482,7 @@ endif() if(onnxruntime_cuda_sm120_tma_srcs) - set(_ort_sm120_cuda_architectures) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 120) - list(APPEND _ort_sm120_cuda_architectures "${_arch}") - endif() - endforeach() + onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120) if(_ort_sm120_cuda_architectures) onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs}) set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}") @@ -504,13 +498,7 @@ set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).") if(onnxruntime_cuda_flash_attention_srcs) - set(_ort_flash_cuda_architectures) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 80) - list(APPEND _ort_flash_cuda_architectures "${_arch}") - endif() - endforeach() + onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80) if(_ort_flash_cuda_architectures) onnxruntime_add_object_library(onnxruntime_providers_cuda_flash_attention ${onnxruntime_cuda_flash_attention_srcs}) set_target_properties(onnxruntime_providers_cuda_flash_attention PROPERTIES @@ -530,16 +518,7 @@ # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch # (PTX) is kept so SM120 devices can JIT-compile the code. if(onnxruntime_cuda_llm_srcs) - set(_ort_llm_cuda_architectures) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 75) - if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$") - continue() - endif() - list(APPEND _ort_llm_cuda_architectures "${_arch}") - endif() - endforeach() + onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) if(_ort_llm_cuda_architectures) onnxruntime_add_object_library(onnxruntime_providers_cuda_llm ${onnxruntime_cuda_llm_srcs}) set_target_properties(onnxruntime_providers_cuda_llm PROPERTIES diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 4ea21e0f4617c..cd7810e7c4d91 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -284,13 +284,7 @@ if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs) endif() if(_cuda_plugin_sm120_tma_srcs) - set(_plugin_sm120_cuda_architectures) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 120) - list(APPEND _plugin_sm120_cuda_architectures "${_arch}") - endif() - endforeach() + onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120) if(_plugin_sm120_cuda_architectures) onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm120_tma ${_cuda_plugin_sm120_tma_srcs}) set_target_properties(onnxruntime_providers_cuda_plugin_sm120_tma PROPERTIES @@ -315,13 +309,7 @@ if(NOT DEFINED onnxruntime_FLASH_NVCC_THREADS) set(onnxruntime_FLASH_NVCC_THREADS "1") endif() if(_cuda_plugin_flash_attention_srcs) - set(_plugin_flash_cuda_architectures) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 80) - list(APPEND _plugin_flash_cuda_architectures "${_arch}") - endif() - endforeach() + onnxruntime_filter_cuda_archs(_plugin_flash_cuda_architectures MIN_SM 80) if(_plugin_flash_cuda_architectures) onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_flash_attention ${_cuda_plugin_flash_attention_srcs}) set_target_properties(onnxruntime_providers_cuda_plugin_flash_attention PROPERTIES @@ -346,16 +334,7 @@ endif() # the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX # headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept. if(_cuda_plugin_llm_srcs) - set(_plugin_llm_cuda_architectures) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 75) - if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$") - continue() - endif() - list(APPEND _plugin_llm_cuda_architectures "${_arch}") - endif() - endforeach() + onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) if(_plugin_llm_cuda_architectures) onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_llm ${_cuda_plugin_llm_srcs}) set_target_properties(onnxruntime_providers_cuda_plugin_llm PROPERTIES diff --git a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml index 5ff6def4c1e10..e2618165b9b58 100644 --- a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml @@ -127,7 +127,8 @@ stages: --skip_submodule_sync --cmake_generator "$(VSGenerator)" --parallel - --nvcc_threads 4 --flash_nvcc_threads 2 + --nvcc_threads 4 + --flash_nvcc_threads 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags @@ -154,7 +155,8 @@ stages: --skip_submodule_sync --cmake_generator "$(VSGenerator)" --parallel - --nvcc_threads 4 --flash_nvcc_threads 2 + --nvcc_threads 4 + --flash_nvcc_threads 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index 65216e7c59198..e49b94cbc2c56 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -122,7 +122,8 @@ stages: --enable_pybind --enable_onnx_tests --parallel - --nvcc_threads 4 --flash_nvcc_threads 2 + --nvcc_threads 4 + --flash_nvcc_threads 2 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags diff --git a/tools/ci_build/github/linux/build_cuda_plugin_package.sh b/tools/ci_build/github/linux/build_cuda_plugin_package.sh index 958744e000727..5feae049acb22 100755 --- a/tools/ci_build/github/linux/build_cuda_plugin_package.sh +++ b/tools/ci_build/github/linux/build_cuda_plugin_package.sh @@ -55,7 +55,8 @@ docker run --rm \ --config ${BUILD_CONFIG} \ --skip_submodule_sync \ --parallel \ - --nvcc_threads 2 --flash_nvcc_threads 1 \ + --nvcc_threads 2 \ + --flash_nvcc_threads 1 \ --use_binskim_compliant_compile_flags \ --use_cuda \ --cuda_version=${SHORT_CUDA_VERSION} \ diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 672582edf0518..1df9733247fc2 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -86,7 +86,9 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then CUDA_HOME=/usr/local/cuda fi #Enable CUDA EP. - BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=4" "--flash_nvcc_threads=2" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF") + BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME") + BUILD_ARGS+=("--nvcc_threads=4" "--flash_nvcc_threads=2") + BUILD_ARGS+=("--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF") # Enable TRT EP only if TensorRT is installed. if [ -f /usr/include/NvInfer.h ]; then BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/usr") From 86c45a412e6c3ac5bb0f2ea8f09444d9c914a8d4 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 26 May 2026 15:23:48 -0700 Subject: [PATCH 15/17] refactoring --- cmake/onnxruntime_cuda_source_filters.cmake | 53 +++++++++++ cmake/onnxruntime_providers_cuda.cmake | 59 ++++++------- cmake/onnxruntime_providers_cuda_plugin.cmake | 88 ++++++------------- 3 files changed, 105 insertions(+), 95 deletions(-) diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake index 752f911d0fd1a..3af4b71b90988 100644 --- a/cmake/onnxruntime_cuda_source_filters.cmake +++ b/cmake/onnxruntime_cuda_source_filters.cmake @@ -190,3 +190,56 @@ function(onnxruntime_filter_cuda_archs OUTPUT_VAR) endforeach() set("${OUTPUT_VAR}" "${_filtered}" PARENT_SCOPE) endfunction() + +# Create a CUDA OBJECT library for the in-tree CUDA EP and link it to the parent target. +# Uses config_cuda_provider_shared_module() for full configuration (includes, link libs, +# PCH, platform flags, etc.), then applies nvcc --threads. +# +# Usage: +# onnxruntime_add_cuda_object_library( +# NAME +# PARENT +# CUDA_ARCHITECTURES +# NVCC_THREADS +# SOURCES ) +function(onnxruntime_add_cuda_object_library) + cmake_parse_arguments(PARSE_ARGV 0 _ARG "" "NAME;PARENT;CUDA_ARCHITECTURES;NVCC_THREADS" "SOURCES") + + onnxruntime_add_object_library("${_ARG_NAME}" ${_ARG_SOURCES}) + set_target_properties("${_ARG_NAME}" PROPERTIES CUDA_ARCHITECTURES "${_ARG_CUDA_ARCHITECTURES}") + config_cuda_provider_shared_module("${_ARG_NAME}") + target_compile_options("${_ARG_NAME}" PRIVATE + "$<$:SHELL:--threads \"${_ARG_NVCC_THREADS}\">") + target_link_libraries("${_ARG_PARENT}" PRIVATE "${_ARG_NAME}") +endfunction() + +# Create a CUDA OBJECT library for the plugin EP and link it to the parent target. +# Handles the boilerplate: set CUDA_ARCHITECTURES/CUDA_STANDARD, propagate includes +# and compile definitions from the parent, apply shared compile options + nvcc --threads. +# +# Usage: +# onnxruntime_add_cuda_plugin_object_library( +# NAME +# PARENT +# CUDA_ARCHITECTURES +# NVCC_THREADS +# COMPILE_OPTIONS +# SOURCES ) +function(onnxruntime_add_cuda_plugin_object_library) + cmake_parse_arguments(PARSE_ARGV 0 _ARG "" "NAME;PARENT;CUDA_ARCHITECTURES;NVCC_THREADS" "SOURCES;COMPILE_OPTIONS") + + onnxruntime_add_object_library("${_ARG_NAME}" ${_ARG_SOURCES}) + set_target_properties("${_ARG_NAME}" PROPERTIES + CUDA_ARCHITECTURES "${_ARG_CUDA_ARCHITECTURES}" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + target_include_directories("${_ARG_NAME}" PRIVATE + $) + target_compile_definitions("${_ARG_NAME}" PRIVATE + $) + target_compile_options("${_ARG_NAME}" PRIVATE + ${_ARG_COMPILE_OPTIONS} + "$<$:SHELL:--threads \"${_ARG_NVCC_THREADS}\">") + target_link_libraries("${_ARG_PARENT}" PRIVATE "${_ARG_NAME}") +endfunction() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 8ddd29763ff39..1c2f9291bb5d1 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -464,31 +464,26 @@ # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat. # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs}) - set(_ort_has_sm90_plus FALSE) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 90) - set(_ort_has_sm90_plus TRUE) - break() - endif() - endforeach() - if(_ort_has_sm90_plus) - onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${_ort_sm90_all_srcs}) - set_target_properties(onnxruntime_providers_cuda_sm90_tma PROPERTIES CUDA_ARCHITECTURES "90a-real") - config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm90_tma) - target_compile_options(onnxruntime_providers_cuda_sm90_tma PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") - target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm90_tma) + onnxruntime_filter_cuda_archs(_ort_sm90_check MIN_SM 90) + if(_ort_sm90_check) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_sm90_tma + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "90a-real" + NVCC_THREADS "${onnxruntime_NVCC_THREADS}" + SOURCES ${_ort_sm90_all_srcs}) endif() endif() if(onnxruntime_cuda_sm120_tma_srcs) onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120) if(_ort_sm120_cuda_architectures) - onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs}) - set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}") - config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm120_tma) - target_compile_options(onnxruntime_providers_cuda_sm120_tma PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") - target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm120_tma) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_sm120_tma + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}" + NVCC_THREADS "${onnxruntime_NVCC_THREADS}" + SOURCES ${onnxruntime_cuda_sm120_tma_srcs}) endif() endif() @@ -500,14 +495,12 @@ if(onnxruntime_cuda_flash_attention_srcs) onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80) if(_ort_flash_cuda_architectures) - onnxruntime_add_object_library(onnxruntime_providers_cuda_flash_attention ${onnxruntime_cuda_flash_attention_srcs}) - set_target_properties(onnxruntime_providers_cuda_flash_attention PROPERTIES - CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}") - config_cuda_provider_shared_module(onnxruntime_providers_cuda_flash_attention) - target_compile_options(onnxruntime_providers_cuda_flash_attention PRIVATE - "$<$:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">" - ) - target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_flash_attention) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_flash_attention + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}" + NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}" + SOURCES ${onnxruntime_cuda_flash_attention_srcs}) endif() endif() @@ -520,12 +513,12 @@ if(onnxruntime_cuda_llm_srcs) onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) if(_ort_llm_cuda_architectures) - onnxruntime_add_object_library(onnxruntime_providers_cuda_llm ${onnxruntime_cuda_llm_srcs}) - set_target_properties(onnxruntime_providers_cuda_llm PROPERTIES - CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}") - config_cuda_provider_shared_module(onnxruntime_providers_cuda_llm) - target_compile_options(onnxruntime_providers_cuda_llm PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") - target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_llm) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_llm + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}" + NVCC_THREADS "${onnxruntime_NVCC_THREADS}" + SOURCES ${onnxruntime_cuda_llm_srcs}) endif() endif() diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index cd7810e7c4d91..313285426e0ef 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -257,49 +257,28 @@ target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs) set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs}) - set(_plugin_has_sm90_plus FALSE) - foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES) - string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}") - if(_arch_num GREATER_EQUAL 90) - set(_plugin_has_sm90_plus TRUE) - break() - endif() - endforeach() - if(_plugin_has_sm90_plus) - onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_plugin_sm90_all_srcs}) - set_target_properties(onnxruntime_providers_cuda_plugin_sm90_tma PROPERTIES + onnxruntime_filter_cuda_archs(_plugin_sm90_check MIN_SM 90) + if(_plugin_sm90_check) + onnxruntime_add_cuda_plugin_object_library( + NAME onnxruntime_providers_cuda_plugin_sm90_tma + PARENT onnxruntime_providers_cuda_plugin CUDA_ARCHITECTURES "90a-real" - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON - ) - target_include_directories(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE - $) - target_compile_definitions(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE - $) - target_compile_options(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE - ${_cuda_plugin_shared_compile_options} - "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">") - target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm90_tma) + NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" + COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} + SOURCES ${_plugin_sm90_all_srcs}) endif() endif() if(_cuda_plugin_sm120_tma_srcs) onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120) if(_plugin_sm120_cuda_architectures) - onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm120_tma ${_cuda_plugin_sm120_tma_srcs}) - set_target_properties(onnxruntime_providers_cuda_plugin_sm120_tma PROPERTIES + onnxruntime_add_cuda_plugin_object_library( + NAME onnxruntime_providers_cuda_plugin_sm120_tma + PARENT onnxruntime_providers_cuda_plugin CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}" - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON - ) - target_include_directories(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE - $) - target_compile_definitions(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE - $) - target_compile_options(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE - ${_cuda_plugin_shared_compile_options} - "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">") - target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm120_tma) + NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" + COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} + SOURCES ${_cuda_plugin_sm120_tma_srcs}) endif() endif() @@ -311,21 +290,13 @@ endif() if(_cuda_plugin_flash_attention_srcs) onnxruntime_filter_cuda_archs(_plugin_flash_cuda_architectures MIN_SM 80) if(_plugin_flash_cuda_architectures) - onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_flash_attention ${_cuda_plugin_flash_attention_srcs}) - set_target_properties(onnxruntime_providers_cuda_plugin_flash_attention PROPERTIES + onnxruntime_add_cuda_plugin_object_library( + NAME onnxruntime_providers_cuda_plugin_flash_attention + PARENT onnxruntime_providers_cuda_plugin CUDA_ARCHITECTURES "${_plugin_flash_cuda_architectures}" - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON - ) - target_include_directories(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE - $) - target_compile_definitions(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE - $) - target_compile_options(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE - ${_cuda_plugin_shared_compile_options} - # Flash attention uses a lower nvcc --threads (memory-intensive compilation). - "$<$:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">") - target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_flash_attention) + NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}" + COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} + SOURCES ${_cuda_plugin_flash_attention_srcs}) endif() endif() @@ -336,20 +307,13 @@ endif() if(_cuda_plugin_llm_srcs) onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) if(_plugin_llm_cuda_architectures) - onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_llm ${_cuda_plugin_llm_srcs}) - set_target_properties(onnxruntime_providers_cuda_plugin_llm PROPERTIES + onnxruntime_add_cuda_plugin_object_library( + NAME onnxruntime_providers_cuda_plugin_llm + PARENT onnxruntime_providers_cuda_plugin CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}" - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON - ) - target_include_directories(onnxruntime_providers_cuda_plugin_llm PRIVATE - $) - target_compile_definitions(onnxruntime_providers_cuda_plugin_llm PRIVATE - $) - target_compile_options(onnxruntime_providers_cuda_plugin_llm PRIVATE - ${_cuda_plugin_shared_compile_options} - "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">") - target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_llm) + NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" + COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} + SOURCES ${_cuda_plugin_llm_srcs}) endif() endif() From 6636c99d7e60528bfb8189996d629b0aa9382fa3 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 26 May 2026 15:28:14 -0700 Subject: [PATCH 16/17] handle disable contrib ops nicely --- cmake/onnxruntime_providers_cuda.cmake | 124 ++++++++++-------- cmake/onnxruntime_providers_cuda_plugin.cmake | 92 +++++++------ 2 files changed, 117 insertions(+), 99 deletions(-) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 1c2f9291bb5d1..b23c978d2d7b7 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -456,69 +456,79 @@ config_cuda_provider_shared_module(onnxruntime_providers_cuda) target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">") - # Create OBJECT libraries for SM90/SM120 TMA WS generated files that must be compiled + # Create OBJECT libraries for SM-specific contrib CUDA sources that must be compiled # with restricted CUDA architectures. These files use CUTLASS 3.x SM90+/SM120+ features # (GMMA, TMA) that cannot produce useful device code for older architectures. - if(onnxruntime_cuda_sm90_tma_srcs OR onnxruntime_cuda_llm_sm90_srcs) - # SM90 TMA warp-specialized files use SM90-specific collective operations. - # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat. - # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). - set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs}) - onnxruntime_filter_cuda_archs(_ort_sm90_check MIN_SM 90) - if(_ort_sm90_check) - onnxruntime_add_cuda_object_library( - NAME onnxruntime_providers_cuda_sm90_tma - PARENT onnxruntime_providers_cuda - CUDA_ARCHITECTURES "90a-real" - NVCC_THREADS "${onnxruntime_NVCC_THREADS}" - SOURCES ${_ort_sm90_all_srcs}) + # + # SM90/SM120 TMA and LLM OBJECT libraries contain MoE and MatMulNBits kernels (contrib ops). + # Flash Attention is also used by the ONNX domain Attention op, so it is included even + # when contrib ops are disabled. + if(NOT onnxruntime_CUDA_MINIMAL) + # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads. + # Flash Attention V2 kernels require SM80 (Ampere) and are memory-intensive to compile. + # Isolating them allows the rest of the build to use higher --threads without OOM. + # Included even with onnxruntime_DISABLE_CONTRIB_OPS because the ONNX domain Attention + # kernel depends on flash attention infrastructure in contrib_ops/cuda/bert/. + set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING + "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).") + if(onnxruntime_cuda_flash_attention_srcs) + onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80) + if(_ort_flash_cuda_architectures) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_flash_attention + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}" + NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}" + SOURCES ${onnxruntime_cuda_flash_attention_srcs}) + endif() endif() - endif() - if(onnxruntime_cuda_sm120_tma_srcs) - onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120) - if(_ort_sm120_cuda_architectures) - onnxruntime_add_cuda_object_library( - NAME onnxruntime_providers_cuda_sm120_tma - PARENT onnxruntime_providers_cuda - CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}" - NVCC_THREADS "${onnxruntime_NVCC_THREADS}" - SOURCES ${onnxruntime_cuda_sm120_tma_srcs}) - endif() - endif() + if(NOT onnxruntime_DISABLE_CONTRIB_OPS) + # SM90 TMA warp-specialized files use SM90-specific collective operations. + # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat. + # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). + if(onnxruntime_cuda_sm90_tma_srcs OR onnxruntime_cuda_llm_sm90_srcs) + set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs}) + onnxruntime_filter_cuda_archs(_ort_sm90_check MIN_SM 90) + if(_ort_sm90_check) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_sm90_tma + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "90a-real" + NVCC_THREADS "${onnxruntime_NVCC_THREADS}" + SOURCES ${_ort_sm90_all_srcs}) + endif() + endif() - # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads. - # Flash Attention V2 kernels require SM80 (Ampere) and are memory-intensive to compile. - # Isolating them allows the rest of the build to use higher --threads without OOM. - set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING - "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).") - if(onnxruntime_cuda_flash_attention_srcs) - onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80) - if(_ort_flash_cuda_architectures) - onnxruntime_add_cuda_object_library( - NAME onnxruntime_providers_cuda_flash_attention - PARENT onnxruntime_providers_cuda - CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}" - NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}" - SOURCES ${onnxruntime_cuda_flash_attention_srcs}) - endif() - endif() + if(onnxruntime_cuda_sm120_tma_srcs) + onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120) + if(_ort_sm120_cuda_architectures) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_sm120_tma + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}" + NVCC_THREADS "${onnxruntime_NVCC_THREADS}" + SOURCES ${onnxruntime_cuda_sm120_tma_srcs}) + endif() + endif() - # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). - # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs. - # Excludes SM120+ real (native SASS) architectures because SM120-specific kernels are already - # compiled in the separate SM120 TMA OBJECT library, and compiling the general LLM code for - # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch - # (PTX) is kept so SM120 devices can JIT-compile the code. - if(onnxruntime_cuda_llm_srcs) - onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) - if(_ort_llm_cuda_architectures) - onnxruntime_add_cuda_object_library( - NAME onnxruntime_providers_cuda_llm - PARENT onnxruntime_providers_cuda - CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}" - NVCC_THREADS "${onnxruntime_NVCC_THREADS}" - SOURCES ${onnxruntime_cuda_llm_srcs}) + # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). + # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs. + # Excludes SM120+ real (native SASS) architectures because SM120-specific kernels are already + # compiled in the separate SM120 TMA OBJECT library, and compiling the general LLM code for + # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch + # (PTX) is kept so SM120 devices can JIT-compile the code. + if(onnxruntime_cuda_llm_srcs) + onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) + if(_ort_llm_cuda_architectures) + onnxruntime_add_cuda_object_library( + NAME onnxruntime_providers_cuda_llm + PARENT onnxruntime_providers_cuda + CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}" + NVCC_THREADS "${onnxruntime_NVCC_THREADS}" + SOURCES ${onnxruntime_cuda_llm_srcs}) + endif() + endif() endif() endif() diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index 313285426e0ef..d8435e321c7ce 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -253,37 +253,14 @@ target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE "$<$:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">" ) -# SM90/SM120 TMA WS OBJECT libraries — compiled with restricted CUDA architectures. -# Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). -if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs) - set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs}) - onnxruntime_filter_cuda_archs(_plugin_sm90_check MIN_SM 90) - if(_plugin_sm90_check) - onnxruntime_add_cuda_plugin_object_library( - NAME onnxruntime_providers_cuda_plugin_sm90_tma - PARENT onnxruntime_providers_cuda_plugin - CUDA_ARCHITECTURES "90a-real" - NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" - COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} - SOURCES ${_plugin_sm90_all_srcs}) - endif() -endif() - -if(_cuda_plugin_sm120_tma_srcs) - onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120) - if(_plugin_sm120_cuda_architectures) - onnxruntime_add_cuda_plugin_object_library( - NAME onnxruntime_providers_cuda_plugin_sm120_tma - PARENT onnxruntime_providers_cuda_plugin - CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}" - NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" - COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} - SOURCES ${_cuda_plugin_sm120_tma_srcs}) - endif() -endif() +# SM-specific OBJECT libraries — compiled with restricted CUDA architectures. +# Flash Attention is also used by the ONNX domain Attention op, so it is always included. +# SM90/SM120 TMA and LLM contain MoE and MatMulNBits kernels (contrib ops only). # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads. # Flash Attention V2 kernels require SM80 and are memory-intensive to compile. +# Included even with onnxruntime_DISABLE_CONTRIB_OPS because the ONNX domain Attention +# kernel depends on flash attention infrastructure in contrib_ops/cuda/bert/. if(NOT DEFINED onnxruntime_FLASH_NVCC_THREADS) set(onnxruntime_FLASH_NVCC_THREADS "1") endif() @@ -300,20 +277,51 @@ if(_cuda_plugin_flash_attention_srcs) endif() endif() -# LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). -# Excludes SM120+ real (native SASS) architectures — SM120-specific kernels are compiled in -# the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX -# headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept. -if(_cuda_plugin_llm_srcs) - onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) - if(_plugin_llm_cuda_architectures) - onnxruntime_add_cuda_plugin_object_library( - NAME onnxruntime_providers_cuda_plugin_llm - PARENT onnxruntime_providers_cuda_plugin - CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}" - NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" - COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} - SOURCES ${_cuda_plugin_llm_srcs}) +if(NOT onnxruntime_DISABLE_CONTRIB_OPS) + # SM90 TMA warp-specialized files use SM90-specific collective operations. + # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90). + if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs) + set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs}) + onnxruntime_filter_cuda_archs(_plugin_sm90_check MIN_SM 90) + if(_plugin_sm90_check) + onnxruntime_add_cuda_plugin_object_library( + NAME onnxruntime_providers_cuda_plugin_sm90_tma + PARENT onnxruntime_providers_cuda_plugin + CUDA_ARCHITECTURES "90a-real" + NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" + COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} + SOURCES ${_plugin_sm90_all_srcs}) + endif() + endif() + + if(_cuda_plugin_sm120_tma_srcs) + onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120) + if(_plugin_sm120_cuda_architectures) + onnxruntime_add_cuda_plugin_object_library( + NAME onnxruntime_providers_cuda_plugin_sm120_tma + PARENT onnxruntime_providers_cuda_plugin + CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}" + NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" + COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} + SOURCES ${_cuda_plugin_sm120_tma_srcs}) + endif() + endif() + + # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75). + # Excludes SM120+ real (native SASS) architectures — SM120-specific kernels are compiled in + # the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX + # headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept. + if(_cuda_plugin_llm_srcs) + onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL) + if(_plugin_llm_cuda_architectures) + onnxruntime_add_cuda_plugin_object_library( + NAME onnxruntime_providers_cuda_plugin_llm + PARENT onnxruntime_providers_cuda_plugin + CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}" + NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}" + COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} + SOURCES ${_cuda_plugin_llm_srcs}) + endif() endif() endif() From e81b491bbae9d712b18595cd171c1f7df9f856be Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 26 May 2026 17:20:49 -0700 Subject: [PATCH 17/17] fix: address review feedback on flash attention quick-build and pre-SM80 fallback - Update quick-build comment to reflect that both FP16 and BF16 hdim128 kernels are intentionally retained (not just FP16). - Add fallback for pre-SM80 builds: when no SM80+ architectures are configured, flash attention sources are added back to the parent target so the linker can find host-side symbols referenced by flash_api.cc. --- cmake/onnxruntime_cuda_source_filters.cmake | 4 ++-- cmake/onnxruntime_providers_cuda.cmake | 5 +++++ cmake/onnxruntime_providers_cuda_plugin.cmake | 5 +++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake index 3af4b71b90988..4083154100554 100644 --- a/cmake/onnxruntime_cuda_source_filters.cmake +++ b/cmake/onnxruntime_cuda_source_filters.cmake @@ -14,11 +14,11 @@ function(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST) set(_list "${${CU_SRC_LIST}}") # Quick build mode: Filter flash attention kernels for faster development iteration. - # - We keep only hdim128 fp16 flash attention kernels in quick build mode. + # - We keep only hdim128 fp16 and bf16 flash attention kernels in quick build mode. # - All other listed head dimensions are excluded (e.g., 32, 64, 96, 192, 256). # If new head dimensions are added or removed, update this list to match the supported set. if(onnxruntime_QUICK_BUILD) - message(STATUS "Quick build mode enabled: Only building hdim128 fp16 flash attention kernels") + message(STATUS "Quick build mode enabled: Only building hdim128 fp16/bf16 flash attention kernels") list(FILTER _list EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)") endif() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index b23c978d2d7b7..08f51e92f50d5 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -480,6 +480,11 @@ CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}" NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}" SOURCES ${onnxruntime_cuda_flash_attention_srcs}) + else() + # No SM80+ architectures available: compile flash sources in parent target so the + # linker can find the host-side symbols referenced by flash_api.cc. The kernels + # themselves will be empty stubs due to __CUDA_ARCH__ >= 800 guards. + target_sources(onnxruntime_providers_cuda PRIVATE ${onnxruntime_cuda_flash_attention_srcs}) endif() endif() diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake index d8435e321c7ce..98dfde7c54328 100644 --- a/cmake/onnxruntime_providers_cuda_plugin.cmake +++ b/cmake/onnxruntime_providers_cuda_plugin.cmake @@ -274,6 +274,11 @@ if(_cuda_plugin_flash_attention_srcs) NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}" COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options} SOURCES ${_cuda_plugin_flash_attention_srcs}) + else() + # No SM80+ architectures available: compile flash sources in parent target so the + # linker can find the host-side symbols referenced by flash_api.cc. The kernels + # themselves will be empty stubs due to __CUDA_ARCH__ >= 800 guards. + target_sources(onnxruntime_providers_cuda_plugin PRIVATE ${_cuda_plugin_flash_attention_srcs}) endif() endif()