From 207789febdd5d8e32517a57425157a774ec3d5e2 Mon Sep 17 00:00:00 2001
From: Tianlei WU <tlwu@microsoft.com>
Date: Thu, 21 May 2026 17:43:37 -0700
Subject: [PATCH 01/17]  obj lib for moe gemm to speed up

---
 cmake/onnxruntime_cuda_source_filters.cmake   | 45 +++++++++++++++
 cmake/onnxruntime_providers_cuda.cmake        | 49 ++++++++++++++++
 cmake/onnxruntime_providers_cuda_plugin.cmake | 56 +++++++++++++++++++
 3 files changed, 150 insertions(+)

diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake
index a93686c379de1..94c25fd14ef2f 100644
--- a/cmake/onnxruntime_cuda_source_filters.cmake
+++ b/cmake/onnxruntime_cuda_source_filters.cmake
@@ -41,3 +41,48 @@ macro(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST)
     list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
   endif()
 endmacro()
+
+# Extract SM90/SM120 TMA warp-specialized generated source files from a CUDA source list.
+# These files use CUTLASS 3.x features (GMMA, TMA) that are specific to SM90+ or SM120+.
+# They are compiled in separate OBJECT libraries with restricted CUDA_ARCHITECTURES to:
+#   1. Reduce compile time (avoid compiling heavy templates for unused architectures)
+#   2. Reduce binary size (no dead device code for unsupported architectures)
+#   3. Ensure correctness (SM90 code compiled at exactly 90a-real, SM120 at 120+)
+#
+# The per-source CUDA_ARCHITECTURES property does not work with the Visual Studio generator,
+# so OBJECT libraries are needed.
+#
+# Usage:
+#   onnxruntime_extract_sm_specific_cuda_sources(<cu_src_list_var>
+#       SM90_SOURCES <output_var> SM120_SOURCES <output_var>)
+#
+# Removes matched files from <cu_src_list_var> and stores them in the output variables.
+macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST)
+  cmake_parse_arguments(_EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "" ${ARGN})
+
+  # Extract SM90 TMA WS generated files
+  set(${_EXTRACT_SM90_SOURCES})
+  if(ORT_HAS_SM90_OR_LATER)
+    foreach(_src IN LISTS ${CU_SRC_LIST})
+      if(_src MATCHES "moe_gemm_tma_ws_sm90_.*\\.generated\\.cu$")
+        list(APPEND ${_EXTRACT_SM90_SOURCES} "${_src}")
+      endif()
+    endforeach()
+    if(${_EXTRACT_SM90_SOURCES})
+      list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM90_SOURCES}})
+    endif()
+  endif()
+
+  # Extract SM120 TMA WS generated files
+  set(${_EXTRACT_SM120_SOURCES})
+  if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
+    foreach(_src IN LISTS ${CU_SRC_LIST})
+      if(_src MATCHES "moe_gemm_tma_ws_sm120_.*\\.generated\\.cu$")
+        list(APPEND ${_EXTRACT_SM120_SOURCES} "${_src}")
+      endif()
+    endforeach()
+    if(${_EXTRACT_SM120_SOURCES})
+      list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM120_SOURCES}})
+    endif()
+  endif()
+endmacro()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index b28c35fd502ed..77c350a92263d 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -67,6 +67,10 @@
 
   include(onnxruntime_cuda_source_filters.cmake)
   onnxruntime_filter_cuda_cu_sources(onnxruntime_cuda_contrib_ops_cu_srcs)
+  onnxruntime_extract_sm_specific_cuda_sources(onnxruntime_cuda_contrib_ops_cu_srcs
+    SM90_SOURCES onnxruntime_cuda_sm90_tma_srcs
+    SM120_SOURCES onnxruntime_cuda_sm120_tma_srcs
+  )
 
   # disable contrib ops conditionally
   if(NOT onnxruntime_DISABLE_CONTRIB_OPS AND NOT onnxruntime_CUDA_MINIMAL)
@@ -309,6 +313,13 @@
     if(MSVC)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /Zc:__cplusplus>")
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /bigobj>")
+      # /permissive is required for CUTLASS cute headers and to work around MSVC template resolution
+      # issues with abseil headers when compiled through nvcc.
+      # See https://github.com/NVIDIA/cutlass/issues/3065
+      target_compile_options(${target} PRIVATE
+        "$<$<COMPILE_LANGUAGE:CXX>:/permissive>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
+      )
     endif()
 
     onnxruntime_add_include_to_target(${target} onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers)
@@ -434,6 +445,44 @@
     config_cuda_provider_shared_module(onnxruntime_providers_cuda_obj)
   endif()
   config_cuda_provider_shared_module(onnxruntime_providers_cuda)
+
+  # Create OBJECT libraries for SM90/SM120 TMA WS generated files that must be compiled
+  # with restricted CUDA architectures. These files use CUTLASS 3.x SM90+/SM120+ features
+  # (GMMA, TMA) that cannot produce useful device code for older architectures.
+  if(onnxruntime_cuda_sm90_tma_srcs)
+    # SM90 TMA warp-specialized files use SM90-specific collective operations.
+    # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat.
+    set(_ort_has_sm90_plus FALSE)
+    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+      if(_arch_num GREATER_EQUAL 90)
+        set(_ort_has_sm90_plus TRUE)
+        break()
+      endif()
+    endforeach()
+    if(_ort_has_sm90_plus)
+      onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${onnxruntime_cuda_sm90_tma_srcs})
+      set_target_properties(onnxruntime_providers_cuda_sm90_tma PROPERTIES CUDA_ARCHITECTURES "90a-real")
+      config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm90_tma)
+      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm90_tma)
+    endif()
+  endif()
+
+  if(onnxruntime_cuda_sm120_tma_srcs)
+    set(_ort_sm120_cuda_architectures)
+    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+      if(_arch_num GREATER_EQUAL 120)
+        list(APPEND _ort_sm120_cuda_architectures "${_arch}")
+      endif()
+    endforeach()
+    if(_ort_sm120_cuda_architectures)
+      onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs})
+      set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}")
+      config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm120_tma)
+      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm120_tma)
+    endif()
+  endif()
   # Cannot use glob because the file cuda_provider_options.h should not be exposed out.
   set(ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS
         "${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_context.h"
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 7a76371b74132..10e9774347d9c 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -104,6 +104,10 @@ list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/transforme
 # Apply shared CUDA .cu source filtering (flash attention quick build, MoE GEMM FP4/FP8).
 include(onnxruntime_cuda_source_filters.cmake)
 onnxruntime_filter_cuda_cu_sources(CUDA_PLUGIN_EP_CU_SRCS)
+onnxruntime_extract_sm_specific_cuda_sources(CUDA_PLUGIN_EP_CU_SRCS
+  SM90_SOURCES _cuda_plugin_sm90_tma_srcs
+  SM120_SOURCES _cuda_plugin_sm120_tma_srcs
+)
 
 # Create shared library target using the ORT helper function for plugins
 onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin
@@ -217,6 +221,58 @@ if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
   target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS)
 endif()
 
+# SM90/SM120 TMA WS OBJECT libraries — compiled with restricted CUDA architectures.
+if(_cuda_plugin_sm90_tma_srcs)
+  set(_plugin_has_sm90_plus FALSE)
+  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+    if(_arch_num GREATER_EQUAL 90)
+      set(_plugin_has_sm90_plus TRUE)
+      break()
+    endif()
+  endforeach()
+  if(_plugin_has_sm90_plus)
+    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_cuda_plugin_sm90_tma_srcs})
+    set_target_properties(onnxruntime_providers_cuda_plugin_sm90_tma PROPERTIES
+      CUDA_ARCHITECTURES "90a-real"
+      CUDA_STANDARD 20
+      CUDA_STANDARD_REQUIRED ON
+    )
+    target_include_directories(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
+    target_compile_options(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_OPTIONS>)
+    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm90_tma)
+  endif()
+endif()
+
+if(_cuda_plugin_sm120_tma_srcs)
+  set(_plugin_sm120_cuda_architectures)
+  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+    if(_arch_num GREATER_EQUAL 120)
+      list(APPEND _plugin_sm120_cuda_architectures "${_arch}")
+    endif()
+  endforeach()
+  if(_plugin_sm120_cuda_architectures)
+    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm120_tma ${_cuda_plugin_sm120_tma_srcs})
+    set_target_properties(onnxruntime_providers_cuda_plugin_sm120_tma PROPERTIES
+      CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}"
+      CUDA_STANDARD 20
+      CUDA_STANDARD_REQUIRED ON
+    )
+    target_include_directories(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
+    target_compile_options(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_OPTIONS>)
+    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm120_tma)
+  endif()
+endif()
+
 # --- Find cuDNN (may be at a custom path via onnxruntime_CUDNN_HOME) ---
 set(_CUDNN_SEARCH_PATHS "")
 if(onnxruntime_CUDNN_HOME)

From 19fb1332cca52dc72f79eb25ac7524822e48c663 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 22 May 2026 13:05:20 -0700
Subject: [PATCH 02/17] object lib for flash attention and llm

---
 cmake/onnxruntime_cuda_source_filters.cmake   |  61 +++++++++++
 cmake/onnxruntime_providers_cuda.cmake        |  66 +++++++++++-
 cmake/onnxruntime_providers_cuda_plugin.cmake | 102 +++++++++++++++---
 cmake/onnxruntime_unittests.cmake             |  14 +++
 4 files changed, 228 insertions(+), 15 deletions(-)

diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake
index 94c25fd14ef2f..761f43aab9f48 100644
--- a/cmake/onnxruntime_cuda_source_filters.cmake
+++ b/cmake/onnxruntime_cuda_source_filters.cmake
@@ -86,3 +86,64 @@ macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST)
     endif()
   endif()
 endmacro()
+
+# Extract Flash Attention CUDA source files into a separate list for compilation
+# in a dedicated OBJECT library with SM80+ architectures and independent nvcc_threads.
+# Flash Attention V2 kernels require SM80 (Ampere) or later — they contain
+# __CUDA_ARCH__ >= 800 guards in kernel_traits.h and all files are *_sm80.cu.
+# Compiling them separately allows:
+#   1. Restricting CUDA_ARCHITECTURES to SM80+ (skip dead pre-Ampere passes)
+#   2. Using --threads 1 (memory-intensive) while other targets use higher parallelism
+#
+# Usage:
+#   onnxruntime_extract_flash_attention_sources(<cu_src_list_var>
+#       FLASH_SOURCES <output_var>)
+macro(onnxruntime_extract_flash_attention_sources CU_SRC_LIST)
+  cmake_parse_arguments(_FA "" "FLASH_SOURCES" "" ${ARGN})
+
+  set(${_FA_FLASH_SOURCES})
+  foreach(_src IN LISTS ${CU_SRC_LIST})
+    if(_src MATCHES "/bert/flash_attention/.*\\.cu$")
+      list(APPEND ${_FA_FLASH_SOURCES} "${_src}")
+    endif()
+  endforeach()
+  if(${_FA_FLASH_SOURCES})
+    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_FA_FLASH_SOURCES}})
+  endif()
+endmacro()
+
+# Extract LLM CUDA source files into separate lists for per-architecture compilation.
+# The LLM directory (contrib_ops/cuda/llm/) contains kernels with minimum SM75 support
+# (fpA_intB_gemv/gemm enforce arch >= 75). SM90-specific launchers (fpA_intB_gemm
+# launchers guarded by #ifndef EXCLUDE_SM_90) are extracted separately to be compiled
+# at 90a-real (merged into the SM90 TMA OBJECT library).
+#
+# Note: SM90 TMA MoE GEMM files are already extracted by
+# onnxruntime_extract_sm_specific_cuda_sources() before this macro is called.
+#
+# Usage:
+#   onnxruntime_extract_llm_sources(<cu_src_list_var>
+#       LLM_SOURCES <output_var>
+#       LLM_SM90_SOURCES <output_var>)
+macro(onnxruntime_extract_llm_sources CU_SRC_LIST)
+  cmake_parse_arguments(_LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "" ${ARGN})
+
+  set(${_LLM_LLM_SOURCES})
+  set(${_LLM_LLM_SM90_SOURCES})
+  foreach(_src IN LISTS ${CU_SRC_LIST})
+    if(_src MATCHES "/contrib_ops/cuda/llm/.*\\.cu$")
+      # SM90-specific fpA_intB launchers (guarded by #ifndef EXCLUDE_SM_90)
+      if(_src MATCHES "fpA_intB_gemm_launcher_[0-9]+\\.generated\\.cu$")
+        list(APPEND ${_LLM_LLM_SM90_SOURCES} "${_src}")
+      else()
+        list(APPEND ${_LLM_LLM_SOURCES} "${_src}")
+      endif()
+    endif()
+  endforeach()
+  if(${_LLM_LLM_SOURCES})
+    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SOURCES}})
+  endif()
+  if(${_LLM_LLM_SM90_SOURCES})
+    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SM90_SOURCES}})
+  endif()
+endmacro()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 77c350a92263d..ddf9d9da9c1a0 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -71,6 +71,13 @@
     SM90_SOURCES onnxruntime_cuda_sm90_tma_srcs
     SM120_SOURCES onnxruntime_cuda_sm120_tma_srcs
   )
+  onnxruntime_extract_flash_attention_sources(onnxruntime_cuda_contrib_ops_cu_srcs
+    FLASH_SOURCES onnxruntime_cuda_flash_attention_srcs
+  )
+  onnxruntime_extract_llm_sources(onnxruntime_cuda_contrib_ops_cu_srcs
+    LLM_SOURCES onnxruntime_cuda_llm_srcs
+    LLM_SM90_SOURCES onnxruntime_cuda_llm_sm90_srcs
+  )
 
   # disable contrib ops conditionally
   if(NOT onnxruntime_DISABLE_CONTRIB_OPS AND NOT onnxruntime_CUDA_MINIMAL)
@@ -224,8 +231,9 @@
 
     # Note: CUDA 11.3+ supports parallel compilation
     # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads
+    # --threads is NOT set here; it is applied per-target after calling this function
+    # so that flash attention can use a different (lower) thread count.
     set(onnxruntime_NVCC_THREADS "1" CACHE STRING "Number of threads that NVCC can use for compilation.")
-    target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
 
     # suppress warnings like this:
     #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never
@@ -443,15 +451,19 @@
   endfunction()
   if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
     config_cuda_provider_shared_module(onnxruntime_providers_cuda_obj)
+    target_compile_options(onnxruntime_providers_cuda_obj PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
   endif()
   config_cuda_provider_shared_module(onnxruntime_providers_cuda)
+  target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
 
   # Create OBJECT libraries for SM90/SM120 TMA WS generated files that must be compiled
   # with restricted CUDA architectures. These files use CUTLASS 3.x SM90+/SM120+ features
   # (GMMA, TMA) that cannot produce useful device code for older architectures.
-  if(onnxruntime_cuda_sm90_tma_srcs)
+  if(onnxruntime_cuda_sm90_tma_srcs OR onnxruntime_cuda_llm_sm90_srcs)
     # SM90 TMA warp-specialized files use SM90-specific collective operations.
     # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat.
+    # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
+    set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs})
     set(_ort_has_sm90_plus FALSE)
     foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
       string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
@@ -461,9 +473,10 @@
       endif()
     endforeach()
     if(_ort_has_sm90_plus)
-      onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${onnxruntime_cuda_sm90_tma_srcs})
+      onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${_ort_sm90_all_srcs})
       set_target_properties(onnxruntime_providers_cuda_sm90_tma PROPERTIES CUDA_ARCHITECTURES "90a-real")
       config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm90_tma)
+      target_compile_options(onnxruntime_providers_cuda_sm90_tma PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
       target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm90_tma)
     endif()
   endif()
@@ -480,9 +493,56 @@
       onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs})
       set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}")
       config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm120_tma)
+      target_compile_options(onnxruntime_providers_cuda_sm120_tma PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
       target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm120_tma)
     endif()
   endif()
+
+  # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads.
+  # Flash Attention V2 kernels require SM80 (Ampere) and are memory-intensive to compile.
+  # Isolating them allows the rest of the build to use higher --threads without OOM.
+  set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING
+      "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).")
+  if(onnxruntime_cuda_flash_attention_srcs)
+    set(_ort_flash_cuda_architectures)
+    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+      if(_arch_num GREATER_EQUAL 80)
+        list(APPEND _ort_flash_cuda_architectures "${_arch}")
+      endif()
+    endforeach()
+    if(_ort_flash_cuda_architectures)
+      onnxruntime_add_object_library(onnxruntime_providers_cuda_flash_attention ${onnxruntime_cuda_flash_attention_srcs})
+      set_target_properties(onnxruntime_providers_cuda_flash_attention PROPERTIES
+        CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}")
+      config_cuda_provider_shared_module(onnxruntime_providers_cuda_flash_attention)
+      target_compile_options(onnxruntime_providers_cuda_flash_attention PRIVATE
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">"
+      )
+      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_flash_attention)
+    endif()
+  endif()
+
+  # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
+  # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs.
+  if(onnxruntime_cuda_llm_srcs)
+    set(_ort_llm_cuda_architectures)
+    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+      if(_arch_num GREATER_EQUAL 75)
+        list(APPEND _ort_llm_cuda_architectures "${_arch}")
+      endif()
+    endforeach()
+    if(_ort_llm_cuda_architectures)
+      onnxruntime_add_object_library(onnxruntime_providers_cuda_llm ${onnxruntime_cuda_llm_srcs})
+      set_target_properties(onnxruntime_providers_cuda_llm PROPERTIES
+        CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}")
+      config_cuda_provider_shared_module(onnxruntime_providers_cuda_llm)
+      target_compile_options(onnxruntime_providers_cuda_llm PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
+      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_llm)
+    endif()
+  endif()
+
   # Cannot use glob because the file cuda_provider_options.h should not be exposed out.
   set(ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS
         "${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_context.h"
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 10e9774347d9c..5a3df9d03c84b 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -108,6 +108,13 @@ onnxruntime_extract_sm_specific_cuda_sources(CUDA_PLUGIN_EP_CU_SRCS
   SM90_SOURCES _cuda_plugin_sm90_tma_srcs
   SM120_SOURCES _cuda_plugin_sm120_tma_srcs
 )
+onnxruntime_extract_flash_attention_sources(CUDA_PLUGIN_EP_CU_SRCS
+  FLASH_SOURCES _cuda_plugin_flash_attention_srcs
+)
+onnxruntime_extract_llm_sources(CUDA_PLUGIN_EP_CU_SRCS
+  LLM_SOURCES _cuda_plugin_llm_srcs
+  LLM_SM90_SOURCES _cuda_plugin_llm_sm90_srcs
+)
 
 # Create shared library target using the ORT helper function for plugins
 onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin
@@ -183,22 +190,23 @@ endif()
 if (DEFINED onnxruntime_NVCC_THREADS)
     set(onnxruntime_plugin_nvcc_threads "${onnxruntime_NVCC_THREADS}")
 else()
-    set(onnxruntime_plugin_nvcc_threads "1")
+    set(onnxruntime_plugin_nvcc_threads "4")
 endif()
-target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
-        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">"
-        "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>"
+# Shared CUDA compile options (excluding --threads, which is set per-target so that
+# flash attention can use a lower thread count without duplicate-flag nvcc warnings).
+set(_cuda_plugin_shared_compile_options
+    "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>"
 )
 
 if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
-    target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+    list(APPEND _cuda_plugin_shared_compile_options
             "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>"
             "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>"
             "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2908>"
     )
 
     if (MSVC)
-        target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+        list(APPEND _cuda_plugin_shared_compile_options
                 "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4505>"
         )
     endif()
@@ -209,8 +217,9 @@ include(cutlass)
 
 # TMA compile definitions — mirror config_cuda_provider_shared_module in onnxruntime_providers_cuda.cmake
 if(ORT_HAS_SM90_OR_LATER)
-  target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas=-w>)
-  target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-DCUTLASS_ENABLE_GDC_FOR_SM90=1>)
+  list(APPEND _cuda_plugin_shared_compile_options
+    "$<$<COMPILE_LANGUAGE:CUDA>:-Xptxas=-w>"
+    "$<$<COMPILE_LANGUAGE:CUDA>:-DCUTLASS_ENABLE_GDC_FOR_SM90=1>")
   target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GEMMS)
   target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GROUPED_GEMMS)
 endif()
@@ -221,8 +230,16 @@ if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
   target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS)
 endif()
 
+# Apply shared options + --threads to the parent plugin target.
+target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+  ${_cuda_plugin_shared_compile_options}
+  "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">"
+)
+
 # SM90/SM120 TMA WS OBJECT libraries — compiled with restricted CUDA architectures.
-if(_cuda_plugin_sm90_tma_srcs)
+# Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
+if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs)
+  set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs})
   set(_plugin_has_sm90_plus FALSE)
   foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
     string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
@@ -232,7 +249,7 @@ if(_cuda_plugin_sm90_tma_srcs)
     endif()
   endforeach()
   if(_plugin_has_sm90_plus)
-    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_cuda_plugin_sm90_tma_srcs})
+    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_plugin_sm90_all_srcs})
     set_target_properties(onnxruntime_providers_cuda_plugin_sm90_tma PROPERTIES
       CUDA_ARCHITECTURES "90a-real"
       CUDA_STANDARD 20
@@ -243,7 +260,8 @@ if(_cuda_plugin_sm90_tma_srcs)
     target_compile_definitions(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
       $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
     target_compile_options(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_OPTIONS>)
+      ${_cuda_plugin_shared_compile_options}
+      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">")
     target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm90_tma)
   endif()
 endif()
@@ -268,11 +286,71 @@ if(_cuda_plugin_sm120_tma_srcs)
     target_compile_definitions(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
       $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
     target_compile_options(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_OPTIONS>)
+      ${_cuda_plugin_shared_compile_options}
+      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">")
     target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm120_tma)
   endif()
 endif()
 
+# Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads.
+# Flash Attention V2 kernels require SM80 and are memory-intensive to compile.
+if(NOT DEFINED onnxruntime_FLASH_NVCC_THREADS)
+  set(onnxruntime_FLASH_NVCC_THREADS "1")
+endif()
+if(_cuda_plugin_flash_attention_srcs)
+  set(_plugin_flash_cuda_architectures)
+  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+    if(_arch_num GREATER_EQUAL 80)
+      list(APPEND _plugin_flash_cuda_architectures "${_arch}")
+    endif()
+  endforeach()
+  if(_plugin_flash_cuda_architectures)
+    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_flash_attention ${_cuda_plugin_flash_attention_srcs})
+    set_target_properties(onnxruntime_providers_cuda_plugin_flash_attention PROPERTIES
+      CUDA_ARCHITECTURES "${_plugin_flash_cuda_architectures}"
+      CUDA_STANDARD 20
+      CUDA_STANDARD_REQUIRED ON
+    )
+    target_include_directories(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
+    target_compile_options(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE
+      ${_cuda_plugin_shared_compile_options}
+      # Flash attention uses a lower nvcc --threads (memory-intensive compilation).
+      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">")
+    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_flash_attention)
+  endif()
+endif()
+
+# LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
+if(_cuda_plugin_llm_srcs)
+  set(_plugin_llm_cuda_architectures)
+  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+    if(_arch_num GREATER_EQUAL 75)
+      list(APPEND _plugin_llm_cuda_architectures "${_arch}")
+    endif()
+  endforeach()
+  if(_plugin_llm_cuda_architectures)
+    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_llm ${_cuda_plugin_llm_srcs})
+    set_target_properties(onnxruntime_providers_cuda_plugin_llm PROPERTIES
+      CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}"
+      CUDA_STANDARD 20
+      CUDA_STANDARD_REQUIRED ON
+    )
+    target_include_directories(onnxruntime_providers_cuda_plugin_llm PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin_llm PRIVATE
+      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
+    target_compile_options(onnxruntime_providers_cuda_plugin_llm PRIVATE
+      ${_cuda_plugin_shared_compile_options}
+      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">")
+    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_llm)
+  endif()
+endif()
+
 # --- Find cuDNN (may be at a custom path via onnxruntime_CUDNN_HOME) ---
 set(_CUDNN_SEARCH_PATHS "")
 if(onnxruntime_CUDNN_HOME)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index a061858fa068f..d5f7fd40aa918 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -981,10 +981,24 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   # onnxruntime_providers_cuda_ut is only for unittests.
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
+  target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
   add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
   target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils)
+  # Link architecture-specific OBJECT libraries (same as onnxruntime_providers_cuda).
+  if(TARGET onnxruntime_providers_cuda_sm90_tma)
+    target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_sm90_tma)
+  endif()
+  if(TARGET onnxruntime_providers_cuda_sm120_tma)
+    target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_sm120_tma)
+  endif()
+  if(TARGET onnxruntime_providers_cuda_flash_attention)
+    target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_flash_attention)
+  endif()
+  if(TARGET onnxruntime_providers_cuda_llm)
+    target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE onnxruntime_providers_cuda_llm)
+  endif()
   if (MSVC)
     # Cutlass code has an issue with the following:
     # warning C4100: 'magic': unreferenced formal parameter

From 7861019d3d0c7ee64d4fd1df52eb207a3209d398 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 22 May 2026 13:40:18 -0700
Subject: [PATCH 03/17] use `--nvcc_threads 4 --flash_nvcc_threads 2` in
 pipelines

---
 .github/workflows/linux_cuda_ci.yml           |  2 +-
 .github/workflows/linux_cuda_plugin_ci.yml    |  2 +-
 .github/workflows/windows_cuda.yml            |  4 +--
 .github/workflows/windows_cuda_plugin.yml     |  2 +-
 .github/workflows/windows_tensorrt.yml        |  4 +--
 tools/ci_build/build.py                       | 33 ++++++-------------
 tools/ci_build/build_args.py                  | 11 +++++--
 .../custom-nuget-packaging-pipeline.yml       |  2 +-
 .../stages/nuget-win-cuda-packaging-stage.yml |  8 ++---
 .../stages/plugin-win-cuda-stage.yml          |  4 +--
 .../stages/py-win-gpu-stage.yml               |  2 +-
 .../github/linux/build_cuda_plugin_package.sh |  2 +-
 .../linux/build_linux_python_package.sh       |  2 +-
 13 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
index cae7a84f49442..0c611119e4964 100644
--- a/.github/workflows/linux_cuda_ci.yml
+++ b/.github/workflows/linux_cuda_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
       docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
index 3b532c486cdfc..e874d2c09edd2 100644
--- a/.github/workflows/linux_cuda_plugin_ci.yml
+++ b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -32,7 +32,7 @@ jobs:
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
-        --nvcc_threads 1
+        --nvcc_threads 4 --flash_nvcc_threads 2
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 852d0164083c4..06e6a01cf0fa1 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -115,7 +115,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -235,7 +235,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
index 07083a5caa08a..8febd8c3d3fc3 100644
--- a/.github/workflows/windows_cuda_plugin.yml
+++ b/.github/workflows/windows_cuda_plugin.yml
@@ -73,7 +73,7 @@ jobs:
             --build_dir build `
             --skip_submodule_sync `
             --parallel `
-            --nvcc_threads 1 `
+            --nvcc_threads 4 --flash_nvcc_threads 2 `
             --use_binskim_compliant_compile_flags `
             --cmake_generator "Visual Studio 17 2022" `
             --build_shared_lib `
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index 0a47f46aa8516..8c499c3c3b3aa 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -121,7 +121,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -247,7 +247,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index f42617ba1b04c..d06d462c79cbd 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -209,31 +209,14 @@ def number_of_nvcc_threads(args):
     if args.nvcc_threads >= 0:
         return args.nvcc_threads
 
-    nvcc_threads = 1
-    try:
-        import psutil  # noqa: PLC0415
+    return 4
 
-        available_memory = psutil.virtual_memory().available
-        if isinstance(available_memory, int) and available_memory > 0:
-            if available_memory >= 64 * 1024 * 1024 * 1024:
-                # When available memory is large enough, chance of OOM is small.
-                nvcc_threads = min(4, int(available_memory / (8 * 4 * 1024 * 1024 * 1024)))
-            else:
-                # NVCC need a lot of memory to compile 48 flash attention cu files.
-                # Here we select number of threads to ensure each thread has enough memory (>= 4 GB).
-                memory_per_thread = 4 * 1024 * 1024 * 1024
-                fmha_cu_files = 48
-                fmha_parallel_jobs = min(fmha_cu_files, number_of_parallel_jobs(args))
-                nvcc_threads = max(1, int(available_memory / (memory_per_thread * fmha_parallel_jobs)))
-                print(
-                    f"nvcc_threads={nvcc_threads} to ensure memory per thread >= 4GB for available_memory={available_memory} and fmha_parallel_jobs={fmha_parallel_jobs}"
-                )
-    except ImportError:
-        print(
-            "Failed to import psutil. Please `pip install psutil` for better estimation of nvcc threads. Use nvcc_threads=1"
-        )
 
-    return nvcc_threads
+def number_of_flash_nvcc_threads(args):
+    if args.flash_nvcc_threads >= 0:
+        return args.flash_nvcc_threads
+
+    return number_of_nvcc_threads(args)
 
 
 # See https://learn.microsoft.com/en-us/vcpkg/commands/install
@@ -724,6 +707,10 @@ def generate_build_tree(
     if args.use_cuda:
         nvcc_threads = number_of_nvcc_threads(args)
         cmake_args.append("-Donnxruntime_NVCC_THREADS=" + str(nvcc_threads))
+
+        flash_nvcc_threads = number_of_flash_nvcc_threads(args)
+        cmake_args.append("-Donnxruntime_FLASH_NVCC_THREADS=" + str(flash_nvcc_threads))
+
         cmake_args.append(f"-DCMAKE_CUDA_COMPILER={cuda_home}/bin/nvcc")
         add_default_definition(cmake_extra_defines, "onnxruntime_USE_CUDA", "ON")
         if args.cuda_version:
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index b40bf4c2b25c6..ca866a2db7ae6 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -647,9 +647,16 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     cuda_group.add_argument(
         "--nvcc_threads",
         nargs="?",
-        default=-1,  # -1 signifies auto-detect based on jobs/memory
+        default=4,
         type=int,
-        help="Max NVCC threads per parallel job (-1=auto).",
+        help="Max NVCC threads per parallel job (default is 4).",
+    )
+    cuda_group.add_argument(
+        "--flash_nvcc_threads",
+        nargs="?",
+        default=-1,
+        type=int,
+        help="Max NVCC threads per parallel job for flash attention (default is same value of --nvcc_threads).",
     )
     # CUDA-specific profiling
     cuda_group.add_argument(
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index ad6c04e7fd9d2..423ffb917571e 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -97,7 +97,7 @@ extends:
           msbuildPlatform: x64
           packageName: x64-cuda
           CudaVersion: ${{ parameters.CudaVersion }}
-          buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 1 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}"
+          buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 4 --flash_nvcc_threads 2 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}"
           runTests: false
           buildJava: false
           java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index a2a325f01209a..b072e22818eec 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -73,9 +73,9 @@ stages:
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
     ${{ if ne(parameters.win_cudnn_home, '') }}:
-      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
+      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
     ${{ else }}:
-      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
+      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
@@ -96,9 +96,9 @@ stages:
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
     ${{ if ne(parameters.win_cudnn_home, '') }}:
-      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
+      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
     ${{ else }}:
-      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
+      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml
index 676e03ddd7ff6..5ff6def4c1e10 100644
--- a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml
@@ -127,7 +127,7 @@ stages:
               --skip_submodule_sync
               --cmake_generator "$(VSGenerator)"
               --parallel
-              --nvcc_threads 1
+              --nvcc_threads 4 --flash_nvcc_threads 2
               --use_vcpkg
               --use_vcpkg_ms_internal_asset_cache
               --use_binskim_compliant_compile_flags
@@ -154,7 +154,7 @@ stages:
               --skip_submodule_sync
               --cmake_generator "$(VSGenerator)"
               --parallel
-              --nvcc_threads 1
+              --nvcc_threads 4 --flash_nvcc_threads 2
               --use_vcpkg
               --use_vcpkg_ms_internal_asset_cache
               --use_binskim_compliant_compile_flags
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index 45bc7db05592a..65216e7c59198 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -122,7 +122,7 @@ stages:
             --enable_pybind
             --enable_onnx_tests
             --parallel
-            --nvcc_threads 1
+            --nvcc_threads 4 --flash_nvcc_threads 2
             --use_vcpkg
             --use_vcpkg_ms_internal_asset_cache
             --use_binskim_compliant_compile_flags
diff --git a/tools/ci_build/github/linux/build_cuda_plugin_package.sh b/tools/ci_build/github/linux/build_cuda_plugin_package.sh
index 1b4e897b05389..ad57648f8af81 100755
--- a/tools/ci_build/github/linux/build_cuda_plugin_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_plugin_package.sh
@@ -55,7 +55,7 @@ docker run --rm \
         --config ${BUILD_CONFIG} \
         --skip_submodule_sync \
         --parallel \
-        --nvcc_threads 1 \
+        --nvcc_threads 4 --flash_nvcc_threads 2 \
         --use_binskim_compliant_compile_flags \
         --use_cuda \
         --cuda_version=${SHORT_CUDA_VERSION} \
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 7ba5406e00ec0..672582edf0518 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -86,7 +86,7 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then
         CUDA_HOME=/usr/local/cuda
     fi
     #Enable CUDA EP.
-    BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
+    BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=4" "--flash_nvcc_threads=2" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
     # Enable TRT EP only if TensorRT is installed.
     if [ -f /usr/include/NvInfer.h ]; then
         BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/usr")

From 21d7386819601d29018c2a895cd85975a12fb8a7 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 22 May 2026 20:21:50 -0700
Subject: [PATCH 04/17] exclude sm100 moe gemm kernels

---
 cmake/onnxruntime_providers_cuda.cmake        | 4 ----
 cmake/onnxruntime_providers_cuda_plugin.cmake | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index ddf9d9da9c1a0..8d41197e3c13c 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -388,10 +388,6 @@
       endif()
     endif()
 
-    if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
-      target_compile_definitions(${target} PRIVATE COMPILE_BLACKWELL_TMA_GROUPED_GEMMS)
-    endif()
-
     if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
       target_compile_definitions(${target} PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS)
     endif()
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 5a3df9d03c84b..7e4beb6bc1458 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -223,9 +223,6 @@ if(ORT_HAS_SM90_OR_LATER)
   target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GEMMS)
   target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_HOPPER_TMA_GROUPED_GEMMS)
 endif()
-if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
-  target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_TMA_GROUPED_GEMMS)
-endif()
 if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
   target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS)
 endif()

From 6a3bc2866643b5b3fcd3391b2e2d699a7d5e478e Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 22 May 2026 23:35:39 -0700
Subject: [PATCH 05/17] fix cuda 12.8 plugin CI

---
 cmake/onnxruntime_providers_cuda.cmake        | 2 ++
 cmake/onnxruntime_providers_cuda_plugin.cmake | 1 +
 2 files changed, 3 insertions(+)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 8d41197e3c13c..79b5669d53d3c 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -249,6 +249,8 @@
       endif()
       # skip diagnosis error caused by cuda header files
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>")
+      # NVCC false positive: assigning a [[nodiscard]] Status via operator= is flagged as discarding the value.
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2810>")
       # CUDA 12.8 also reports deprecated implicit by-copy 'this' captures from CUTLASS headers.
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2908>")
     endif()
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 7e4beb6bc1458..5aa96a57670e3 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -202,6 +202,7 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
     list(APPEND _cuda_plugin_shared_compile_options
             "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>"
             "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>"
+            "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2810>"
             "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2908>"
     )
 

From 41e2974963ada82dd8b734900cc4bee9f82e9590 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 23 May 2026 00:18:40 -0700
Subject: [PATCH 06/17] use quick build in CI (except linux cuda CI).

---
 .github/workflows/linux_cuda_plugin_ci.yml            | 1 +
 .github/workflows/linux_tensorrt_ci.yml               | 2 +-
 .github/workflows/windows_cuda.yml                    | 2 +-
 .github/workflows/windows_cuda_plugin.yml             | 1 +
 .github/workflows/windows_tensorrt.yml                | 2 +-
 onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc | 5 ++++-
 onnxruntime/test/python/transformers/test_gqa.py      | 1 +
 7 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
index e874d2c09edd2..2d8eb50e591bb 100644
--- a/.github/workflows/linux_cuda_plugin_ci.yml
+++ b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -38,6 +38,7 @@ jobs:
         --cudnn_home=/usr/local/cuda-12.8
         --enable_cuda_profiling
         --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        --cmake_extra_defines onnxruntime_QUICK_BUILD=ON
         --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
       python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
       run_tests: false
diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml
index 263d9a98f0bb6..5879d455890cc 100644
--- a/.github/workflows/linux_tensorrt_ci.yml
+++ b/.github/workflows/linux_tensorrt_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host'
       docker_image_repo: onnxruntimetensorrt86gpubuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 06e6a01cf0fa1..aeb3c3c9c5ef0 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -235,7 +235,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
index 8febd8c3d3fc3..0dfd7feda388a 100644
--- a/.github/workflows/windows_cuda_plugin.yml
+++ b/.github/workflows/windows_cuda_plugin.yml
@@ -84,6 +84,7 @@ jobs:
             --use_vcpkg `
             --use_vcpkg_ms_internal_asset_cache `
             --enable_cuda_profiling `
+            --cmake_extra_defines onnxruntime_QUICK_BUILD=ON `
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 `
             --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
 
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index 8c499c3c3b3aa..fa051ef694e52 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -121,7 +121,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
index f8bb8d0213099..8212b3b777c12 100644
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
@@ -140,7 +140,10 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
 #ifdef ORT_QUICK_BUILD
   // Quick build: restrict SM80 tile shapes to the 3 instantiated tile sizes only.
   // This matches the reduced instantiations in fused_moe_gemm_sm80_f16.generated.cu.
-  (void)gemm_type;
+  // SIMT (float) kernels use a different tile shape that must still be returned.
+  if (gemm_type == CutlassGemmType::Simt) {
+    return {CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8};
+  }
   return base_configs;
 #endif
 
diff --git a/onnxruntime/test/python/transformers/test_gqa.py b/onnxruntime/test/python/transformers/test_gqa.py
index 55c8b56ae027a..8a98c39d61eb9 100644
--- a/onnxruntime/test/python/transformers/test_gqa.py
+++ b/onnxruntime/test/python/transformers/test_gqa.py
@@ -2512,6 +2512,7 @@ def test_gqa_fp8_prompt(self):
             raise
 
     @unittest.skipIf(not has_cuda_device(89) or not has_fp8_kv_cache, "FP8 KV cache is not available, skipping tests.")
+    @unittest.skipIf(quick_build, "Quick build only has hdim128 flash attention kernels; head_size=48 needs hdim64.")
     def test_gqa_fp8_fallback_unsupported_head_size(self):
         """
         Test GQA with FP8 KV cache on a head size not supported by XQA.

From d5c70d53d79dac252f7cf0a53551683077e98a1b Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 23 May 2026 01:46:47 -0700
Subject: [PATCH 07/17] Fix plugin CI

---
 cmake/onnxruntime_providers_cuda.cmake        |  2 ++
 cmake/onnxruntime_providers_cuda_plugin.cmake | 21 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 79b5669d53d3c..c3ec700a48925 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -239,6 +239,8 @@
     #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never
     #     referenced
     target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
+    # suppress cudafe "variable was set but never used" (#550-D) from flatbuffers/adapter headers
+    target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=550>")
 
     # Since CUDA 12.8, compiling diagnostics become stricter
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 5aa96a57670e3..e74b8e83c3bb3 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -194,15 +194,23 @@ else()
 endif()
 # Shared CUDA compile options (excluding --threads, which is set per-target so that
 # flash attention can use a lower thread count without duplicate-flag nvcc warnings).
+# These mirror the options from the parent plugin target and config_cuda_provider_shared_module
+# so that OBJECT libraries compiled separately receive the same flags.
 set(_cuda_plugin_shared_compile_options
+    # Force NVCC onto C++20 explicitly. With the VS generator the CUDA_STANDARD
+    # property alone still leaves `-std=c++17` in AdditionalOptions.
+    "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--std c++20>"
     "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>"
+    # Suppress cudafe front-end diagnostic 550 (variable set but never used) from third-party headers.
+    "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=550>"
+    # Suppress cudafe [[nodiscard]] false positive on Status assignments.
+    "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=2810>"
 )
 
 if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
     list(APPEND _cuda_plugin_shared_compile_options
             "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>"
             "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>"
-            "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2810>"
             "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2908>"
     )
 
@@ -213,6 +221,17 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
     endif()
 endif()
 
+if (MSVC)
+    list(APPEND _cuda_plugin_shared_compile_options
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4834>"
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4127>"
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4211>"
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /Zc:__cplusplus>"
+            "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /bigobj>"
+    )
+endif()
+
 include(cudnn_frontend)
 include(cutlass)
 

From 64f4acf64ceeda60fbfca42bc04ddc6dc444ac9b Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 23 May 2026 21:45:45 -0700
Subject: [PATCH 08/17] CI --nvcc_threads 8 --flash_nvcc_threads 4

---
 .github/workflows/linux_cuda_ci.yml        | 2 +-
 .github/workflows/linux_cuda_plugin_ci.yml | 2 +-
 .github/workflows/linux_tensorrt_ci.yml    | 2 +-
 .github/workflows/windows_cuda.yml         | 4 ++--
 .github/workflows/windows_cuda_plugin.yml  | 2 +-
 .github/workflows/windows_tensorrt.yml     | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
index 0c611119e4964..2e9d5b92655dc 100644
--- a/.github/workflows/linux_cuda_ci.yml
+++ b/.github/workflows/linux_cuda_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
       docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
index 2d8eb50e591bb..9406d389212ea 100644
--- a/.github/workflows/linux_cuda_plugin_ci.yml
+++ b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -32,7 +32,7 @@ jobs:
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
-        --nvcc_threads 4 --flash_nvcc_threads 2
+        --nvcc_threads 8 --flash_nvcc_threads 4
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml
index 5879d455890cc..bb6d51f2f7190 100644
--- a/.github/workflows/linux_tensorrt_ci.yml
+++ b/.github/workflows/linux_tensorrt_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host'
       docker_image_repo: onnxruntimetensorrt86gpubuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index aeb3c3c9c5ef0..c2df2cc56c1f4 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -115,7 +115,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -235,7 +235,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
index 0dfd7feda388a..c3e3715b44528 100644
--- a/.github/workflows/windows_cuda_plugin.yml
+++ b/.github/workflows/windows_cuda_plugin.yml
@@ -73,7 +73,7 @@ jobs:
             --build_dir build `
             --skip_submodule_sync `
             --parallel `
-            --nvcc_threads 4 --flash_nvcc_threads 2 `
+            --nvcc_threads 8 --flash_nvcc_threads 4 `
             --use_binskim_compliant_compile_flags `
             --cmake_generator "Visual Studio 17 2022" `
             --build_shared_lib `
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index fa051ef694e52..49b36fdb03ce2 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -121,7 +121,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -247,7 +247,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }

From d0afc5a96260c56cb4860356d38a03c587bb3476 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sun, 24 May 2026 01:21:30 -0700
Subject: [PATCH 09/17] --nvcc_threads 4 --flash_nvcc_threads 2 in packaging
 pipeline

---
 .../github/linux/build_cuda_c_api_package.sh  |  3 +-
 tools/ci_build/github/linux/build_cuda_ci.sh  | 53 -------------------
 .../linux/build_tensorrt_c_api_package.sh     |  3 +-
 3 files changed, 4 insertions(+), 55 deletions(-)
 delete mode 100755 tools/ci_build/github/linux/build_cuda_ci.sh

diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
index 606dcab3631c8..d5b4bde9589a7 100755
--- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -19,7 +19,8 @@ docker run -e SYSTEM_COLLECTIONURI --rm \
 --volume "$HOME/.gradle:/home/onnxruntimedev/.gradle" \
 -e NIGHTLY_BUILD "onnxruntimecuda${CUDA_VERSION_MAJOR}build" \
 /bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs \
---build_dir /build --config Release --skip_submodule_sync  --parallel --use_binskim_compliant_compile_flags --build_shared_lib \
+--build_dir /build --config Release --skip_submodule_sync --use_binskim_compliant_compile_flags --build_shared_lib \
+--parallel --nvcc_threads 4 --flash_nvcc_threads 2 \
 --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \
 --skip_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
 --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' \
diff --git a/tools/ci_build/github/linux/build_cuda_ci.sh b/tools/ci_build/github/linux/build_cuda_ci.sh
deleted file mode 100755
index db48c31e0f2d0..0000000000000
--- a/tools/ci_build/github/linux/build_cuda_ci.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-set -ex
-#Every cuda container has this $CUDA_VERSION env var set.
-SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed   's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/')
-
-BUILD_ARGS=('--config'
-            'Release'
-            '--update'
-            '--build'
-            '--skip_submodule_sync'
-            '--build_shared_lib'
-            '--parallel'
-            '--use_vcpkg'
-            '--use_vcpkg_ms_internal_asset_cache'
-            '--use_binskim_compliant_compile_flags'
-            '--build_wheel'
-            '--enable_onnx_tests'
-            '--use_cuda'
-            "--cuda_version=$SHORT_CUDA_VERSION"
-            "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
-            "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
-            "--enable_cuda_profiling"
-            "--enable_pybind"
-            "--build_java"
-            "--cmake_extra_defines"
-            "CMAKE_CUDA_ARCHITECTURES=80"
-            "onnxruntime_BUILD_UNIT_TESTS=ON"
-            "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
-if [ -x "$(command -v ninja)" ]; then
-    BUILD_ARGS+=('--cmake_generator' 'Ninja')
-fi
-
-if [ -d /build ]; then
-    BUILD_ARGS+=('--build_dir' '/build')
-else
-    BUILD_ARGS+=('--build_dir' 'build')
-fi
-
-if command -v ccache &> /dev/null; then
-    ccache --zero-stats
-    BUILD_ARGS+=("--use_cache")
-fi
-
-if [ -f /opt/python/cp312-cp312/bin/python3 ]; then
-    PATH=/opt/python/cp312-cp312/bin:$PATH python tools/ci_build/build.py "${BUILD_ARGS[@]}"
-else
-    python3 tools/ci_build/build.py "${BUILD_ARGS[@]}"
-fi
-
-if command -v ccache &> /dev/null; then
-    # FIXME: can't use `-vv` for extra details b/c we're shipping with a decrepit version of ccache (3.something) that doesn't support it.
-    ccache --show-stats # -vv
-fi
diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
index c027e4a0560a2..b8ffe9fe09679 100755
--- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
@@ -20,7 +20,8 @@ docker run -e SYSTEM_COLLECTIONURI --rm --volume /data/onnx:/data/onnx:ro --volu
   --volume "$HOME/.gradle:/home/onnxruntimedev/.gradle" \
   --volume "$HOME/.onnx:/home/onnxruntimedev/.onnx" -e NIGHTLY_BUILD "onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build" \
   /bin/bash -c "/usr/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests \
-  --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs \
+  --skip_submodule_sync --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs \
+  --parallel --nvcc_threads 4 --flash_nvcc_threads 2 \
   --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr \
   --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}' 'onnxruntime_USE_FPA_INTB_GEMM=OFF' \
   --use_vcpkg --use_vcpkg_ms_internal_asset_cache && cd /build/Release && make install DESTDIR=/build/installed"

From c4d32b8f66ad12531a148e6b357d37a884476a46 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sun, 24 May 2026 01:48:37 -0700
Subject: [PATCH 10/17] Fix  Windows_Packaging_TensorRT

---
 cmake/onnxruntime_providers_cuda.cmake        | 7 +++++++
 cmake/onnxruntime_providers_cuda_plugin.cmake | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index c3ec700a48925..06ee5867498bd 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -525,11 +525,18 @@
 
   # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
   # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs.
+  # Excludes SM120+ real (native SASS) architectures because SM120-specific kernels are already
+  # compiled in the separate SM120 TMA OBJECT library, and compiling the general LLM code for
+  # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch
+  # (PTX) is kept so SM120 devices can JIT-compile the code.
   if(onnxruntime_cuda_llm_srcs)
     set(_ort_llm_cuda_architectures)
     foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
       string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
       if(_arch_num GREATER_EQUAL 75)
+        if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$")
+          continue()
+        endif()
         list(APPEND _ort_llm_cuda_architectures "${_arch}")
       endif()
     endforeach()
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index e74b8e83c3bb3..4ea21e0f4617c 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -342,11 +342,17 @@ if(_cuda_plugin_flash_attention_srcs)
 endif()
 
 # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
+# Excludes SM120+ real (native SASS) architectures — SM120-specific kernels are compiled in
+# the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX
+# headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept.
 if(_cuda_plugin_llm_srcs)
   set(_plugin_llm_cuda_architectures)
   foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
     string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
     if(_arch_num GREATER_EQUAL 75)
+      if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$")
+        continue()
+      endif()
       list(APPEND _plugin_llm_cuda_architectures "${_arch}")
     endif()
   endforeach()

From f14741f8dfd1a7f5eac9e1b1341bac22b7f8c36f Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sun, 24 May 2026 01:49:27 -0700
Subject: [PATCH 11/17] cuda plugin packaging uses --nvcc_threads 2
 --flash_nvcc_threads 1

---
 tools/ci_build/github/linux/build_cuda_plugin_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/linux/build_cuda_plugin_package.sh b/tools/ci_build/github/linux/build_cuda_plugin_package.sh
index ad57648f8af81..958744e000727 100755
--- a/tools/ci_build/github/linux/build_cuda_plugin_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_plugin_package.sh
@@ -55,7 +55,7 @@ docker run --rm \
         --config ${BUILD_CONFIG} \
         --skip_submodule_sync \
         --parallel \
-        --nvcc_threads 4 --flash_nvcc_threads 2 \
+        --nvcc_threads 2 --flash_nvcc_threads 1 \
         --use_binskim_compliant_compile_flags \
         --use_cuda \
         --cuda_version=${SHORT_CUDA_VERSION} \

From 6ebd5e4e42995b7c871d2c18b6c219bb2ec489cd Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sun, 24 May 2026 16:04:26 -0700
Subject: [PATCH 12/17] CI --nvcc_threads 4 --flash_nvcc_threads 4

---
 .github/workflows/linux_cuda_ci.yml        | 2 +-
 .github/workflows/linux_cuda_plugin_ci.yml | 2 +-
 .github/workflows/linux_tensorrt_ci.yml    | 2 +-
 .github/workflows/windows_cuda.yml         | 4 ++--
 .github/workflows/windows_cuda_plugin.yml  | 2 +-
 .github/workflows/windows_tensorrt.yml     | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
index 2e9d5b92655dc..f50c0064dd956 100644
--- a/.github/workflows/linux_cuda_ci.yml
+++ b/.github/workflows/linux_cuda_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
       docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
index 9406d389212ea..e7fd32e864129 100644
--- a/.github/workflows/linux_cuda_plugin_ci.yml
+++ b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -32,7 +32,7 @@ jobs:
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
-        --nvcc_threads 8 --flash_nvcc_threads 4
+        --nvcc_threads 4 --flash_nvcc_threads 4
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml
index bb6d51f2f7190..f5704dab8dcfa 100644
--- a/.github/workflows/linux_tensorrt_ci.yml
+++ b/.github/workflows/linux_tensorrt_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host'
       docker_image_repo: onnxruntimetensorrt86gpubuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index c2df2cc56c1f4..5c9ff551c2a34 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -115,7 +115,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -235,7 +235,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
index c3e3715b44528..a77a407da8e9d 100644
--- a/.github/workflows/windows_cuda_plugin.yml
+++ b/.github/workflows/windows_cuda_plugin.yml
@@ -73,7 +73,7 @@ jobs:
             --build_dir build `
             --skip_submodule_sync `
             --parallel `
-            --nvcc_threads 8 --flash_nvcc_threads 4 `
+            --nvcc_threads 4 --flash_nvcc_threads 4 `
             --use_binskim_compliant_compile_flags `
             --cmake_generator "Visual Studio 17 2022" `
             --build_shared_lib `
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index 49b36fdb03ce2..d5710795942d1 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -121,7 +121,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -247,7 +247,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 8 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }

From 999b02f18b90b7bafe6c57dd1dd011b530f05e25 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 25 May 2026 22:52:06 -0700
Subject: [PATCH 13/17] fix: address review feedback on nvcc_threads defaults
 and QUICK_BUILD

- Remove nargs="?" from --nvcc_threads and --flash_nvcc_threads to prevent
  TypeError when passed without a value
- Add onnxruntime_QUICK_BUILD=ON to Windows CUDA CI build job for consistency
  with the test job
- Update CMake cache default for onnxruntime_NVCC_THREADS from 1 to 4 to match
  the build.py default
---
 .github/workflows/windows_cuda.yml     | 2 +-
 cmake/onnxruntime_providers_cuda.cmake | 2 +-
 tools/ci_build/build_args.py           | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 5c9ff551c2a34..53c7031c3c095 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -115,7 +115,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 06ee5867498bd..234ff643e2519 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -233,7 +233,7 @@
     # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads
     # --threads is NOT set here; it is applied per-target after calling this function
     # so that flash attention can use a different (lower) thread count.
-    set(onnxruntime_NVCC_THREADS "1" CACHE STRING "Number of threads that NVCC can use for compilation.")
+    set(onnxruntime_NVCC_THREADS "4" CACHE STRING "Number of threads that NVCC can use for compilation.")
 
     # suppress warnings like this:
     #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index ca866a2db7ae6..278453a9ddea7 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -646,14 +646,12 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     cuda_group.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.")
     cuda_group.add_argument(
         "--nvcc_threads",
-        nargs="?",
         default=4,
         type=int,
         help="Max NVCC threads per parallel job (default is 4).",
     )
     cuda_group.add_argument(
         "--flash_nvcc_threads",
-        nargs="?",
         default=-1,
         type=int,
         help="Max NVCC threads per parallel job for flash attention (default is same value of --nvcc_threads).",

From b5aae5cb2fcb4a62c2818c09734e1491d908ab82 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 26 May 2026 15:00:15 -0700
Subject: [PATCH 14/17] address feedbacks

---
 .github/workflows/linux_cuda_plugin_ci.yml    |   3 +-
 .github/workflows/windows_cuda_plugin.yml     |   3 +-
 cmake/onnxruntime_cuda_source_filters.cmake   | 141 ++++++++++++------
 cmake/onnxruntime_providers_cuda.cmake        |  27 +---
 cmake/onnxruntime_providers_cuda_plugin.cmake |  27 +---
 .../stages/plugin-win-cuda-stage.yml          |   6 +-
 .../stages/py-win-gpu-stage.yml               |   3 +-
 .../github/linux/build_cuda_plugin_package.sh |   3 +-
 .../linux/build_linux_python_package.sh       |   4 +-
 9 files changed, 113 insertions(+), 104 deletions(-)

diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
index e7fd32e864129..d2491f59812ab 100644
--- a/.github/workflows/linux_cuda_plugin_ci.yml
+++ b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -32,7 +32,8 @@ jobs:
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
-        --nvcc_threads 4 --flash_nvcc_threads 4
+        --nvcc_threads 4
+        --flash_nvcc_threads 4
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
index a77a407da8e9d..f9acdbd76a12d 100644
--- a/.github/workflows/windows_cuda_plugin.yml
+++ b/.github/workflows/windows_cuda_plugin.yml
@@ -73,7 +73,8 @@ jobs:
             --build_dir build `
             --skip_submodule_sync `
             --parallel `
-            --nvcc_threads 4 --flash_nvcc_threads 4 `
+            --nvcc_threads 4 `
+            --flash_nvcc_threads 4 `
             --use_binskim_compliant_compile_flags `
             --cmake_generator "Visual Studio 17 2022" `
             --build_shared_lib `
diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake
index 761f43aab9f48..752f911d0fd1a 100644
--- a/cmake/onnxruntime_cuda_source_filters.cmake
+++ b/cmake/onnxruntime_cuda_source_filters.cmake
@@ -8,39 +8,43 @@
 # Usage:
 #   onnxruntime_filter_cuda_cu_sources(<list_variable_name>)
 #
-# The macro modifies the named list variable in the caller's scope.
+# The function modifies the named list variable in the caller's scope.
+
+function(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST)
+  set(_list "${${CU_SRC_LIST}}")
 
-macro(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST)
   # Quick build mode: Filter flash attention kernels for faster development iteration.
   #   - We keep only hdim128 fp16 flash attention kernels in quick build mode.
   #   - All other listed head dimensions are excluded (e.g., 32, 64, 96, 192, 256).
   #     If new head dimensions are added or removed, update this list to match the supported set.
   if(onnxruntime_QUICK_BUILD)
     message(STATUS "Quick build mode enabled: Only building hdim128 fp16 flash attention kernels")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)")
+    list(FILTER _list EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)")
   endif()
 
   if(NOT onnxruntime_USE_FP4_QMOE)
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_.*\\.generated\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_.*\\.generated\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp4\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_.*\\.generated\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_.*\\.generated\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp4\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
   else()
     # CUDA 13 PTXAS does not complete the FP4 M=128/N=64 pingpong specializations in
     # this build configuration. The dispatcher routes that tile through cooperative
     # mainloop variants instead, so exclude only those unused generated units.
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_(fp16|bf16)_m128_n64_k[0-9]+_cm[12]_cn[12]_pp(_finalize)?\\.generated\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_(fp16|bf16)_m128_n64_k[0-9]+_cm[12]_cn[12]_pp(_finalize)?\\.generated\\.cu")
   endif()
 
   if(NOT onnxruntime_USE_FP8_QMOE)
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm90_wfp8_.*\\.generated\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_fp8_.*\\.generated\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp8\\.cu")
-    list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm90_wfp8_.*\\.generated\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_fp8_.*\\.generated\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp8_fp4\\.generated\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp8\\.cu")
+    list(FILTER _list EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
   endif()
-endmacro()
+
+  set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE)
+endfunction()
 
 # Extract SM90/SM120 TMA warp-specialized generated source files from a CUDA source list.
 # These files use CUTLASS 3.x features (GMMA, TMA) that are specific to SM90+ or SM120+.
@@ -57,35 +61,41 @@ endmacro()
 #       SM90_SOURCES <output_var> SM120_SOURCES <output_var>)
 #
 # Removes matched files from <cu_src_list_var> and stores them in the output variables.
-macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST)
-  cmake_parse_arguments(_EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "" ${ARGN})
+function(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST)
+  cmake_parse_arguments(PARSE_ARGV 1 _EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "")
+
+  set(_list "${${CU_SRC_LIST}}")
 
   # Extract SM90 TMA WS generated files
-  set(${_EXTRACT_SM90_SOURCES})
+  set(_sm90_srcs)
   if(ORT_HAS_SM90_OR_LATER)
-    foreach(_src IN LISTS ${CU_SRC_LIST})
+    foreach(_src IN LISTS _list)
       if(_src MATCHES "moe_gemm_tma_ws_sm90_.*\\.generated\\.cu$")
-        list(APPEND ${_EXTRACT_SM90_SOURCES} "${_src}")
+        list(APPEND _sm90_srcs "${_src}")
       endif()
     endforeach()
-    if(${_EXTRACT_SM90_SOURCES})
-      list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM90_SOURCES}})
+    if(_sm90_srcs)
+      list(REMOVE_ITEM _list ${_sm90_srcs})
     endif()
   endif()
 
   # Extract SM120 TMA WS generated files
-  set(${_EXTRACT_SM120_SOURCES})
+  set(_sm120_srcs)
   if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
-    foreach(_src IN LISTS ${CU_SRC_LIST})
+    foreach(_src IN LISTS _list)
       if(_src MATCHES "moe_gemm_tma_ws_sm120_.*\\.generated\\.cu$")
-        list(APPEND ${_EXTRACT_SM120_SOURCES} "${_src}")
+        list(APPEND _sm120_srcs "${_src}")
       endif()
     endforeach()
-    if(${_EXTRACT_SM120_SOURCES})
-      list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM120_SOURCES}})
+    if(_sm120_srcs)
+      list(REMOVE_ITEM _list ${_sm120_srcs})
     endif()
   endif()
-endmacro()
+
+  set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE)
+  set("${_EXTRACT_SM90_SOURCES}" "${_sm90_srcs}" PARENT_SCOPE)
+  set("${_EXTRACT_SM120_SOURCES}" "${_sm120_srcs}" PARENT_SCOPE)
+endfunction()
 
 # Extract Flash Attention CUDA source files into a separate list for compilation
 # in a dedicated OBJECT library with SM80+ architectures and independent nvcc_threads.
@@ -98,19 +108,23 @@ endmacro()
 # Usage:
 #   onnxruntime_extract_flash_attention_sources(<cu_src_list_var>
 #       FLASH_SOURCES <output_var>)
-macro(onnxruntime_extract_flash_attention_sources CU_SRC_LIST)
-  cmake_parse_arguments(_FA "" "FLASH_SOURCES" "" ${ARGN})
+function(onnxruntime_extract_flash_attention_sources CU_SRC_LIST)
+  cmake_parse_arguments(PARSE_ARGV 1 _FA "" "FLASH_SOURCES" "")
 
-  set(${_FA_FLASH_SOURCES})
-  foreach(_src IN LISTS ${CU_SRC_LIST})
+  set(_list "${${CU_SRC_LIST}}")
+  set(_flash_srcs)
+  foreach(_src IN LISTS _list)
     if(_src MATCHES "/bert/flash_attention/.*\\.cu$")
-      list(APPEND ${_FA_FLASH_SOURCES} "${_src}")
+      list(APPEND _flash_srcs "${_src}")
     endif()
   endforeach()
-  if(${_FA_FLASH_SOURCES})
-    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_FA_FLASH_SOURCES}})
+  if(_flash_srcs)
+    list(REMOVE_ITEM _list ${_flash_srcs})
   endif()
-endmacro()
+
+  set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE)
+  set("${_FA_FLASH_SOURCES}" "${_flash_srcs}" PARENT_SCOPE)
+endfunction()
 
 # Extract LLM CUDA source files into separate lists for per-architecture compilation.
 # The LLM directory (contrib_ops/cuda/llm/) contains kernels with minimum SM75 support
@@ -119,31 +133,60 @@ endmacro()
 # at 90a-real (merged into the SM90 TMA OBJECT library).
 #
 # Note: SM90 TMA MoE GEMM files are already extracted by
-# onnxruntime_extract_sm_specific_cuda_sources() before this macro is called.
+# onnxruntime_extract_sm_specific_cuda_sources() before this function is called.
 #
 # Usage:
 #   onnxruntime_extract_llm_sources(<cu_src_list_var>
 #       LLM_SOURCES <output_var>
 #       LLM_SM90_SOURCES <output_var>)
-macro(onnxruntime_extract_llm_sources CU_SRC_LIST)
-  cmake_parse_arguments(_LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "" ${ARGN})
+function(onnxruntime_extract_llm_sources CU_SRC_LIST)
+  cmake_parse_arguments(PARSE_ARGV 1 _LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "")
 
-  set(${_LLM_LLM_SOURCES})
-  set(${_LLM_LLM_SM90_SOURCES})
-  foreach(_src IN LISTS ${CU_SRC_LIST})
+  set(_list "${${CU_SRC_LIST}}")
+  set(_llm_srcs)
+  set(_llm_sm90_srcs)
+  foreach(_src IN LISTS _list)
     if(_src MATCHES "/contrib_ops/cuda/llm/.*\\.cu$")
       # SM90-specific fpA_intB launchers (guarded by #ifndef EXCLUDE_SM_90)
       if(_src MATCHES "fpA_intB_gemm_launcher_[0-9]+\\.generated\\.cu$")
-        list(APPEND ${_LLM_LLM_SM90_SOURCES} "${_src}")
+        list(APPEND _llm_sm90_srcs "${_src}")
       else()
-        list(APPEND ${_LLM_LLM_SOURCES} "${_src}")
+        list(APPEND _llm_srcs "${_src}")
       endif()
     endif()
   endforeach()
-  if(${_LLM_LLM_SOURCES})
-    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SOURCES}})
+  if(_llm_srcs)
+    list(REMOVE_ITEM _list ${_llm_srcs})
   endif()
-  if(${_LLM_LLM_SM90_SOURCES})
-    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SM90_SOURCES}})
+  if(_llm_sm90_srcs)
+    list(REMOVE_ITEM _list ${_llm_sm90_srcs})
   endif()
-endmacro()
+
+  set("${CU_SRC_LIST}" "${_list}" PARENT_SCOPE)
+  set("${_LLM_LLM_SOURCES}" "${_llm_srcs}" PARENT_SCOPE)
+  set("${_LLM_LLM_SM90_SOURCES}" "${_llm_sm90_srcs}" PARENT_SCOPE)
+endfunction()
+
+# Filter CMAKE_CUDA_ARCHITECTURES to only those >= a minimum SM version.
+# Optionally excludes SM120+ real architectures (for LLM targets that hit
+# CCCL tcgen05 PTX issues on Windows/MSVC when compiled for sm_120a native).
+#
+# Usage:
+#   onnxruntime_filter_cuda_archs(<output_var>
+#       MIN_SM <number>
+#       [EXCLUDE_SM120_REAL])
+function(onnxruntime_filter_cuda_archs OUTPUT_VAR)
+  cmake_parse_arguments(PARSE_ARGV 1 _FCA "EXCLUDE_SM120_REAL" "MIN_SM" "")
+
+  set(_filtered)
+  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
+    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
+    if(_arch_num GREATER_EQUAL "${_FCA_MIN_SM}")
+      if(_FCA_EXCLUDE_SM120_REAL AND _arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$")
+        continue()
+      endif()
+      list(APPEND _filtered "${_arch}")
+    endif()
+  endforeach()
+  set("${OUTPUT_VAR}" "${_filtered}" PARENT_SCOPE)
+endfunction()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 234ff643e2519..8ddd29763ff39 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -482,13 +482,7 @@
   endif()
 
   if(onnxruntime_cuda_sm120_tma_srcs)
-    set(_ort_sm120_cuda_architectures)
-    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-      if(_arch_num GREATER_EQUAL 120)
-        list(APPEND _ort_sm120_cuda_architectures "${_arch}")
-      endif()
-    endforeach()
+    onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120)
     if(_ort_sm120_cuda_architectures)
       onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs})
       set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}")
@@ -504,13 +498,7 @@
   set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING
       "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).")
   if(onnxruntime_cuda_flash_attention_srcs)
-    set(_ort_flash_cuda_architectures)
-    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-      if(_arch_num GREATER_EQUAL 80)
-        list(APPEND _ort_flash_cuda_architectures "${_arch}")
-      endif()
-    endforeach()
+    onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80)
     if(_ort_flash_cuda_architectures)
       onnxruntime_add_object_library(onnxruntime_providers_cuda_flash_attention ${onnxruntime_cuda_flash_attention_srcs})
       set_target_properties(onnxruntime_providers_cuda_flash_attention PROPERTIES
@@ -530,16 +518,7 @@
   # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch
   # (PTX) is kept so SM120 devices can JIT-compile the code.
   if(onnxruntime_cuda_llm_srcs)
-    set(_ort_llm_cuda_architectures)
-    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-      if(_arch_num GREATER_EQUAL 75)
-        if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$")
-          continue()
-        endif()
-        list(APPEND _ort_llm_cuda_architectures "${_arch}")
-      endif()
-    endforeach()
+    onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
     if(_ort_llm_cuda_architectures)
       onnxruntime_add_object_library(onnxruntime_providers_cuda_llm ${onnxruntime_cuda_llm_srcs})
       set_target_properties(onnxruntime_providers_cuda_llm PROPERTIES
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 4ea21e0f4617c..cd7810e7c4d91 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -284,13 +284,7 @@ if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs)
 endif()
 
 if(_cuda_plugin_sm120_tma_srcs)
-  set(_plugin_sm120_cuda_architectures)
-  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-    if(_arch_num GREATER_EQUAL 120)
-      list(APPEND _plugin_sm120_cuda_architectures "${_arch}")
-    endif()
-  endforeach()
+  onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120)
   if(_plugin_sm120_cuda_architectures)
     onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm120_tma ${_cuda_plugin_sm120_tma_srcs})
     set_target_properties(onnxruntime_providers_cuda_plugin_sm120_tma PROPERTIES
@@ -315,13 +309,7 @@ if(NOT DEFINED onnxruntime_FLASH_NVCC_THREADS)
   set(onnxruntime_FLASH_NVCC_THREADS "1")
 endif()
 if(_cuda_plugin_flash_attention_srcs)
-  set(_plugin_flash_cuda_architectures)
-  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-    if(_arch_num GREATER_EQUAL 80)
-      list(APPEND _plugin_flash_cuda_architectures "${_arch}")
-    endif()
-  endforeach()
+  onnxruntime_filter_cuda_archs(_plugin_flash_cuda_architectures MIN_SM 80)
   if(_plugin_flash_cuda_architectures)
     onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_flash_attention ${_cuda_plugin_flash_attention_srcs})
     set_target_properties(onnxruntime_providers_cuda_plugin_flash_attention PROPERTIES
@@ -346,16 +334,7 @@ endif()
 # the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX
 # headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept.
 if(_cuda_plugin_llm_srcs)
-  set(_plugin_llm_cuda_architectures)
-  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-    if(_arch_num GREATER_EQUAL 75)
-      if(_arch_num GREATER_EQUAL 120 AND _arch MATCHES "-real$")
-        continue()
-      endif()
-      list(APPEND _plugin_llm_cuda_architectures "${_arch}")
-    endif()
-  endforeach()
+  onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
   if(_plugin_llm_cuda_architectures)
     onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_llm ${_cuda_plugin_llm_srcs})
     set_target_properties(onnxruntime_providers_cuda_plugin_llm PROPERTIES
diff --git a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml
index 5ff6def4c1e10..e2618165b9b58 100644
--- a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml
@@ -127,7 +127,8 @@ stages:
               --skip_submodule_sync
               --cmake_generator "$(VSGenerator)"
               --parallel
-              --nvcc_threads 4 --flash_nvcc_threads 2
+              --nvcc_threads 4
+              --flash_nvcc_threads 2
               --use_vcpkg
               --use_vcpkg_ms_internal_asset_cache
               --use_binskim_compliant_compile_flags
@@ -154,7 +155,8 @@ stages:
               --skip_submodule_sync
               --cmake_generator "$(VSGenerator)"
               --parallel
-              --nvcc_threads 4 --flash_nvcc_threads 2
+              --nvcc_threads 4
+              --flash_nvcc_threads 2
               --use_vcpkg
               --use_vcpkg_ms_internal_asset_cache
               --use_binskim_compliant_compile_flags
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index 65216e7c59198..e49b94cbc2c56 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -122,7 +122,8 @@ stages:
             --enable_pybind
             --enable_onnx_tests
             --parallel
-            --nvcc_threads 4 --flash_nvcc_threads 2
+            --nvcc_threads 4
+            --flash_nvcc_threads 2
             --use_vcpkg
             --use_vcpkg_ms_internal_asset_cache
             --use_binskim_compliant_compile_flags
diff --git a/tools/ci_build/github/linux/build_cuda_plugin_package.sh b/tools/ci_build/github/linux/build_cuda_plugin_package.sh
index 958744e000727..5feae049acb22 100755
--- a/tools/ci_build/github/linux/build_cuda_plugin_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_plugin_package.sh
@@ -55,7 +55,8 @@ docker run --rm \
         --config ${BUILD_CONFIG} \
         --skip_submodule_sync \
         --parallel \
-        --nvcc_threads 2 --flash_nvcc_threads 1 \
+        --nvcc_threads 2 \
+        --flash_nvcc_threads 1 \
         --use_binskim_compliant_compile_flags \
         --use_cuda \
         --cuda_version=${SHORT_CUDA_VERSION} \
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 672582edf0518..1df9733247fc2 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -86,7 +86,9 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then
         CUDA_HOME=/usr/local/cuda
     fi
     #Enable CUDA EP.
-    BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=4" "--flash_nvcc_threads=2" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
+    BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME")
+    BUILD_ARGS+=("--nvcc_threads=4" "--flash_nvcc_threads=2")
+    BUILD_ARGS+=("--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
     # Enable TRT EP only if TensorRT is installed.
     if [ -f /usr/include/NvInfer.h ]; then
         BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/usr")

From 86c45a412e6c3ac5bb0f2ea8f09444d9c914a8d4 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 26 May 2026 15:23:48 -0700
Subject: [PATCH 15/17] refactoring

---
 cmake/onnxruntime_cuda_source_filters.cmake   | 53 +++++++++++
 cmake/onnxruntime_providers_cuda.cmake        | 59 ++++++-------
 cmake/onnxruntime_providers_cuda_plugin.cmake | 88 ++++++-------------
 3 files changed, 105 insertions(+), 95 deletions(-)

diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake
index 752f911d0fd1a..3af4b71b90988 100644
--- a/cmake/onnxruntime_cuda_source_filters.cmake
+++ b/cmake/onnxruntime_cuda_source_filters.cmake
@@ -190,3 +190,56 @@ function(onnxruntime_filter_cuda_archs OUTPUT_VAR)
   endforeach()
   set("${OUTPUT_VAR}" "${_filtered}" PARENT_SCOPE)
 endfunction()
+
+# Create a CUDA OBJECT library for the in-tree CUDA EP and link it to the parent target.
+# Uses config_cuda_provider_shared_module() for full configuration (includes, link libs,
+# PCH, platform flags, etc.), then applies nvcc --threads.
+#
+# Usage:
+#   onnxruntime_add_cuda_object_library(
+#       NAME <target_name>
+#       PARENT <parent_target>
+#       CUDA_ARCHITECTURES <arch_list>
+#       NVCC_THREADS <thread_count>
+#       SOURCES <source_files...>)
+function(onnxruntime_add_cuda_object_library)
+  cmake_parse_arguments(PARSE_ARGV 0 _ARG "" "NAME;PARENT;CUDA_ARCHITECTURES;NVCC_THREADS" "SOURCES")
+
+  onnxruntime_add_object_library("${_ARG_NAME}" ${_ARG_SOURCES})
+  set_target_properties("${_ARG_NAME}" PROPERTIES CUDA_ARCHITECTURES "${_ARG_CUDA_ARCHITECTURES}")
+  config_cuda_provider_shared_module("${_ARG_NAME}")
+  target_compile_options("${_ARG_NAME}" PRIVATE
+    "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${_ARG_NVCC_THREADS}\">")
+  target_link_libraries("${_ARG_PARENT}" PRIVATE "${_ARG_NAME}")
+endfunction()
+
+# Create a CUDA OBJECT library for the plugin EP and link it to the parent target.
+# Handles the boilerplate: set CUDA_ARCHITECTURES/CUDA_STANDARD, propagate includes
+# and compile definitions from the parent, apply shared compile options + nvcc --threads.
+#
+# Usage:
+#   onnxruntime_add_cuda_plugin_object_library(
+#       NAME <target_name>
+#       PARENT <parent_target>
+#       CUDA_ARCHITECTURES <arch_list>
+#       NVCC_THREADS <thread_count>
+#       COMPILE_OPTIONS <options...>
+#       SOURCES <source_files...>)
+function(onnxruntime_add_cuda_plugin_object_library)
+  cmake_parse_arguments(PARSE_ARGV 0 _ARG "" "NAME;PARENT;CUDA_ARCHITECTURES;NVCC_THREADS" "SOURCES;COMPILE_OPTIONS")
+
+  onnxruntime_add_object_library("${_ARG_NAME}" ${_ARG_SOURCES})
+  set_target_properties("${_ARG_NAME}" PROPERTIES
+    CUDA_ARCHITECTURES "${_ARG_CUDA_ARCHITECTURES}"
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+  )
+  target_include_directories("${_ARG_NAME}" PRIVATE
+    $<TARGET_PROPERTY:${_ARG_PARENT},INCLUDE_DIRECTORIES>)
+  target_compile_definitions("${_ARG_NAME}" PRIVATE
+    $<TARGET_PROPERTY:${_ARG_PARENT},COMPILE_DEFINITIONS>)
+  target_compile_options("${_ARG_NAME}" PRIVATE
+    ${_ARG_COMPILE_OPTIONS}
+    "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${_ARG_NVCC_THREADS}\">")
+  target_link_libraries("${_ARG_PARENT}" PRIVATE "${_ARG_NAME}")
+endfunction()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 8ddd29763ff39..1c2f9291bb5d1 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -464,31 +464,26 @@
     # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat.
     # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
     set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs})
-    set(_ort_has_sm90_plus FALSE)
-    foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-      string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-      if(_arch_num GREATER_EQUAL 90)
-        set(_ort_has_sm90_plus TRUE)
-        break()
-      endif()
-    endforeach()
-    if(_ort_has_sm90_plus)
-      onnxruntime_add_object_library(onnxruntime_providers_cuda_sm90_tma ${_ort_sm90_all_srcs})
-      set_target_properties(onnxruntime_providers_cuda_sm90_tma PROPERTIES CUDA_ARCHITECTURES "90a-real")
-      config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm90_tma)
-      target_compile_options(onnxruntime_providers_cuda_sm90_tma PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
-      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm90_tma)
+    onnxruntime_filter_cuda_archs(_ort_sm90_check MIN_SM 90)
+    if(_ort_sm90_check)
+      onnxruntime_add_cuda_object_library(
+        NAME onnxruntime_providers_cuda_sm90_tma
+        PARENT onnxruntime_providers_cuda
+        CUDA_ARCHITECTURES "90a-real"
+        NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
+        SOURCES ${_ort_sm90_all_srcs})
     endif()
   endif()
 
   if(onnxruntime_cuda_sm120_tma_srcs)
     onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120)
     if(_ort_sm120_cuda_architectures)
-      onnxruntime_add_object_library(onnxruntime_providers_cuda_sm120_tma ${onnxruntime_cuda_sm120_tma_srcs})
-      set_target_properties(onnxruntime_providers_cuda_sm120_tma PROPERTIES CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}")
-      config_cuda_provider_shared_module(onnxruntime_providers_cuda_sm120_tma)
-      target_compile_options(onnxruntime_providers_cuda_sm120_tma PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
-      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_sm120_tma)
+      onnxruntime_add_cuda_object_library(
+        NAME onnxruntime_providers_cuda_sm120_tma
+        PARENT onnxruntime_providers_cuda
+        CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}"
+        NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
+        SOURCES ${onnxruntime_cuda_sm120_tma_srcs})
     endif()
   endif()
 
@@ -500,14 +495,12 @@
   if(onnxruntime_cuda_flash_attention_srcs)
     onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80)
     if(_ort_flash_cuda_architectures)
-      onnxruntime_add_object_library(onnxruntime_providers_cuda_flash_attention ${onnxruntime_cuda_flash_attention_srcs})
-      set_target_properties(onnxruntime_providers_cuda_flash_attention PROPERTIES
-        CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}")
-      config_cuda_provider_shared_module(onnxruntime_providers_cuda_flash_attention)
-      target_compile_options(onnxruntime_providers_cuda_flash_attention PRIVATE
-        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">"
-      )
-      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_flash_attention)
+      onnxruntime_add_cuda_object_library(
+        NAME onnxruntime_providers_cuda_flash_attention
+        PARENT onnxruntime_providers_cuda
+        CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}"
+        NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
+        SOURCES ${onnxruntime_cuda_flash_attention_srcs})
     endif()
   endif()
 
@@ -520,12 +513,12 @@
   if(onnxruntime_cuda_llm_srcs)
     onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
     if(_ort_llm_cuda_architectures)
-      onnxruntime_add_object_library(onnxruntime_providers_cuda_llm ${onnxruntime_cuda_llm_srcs})
-      set_target_properties(onnxruntime_providers_cuda_llm PROPERTIES
-        CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}")
-      config_cuda_provider_shared_module(onnxruntime_providers_cuda_llm)
-      target_compile_options(onnxruntime_providers_cuda_llm PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
-      target_link_libraries(onnxruntime_providers_cuda PRIVATE onnxruntime_providers_cuda_llm)
+      onnxruntime_add_cuda_object_library(
+        NAME onnxruntime_providers_cuda_llm
+        PARENT onnxruntime_providers_cuda
+        CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}"
+        NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
+        SOURCES ${onnxruntime_cuda_llm_srcs})
     endif()
   endif()
 
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index cd7810e7c4d91..313285426e0ef 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -257,49 +257,28 @@ target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
 # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
 if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs)
   set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs})
-  set(_plugin_has_sm90_plus FALSE)
-  foreach(_arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    string(REGEX MATCH "^([0-9]+)" _arch_num "${_arch}")
-    if(_arch_num GREATER_EQUAL 90)
-      set(_plugin_has_sm90_plus TRUE)
-      break()
-    endif()
-  endforeach()
-  if(_plugin_has_sm90_plus)
-    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm90_tma ${_plugin_sm90_all_srcs})
-    set_target_properties(onnxruntime_providers_cuda_plugin_sm90_tma PROPERTIES
+  onnxruntime_filter_cuda_archs(_plugin_sm90_check MIN_SM 90)
+  if(_plugin_sm90_check)
+    onnxruntime_add_cuda_plugin_object_library(
+      NAME onnxruntime_providers_cuda_plugin_sm90_tma
+      PARENT onnxruntime_providers_cuda_plugin
       CUDA_ARCHITECTURES "90a-real"
-      CUDA_STANDARD 20
-      CUDA_STANDARD_REQUIRED ON
-    )
-    target_include_directories(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
-    target_compile_definitions(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
-    target_compile_options(onnxruntime_providers_cuda_plugin_sm90_tma PRIVATE
-      ${_cuda_plugin_shared_compile_options}
-      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">")
-    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm90_tma)
+      NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
+      COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
+      SOURCES ${_plugin_sm90_all_srcs})
   endif()
 endif()
 
 if(_cuda_plugin_sm120_tma_srcs)
   onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120)
   if(_plugin_sm120_cuda_architectures)
-    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_sm120_tma ${_cuda_plugin_sm120_tma_srcs})
-    set_target_properties(onnxruntime_providers_cuda_plugin_sm120_tma PROPERTIES
+    onnxruntime_add_cuda_plugin_object_library(
+      NAME onnxruntime_providers_cuda_plugin_sm120_tma
+      PARENT onnxruntime_providers_cuda_plugin
       CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}"
-      CUDA_STANDARD 20
-      CUDA_STANDARD_REQUIRED ON
-    )
-    target_include_directories(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
-    target_compile_definitions(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
-    target_compile_options(onnxruntime_providers_cuda_plugin_sm120_tma PRIVATE
-      ${_cuda_plugin_shared_compile_options}
-      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">")
-    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_sm120_tma)
+      NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
+      COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
+      SOURCES ${_cuda_plugin_sm120_tma_srcs})
   endif()
 endif()
 
@@ -311,21 +290,13 @@ endif()
 if(_cuda_plugin_flash_attention_srcs)
   onnxruntime_filter_cuda_archs(_plugin_flash_cuda_architectures MIN_SM 80)
   if(_plugin_flash_cuda_architectures)
-    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_flash_attention ${_cuda_plugin_flash_attention_srcs})
-    set_target_properties(onnxruntime_providers_cuda_plugin_flash_attention PROPERTIES
+    onnxruntime_add_cuda_plugin_object_library(
+      NAME onnxruntime_providers_cuda_plugin_flash_attention
+      PARENT onnxruntime_providers_cuda_plugin
       CUDA_ARCHITECTURES "${_plugin_flash_cuda_architectures}"
-      CUDA_STANDARD 20
-      CUDA_STANDARD_REQUIRED ON
-    )
-    target_include_directories(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
-    target_compile_definitions(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
-    target_compile_options(onnxruntime_providers_cuda_plugin_flash_attention PRIVATE
-      ${_cuda_plugin_shared_compile_options}
-      # Flash attention uses a lower nvcc --threads (memory-intensive compilation).
-      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_FLASH_NVCC_THREADS}\">")
-    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_flash_attention)
+      NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
+      COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
+      SOURCES ${_cuda_plugin_flash_attention_srcs})
   endif()
 endif()
 
@@ -336,20 +307,13 @@ endif()
 if(_cuda_plugin_llm_srcs)
   onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
   if(_plugin_llm_cuda_architectures)
-    onnxruntime_add_object_library(onnxruntime_providers_cuda_plugin_llm ${_cuda_plugin_llm_srcs})
-    set_target_properties(onnxruntime_providers_cuda_plugin_llm PROPERTIES
+    onnxruntime_add_cuda_plugin_object_library(
+      NAME onnxruntime_providers_cuda_plugin_llm
+      PARENT onnxruntime_providers_cuda_plugin
       CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}"
-      CUDA_STANDARD 20
-      CUDA_STANDARD_REQUIRED ON
-    )
-    target_include_directories(onnxruntime_providers_cuda_plugin_llm PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,INCLUDE_DIRECTORIES>)
-    target_compile_definitions(onnxruntime_providers_cuda_plugin_llm PRIVATE
-      $<TARGET_PROPERTY:onnxruntime_providers_cuda_plugin,COMPILE_DEFINITIONS>)
-    target_compile_options(onnxruntime_providers_cuda_plugin_llm PRIVATE
-      ${_cuda_plugin_shared_compile_options}
-      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">")
-    target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE onnxruntime_providers_cuda_plugin_llm)
+      NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
+      COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
+      SOURCES ${_cuda_plugin_llm_srcs})
   endif()
 endif()
 

From 6636c99d7e60528bfb8189996d629b0aa9382fa3 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 26 May 2026 15:28:14 -0700
Subject: [PATCH 16/17] handle disable contrib ops nicely

---
 cmake/onnxruntime_providers_cuda.cmake        | 124 ++++++++++--------
 cmake/onnxruntime_providers_cuda_plugin.cmake |  92 +++++++------
 2 files changed, 117 insertions(+), 99 deletions(-)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 1c2f9291bb5d1..b23c978d2d7b7 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -456,69 +456,79 @@
   config_cuda_provider_shared_module(onnxruntime_providers_cuda)
   target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
 
-  # Create OBJECT libraries for SM90/SM120 TMA WS generated files that must be compiled
+  # Create OBJECT libraries for SM-specific contrib CUDA sources that must be compiled
   # with restricted CUDA architectures. These files use CUTLASS 3.x SM90+/SM120+ features
   # (GMMA, TMA) that cannot produce useful device code for older architectures.
-  if(onnxruntime_cuda_sm90_tma_srcs OR onnxruntime_cuda_llm_sm90_srcs)
-    # SM90 TMA warp-specialized files use SM90-specific collective operations.
-    # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat.
-    # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
-    set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs})
-    onnxruntime_filter_cuda_archs(_ort_sm90_check MIN_SM 90)
-    if(_ort_sm90_check)
-      onnxruntime_add_cuda_object_library(
-        NAME onnxruntime_providers_cuda_sm90_tma
-        PARENT onnxruntime_providers_cuda
-        CUDA_ARCHITECTURES "90a-real"
-        NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
-        SOURCES ${_ort_sm90_all_srcs})
+  #
+  # SM90/SM120 TMA and LLM OBJECT libraries contain MoE and MatMulNBits kernels (contrib ops).
+  # Flash Attention is also used by the ONNX domain Attention op, so it is included even
+  # when contrib ops are disabled.
+  if(NOT onnxruntime_CUDA_MINIMAL)
+    # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads.
+    # Flash Attention V2 kernels require SM80 (Ampere) and are memory-intensive to compile.
+    # Isolating them allows the rest of the build to use higher --threads without OOM.
+    # Included even with onnxruntime_DISABLE_CONTRIB_OPS because the ONNX domain Attention
+    # kernel depends on flash attention infrastructure in contrib_ops/cuda/bert/.
+    set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING
+        "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).")
+    if(onnxruntime_cuda_flash_attention_srcs)
+      onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80)
+      if(_ort_flash_cuda_architectures)
+        onnxruntime_add_cuda_object_library(
+          NAME onnxruntime_providers_cuda_flash_attention
+          PARENT onnxruntime_providers_cuda
+          CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}"
+          NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
+          SOURCES ${onnxruntime_cuda_flash_attention_srcs})
+      endif()
     endif()
-  endif()
 
-  if(onnxruntime_cuda_sm120_tma_srcs)
-    onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120)
-    if(_ort_sm120_cuda_architectures)
-      onnxruntime_add_cuda_object_library(
-        NAME onnxruntime_providers_cuda_sm120_tma
-        PARENT onnxruntime_providers_cuda
-        CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}"
-        NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
-        SOURCES ${onnxruntime_cuda_sm120_tma_srcs})
-    endif()
-  endif()
+    if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+      # SM90 TMA warp-specialized files use SM90-specific collective operations.
+      # Compile at exactly 90a-real: SM120+ GPUs run SM90 native code via forward compat.
+      # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
+      if(onnxruntime_cuda_sm90_tma_srcs OR onnxruntime_cuda_llm_sm90_srcs)
+        set(_ort_sm90_all_srcs ${onnxruntime_cuda_sm90_tma_srcs} ${onnxruntime_cuda_llm_sm90_srcs})
+        onnxruntime_filter_cuda_archs(_ort_sm90_check MIN_SM 90)
+        if(_ort_sm90_check)
+          onnxruntime_add_cuda_object_library(
+            NAME onnxruntime_providers_cuda_sm90_tma
+            PARENT onnxruntime_providers_cuda
+            CUDA_ARCHITECTURES "90a-real"
+            NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
+            SOURCES ${_ort_sm90_all_srcs})
+        endif()
+      endif()
 
-  # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads.
-  # Flash Attention V2 kernels require SM80 (Ampere) and are memory-intensive to compile.
-  # Isolating them allows the rest of the build to use higher --threads without OOM.
-  set(onnxruntime_FLASH_NVCC_THREADS "1" CACHE STRING
-      "Number of NVCC threads for Flash Attention compilation (memory-intensive, keep low).")
-  if(onnxruntime_cuda_flash_attention_srcs)
-    onnxruntime_filter_cuda_archs(_ort_flash_cuda_architectures MIN_SM 80)
-    if(_ort_flash_cuda_architectures)
-      onnxruntime_add_cuda_object_library(
-        NAME onnxruntime_providers_cuda_flash_attention
-        PARENT onnxruntime_providers_cuda
-        CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}"
-        NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
-        SOURCES ${onnxruntime_cuda_flash_attention_srcs})
-    endif()
-  endif()
+      if(onnxruntime_cuda_sm120_tma_srcs)
+        onnxruntime_filter_cuda_archs(_ort_sm120_cuda_architectures MIN_SM 120)
+        if(_ort_sm120_cuda_architectures)
+          onnxruntime_add_cuda_object_library(
+            NAME onnxruntime_providers_cuda_sm120_tma
+            PARENT onnxruntime_providers_cuda
+            CUDA_ARCHITECTURES "${_ort_sm120_cuda_architectures}"
+            NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
+            SOURCES ${onnxruntime_cuda_sm120_tma_srcs})
+        endif()
+      endif()
 
-  # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
-  # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs.
-  # Excludes SM120+ real (native SASS) architectures because SM120-specific kernels are already
-  # compiled in the separate SM120 TMA OBJECT library, and compiling the general LLM code for
-  # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch
-  # (PTX) is kept so SM120 devices can JIT-compile the code.
-  if(onnxruntime_cuda_llm_srcs)
-    onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
-    if(_ort_llm_cuda_architectures)
-      onnxruntime_add_cuda_object_library(
-        NAME onnxruntime_providers_cuda_llm
-        PARENT onnxruntime_providers_cuda
-        CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}"
-        NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
-        SOURCES ${onnxruntime_cuda_llm_srcs})
+      # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
+      # Restricts CUDA_ARCHITECTURES to avoid compiling heavy CUTLASS templates for pre-Turing GPUs.
+      # Excludes SM120+ real (native SASS) architectures because SM120-specific kernels are already
+      # compiled in the separate SM120 TMA OBJECT library, and compiling the general LLM code for
+      # sm_120a triggers CCCL tcgen05 PTX headers that fail on Windows/MSVC. The virtual arch
+      # (PTX) is kept so SM120 devices can JIT-compile the code.
+      if(onnxruntime_cuda_llm_srcs)
+        onnxruntime_filter_cuda_archs(_ort_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
+        if(_ort_llm_cuda_architectures)
+          onnxruntime_add_cuda_object_library(
+            NAME onnxruntime_providers_cuda_llm
+            PARENT onnxruntime_providers_cuda
+            CUDA_ARCHITECTURES "${_ort_llm_cuda_architectures}"
+            NVCC_THREADS "${onnxruntime_NVCC_THREADS}"
+            SOURCES ${onnxruntime_cuda_llm_srcs})
+        endif()
+      endif()
     endif()
   endif()
 
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index 313285426e0ef..d8435e321c7ce 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -253,37 +253,14 @@ target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
   "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">"
 )
 
-# SM90/SM120 TMA WS OBJECT libraries — compiled with restricted CUDA architectures.
-# Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
-if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs)
-  set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs})
-  onnxruntime_filter_cuda_archs(_plugin_sm90_check MIN_SM 90)
-  if(_plugin_sm90_check)
-    onnxruntime_add_cuda_plugin_object_library(
-      NAME onnxruntime_providers_cuda_plugin_sm90_tma
-      PARENT onnxruntime_providers_cuda_plugin
-      CUDA_ARCHITECTURES "90a-real"
-      NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
-      COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
-      SOURCES ${_plugin_sm90_all_srcs})
-  endif()
-endif()
-
-if(_cuda_plugin_sm120_tma_srcs)
-  onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120)
-  if(_plugin_sm120_cuda_architectures)
-    onnxruntime_add_cuda_plugin_object_library(
-      NAME onnxruntime_providers_cuda_plugin_sm120_tma
-      PARENT onnxruntime_providers_cuda_plugin
-      CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}"
-      NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
-      COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
-      SOURCES ${_cuda_plugin_sm120_tma_srcs})
-  endif()
-endif()
+# SM-specific OBJECT libraries — compiled with restricted CUDA architectures.
+# Flash Attention is also used by the ONNX domain Attention op, so it is always included.
+# SM90/SM120 TMA and LLM contain MoE and MatMulNBits kernels (contrib ops only).
 
 # Flash Attention OBJECT library: SM80+ only, with independent nvcc_threads.
 # Flash Attention V2 kernels require SM80 and are memory-intensive to compile.
+# Included even with onnxruntime_DISABLE_CONTRIB_OPS because the ONNX domain Attention
+# kernel depends on flash attention infrastructure in contrib_ops/cuda/bert/.
 if(NOT DEFINED onnxruntime_FLASH_NVCC_THREADS)
   set(onnxruntime_FLASH_NVCC_THREADS "1")
 endif()
@@ -300,20 +277,51 @@ if(_cuda_plugin_flash_attention_srcs)
   endif()
 endif()
 
-# LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
-# Excludes SM120+ real (native SASS) architectures — SM120-specific kernels are compiled in
-# the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX
-# headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept.
-if(_cuda_plugin_llm_srcs)
-  onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
-  if(_plugin_llm_cuda_architectures)
-    onnxruntime_add_cuda_plugin_object_library(
-      NAME onnxruntime_providers_cuda_plugin_llm
-      PARENT onnxruntime_providers_cuda_plugin
-      CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}"
-      NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
-      COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
-      SOURCES ${_cuda_plugin_llm_srcs})
+if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+  # SM90 TMA warp-specialized files use SM90-specific collective operations.
+  # Also includes fpA_intB SM90 launchers (guarded by #ifndef EXCLUDE_SM_90).
+  if(_cuda_plugin_sm90_tma_srcs OR _cuda_plugin_llm_sm90_srcs)
+    set(_plugin_sm90_all_srcs ${_cuda_plugin_sm90_tma_srcs} ${_cuda_plugin_llm_sm90_srcs})
+    onnxruntime_filter_cuda_archs(_plugin_sm90_check MIN_SM 90)
+    if(_plugin_sm90_check)
+      onnxruntime_add_cuda_plugin_object_library(
+        NAME onnxruntime_providers_cuda_plugin_sm90_tma
+        PARENT onnxruntime_providers_cuda_plugin
+        CUDA_ARCHITECTURES "90a-real"
+        NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
+        COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
+        SOURCES ${_plugin_sm90_all_srcs})
+    endif()
+  endif()
+
+  if(_cuda_plugin_sm120_tma_srcs)
+    onnxruntime_filter_cuda_archs(_plugin_sm120_cuda_architectures MIN_SM 120)
+    if(_plugin_sm120_cuda_architectures)
+      onnxruntime_add_cuda_plugin_object_library(
+        NAME onnxruntime_providers_cuda_plugin_sm120_tma
+        PARENT onnxruntime_providers_cuda_plugin
+        CUDA_ARCHITECTURES "${_plugin_sm120_cuda_architectures}"
+        NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
+        COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
+        SOURCES ${_cuda_plugin_sm120_tma_srcs})
+    endif()
+  endif()
+
+  # LLM OBJECT library: SM75+ (backward compatible with fpA_intB_gemv/gemm which support SM75).
+  # Excludes SM120+ real (native SASS) architectures — SM120-specific kernels are compiled in
+  # the separate SM120 TMA OBJECT library, and the general LLM code triggers CCCL tcgen05 PTX
+  # headers that fail on Windows/MSVC when compiled for sm_120a. Virtual arch (PTX) is kept.
+  if(_cuda_plugin_llm_srcs)
+    onnxruntime_filter_cuda_archs(_plugin_llm_cuda_architectures MIN_SM 75 EXCLUDE_SM120_REAL)
+    if(_plugin_llm_cuda_architectures)
+      onnxruntime_add_cuda_plugin_object_library(
+        NAME onnxruntime_providers_cuda_plugin_llm
+        PARENT onnxruntime_providers_cuda_plugin
+        CUDA_ARCHITECTURES "${_plugin_llm_cuda_architectures}"
+        NVCC_THREADS "${onnxruntime_plugin_nvcc_threads}"
+        COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
+        SOURCES ${_cuda_plugin_llm_srcs})
+    endif()
   endif()
 endif()
 

From e81b491bbae9d712b18595cd171c1f7df9f856be Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 26 May 2026 17:20:49 -0700
Subject: [PATCH 17/17] fix: address review feedback on flash attention
 quick-build and pre-SM80 fallback

- Update quick-build comment to reflect that both FP16 and BF16 hdim128 kernels
  are intentionally retained (not just FP16).
- Add fallback for pre-SM80 builds: when no SM80+ architectures are configured,
  flash attention sources are added back to the parent target so the linker can
  find host-side symbols referenced by flash_api.cc.
---
 cmake/onnxruntime_cuda_source_filters.cmake   | 4 ++--
 cmake/onnxruntime_providers_cuda.cmake        | 5 +++++
 cmake/onnxruntime_providers_cuda_plugin.cmake | 5 +++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake
index 3af4b71b90988..4083154100554 100644
--- a/cmake/onnxruntime_cuda_source_filters.cmake
+++ b/cmake/onnxruntime_cuda_source_filters.cmake
@@ -14,11 +14,11 @@ function(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST)
   set(_list "${${CU_SRC_LIST}}")
 
   # Quick build mode: Filter flash attention kernels for faster development iteration.
-  #   - We keep only hdim128 fp16 flash attention kernels in quick build mode.
+  #   - We keep only hdim128 fp16 and bf16 flash attention kernels in quick build mode.
   #   - All other listed head dimensions are excluded (e.g., 32, 64, 96, 192, 256).
   #     If new head dimensions are added or removed, update this list to match the supported set.
   if(onnxruntime_QUICK_BUILD)
-    message(STATUS "Quick build mode enabled: Only building hdim128 fp16 flash attention kernels")
+    message(STATUS "Quick build mode enabled: Only building hdim128 fp16/bf16 flash attention kernels")
     list(FILTER _list EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)")
   endif()
 
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index b23c978d2d7b7..08f51e92f50d5 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -480,6 +480,11 @@
           CUDA_ARCHITECTURES "${_ort_flash_cuda_architectures}"
           NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
           SOURCES ${onnxruntime_cuda_flash_attention_srcs})
+      else()
+        # No SM80+ architectures available: compile flash sources in parent target so the
+        # linker can find the host-side symbols referenced by flash_api.cc. The kernels
+        # themselves will be empty stubs due to __CUDA_ARCH__ >= 800 guards.
+        target_sources(onnxruntime_providers_cuda PRIVATE ${onnxruntime_cuda_flash_attention_srcs})
       endif()
     endif()
 
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
index d8435e321c7ce..98dfde7c54328 100644
--- a/cmake/onnxruntime_providers_cuda_plugin.cmake
+++ b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -274,6 +274,11 @@ if(_cuda_plugin_flash_attention_srcs)
       NVCC_THREADS "${onnxruntime_FLASH_NVCC_THREADS}"
       COMPILE_OPTIONS ${_cuda_plugin_shared_compile_options}
       SOURCES ${_cuda_plugin_flash_attention_srcs})
+  else()
+    # No SM80+ architectures available: compile flash sources in parent target so the
+    # linker can find the host-side symbols referenced by flash_api.cc. The kernels
+    # themselves will be empty stubs due to __CUDA_ARCH__ >= 800 guards.
+    target_sources(onnxruntime_providers_cuda_plugin PRIVATE ${_cuda_plugin_flash_attention_srcs})
   endif()
 endif()