microsoft · tianleiwu · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 23, 2026
diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
       docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory

diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -32,12 +32,13 @@ jobs:
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
-        --nvcc_threads 1
+        --nvcc_threads 4 --flash_nvcc_threads 4
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
         --enable_cuda_profiling
         --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        --cmake_extra_defines onnxruntime_QUICK_BUILD=ON
         --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
       python_path_prefix: 'PATH=/opt/python/cp312-cp312/bin:$PATH'
       run_tests: false

diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1 --build-arg TRT_VERSION=10.14.1.48-1.cuda12.9 --network=host'
       docker_image_repo: onnxruntimetensorrt86gpubuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --use_tensorrt --tensorrt_home /usr  --build_java --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory

diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
@@ -115,7 +115,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -235,7 +235,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }

diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
@@ -73,7 +73,7 @@ jobs:
             --build_dir build `
             --skip_submodule_sync `
             --parallel `
-            --nvcc_threads 1 `
+            --nvcc_threads 4 --flash_nvcc_threads 4 `
             --use_binskim_compliant_compile_flags `
             --cmake_generator "Visual Studio 17 2022" `
             --build_shared_lib `
@@ -84,6 +84,7 @@ jobs:
             --use_vcpkg `
             --use_vcpkg_ms_internal_asset_cache `
             --enable_cuda_profiling `
+            --cmake_extra_defines onnxruntime_QUICK_BUILD=ON `
             --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 `
             --cmake_extra_defines onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON
 

diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
@@ -121,7 +121,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines onnxruntime_QUICK_BUILD=ON --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -247,7 +247,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 4 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }

diff --git a/cmake/onnxruntime_cuda_source_filters.cmake b/cmake/onnxruntime_cuda_source_filters.cmake
@@ -41,3 +41,109 @@ macro(onnxruntime_filter_cuda_cu_sources CU_SRC_LIST)
     list(FILTER ${CU_SRC_LIST} EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
   endif()
 endmacro()
+
+# Extract SM90/SM120 TMA warp-specialized generated source files from a CUDA source list.
+# These files use CUTLASS 3.x features (GMMA, TMA) that are specific to SM90+ or SM120+.
+# They are compiled in separate OBJECT libraries with restricted CUDA_ARCHITECTURES to:
+#   1. Reduce compile time (avoid compiling heavy templates for unused architectures)
+#   2. Reduce binary size (no dead device code for unsupported architectures)
+#   3. Ensure correctness (SM90 code compiled at exactly 90a-real, SM120 at 120+)
+#
+# The per-source CUDA_ARCHITECTURES property does not work with the Visual Studio generator,
+# so OBJECT libraries are needed.
+#
+# Usage:
+#   onnxruntime_extract_sm_specific_cuda_sources(<cu_src_list_var>
+#       SM90_SOURCES <output_var> SM120_SOURCES <output_var>)
+#
+# Removes matched files from <cu_src_list_var> and stores them in the output variables.
+macro(onnxruntime_extract_sm_specific_cuda_sources CU_SRC_LIST)
+  cmake_parse_arguments(_EXTRACT "" "SM90_SOURCES;SM120_SOURCES" "" ${ARGN})
+
+  # Extract SM90 TMA WS generated files
+  set(${_EXTRACT_SM90_SOURCES})
+  if(ORT_HAS_SM90_OR_LATER)
+    foreach(_src IN LISTS ${CU_SRC_LIST})
+      if(_src MATCHES "moe_gemm_tma_ws_sm90_.*\\.generated\\.cu$")
+        list(APPEND ${_EXTRACT_SM90_SOURCES} "${_src}")
+      endif()
+    endforeach()
+    if(${_EXTRACT_SM90_SOURCES})
+      list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM90_SOURCES}})
+    endif()
+  endif()
+
+  # Extract SM120 TMA WS generated files
+  set(${_EXTRACT_SM120_SOURCES})
+  if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
+    foreach(_src IN LISTS ${CU_SRC_LIST})
+      if(_src MATCHES "moe_gemm_tma_ws_sm120_.*\\.generated\\.cu$")
+        list(APPEND ${_EXTRACT_SM120_SOURCES} "${_src}")
+      endif()
+    endforeach()
+    if(${_EXTRACT_SM120_SOURCES})
+      list(REMOVE_ITEM ${CU_SRC_LIST} ${${_EXTRACT_SM120_SOURCES}})
+    endif()
+  endif()
+endmacro()
+
+# Extract Flash Attention CUDA source files into a separate list for compilation
+# in a dedicated OBJECT library with SM80+ architectures and independent nvcc_threads.
+# Flash Attention V2 kernels require SM80 (Ampere) or later — they contain
+# __CUDA_ARCH__ >= 800 guards in kernel_traits.h and all files are *_sm80.cu.
+# Compiling them separately allows:
+#   1. Restricting CUDA_ARCHITECTURES to SM80+ (skip dead pre-Ampere passes)
+#   2. Using --threads 1 (memory-intensive) while other targets use higher parallelism
+#
+# Usage:
+#   onnxruntime_extract_flash_attention_sources(<cu_src_list_var>
+#       FLASH_SOURCES <output_var>)
+macro(onnxruntime_extract_flash_attention_sources CU_SRC_LIST)
+  cmake_parse_arguments(_FA "" "FLASH_SOURCES" "" ${ARGN})
+
+  set(${_FA_FLASH_SOURCES})
+  foreach(_src IN LISTS ${CU_SRC_LIST})
+    if(_src MATCHES "/bert/flash_attention/.*\\.cu$")
+      list(APPEND ${_FA_FLASH_SOURCES} "${_src}")
+    endif()
+  endforeach()
+  if(${_FA_FLASH_SOURCES})
+    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_FA_FLASH_SOURCES}})
+  endif()
+endmacro()
+
+# Extract LLM CUDA source files into separate lists for per-architecture compilation.
+# The LLM directory (contrib_ops/cuda/llm/) contains kernels with minimum SM75 support
+# (fpA_intB_gemv/gemm enforce arch >= 75). SM90-specific launchers (fpA_intB_gemm
+# launchers guarded by #ifndef EXCLUDE_SM_90) are extracted separately to be compiled
+# at 90a-real (merged into the SM90 TMA OBJECT library).
+#
+# Note: SM90 TMA MoE GEMM files are already extracted by
+# onnxruntime_extract_sm_specific_cuda_sources() before this macro is called.
+#
+# Usage:
+#   onnxruntime_extract_llm_sources(<cu_src_list_var>
+#       LLM_SOURCES <output_var>
+#       LLM_SM90_SOURCES <output_var>)
+macro(onnxruntime_extract_llm_sources CU_SRC_LIST)
+  cmake_parse_arguments(_LLM "" "LLM_SOURCES;LLM_SM90_SOURCES" "" ${ARGN})
+
+  set(${_LLM_LLM_SOURCES})
+  set(${_LLM_LLM_SM90_SOURCES})
+  foreach(_src IN LISTS ${CU_SRC_LIST})
+    if(_src MATCHES "/contrib_ops/cuda/llm/.*\\.cu$")
+      # SM90-specific fpA_intB launchers (guarded by #ifndef EXCLUDE_SM_90)
+      if(_src MATCHES "fpA_intB_gemm_launcher_[0-9]+\\.generated\\.cu$")
+        list(APPEND ${_LLM_LLM_SM90_SOURCES} "${_src}")
+      else()
+        list(APPEND ${_LLM_LLM_SOURCES} "${_src}")
+      endif()
+    endif()
+  endforeach()
+  if(${_LLM_LLM_SOURCES})
+    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SOURCES}})
+  endif()
+  if(${_LLM_LLM_SM90_SOURCES})
+    list(REMOVE_ITEM ${CU_SRC_LIST} ${${_LLM_LLM_SM90_SOURCES}})
+  endif()
+endmacro()