use --nvcc_threads 4 --flash_nvcc_threads 2 in pipelines

tianleiwu · tianleiwu · commit 9091c9d33c49 · 2026-05-22T13:14:24.000-07:00
diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
@@ -29,7 +29,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
       docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
diff --git a/.github/workflows/linux_cuda_plugin_ci.yml b/.github/workflows/linux_cuda_plugin_ci.yml
@@ -32,7 +32,7 @@ jobs:
         --use_binskim_compliant_compile_flags
         --build_wheel
         --parallel
-        --nvcc_threads 1
+        --nvcc_threads 4 --flash_nvcc_threads 2
         --cuda_version=12.8
         --cuda_home=/usr/local/cuda-12.8
         --cudnn_home=/usr/local/cuda-12.8
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
@@ -115,7 +115,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -235,7 +235,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_cuda_plugin.yml b/.github/workflows/windows_cuda_plugin.yml
@@ -73,7 +73,7 @@ jobs:
             --build_dir build `
             --skip_submodule_sync `
             --parallel `
-            --nvcc_threads 1 `
+            --nvcc_threads 4 --flash_nvcc_threads 2 `
             --use_binskim_compliant_compile_flags `
             --cmake_generator "Visual Studio 17 2022" `
             --build_shared_lib `
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
@@ -121,7 +121,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -247,7 +247,7 @@ jobs:
             exit $lastExitCode
           }
 
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/docs/cuda_plugin_ep/build_optimization_plan.md b/docs/cuda_plugin_ep/build_optimization_plan.md
@@ -0,0 +1,107 @@
+# Plan: Speed Up CUDA Build via Per-Library Architecture Splitting and nvcc_threads Optimization
+
+## TL;DR
+
+Split the monolithic CUDA provider compilation into architecture-specific OBJECT libraries for
+flash_attention (SM80+, `--threads 1`) and llm/ (SM75+), allowing the main target and llm to use
+higher `nvcc_threads` for faster parallel compilation. Merge fpA_intB SM90 launchers into the
+existing SM90 TMA OBJECT library.
+
+## Architecture Requirements (Verified)
+
+| Directory | Min SM | Notes |
+|-----------|--------|-------|
+| `bert/flash_attention/` (48 .cu) | **SM80** | `__CUDA_ARCH__ >= 800` in kernel_traits.h |
+| `llm/fpA_intB_gemv/` (11 .cu) | **SM75** | `ORT_ENFORCE(arch >= 75)` |
+| `llm/fpA_intB_gemm/` (8 base .cu) | **SM75** | SM75+ base support |
+| `llm/fpA_intB_gemm/launchers/` (2 .cu) | **SM90** | `#ifndef EXCLUDE_SM_90` |
+| `llm/moe_gemm/` (14 root .cu) | **SM75** | CUTLASS stages=2 fallback for SM75 |
+| `llm/moe_gemm/launchers/fused_moe_sm80` (2 .cu) | **SM80** | `#ifndef EXCLUDE_SM_80` (has arch guard in code, safe to compile at SM75+) |
+| `llm/moe_gemm/launchers/` SM90 TMA (324 .cu) | SM90 | **Already extracted** |
+| `llm/moe_gemm/launchers/` SM120 TMA (11 .cu) | SM120 | **Already extracted** |
+| `llm/kernels/` (1 .cu) | SM50 | BF16 guarded by `__CUDA_ARCH__ >= 800` |
+
+## Steps
+
+### Phase 1: Flash Attention OBJECT Library
+
+1. Add macro `onnxruntime_extract_flash_attention_sources()` in
+   `cmake/onnxruntime_cuda_source_filters.cmake` — extracts `*/bert/flash_attention/*.cu`
+   from the main CU source list.
+
+2. In both provider cmake files, call this macro after existing filtering. Create OBJECT library:
+   - `CUDA_ARCHITECTURES` = entries from `CMAKE_CUDA_ARCHITECTURES` where arch >= 80
+   - `--threads ${onnxruntime_FLASH_NVCC_THREADS}`
+   - Same includes/compile defs as parent (`config_cuda_provider_shared_module()`)
+   - Link into parent
+
+3. Add CMake cache option: `onnxruntime_FLASH_NVCC_THREADS` (default `"1"`, type STRING)
+
+### Phase 2: LLM OBJECT Library (SM75+ — Backward Compatible)
+
+4. Add macro `onnxruntime_extract_llm_sources()` — extracts `*/contrib_ops/cuda/llm/*.cu`,
+   then further extracts SM90 launcher files (`fpA_intB_gemm_launcher_*.generated.cu`) into a
+   separate output variable.
+
+5. Create `onnxruntime_providers_cuda_llm` OBJECT library:
+   - `CUDA_ARCHITECTURES` = entries from `CMAKE_CUDA_ARCHITECTURES` where arch >= 75
+   - `--threads ${onnxruntime_NVCC_THREADS}` (user can now safely set to 2-4)
+   - Contains all llm/ .cu files EXCEPT SM90 TMA (already extracted) and fpA_intB SM90 launchers
+
+6. **Merge fpA_intB SM90 launchers** (`fpA_intB_gemm_launcher_1.generated.cu`,
+   `fpA_intB_gemm_launcher_2.generated.cu`) into existing
+   `onnxruntime_providers_cuda_sm90_tma` OBJECT library — both need
+   `CUDA_ARCHITECTURES "90a-real"` and `COMPILE_HOPPER_TMA_GEMMS`.
+
+### Phase 3: nvcc_threads Configuration
+
+7. Define `onnxruntime_FLASH_NVCC_THREADS` (default `"1"`). Flash attention target uses this.
+   Main target and LLM target use existing `onnxruntime_NVCC_THREADS` (can be raised to 2-4
+   since flash attention is isolated).
+
+### Phase 4: Mirror in Plugin Build
+
+8. Identical pattern in `onnxruntime_providers_cuda_plugin.cmake`:
+   - `onnxruntime_providers_cuda_plugin_flash_attention` (SM80+, threads from `onnxruntime_FLASH_NVCC_THREADS`)
+   - `onnxruntime_providers_cuda_plugin_llm` (SM75+)
+   - fpA_intB SM90 launchers merged into `onnxruntime_providers_cuda_plugin_sm90_tma`
+
+### Phase 5: Build Script for Testing
+
+9. Create `.env/cuda_build_time_test.sh` (based on `.env/cuda13_all.sh`):
+   - `CMAKE_CUDA_ARCHITECTURES="75-real;80-real;86-real;89-real;90-real;100-real;120-real;120-virtual"`
+   - `onnxruntime_NVCC_THREADS=4`
+   - `onnxruntime_FLASH_NVCC_THREADS=1`
+   - Build with timing, report total duration
+
+## Relevant Files
+
+- `cmake/onnxruntime_cuda_source_filters.cmake` — new macros
+- `cmake/onnxruntime_providers_cuda.cmake` — create flash_attention and llm OBJECT libraries
+- `cmake/onnxruntime_providers_cuda_plugin.cmake` — mirror for plugin build
+- `.env/cuda_build_time_test.sh` — new build script for benchmarking
+
+## Verification
+
+1. Check `build.ninja`: flash_attention files have only SM80+ `--generate-code`; llm files have SM75+
+2. Build with `onnxruntime_NVCC_THREADS=4`, `onnxruntime_FLASH_NVCC_THREADS=1` — no OOM
+3. Compare total build time before/after using multi-arch build script
+4. Run: `./onnxruntime_test_all --gtest_filter=*FlashAttention*:*MoE*:*FpAIntB*`
+5. Run: `python test_gqa.py`, `python test_moe_cuda.py`
+6. No link errors in both in-tree and plugin builds
+
+## Decisions
+
+- **LLM = SM75+** (not SM80+) — preserves backward compatibility for `fpA_intB_gemv/gemm`
+- **Flash attention = SM80+** — all kernel files are `_sm80` suffixed with arch guards
+- **fpA_intB SM90 launchers merged into SM90 TMA lib** — both need "90a-real" + `COMPILE_HOPPER_TMA_GEMMS`
+- **`onnxruntime_FLASH_NVCC_THREADS`** = new option (default 1); `onnxruntime_NVCC_THREADS` remains
+- **`onnxruntime_QUICK_BUILD`** filtering applies before OBJECT library creation (no behavior change)
+- SM90/SM120 TMA MoE extraction unchanged (existing mechanism)
+
+## Reference
+
+- TRT-LLM pattern: `~/tensorrt-llm/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt`
+  — uses `set_cuda_architectures()` per sub-library with separate OBJECT targets
+- Flash Attention official: `~/flash-attention/setup.py`
+  — defaults to `NVCC_THREADS=4`, architectures `80;90;100;110;120`, ~5GB per nvcc thread
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
@@ -209,31 +209,13 @@ def number_of_nvcc_threads(args):
     if args.nvcc_threads >= 0:
         return args.nvcc_threads
 
-    nvcc_threads = 1
-    try:
-        import psutil  # noqa: PLC0415
+    return 4
 
-        available_memory = psutil.virtual_memory().available
-        if isinstance(available_memory, int) and available_memory > 0:
-            if available_memory >= 64 * 1024 * 1024 * 1024:
-                # When available memory is large enough, chance of OOM is small.
-                nvcc_threads = min(4, int(available_memory / (8 * 4 * 1024 * 1024 * 1024)))
-            else:
-                # NVCC need a lot of memory to compile 48 flash attention cu files.
-                # Here we select number of threads to ensure each thread has enough memory (>= 4 GB).
-                memory_per_thread = 4 * 1024 * 1024 * 1024
-                fmha_cu_files = 48
-                fmha_parallel_jobs = min(fmha_cu_files, number_of_parallel_jobs(args))
-                nvcc_threads = max(1, int(available_memory / (memory_per_thread * fmha_parallel_jobs)))
-                print(
-                    f"nvcc_threads={nvcc_threads} to ensure memory per thread >= 4GB for available_memory={available_memory} and fmha_parallel_jobs={fmha_parallel_jobs}"
-                )
-    except ImportError:
-        print(
-            "Failed to import psutil. Please `pip install psutil` for better estimation of nvcc threads. Use nvcc_threads=1"
-        )
+def number_of_flash_nvcc_threads(args):
+    if args.flash_nvcc_threads >= 0:
+        return args.flash_nvcc_threads
 
-    return nvcc_threads
+    return number_of_nvcc_threads(args)
 
 
 # See https://learn.microsoft.com/en-us/vcpkg/commands/install
@@ -724,6 +706,10 @@ def generate_build_tree(
     if args.use_cuda:
         nvcc_threads = number_of_nvcc_threads(args)
         cmake_args.append("-Donnxruntime_NVCC_THREADS=" + str(nvcc_threads))
+
+        flash_nvcc_threads = number_of_flash_nvcc_threads(args)
+        cmake_args.append("-Donnxruntime_FLASH_NVCC_THREADS=" + str(flash_nvcc_threads))
+
         cmake_args.append(f"-DCMAKE_CUDA_COMPILER={cuda_home}/bin/nvcc")
         add_default_definition(cmake_extra_defines, "onnxruntime_USE_CUDA", "ON")
         if args.cuda_version:
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
@@ -647,9 +647,16 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     cuda_group.add_argument(
         "--nvcc_threads",
         nargs="?",
-        default=-1,  # -1 signifies auto-detect based on jobs/memory
+        default=4,
         type=int,
-        help="Max NVCC threads per parallel job (-1=auto).",
+        help="Max NVCC threads per parallel job (default is 4).",
+    )
+    cuda_group.add_argument(
+        "--flash_nvcc_threads",
+        nargs="?",
+        default=-1,
+        type=int,
+        help="Max NVCC threads per parallel job for flash attention (default is same value of --nvcc_threads).",
     )
     # CUDA-specific profiling
     cuda_group.add_argument(
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -97,7 +97,7 @@ extends:
           msbuildPlatform: x64
           packageName: x64-cuda
           CudaVersion: ${{ parameters.CudaVersion }}
-          buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 1 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}"
+          buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 4 --flash_nvcc_threads 2 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}"
           runTests: false
           buildJava: false
           java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -73,9 +73,9 @@ stages:
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
     ${{ if ne(parameters.win_cudnn_home, '') }}:
-      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
+      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
     ${{ else }}:
-      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
+      buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
@@ -96,9 +96,9 @@ stages:
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
     ${{ if ne(parameters.win_cudnn_home, '') }}:
-      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
+      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
     ${{ else }}:
-      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
+      buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/plugin-win-cuda-stage.yml
@@ -127,7 +127,7 @@ stages:
               --skip_submodule_sync
               --cmake_generator "$(VSGenerator)"
               --parallel
-              --nvcc_threads 1
+              --nvcc_threads 4 --flash_nvcc_threads 2
               --use_vcpkg
               --use_vcpkg_ms_internal_asset_cache
               --use_binskim_compliant_compile_flags
@@ -154,7 +154,7 @@ stages:
               --skip_submodule_sync
               --cmake_generator "$(VSGenerator)"
               --parallel
-              --nvcc_threads 1
+              --nvcc_threads 4 --flash_nvcc_threads 2
               --use_vcpkg
               --use_vcpkg_ms_internal_asset_cache
               --use_binskim_compliant_compile_flags
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -122,7 +122,7 @@ stages:
             --enable_pybind
             --enable_onnx_tests
             --parallel
-            --nvcc_threads 1
+            --nvcc_threads 4 --flash_nvcc_threads 2
             --use_vcpkg
             --use_vcpkg_ms_internal_asset_cache
             --use_binskim_compliant_compile_flags
diff --git a/tools/ci_build/github/linux/build_cuda_plugin_package.sh b/tools/ci_build/github/linux/build_cuda_plugin_package.sh
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ jobs:`
`115`	`115`	`exit $lastExitCode`
`116`	`116`	`}`
`117`	`117`	`# Execute the build process`
`118`		- python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
	`118`	+ python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
`119`	`119`	`if ($lastExitCode -ne 0) {`
`120`	`120`	`exit $lastExitCode`
`121`	`121`	`}`
`@@ -235,7 +235,7 @@ jobs:`
`235`	`235`	`exit $lastExitCode`
`236`	`236`	`}`
`237`	`237`
`238`		- python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
	`238`	+ python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
`239`	`239`	`if ($lastExitCode -ne 0) {`
`240`	`240`	`exit $lastExitCode`
`241`	`241`	`}`
Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ jobs:`
`121`	`121`	`exit $lastExitCode`
`122`	`122`	`}`
`123`	`123`	`# Execute the build process`
`124`		- python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
	`124`	+ python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
`125`	`125`	`if ($lastExitCode -ne 0) {`
`126`	`126`	`exit $lastExitCode`
`127`	`127`	`}`
`@@ -247,7 +247,7 @@ jobs:`
`247`	`247`	`exit $lastExitCode`
`248`	`248`	`}`
`249`	`249`
`250`		- python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
	`250`	+ python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
`251`	`251`	`if ($lastExitCode -ne 0) {`
`252`	`252`	`exit $lastExitCode`
`253`	`253`	`}`