Skip to content

Commit 9091c9d

Browse files
committed
use --nvcc_threads 4 --flash_nvcc_threads 2 in pipelines
1 parent 19fb133 commit 9091c9d

14 files changed

Lines changed: 142 additions & 42 deletions

.github/workflows/linux_cuda_ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
3030
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
3131
docker_image_repo: onnxruntimecuda12manylinuxbuild
32-
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
32+
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
3333
python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
3434
run_tests: false # <<< Do not run tests in this job
3535
upload_build_output: true # <<< Upload the build/Release directory

.github/workflows/linux_cuda_plugin_ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
--use_binskim_compliant_compile_flags
3333
--build_wheel
3434
--parallel
35-
--nvcc_threads 1
35+
--nvcc_threads 4 --flash_nvcc_threads 2
3636
--cuda_version=12.8
3737
--cuda_home=/usr/local/cuda-12.8
3838
--cudnn_home=/usr/local/cuda-12.8

.github/workflows/windows_cuda.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ jobs:
115115
exit $lastExitCode
116116
}
117117
# Execute the build process
118-
python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
118+
python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
119119
if ($lastExitCode -ne 0) {
120120
exit $lastExitCode
121121
}
@@ -235,7 +235,7 @@ jobs:
235235
exit $lastExitCode
236236
}
237237
238-
python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
238+
python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
239239
if ($lastExitCode -ne 0) {
240240
exit $lastExitCode
241241
}

.github/workflows/windows_cuda_plugin.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ jobs:
7373
--build_dir build `
7474
--skip_submodule_sync `
7575
--parallel `
76-
--nvcc_threads 1 `
76+
--nvcc_threads 4 --flash_nvcc_threads 2 `
7777
--use_binskim_compliant_compile_flags `
7878
--cmake_generator "Visual Studio 17 2022" `
7979
--build_shared_lib `

.github/workflows/windows_tensorrt.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ jobs:
121121
exit $lastExitCode
122122
}
123123
# Execute the build process
124-
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
124+
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
125125
if ($lastExitCode -ne 0) {
126126
exit $lastExitCode
127127
}
@@ -247,7 +247,7 @@ jobs:
247247
exit $lastExitCode
248248
}
249249
250-
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
250+
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 4 --flash_nvcc_threads 2 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
251251
if ($lastExitCode -ne 0) {
252252
exit $lastExitCode
253253
}
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Plan: Speed Up CUDA Build via Per-Library Architecture Splitting and nvcc_threads Optimization
2+
3+
## TL;DR
4+
5+
Split the monolithic CUDA provider compilation into architecture-specific OBJECT libraries for
6+
flash_attention (SM80+, `--threads 1`) and llm/ (SM75+), allowing the main target and llm to use
7+
higher `nvcc_threads` for faster parallel compilation. Merge fpA_intB SM90 launchers into the
8+
existing SM90 TMA OBJECT library.
9+
10+
## Architecture Requirements (Verified)
11+
12+
| Directory | Min SM | Notes |
13+
|-----------|--------|-------|
14+
| `bert/flash_attention/` (48 .cu) | **SM80** | `__CUDA_ARCH__ >= 800` in kernel_traits.h |
15+
| `llm/fpA_intB_gemv/` (11 .cu) | **SM75** | `ORT_ENFORCE(arch >= 75)` |
16+
| `llm/fpA_intB_gemm/` (8 base .cu) | **SM75** | SM75+ base support |
17+
| `llm/fpA_intB_gemm/launchers/` (2 .cu) | **SM90** | `#ifndef EXCLUDE_SM_90` |
18+
| `llm/moe_gemm/` (14 root .cu) | **SM75** | CUTLASS stages=2 fallback for SM75 |
19+
| `llm/moe_gemm/launchers/fused_moe_sm80` (2 .cu) | **SM80** | `#ifndef EXCLUDE_SM_80` (has arch guard in code, safe to compile at SM75+) |
20+
| `llm/moe_gemm/launchers/` SM90 TMA (324 .cu) | SM90 | **Already extracted** |
21+
| `llm/moe_gemm/launchers/` SM120 TMA (11 .cu) | SM120 | **Already extracted** |
22+
| `llm/kernels/` (1 .cu) | SM50 | BF16 guarded by `__CUDA_ARCH__ >= 800` |
23+
24+
## Steps
25+
26+
### Phase 1: Flash Attention OBJECT Library
27+
28+
1. Add macro `onnxruntime_extract_flash_attention_sources()` in
29+
`cmake/onnxruntime_cuda_source_filters.cmake` — extracts `*/bert/flash_attention/*.cu`
30+
from the main CU source list.
31+
32+
2. In both provider cmake files, call this macro after existing filtering. Create OBJECT library:
33+
- `CUDA_ARCHITECTURES` = entries from `CMAKE_CUDA_ARCHITECTURES` where arch >= 80
34+
- `--threads ${onnxruntime_FLASH_NVCC_THREADS}`
35+
- Same includes/compile defs as parent (`config_cuda_provider_shared_module()`)
36+
- Link into parent
37+
38+
3. Add CMake cache option: `onnxruntime_FLASH_NVCC_THREADS` (default `"1"`, type STRING)
39+
40+
### Phase 2: LLM OBJECT Library (SM75+ — Backward Compatible)
41+
42+
4. Add macro `onnxruntime_extract_llm_sources()` — extracts `*/contrib_ops/cuda/llm/*.cu`,
43+
then further extracts SM90 launcher files (`fpA_intB_gemm_launcher_*.generated.cu`) into a
44+
separate output variable.
45+
46+
5. Create `onnxruntime_providers_cuda_llm` OBJECT library:
47+
- `CUDA_ARCHITECTURES` = entries from `CMAKE_CUDA_ARCHITECTURES` where arch >= 75
48+
- `--threads ${onnxruntime_NVCC_THREADS}` (user can now safely set to 2-4)
49+
- Contains all llm/ .cu files EXCEPT SM90 TMA (already extracted) and fpA_intB SM90 launchers
50+
51+
6. **Merge fpA_intB SM90 launchers** (`fpA_intB_gemm_launcher_1.generated.cu`,
52+
`fpA_intB_gemm_launcher_2.generated.cu`) into existing
53+
`onnxruntime_providers_cuda_sm90_tma` OBJECT library — both need
54+
`CUDA_ARCHITECTURES "90a-real"` and `COMPILE_HOPPER_TMA_GEMMS`.
55+
56+
### Phase 3: nvcc_threads Configuration
57+
58+
7. Define `onnxruntime_FLASH_NVCC_THREADS` (default `"1"`). Flash attention target uses this.
59+
Main target and LLM target use existing `onnxruntime_NVCC_THREADS` (can be raised to 2-4
60+
since flash attention is isolated).
61+
62+
### Phase 4: Mirror in Plugin Build
63+
64+
8. Identical pattern in `onnxruntime_providers_cuda_plugin.cmake`:
65+
- `onnxruntime_providers_cuda_plugin_flash_attention` (SM80+, threads from `onnxruntime_FLASH_NVCC_THREADS`)
66+
- `onnxruntime_providers_cuda_plugin_llm` (SM75+)
67+
- fpA_intB SM90 launchers merged into `onnxruntime_providers_cuda_plugin_sm90_tma`
68+
69+
### Phase 5: Build Script for Testing
70+
71+
9. Create `.env/cuda_build_time_test.sh` (based on `.env/cuda13_all.sh`):
72+
- `CMAKE_CUDA_ARCHITECTURES="75-real;80-real;86-real;89-real;90-real;100-real;120-real;120-virtual"`
73+
- `onnxruntime_NVCC_THREADS=4`
74+
- `onnxruntime_FLASH_NVCC_THREADS=1`
75+
- Build with timing, report total duration
76+
77+
## Relevant Files
78+
79+
- `cmake/onnxruntime_cuda_source_filters.cmake` — new macros
80+
- `cmake/onnxruntime_providers_cuda.cmake` — create flash_attention and llm OBJECT libraries
81+
- `cmake/onnxruntime_providers_cuda_plugin.cmake` — mirror for plugin build
82+
- `.env/cuda_build_time_test.sh` — new build script for benchmarking
83+
84+
## Verification
85+
86+
1. Check `build.ninja`: flash_attention files have only SM80+ `--generate-code`; llm files have SM75+
87+
2. Build with `onnxruntime_NVCC_THREADS=4`, `onnxruntime_FLASH_NVCC_THREADS=1` — no OOM
88+
3. Compare total build time before/after using multi-arch build script
89+
4. Run: `./onnxruntime_test_all --gtest_filter=*FlashAttention*:*MoE*:*FpAIntB*`
90+
5. Run: `python test_gqa.py`, `python test_moe_cuda.py`
91+
6. No link errors in both in-tree and plugin builds
92+
93+
## Decisions
94+
95+
- **LLM = SM75+** (not SM80+) — preserves backward compatibility for `fpA_intB_gemv/gemm`
96+
- **Flash attention = SM80+** — all kernel files are `_sm80` suffixed with arch guards
97+
- **fpA_intB SM90 launchers merged into SM90 TMA lib** — both need "90a-real" + `COMPILE_HOPPER_TMA_GEMMS`
98+
- **`onnxruntime_FLASH_NVCC_THREADS`** = new option (default 1); `onnxruntime_NVCC_THREADS` remains
99+
- **`onnxruntime_QUICK_BUILD`** filtering applies before OBJECT library creation (no behavior change)
100+
- SM90/SM120 TMA MoE extraction unchanged (existing mechanism)
101+
102+
## Reference
103+
104+
- TRT-LLM pattern: `~/tensorrt-llm/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt`
105+
— uses `set_cuda_architectures()` per sub-library with separate OBJECT targets
106+
- Flash Attention official: `~/flash-attention/setup.py`
107+
— defaults to `NVCC_THREADS=4`, architectures `80;90;100;110;120`, ~5GB per nvcc thread

tools/ci_build/build.py

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -209,31 +209,13 @@ def number_of_nvcc_threads(args):
209209
if args.nvcc_threads >= 0:
210210
return args.nvcc_threads
211211

212-
nvcc_threads = 1
213-
try:
214-
import psutil # noqa: PLC0415
212+
return 4
215213

216-
available_memory = psutil.virtual_memory().available
217-
if isinstance(available_memory, int) and available_memory > 0:
218-
if available_memory >= 64 * 1024 * 1024 * 1024:
219-
# When available memory is large enough, chance of OOM is small.
220-
nvcc_threads = min(4, int(available_memory / (8 * 4 * 1024 * 1024 * 1024)))
221-
else:
222-
# NVCC need a lot of memory to compile 48 flash attention cu files.
223-
# Here we select number of threads to ensure each thread has enough memory (>= 4 GB).
224-
memory_per_thread = 4 * 1024 * 1024 * 1024
225-
fmha_cu_files = 48
226-
fmha_parallel_jobs = min(fmha_cu_files, number_of_parallel_jobs(args))
227-
nvcc_threads = max(1, int(available_memory / (memory_per_thread * fmha_parallel_jobs)))
228-
print(
229-
f"nvcc_threads={nvcc_threads} to ensure memory per thread >= 4GB for available_memory={available_memory} and fmha_parallel_jobs={fmha_parallel_jobs}"
230-
)
231-
except ImportError:
232-
print(
233-
"Failed to import psutil. Please `pip install psutil` for better estimation of nvcc threads. Use nvcc_threads=1"
234-
)
214+
def number_of_flash_nvcc_threads(args):
215+
if args.flash_nvcc_threads >= 0:
216+
return args.flash_nvcc_threads
235217

236-
return nvcc_threads
218+
return number_of_nvcc_threads(args)
237219

238220

239221
# See https://learn.microsoft.com/en-us/vcpkg/commands/install
@@ -724,6 +706,10 @@ def generate_build_tree(
724706
if args.use_cuda:
725707
nvcc_threads = number_of_nvcc_threads(args)
726708
cmake_args.append("-Donnxruntime_NVCC_THREADS=" + str(nvcc_threads))
709+
710+
flash_nvcc_threads = number_of_flash_nvcc_threads(args)
711+
cmake_args.append("-Donnxruntime_FLASH_NVCC_THREADS=" + str(flash_nvcc_threads))
712+
727713
cmake_args.append(f"-DCMAKE_CUDA_COMPILER={cuda_home}/bin/nvcc")
728714
add_default_definition(cmake_extra_defines, "onnxruntime_USE_CUDA", "ON")
729715
if args.cuda_version:

tools/ci_build/build_args.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -647,9 +647,16 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
647647
cuda_group.add_argument(
648648
"--nvcc_threads",
649649
nargs="?",
650-
default=-1, # -1 signifies auto-detect based on jobs/memory
650+
default=4,
651651
type=int,
652-
help="Max NVCC threads per parallel job (-1=auto).",
652+
help="Max NVCC threads per parallel job (default is 4).",
653+
)
654+
cuda_group.add_argument(
655+
"--flash_nvcc_threads",
656+
nargs="?",
657+
default=-1,
658+
type=int,
659+
help="Max NVCC threads per parallel job for flash attention (default is same value of --nvcc_threads).",
653660
)
654661
# CUDA-specific profiling
655662
cuda_group.add_argument(

tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ extends:
9797
msbuildPlatform: x64
9898
packageName: x64-cuda
9999
CudaVersion: ${{ parameters.CudaVersion }}
100-
buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 1 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}"
100+
buildparameter: --use_cuda --cuda_home=${{ variables.win_cuda_home }} --enable_onnx_tests --nvcc_threads 4 --flash_nvcc_threads 2 --caller_framework WinAI --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ variables.CmakeCudaArchitectures }}"
101101
runTests: false
102102
buildJava: false
103103
java_artifact_id: onnxruntime_gpu

tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ stages:
7373
packageName: x64-cuda
7474
CudaVersion: ${{ parameters.CudaVersion }}
7575
${{ if ne(parameters.win_cudnn_home, '') }}:
76-
buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
76+
buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
7777
${{ else }}:
78-
buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
78+
buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
7979
runTests: ${{ parameters.RunOnnxRuntimeTests }}
8080
buildJava: ${{ parameters.buildJava }}
8181
java_artifact_id: onnxruntime_gpu
@@ -96,9 +96,9 @@ stages:
9696
CudaVersion: ${{ parameters.CudaVersion }}
9797
packageName: x64-tensorrt
9898
${{ if ne(parameters.win_cudnn_home, '') }}:
99-
buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
99+
buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}" --cudnn_home=${{ parameters.win_cudnn_home }}
100100
${{ else }}:
101-
buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 1 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
101+
buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --nvcc_threads 4 --flash_nvcc_threads 2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=${{ parameters.CudaArchs }}"
102102
runTests: ${{ parameters.RunOnnxRuntimeTests }}
103103
buildJava: ${{ parameters.buildJava }}
104104
java_artifact_id: onnxruntime_gpu

0 commit comments

Comments
 (0)