From d09c3385bcee044434796eb724e99a872b5cce3b Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Fri, 5 Jun 2026 06:16:34 +0000 Subject: [PATCH 1/8] [https://nvbugs/6224637][test] unwaive associated tests Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 68102bdf0380..2429761bd850 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -15,25 +15,22 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewi accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6224637) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162120) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/6162122) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dflash SKIP (https://nvbugs/6156233) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6341371) From 302e3db9bf042a958f0a8e6934f94e4c0c6a6d1b Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Mon, 15 Jun 2026 14:23:46 +0000 Subject: [PATCH 2/8] Handle NCCL NVLS init hangs in unwaived tests Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- cpp/tensorrt_llm/runtime/ncclCommunicator.cpp | 234 ++++++++++++++++-- .../defs/accuracy/test_llm_api_pytorch.py | 21 +- 2 files changed, 235 insertions(+), 20 deletions(-) diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp index dc79fd5f48e2..3e72fa715b4a 100644 --- a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp +++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,11 +23,188 @@ #include #endif // ENABLE_MULTI_DEVICE +#include +#include +#include +#include +#include + using namespace tensorrt_llm::runtime; namespace { #if ENABLE_MULTI_DEVICE +constexpr int kDefaultNcclCommInitTimeoutMs = 60'000; +constexpr int kNcclCommInitPollIntervalMs = 20; +constexpr char const* kNcclCommInitTimeoutEnv = "TLLM_NCCL_COMM_INIT_TIMEOUT_MS"; +constexpr char const* kNcclNvlsEnableEnv = "NCCL_NVLS_ENABLE"; + +struct NcclInitResult +{ + ncclComm_t comm{nullptr}; + ncclResult_t result{ncclSuccess}; + bool timedOut{false}; + + [[nodiscard]] bool isSuccess() const + { + return result == ncclSuccess; + } +}; + +struct NcclInitStatus +{ + bool failed{false}; + bool timedOut{false}; +}; + +int getNcclCommInitTimeoutMs() +{ + auto const* env = std::getenv(kNcclCommInitTimeoutEnv); + int const timeoutMs = env == nullptr ? 0 : std::atoi(env); + return timeoutMs > 0 ? timeoutMs : kDefaultNcclCommInitTimeoutMs; +} + +bool canSuggestNvlsDisable() +{ + auto const* nvlsEnable = std::getenv(kNcclNvlsEnableEnv); + return nvlsEnable == nullptr || std::string{nvlsEnable} == "2"; +} + +void setRuntimeConnectIfUnset() +{ + // Need static connection initialization for accurate KV cache size estimation. +#if defined(_WIN32) + if (getenv("NCCL_RUNTIME_CONNECT") == nullptr) + { + _putenv_s("NCCL_RUNTIME_CONNECT", "0"); + } +#else + setenv("NCCL_RUNTIME_CONNECT", "0", 0); +#endif // _WIN32 +} + +void abortNcclComm(ncclComm_t comm) +{ + if (comm == nullptr) + { + return; + } + + auto const result = ncclCommAbort(comm); + if (result != ncclSuccess) + { + TLLM_LOG_WARNING("Failed to abort NCCL communicator: %s.", ncclGetErrorString(result)); + } +} + +NcclInitResult initNcclCommWithTimeout(ncclUniqueId const& id, int worldSize, int rank, int timeoutMs) +{ + NcclInitResult initResult; + ncclConfig_t config = NCCL_CONFIG_INITIALIZER; + config.blocking = 0; + + auto result = ncclCommInitRankConfig(&initResult.comm, worldSize, id, rank, &config); + if (result != ncclSuccess && result != ncclInProgress) + { + initResult.result = result; + return initResult; + } + if (result == ncclSuccess) + { + initResult.result = ncclSuccess; + return initResult; + } + if (initResult.comm == nullptr) + { + initResult.result = result; + return initResult; + } + + auto const deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds{timeoutMs}; + while (true) + { + ncclResult_t asyncResult = ncclSuccess; + result = ncclCommGetAsyncError(initResult.comm, &asyncResult); + if (result != ncclSuccess) + { + initResult.result = result; + return initResult; + } + if (asyncResult != ncclInProgress) + { + initResult.result = asyncResult; + return initResult; + } + if (std::chrono::steady_clock::now() >= deadline) + { + initResult.result = ncclInProgress; + initResult.timedOut = true; + return initResult; + } + std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs}); + } +} + +NcclInitStatus getNcclInitStatus(NcclInitResult const& result, tensorrt_llm::mpi::MpiComm const& mpiComm) +{ + std::array localStatus{result.isSuccess() ? 0 : 1, result.timedOut ? 1 : 0}; + std::array globalStatus{}; + mpiComm.allreduce( + localStatus.data(), globalStatus.data(), 2, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MAX); + return {globalStatus[0] != 0, globalStatus[1] != 0}; +} + +bool allRanksCanUseNvlsDisableWorkaround(tensorrt_llm::mpi::MpiComm const& mpiComm) +{ + int const localCanDisable = canSuggestNvlsDisable() ? 1 : 0; + int globalCanDisable = 0; + mpiComm.allreduce( + &localCanDisable, &globalCanDisable, 1, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MIN); + + return globalCanDisable != 0; +} + +void checkNcclResult(ncclComm_t comm, ncclResult_t result, char const* operation) +{ + if (result == ncclSuccess) + { + return; + } + if (result != ncclInProgress) + { + TLLM_NCCL_CHECK(result); + } + + while (true) + { + ncclResult_t asyncResult = ncclSuccess; + result = ncclCommGetAsyncError(comm, &asyncResult); + if (result != ncclSuccess) + { + TLLM_THROW("NCCL %s failed while polling communicator status: %s.", operation, ncclGetErrorString(result)); + } + if (asyncResult == ncclSuccess) + { + return; + } + if (asyncResult != ncclInProgress) + { + TLLM_THROW("NCCL %s failed asynchronously: %s.", operation, ncclGetErrorString(asyncResult)); + } + std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs}); + } +} + +ncclUniqueId createAndBroadcastNcclId(int rank, tensorrt_llm::mpi::MpiComm const& mpiComm) +{ + ncclUniqueId id; + if (rank == 0) + { + TLLM_NCCL_CHECK(ncclGetUniqueId(&id)); + } + mpiComm.bcastValue(id, 0); + return id; +} ncclDataType_t toNcclType(nvinfer1::DataType dataType) { @@ -53,7 +230,7 @@ void NcclCommunicator::send( void const* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const { #if ENABLE_MULTI_DEVICE - TLLM_NCCL_CHECK(ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get())); + checkNcclResult(mComm, ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "send"); #else TLLM_THROW("Multi device support is disabled."); #endif // ENABLE_MULTI_DEVICE @@ -63,7 +240,7 @@ void NcclCommunicator::receive( void* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const { #if ENABLE_MULTI_DEVICE - TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get())); + checkNcclResult(mComm, ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "receive"); #else TLLM_THROW("Multi device support is disabled."); #endif // ENABLE_MULTI_DEVICE @@ -73,22 +250,45 @@ ncclComm_t NcclCommunicator::createComm(int worldSize, int rank, mpi::MpiComm co { #if ENABLE_MULTI_DEVICE - ncclUniqueId id; - if (rank == 0) + setRuntimeConnectIfUnset(); + auto const timeoutMs = getNcclCommInitTimeoutMs(); + + auto id = createAndBroadcastNcclId(rank, mpiComm); + auto initResult = initNcclCommWithTimeout(id, worldSize, rank, timeoutMs); + auto const initStatus = getNcclInitStatus(initResult, mpiComm); + + if (initStatus.failed) { - ncclGetUniqueId(&id); + if (initStatus.timedOut) + { + if (allRanksCanUseNvlsDisableWorkaround(mpiComm)) + { + TLLM_THROW( + "NCCL communicator initialization timed out after %d ms on at least one rank. This may indicate " + "an NVLS multicast resource setup failure in Fabric Manager. Set NCCL_NVLS_ENABLE=0 before " + "process startup and retry. TensorRT-LLM does not retry in-process because NCCL may not recover " + "after a timed-out NVLS initialization.", + timeoutMs); + } + TLLM_THROW( + "NCCL communicator initialization timed out after %d ms on at least one rank. NCCL_NVLS_ENABLE is " + "explicitly set, so TensorRT-LLM will not override it.", + timeoutMs); + } + if (!initResult.isSuccess()) + { + abortNcclComm(initResult.comm); + } + mpiComm.barrier(); + if (!initResult.isSuccess()) + { + TLLM_THROW( + "NCCL communicator initialization failed on rank %d: %s.", rank, ncclGetErrorString(initResult.result)); + } + TLLM_THROW("NCCL communicator initialization failed on at least one peer rank."); } - mpiComm.bcastValue(id, 0); - ncclComm_t comm; -// Need static connection initialization for accurate KV cache size estimation -#if defined(_WIN32) - if (getenv("NCCL_RUNTIME_CONNECT") == nullptr) - _putenv_s("NCCL_RUNTIME_CONNECT", "0"); -#else - setenv("NCCL_RUNTIME_CONNECT", "0", 0); -#endif // _WIN32 - TLLM_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, id, rank)); - return comm; + + return initResult.comm; #else // Python runtime requires instantiation of a communicator even though it may never be used to enable // pipeline parallel code-path. To enable this, have an empty communicator with uninitialized state. diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ff21b81babe7..2a3ffa39e5e9 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -48,6 +48,8 @@ JsonModeEval, LlmapiAccuracyTestHarness, LongBenchV1, LongBenchV2) +_NCCL_NVLS_DISABLED_ENV = {"NCCL_NVLS_ENABLE": "0"} + # Keep helper definitions below imports so new imports do not need E402 # suppressions in this legacy test file. @@ -75,6 +77,11 @@ def patched_start_mpi_pool(self): patched_start_mpi_pool) +def disable_nccl_nvls_for_test(mocker): + mocker.patch.dict(os.environ, _NCCL_NVLS_DISABLED_ENV) + patch_mpi_pool_session_for_env(mocker, _NCCL_NVLS_DISABLED_ENV) + + def _get_default_torch_compile_config(torch_compile): return TorchCompileConfig(enable_fullgraph=True, enable_piecewise_cuda_graph=True, @@ -1698,7 +1705,10 @@ def test_bfloat16_4gpus_kv_cache_aware_routing(self, mtp_nextn): ids=["tp4", "ep4", "tp2pp2", "pp4"]) def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, attention_dp, cuda_graph, overlap_scheduler, - torch_compile): + torch_compile, mocker): + if pp_size > 1: + disable_nccl_nvls_for_test(mocker) + if pp_size > 1 and mtp_nextn > 0: num_hidden_layers = 30 pp_partition = [num_hidden_layers // pp_size + 1] * pp_size @@ -1954,7 +1964,10 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn, def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, torch_compile, - sampler_async_worker): + sampler_async_worker, mocker): + if pp_size > 1: + disable_nccl_nvls_for_test(mocker) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = _get_default_torch_compile_config(torch_compile) pytorch_config = dict( @@ -2390,12 +2403,14 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph, def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, low_precision_combine, tp_size, pp_size, ep_size, torch_compile, mtp_nextn, - moe_backend): + moe_backend, mocker): sm_version = get_sm_version() if moe_backend == "TRTLLM" and sm_version in (120, 121): pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") if moe_backend == "CUTEDSL" and sm_version not in (100, 103): pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only") + if pp_size > 1: + disable_nccl_nvls_for_test(mocker) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp. From e46ca0706eb7429c7c0a20acc954224b2363be0d Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Wed, 17 Jun 2026 03:10:19 +0000 Subject: [PATCH 3/8] [https://nvbugs/6224637][test] restore failing GB200 waive Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2429761bd850..ff02f37bd39a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -29,6 +29,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394) From 0a24f02a3eb8b83b35f8f344db5e2eb30a2f96b1 Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Wed, 17 Jun 2026 04:25:20 +0000 Subject: [PATCH 4/8] [https://nvbugs/6224637][test] keep waive update deletion-only Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index ff02f37bd39a..ce70f437ee04 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -21,15 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mt accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162120) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162122) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] SKIP (https://nvbugs/6162122) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2] SKIP (https://nvbugs/6162122) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/6162122) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394) From c9ec5ec31fd39e574d377a8d287b3fec745c6c1a Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Sat, 20 Jun 2026 08:04:59 +0000 Subject: [PATCH 5/8] [https://nvbugs/6224637][test] remove GB200 hang workarounds Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- cpp/tensorrt_llm/runtime/ncclCommunicator.cpp | 234 ++---------------- .../defs/accuracy/test_llm_api_pytorch.py | 21 +- 2 files changed, 20 insertions(+), 235 deletions(-) diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp index 3e72fa715b4a..dc79fd5f48e2 100644 --- a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp +++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,188 +23,11 @@ #include #endif // ENABLE_MULTI_DEVICE -#include -#include -#include -#include -#include - using namespace tensorrt_llm::runtime; namespace { #if ENABLE_MULTI_DEVICE -constexpr int kDefaultNcclCommInitTimeoutMs = 60'000; -constexpr int kNcclCommInitPollIntervalMs = 20; -constexpr char const* kNcclCommInitTimeoutEnv = "TLLM_NCCL_COMM_INIT_TIMEOUT_MS"; -constexpr char const* kNcclNvlsEnableEnv = "NCCL_NVLS_ENABLE"; - -struct NcclInitResult -{ - ncclComm_t comm{nullptr}; - ncclResult_t result{ncclSuccess}; - bool timedOut{false}; - - [[nodiscard]] bool isSuccess() const - { - return result == ncclSuccess; - } -}; - -struct NcclInitStatus -{ - bool failed{false}; - bool timedOut{false}; -}; - -int getNcclCommInitTimeoutMs() -{ - auto const* env = std::getenv(kNcclCommInitTimeoutEnv); - int const timeoutMs = env == nullptr ? 0 : std::atoi(env); - return timeoutMs > 0 ? timeoutMs : kDefaultNcclCommInitTimeoutMs; -} - -bool canSuggestNvlsDisable() -{ - auto const* nvlsEnable = std::getenv(kNcclNvlsEnableEnv); - return nvlsEnable == nullptr || std::string{nvlsEnable} == "2"; -} - -void setRuntimeConnectIfUnset() -{ - // Need static connection initialization for accurate KV cache size estimation. -#if defined(_WIN32) - if (getenv("NCCL_RUNTIME_CONNECT") == nullptr) - { - _putenv_s("NCCL_RUNTIME_CONNECT", "0"); - } -#else - setenv("NCCL_RUNTIME_CONNECT", "0", 0); -#endif // _WIN32 -} - -void abortNcclComm(ncclComm_t comm) -{ - if (comm == nullptr) - { - return; - } - - auto const result = ncclCommAbort(comm); - if (result != ncclSuccess) - { - TLLM_LOG_WARNING("Failed to abort NCCL communicator: %s.", ncclGetErrorString(result)); - } -} - -NcclInitResult initNcclCommWithTimeout(ncclUniqueId const& id, int worldSize, int rank, int timeoutMs) -{ - NcclInitResult initResult; - ncclConfig_t config = NCCL_CONFIG_INITIALIZER; - config.blocking = 0; - - auto result = ncclCommInitRankConfig(&initResult.comm, worldSize, id, rank, &config); - if (result != ncclSuccess && result != ncclInProgress) - { - initResult.result = result; - return initResult; - } - if (result == ncclSuccess) - { - initResult.result = ncclSuccess; - return initResult; - } - if (initResult.comm == nullptr) - { - initResult.result = result; - return initResult; - } - - auto const deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds{timeoutMs}; - while (true) - { - ncclResult_t asyncResult = ncclSuccess; - result = ncclCommGetAsyncError(initResult.comm, &asyncResult); - if (result != ncclSuccess) - { - initResult.result = result; - return initResult; - } - if (asyncResult != ncclInProgress) - { - initResult.result = asyncResult; - return initResult; - } - if (std::chrono::steady_clock::now() >= deadline) - { - initResult.result = ncclInProgress; - initResult.timedOut = true; - return initResult; - } - std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs}); - } -} - -NcclInitStatus getNcclInitStatus(NcclInitResult const& result, tensorrt_llm::mpi::MpiComm const& mpiComm) -{ - std::array localStatus{result.isSuccess() ? 0 : 1, result.timedOut ? 1 : 0}; - std::array globalStatus{}; - mpiComm.allreduce( - localStatus.data(), globalStatus.data(), 2, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MAX); - return {globalStatus[0] != 0, globalStatus[1] != 0}; -} - -bool allRanksCanUseNvlsDisableWorkaround(tensorrt_llm::mpi::MpiComm const& mpiComm) -{ - int const localCanDisable = canSuggestNvlsDisable() ? 1 : 0; - int globalCanDisable = 0; - mpiComm.allreduce( - &localCanDisable, &globalCanDisable, 1, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MIN); - - return globalCanDisable != 0; -} - -void checkNcclResult(ncclComm_t comm, ncclResult_t result, char const* operation) -{ - if (result == ncclSuccess) - { - return; - } - if (result != ncclInProgress) - { - TLLM_NCCL_CHECK(result); - } - - while (true) - { - ncclResult_t asyncResult = ncclSuccess; - result = ncclCommGetAsyncError(comm, &asyncResult); - if (result != ncclSuccess) - { - TLLM_THROW("NCCL %s failed while polling communicator status: %s.", operation, ncclGetErrorString(result)); - } - if (asyncResult == ncclSuccess) - { - return; - } - if (asyncResult != ncclInProgress) - { - TLLM_THROW("NCCL %s failed asynchronously: %s.", operation, ncclGetErrorString(asyncResult)); - } - std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs}); - } -} - -ncclUniqueId createAndBroadcastNcclId(int rank, tensorrt_llm::mpi::MpiComm const& mpiComm) -{ - ncclUniqueId id; - if (rank == 0) - { - TLLM_NCCL_CHECK(ncclGetUniqueId(&id)); - } - mpiComm.bcastValue(id, 0); - return id; -} ncclDataType_t toNcclType(nvinfer1::DataType dataType) { @@ -230,7 +53,7 @@ void NcclCommunicator::send( void const* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const { #if ENABLE_MULTI_DEVICE - checkNcclResult(mComm, ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "send"); + TLLM_NCCL_CHECK(ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get())); #else TLLM_THROW("Multi device support is disabled."); #endif // ENABLE_MULTI_DEVICE @@ -240,7 +63,7 @@ void NcclCommunicator::receive( void* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const { #if ENABLE_MULTI_DEVICE - checkNcclResult(mComm, ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "receive"); + TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get())); #else TLLM_THROW("Multi device support is disabled."); #endif // ENABLE_MULTI_DEVICE @@ -250,45 +73,22 @@ ncclComm_t NcclCommunicator::createComm(int worldSize, int rank, mpi::MpiComm co { #if ENABLE_MULTI_DEVICE - setRuntimeConnectIfUnset(); - auto const timeoutMs = getNcclCommInitTimeoutMs(); - - auto id = createAndBroadcastNcclId(rank, mpiComm); - auto initResult = initNcclCommWithTimeout(id, worldSize, rank, timeoutMs); - auto const initStatus = getNcclInitStatus(initResult, mpiComm); - - if (initStatus.failed) + ncclUniqueId id; + if (rank == 0) { - if (initStatus.timedOut) - { - if (allRanksCanUseNvlsDisableWorkaround(mpiComm)) - { - TLLM_THROW( - "NCCL communicator initialization timed out after %d ms on at least one rank. This may indicate " - "an NVLS multicast resource setup failure in Fabric Manager. Set NCCL_NVLS_ENABLE=0 before " - "process startup and retry. TensorRT-LLM does not retry in-process because NCCL may not recover " - "after a timed-out NVLS initialization.", - timeoutMs); - } - TLLM_THROW( - "NCCL communicator initialization timed out after %d ms on at least one rank. NCCL_NVLS_ENABLE is " - "explicitly set, so TensorRT-LLM will not override it.", - timeoutMs); - } - if (!initResult.isSuccess()) - { - abortNcclComm(initResult.comm); - } - mpiComm.barrier(); - if (!initResult.isSuccess()) - { - TLLM_THROW( - "NCCL communicator initialization failed on rank %d: %s.", rank, ncclGetErrorString(initResult.result)); - } - TLLM_THROW("NCCL communicator initialization failed on at least one peer rank."); + ncclGetUniqueId(&id); } - - return initResult.comm; + mpiComm.bcastValue(id, 0); + ncclComm_t comm; +// Need static connection initialization for accurate KV cache size estimation +#if defined(_WIN32) + if (getenv("NCCL_RUNTIME_CONNECT") == nullptr) + _putenv_s("NCCL_RUNTIME_CONNECT", "0"); +#else + setenv("NCCL_RUNTIME_CONNECT", "0", 0); +#endif // _WIN32 + TLLM_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, id, rank)); + return comm; #else // Python runtime requires instantiation of a communicator even though it may never be used to enable // pipeline parallel code-path. To enable this, have an empty communicator with uninitialized state. diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2a3ffa39e5e9..ff21b81babe7 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -48,8 +48,6 @@ JsonModeEval, LlmapiAccuracyTestHarness, LongBenchV1, LongBenchV2) -_NCCL_NVLS_DISABLED_ENV = {"NCCL_NVLS_ENABLE": "0"} - # Keep helper definitions below imports so new imports do not need E402 # suppressions in this legacy test file. @@ -77,11 +75,6 @@ def patched_start_mpi_pool(self): patched_start_mpi_pool) -def disable_nccl_nvls_for_test(mocker): - mocker.patch.dict(os.environ, _NCCL_NVLS_DISABLED_ENV) - patch_mpi_pool_session_for_env(mocker, _NCCL_NVLS_DISABLED_ENV) - - def _get_default_torch_compile_config(torch_compile): return TorchCompileConfig(enable_fullgraph=True, enable_piecewise_cuda_graph=True, @@ -1705,10 +1698,7 @@ def test_bfloat16_4gpus_kv_cache_aware_routing(self, mtp_nextn): ids=["tp4", "ep4", "tp2pp2", "pp4"]) def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, attention_dp, cuda_graph, overlap_scheduler, - torch_compile, mocker): - if pp_size > 1: - disable_nccl_nvls_for_test(mocker) - + torch_compile): if pp_size > 1 and mtp_nextn > 0: num_hidden_layers = 30 pp_partition = [num_hidden_layers // pp_size + 1] * pp_size @@ -1964,10 +1954,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn, def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, torch_compile, - sampler_async_worker, mocker): - if pp_size > 1: - disable_nccl_nvls_for_test(mocker) - + sampler_async_worker): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = _get_default_torch_compile_config(torch_compile) pytorch_config = dict( @@ -2403,14 +2390,12 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph, def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, low_precision_combine, tp_size, pp_size, ep_size, torch_compile, mtp_nextn, - moe_backend, mocker): + moe_backend): sm_version = get_sm_version() if moe_backend == "TRTLLM" and sm_version in (120, 121): pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") if moe_backend == "CUTEDSL" and sm_version not in (100, 103): pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only") - if pp_size > 1: - disable_nccl_nvls_for_test(mocker) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp. From eb7647ecaa4aab329ee118ab404e742f6c143f08 Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Wed, 24 Jun 2026 03:05:32 +0000 Subject: [PATCH 6/8] [NVBUG-6224637][fix] Enable CuTe DSL BF16 kernels on SM100 PP Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- tensorrt_llm/_torch/modules/attention.py | 6 ++++-- tensorrt_llm/_torch/modules/gated_mlp.py | 2 ++ tensorrt_llm/llmapi/llm_args.py | 10 +++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index 28d5cff5c3d8..3dba1084a8cf 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -529,7 +529,8 @@ def __init__( force_dynamic_quantization=config.force_dynamic_quantization, disable_deep_gemm=disable_deep_gemm, use_custom_cublas_mm=use_custom_cublas_mm, - use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm) + use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm) self.quant_config = config.get_quant_config() self.attn_backend = config.attn_backend @@ -1462,7 +1463,8 @@ def __init__( reduce_output=reduce_output, allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, - use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm) + use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm) def yarn_get_mscale(scale=1, mscale=1): if scale <= 1: diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py index 7e2fdcaeca20..1bb29e8e3640 100644 --- a/tensorrt_llm/_torch/modules/gated_mlp.py +++ b/tensorrt_llm/_torch/modules/gated_mlp.py @@ -84,6 +84,7 @@ def __init__( allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm, disable_deep_gemm=disable_deep_gemm, fused_weight_shard_indices_mapping=gateup_shard_indices_mapping, use_custom_cublas_mm=use_custom_cublas_mm, @@ -114,6 +115,7 @@ def __init__( allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm, disable_deep_gemm=disable_deep_gemm, use_custom_cublas_mm=use_custom_cublas_mm, ) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 609e11c5a27a..4f0a624d13f1 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -46,7 +46,8 @@ from tensorrt_llm.lora_helper import (LoraConfig, get_default_trtllm_modules_to_hf_modules) -from .._utils import _str_to_torch_dtype_dict, mpi_rank, prefer_pinned +from .._utils import (_str_to_torch_dtype_dict, is_sm_100f, mpi_rank, + prefer_pinned) # yapf: disable # isort: off @@ -5077,6 +5078,13 @@ def validate_ray_placement_config(self) -> 'TorchLlmArgs': @model_validator(mode='after') def validate_cute_dsl_bf16(self) -> 'TorchLlmArgs': + if (not (self.use_cute_dsl_bf16_bmm and self.use_cute_dsl_bf16_gemm) + and self.pipeline_parallel_size > 1 and is_sm_100f()): + logger.info("Automatically enabling CuTe DSL BF16 BMM and GEMM for " + "SM100/SM103 PP.") + self.use_cute_dsl_bf16_bmm = True + self.use_cute_dsl_bf16_gemm = True + if self.use_cute_dsl_bf16_bmm or self.use_cute_dsl_bf16_gemm: major, minor = torch.cuda.get_device_capability() sm = major * 10 + minor From a1c811f900e1657712812b087c446623b13600b5 Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Wed, 24 Jun 2026 08:45:09 +0000 Subject: [PATCH 7/8] [NVBUG-6224637][test] Remove rebase-added waiver Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index ce70f437ee04..f81f8ebe6652 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -20,7 +20,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mt accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403) From ce916714a78e4ee5844d0745ea7842d9c0acc763 Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:19:48 +0000 Subject: [PATCH 8/8] [NVBUG-6224637][fix] Guard CuTe BF16 GEMM config for VisualGen Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- tensorrt_llm/_torch/modules/gated_mlp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py index 1bb29e8e3640..4870db50b44c 100644 --- a/tensorrt_llm/_torch/modules/gated_mlp.py +++ b/tensorrt_llm/_torch/modules/gated_mlp.py @@ -44,6 +44,8 @@ def __init__( self.use_cute_dsl_blockscaling_mm = use_cute_dsl_blockscaling_mm config = config or ModelConfig() + use_cute_dsl_bf16_gemm = getattr(config, "use_cute_dsl_bf16_gemm", + False) self.mapping = config.mapping if overridden_tp_size is not None: assert config.mapping.tp_size % overridden_tp_size == 0 @@ -84,7 +86,7 @@ def __init__( allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, - use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm, + use_cute_dsl_bf16_gemm=use_cute_dsl_bf16_gemm, disable_deep_gemm=disable_deep_gemm, fused_weight_shard_indices_mapping=gateup_shard_indices_mapping, use_custom_cublas_mm=use_custom_cublas_mm, @@ -115,7 +117,7 @@ def __init__( allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, - use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm, + use_cute_dsl_bf16_gemm=use_cute_dsl_bf16_gemm, disable_deep_gemm=disable_deep_gemm, use_custom_cublas_mm=use_custom_cublas_mm, )