From d09c3385bcee044434796eb724e99a872b5cce3b Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Fri, 5 Jun 2026 06:16:34 +0000
Subject: [PATCH 1/8] [https://nvbugs/6224637][test] unwaive associated tests

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 68102bdf0380..2429761bd850 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -15,25 +15,22 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewi
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6224637)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6278337)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6224637)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6224637)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/6224637)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6224637)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6224637)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6224637)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162120)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/6162122)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6224637)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dflash SKIP (https://nvbugs/6156233)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6341371)

From 302e3db9bf042a958f0a8e6934f94e4c0c6a6d1b Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Mon, 15 Jun 2026 14:23:46 +0000
Subject: [PATCH 2/8] Handle NCCL NVLS init hangs in unwaived tests

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 cpp/tensorrt_llm/runtime/ncclCommunicator.cpp | 234 ++++++++++++++++--
 .../defs/accuracy/test_llm_api_pytorch.py     |  21 +-
 2 files changed, 235 insertions(+), 20 deletions(-)

diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
index dc79fd5f48e2..3e72fa715b4a 100644
--- a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
+++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,11 +23,188 @@
 #include <nccl.h>
 #endif // ENABLE_MULTI_DEVICE
 
+#include <array>
+#include <chrono>
+#include <cstdlib>
+#include <string>
+#include <thread>
+
 using namespace tensorrt_llm::runtime;
 
 namespace
 {
 #if ENABLE_MULTI_DEVICE
+constexpr int kDefaultNcclCommInitTimeoutMs = 60'000;
+constexpr int kNcclCommInitPollIntervalMs = 20;
+constexpr char const* kNcclCommInitTimeoutEnv = "TLLM_NCCL_COMM_INIT_TIMEOUT_MS";
+constexpr char const* kNcclNvlsEnableEnv = "NCCL_NVLS_ENABLE";
+
+struct NcclInitResult
+{
+    ncclComm_t comm{nullptr};
+    ncclResult_t result{ncclSuccess};
+    bool timedOut{false};
+
+    [[nodiscard]] bool isSuccess() const
+    {
+        return result == ncclSuccess;
+    }
+};
+
+struct NcclInitStatus
+{
+    bool failed{false};
+    bool timedOut{false};
+};
+
+int getNcclCommInitTimeoutMs()
+{
+    auto const* env = std::getenv(kNcclCommInitTimeoutEnv);
+    int const timeoutMs = env == nullptr ? 0 : std::atoi(env);
+    return timeoutMs > 0 ? timeoutMs : kDefaultNcclCommInitTimeoutMs;
+}
+
+bool canSuggestNvlsDisable()
+{
+    auto const* nvlsEnable = std::getenv(kNcclNvlsEnableEnv);
+    return nvlsEnable == nullptr || std::string{nvlsEnable} == "2";
+}
+
+void setRuntimeConnectIfUnset()
+{
+    // Need static connection initialization for accurate KV cache size estimation.
+#if defined(_WIN32)
+    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
+    {
+        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
+    }
+#else
+    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
+#endif // _WIN32
+}
+
+void abortNcclComm(ncclComm_t comm)
+{
+    if (comm == nullptr)
+    {
+        return;
+    }
+
+    auto const result = ncclCommAbort(comm);
+    if (result != ncclSuccess)
+    {
+        TLLM_LOG_WARNING("Failed to abort NCCL communicator: %s.", ncclGetErrorString(result));
+    }
+}
+
+NcclInitResult initNcclCommWithTimeout(ncclUniqueId const& id, int worldSize, int rank, int timeoutMs)
+{
+    NcclInitResult initResult;
+    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+    config.blocking = 0;
+
+    auto result = ncclCommInitRankConfig(&initResult.comm, worldSize, id, rank, &config);
+    if (result != ncclSuccess && result != ncclInProgress)
+    {
+        initResult.result = result;
+        return initResult;
+    }
+    if (result == ncclSuccess)
+    {
+        initResult.result = ncclSuccess;
+        return initResult;
+    }
+    if (initResult.comm == nullptr)
+    {
+        initResult.result = result;
+        return initResult;
+    }
+
+    auto const deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds{timeoutMs};
+    while (true)
+    {
+        ncclResult_t asyncResult = ncclSuccess;
+        result = ncclCommGetAsyncError(initResult.comm, &asyncResult);
+        if (result != ncclSuccess)
+        {
+            initResult.result = result;
+            return initResult;
+        }
+        if (asyncResult != ncclInProgress)
+        {
+            initResult.result = asyncResult;
+            return initResult;
+        }
+        if (std::chrono::steady_clock::now() >= deadline)
+        {
+            initResult.result = ncclInProgress;
+            initResult.timedOut = true;
+            return initResult;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
+    }
+}
+
+NcclInitStatus getNcclInitStatus(NcclInitResult const& result, tensorrt_llm::mpi::MpiComm const& mpiComm)
+{
+    std::array<int, 2> localStatus{result.isSuccess() ? 0 : 1, result.timedOut ? 1 : 0};
+    std::array<int, 2> globalStatus{};
+    mpiComm.allreduce(
+        localStatus.data(), globalStatus.data(), 2, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MAX);
+    return {globalStatus[0] != 0, globalStatus[1] != 0};
+}
+
+bool allRanksCanUseNvlsDisableWorkaround(tensorrt_llm::mpi::MpiComm const& mpiComm)
+{
+    int const localCanDisable = canSuggestNvlsDisable() ? 1 : 0;
+    int globalCanDisable = 0;
+    mpiComm.allreduce(
+        &localCanDisable, &globalCanDisable, 1, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MIN);
+
+    return globalCanDisable != 0;
+}
+
+void checkNcclResult(ncclComm_t comm, ncclResult_t result, char const* operation)
+{
+    if (result == ncclSuccess)
+    {
+        return;
+    }
+    if (result != ncclInProgress)
+    {
+        TLLM_NCCL_CHECK(result);
+    }
+
+    while (true)
+    {
+        ncclResult_t asyncResult = ncclSuccess;
+        result = ncclCommGetAsyncError(comm, &asyncResult);
+        if (result != ncclSuccess)
+        {
+            TLLM_THROW("NCCL %s failed while polling communicator status: %s.", operation, ncclGetErrorString(result));
+        }
+        if (asyncResult == ncclSuccess)
+        {
+            return;
+        }
+        if (asyncResult != ncclInProgress)
+        {
+            TLLM_THROW("NCCL %s failed asynchronously: %s.", operation, ncclGetErrorString(asyncResult));
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
+    }
+}
+
+ncclUniqueId createAndBroadcastNcclId(int rank, tensorrt_llm::mpi::MpiComm const& mpiComm)
+{
+    ncclUniqueId id;
+    if (rank == 0)
+    {
+        TLLM_NCCL_CHECK(ncclGetUniqueId(&id));
+    }
+    mpiComm.bcastValue(id, 0);
+    return id;
+}
 
 ncclDataType_t toNcclType(nvinfer1::DataType dataType)
 {
@@ -53,7 +230,7 @@ void NcclCommunicator::send(
     void const* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    TLLM_NCCL_CHECK(ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
+    checkNcclResult(mComm, ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "send");
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -63,7 +240,7 @@ void NcclCommunicator::receive(
     void* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
+    checkNcclResult(mComm, ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "receive");
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -73,22 +250,45 @@ ncclComm_t NcclCommunicator::createComm(int worldSize, int rank, mpi::MpiComm co
 {
 #if ENABLE_MULTI_DEVICE
 
-    ncclUniqueId id;
-    if (rank == 0)
+    setRuntimeConnectIfUnset();
+    auto const timeoutMs = getNcclCommInitTimeoutMs();
+
+    auto id = createAndBroadcastNcclId(rank, mpiComm);
+    auto initResult = initNcclCommWithTimeout(id, worldSize, rank, timeoutMs);
+    auto const initStatus = getNcclInitStatus(initResult, mpiComm);
+
+    if (initStatus.failed)
     {
-        ncclGetUniqueId(&id);
+        if (initStatus.timedOut)
+        {
+            if (allRanksCanUseNvlsDisableWorkaround(mpiComm))
+            {
+                TLLM_THROW(
+                    "NCCL communicator initialization timed out after %d ms on at least one rank. This may indicate "
+                    "an NVLS multicast resource setup failure in Fabric Manager. Set NCCL_NVLS_ENABLE=0 before "
+                    "process startup and retry. TensorRT-LLM does not retry in-process because NCCL may not recover "
+                    "after a timed-out NVLS initialization.",
+                    timeoutMs);
+            }
+            TLLM_THROW(
+                "NCCL communicator initialization timed out after %d ms on at least one rank. NCCL_NVLS_ENABLE is "
+                "explicitly set, so TensorRT-LLM will not override it.",
+                timeoutMs);
+        }
+        if (!initResult.isSuccess())
+        {
+            abortNcclComm(initResult.comm);
+        }
+        mpiComm.barrier();
+        if (!initResult.isSuccess())
+        {
+            TLLM_THROW(
+                "NCCL communicator initialization failed on rank %d: %s.", rank, ncclGetErrorString(initResult.result));
+        }
+        TLLM_THROW("NCCL communicator initialization failed on at least one peer rank.");
     }
-    mpiComm.bcastValue(id, 0);
-    ncclComm_t comm;
-// Need static connection initialization for accurate KV cache size estimation
-#if defined(_WIN32)
-    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
-        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
-#else
-    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
-#endif // _WIN32
-    TLLM_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, id, rank));
-    return comm;
+
+    return initResult.comm;
 #else
     // Python runtime requires instantiation of a communicator even though it may never be used to enable
     // pipeline parallel code-path. To enable this, have an empty communicator with uninitialized state.
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index ff21b81babe7..2a3ffa39e5e9 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -48,6 +48,8 @@
                             JsonModeEval, LlmapiAccuracyTestHarness,
                             LongBenchV1, LongBenchV2)
 
+_NCCL_NVLS_DISABLED_ENV = {"NCCL_NVLS_ENABLE": "0"}
+
 
 # Keep helper definitions below imports so new imports do not need E402
 # suppressions in this legacy test file.
@@ -75,6 +77,11 @@ def patched_start_mpi_pool(self):
                         patched_start_mpi_pool)
 
 
+def disable_nccl_nvls_for_test(mocker):
+    mocker.patch.dict(os.environ, _NCCL_NVLS_DISABLED_ENV)
+    patch_mpi_pool_session_for_env(mocker, _NCCL_NVLS_DISABLED_ENV)
+
+
 def _get_default_torch_compile_config(torch_compile):
     return TorchCompileConfig(enable_fullgraph=True,
                               enable_piecewise_cuda_graph=True,
@@ -1698,7 +1705,10 @@ def test_bfloat16_4gpus_kv_cache_aware_routing(self, mtp_nextn):
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                             attention_dp, cuda_graph, overlap_scheduler,
-                            torch_compile):
+                            torch_compile, mocker):
+        if pp_size > 1:
+            disable_nccl_nvls_for_test(mocker)
+
         if pp_size > 1 and mtp_nextn > 0:
             num_hidden_layers = 30
             pp_partition = [num_hidden_layers // pp_size + 1] * pp_size
@@ -1954,7 +1964,10 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                     fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler, torch_compile,
-                                    sampler_async_worker):
+                                    sampler_async_worker, mocker):
+        if pp_size > 1:
+            disable_nccl_nvls_for_test(mocker)
+
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
         pytorch_config = dict(
@@ -2390,12 +2403,14 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, low_precision_combine, tp_size,
                          pp_size, ep_size, torch_compile, mtp_nextn,
-                         moe_backend):
+                         moe_backend, mocker):
         sm_version = get_sm_version()
         if moe_backend == "TRTLLM" and sm_version in (120, 121):
             pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
         if moe_backend == "CUTEDSL" and sm_version not in (100, 103):
             pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only")
+        if pp_size > 1:
+            disable_nccl_nvls_for_test(mocker)
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.

From e46ca0706eb7429c7c0a20acc954224b2363be0d Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Wed, 17 Jun 2026 03:10:19 +0000
Subject: [PATCH 3/8] [https://nvbugs/6224637][test] restore failing GB200
 waive

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 2429761bd850..ff02f37bd39a 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -29,6 +29,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394)

From 0a24f02a3eb8b83b35f8f344db5e2eb30a2f96b1 Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Wed, 17 Jun 2026 04:25:20 +0000
Subject: [PATCH 4/8] [https://nvbugs/6224637][test] keep waive update
 deletion-only

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index ff02f37bd39a..ce70f437ee04 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -21,15 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mt
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162120)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6162122)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] SKIP (https://nvbugs/6162122)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2] SKIP (https://nvbugs/6162122)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/6162122)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/6162122)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394)

From c9ec5ec31fd39e574d377a8d287b3fec745c6c1a Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Sat, 20 Jun 2026 08:04:59 +0000
Subject: [PATCH 5/8] [https://nvbugs/6224637][test] remove GB200 hang
 workarounds

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 cpp/tensorrt_llm/runtime/ncclCommunicator.cpp | 234 ++----------------
 .../defs/accuracy/test_llm_api_pytorch.py     |  21 +-
 2 files changed, 20 insertions(+), 235 deletions(-)

diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
index 3e72fa715b4a..dc79fd5f48e2 100644
--- a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
+++ b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,188 +23,11 @@
 #include <nccl.h>
 #endif // ENABLE_MULTI_DEVICE
 
-#include <array>
-#include <chrono>
-#include <cstdlib>
-#include <string>
-#include <thread>
-
 using namespace tensorrt_llm::runtime;
 
 namespace
 {
 #if ENABLE_MULTI_DEVICE
-constexpr int kDefaultNcclCommInitTimeoutMs = 60'000;
-constexpr int kNcclCommInitPollIntervalMs = 20;
-constexpr char const* kNcclCommInitTimeoutEnv = "TLLM_NCCL_COMM_INIT_TIMEOUT_MS";
-constexpr char const* kNcclNvlsEnableEnv = "NCCL_NVLS_ENABLE";
-
-struct NcclInitResult
-{
-    ncclComm_t comm{nullptr};
-    ncclResult_t result{ncclSuccess};
-    bool timedOut{false};
-
-    [[nodiscard]] bool isSuccess() const
-    {
-        return result == ncclSuccess;
-    }
-};
-
-struct NcclInitStatus
-{
-    bool failed{false};
-    bool timedOut{false};
-};
-
-int getNcclCommInitTimeoutMs()
-{
-    auto const* env = std::getenv(kNcclCommInitTimeoutEnv);
-    int const timeoutMs = env == nullptr ? 0 : std::atoi(env);
-    return timeoutMs > 0 ? timeoutMs : kDefaultNcclCommInitTimeoutMs;
-}
-
-bool canSuggestNvlsDisable()
-{
-    auto const* nvlsEnable = std::getenv(kNcclNvlsEnableEnv);
-    return nvlsEnable == nullptr || std::string{nvlsEnable} == "2";
-}
-
-void setRuntimeConnectIfUnset()
-{
-    // Need static connection initialization for accurate KV cache size estimation.
-#if defined(_WIN32)
-    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
-    {
-        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
-    }
-#else
-    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
-#endif // _WIN32
-}
-
-void abortNcclComm(ncclComm_t comm)
-{
-    if (comm == nullptr)
-    {
-        return;
-    }
-
-    auto const result = ncclCommAbort(comm);
-    if (result != ncclSuccess)
-    {
-        TLLM_LOG_WARNING("Failed to abort NCCL communicator: %s.", ncclGetErrorString(result));
-    }
-}
-
-NcclInitResult initNcclCommWithTimeout(ncclUniqueId const& id, int worldSize, int rank, int timeoutMs)
-{
-    NcclInitResult initResult;
-    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
-    config.blocking = 0;
-
-    auto result = ncclCommInitRankConfig(&initResult.comm, worldSize, id, rank, &config);
-    if (result != ncclSuccess && result != ncclInProgress)
-    {
-        initResult.result = result;
-        return initResult;
-    }
-    if (result == ncclSuccess)
-    {
-        initResult.result = ncclSuccess;
-        return initResult;
-    }
-    if (initResult.comm == nullptr)
-    {
-        initResult.result = result;
-        return initResult;
-    }
-
-    auto const deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds{timeoutMs};
-    while (true)
-    {
-        ncclResult_t asyncResult = ncclSuccess;
-        result = ncclCommGetAsyncError(initResult.comm, &asyncResult);
-        if (result != ncclSuccess)
-        {
-            initResult.result = result;
-            return initResult;
-        }
-        if (asyncResult != ncclInProgress)
-        {
-            initResult.result = asyncResult;
-            return initResult;
-        }
-        if (std::chrono::steady_clock::now() >= deadline)
-        {
-            initResult.result = ncclInProgress;
-            initResult.timedOut = true;
-            return initResult;
-        }
-        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
-    }
-}
-
-NcclInitStatus getNcclInitStatus(NcclInitResult const& result, tensorrt_llm::mpi::MpiComm const& mpiComm)
-{
-    std::array<int, 2> localStatus{result.isSuccess() ? 0 : 1, result.timedOut ? 1 : 0};
-    std::array<int, 2> globalStatus{};
-    mpiComm.allreduce(
-        localStatus.data(), globalStatus.data(), 2, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MAX);
-    return {globalStatus[0] != 0, globalStatus[1] != 0};
-}
-
-bool allRanksCanUseNvlsDisableWorkaround(tensorrt_llm::mpi::MpiComm const& mpiComm)
-{
-    int const localCanDisable = canSuggestNvlsDisable() ? 1 : 0;
-    int globalCanDisable = 0;
-    mpiComm.allreduce(
-        &localCanDisable, &globalCanDisable, 1, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MIN);
-
-    return globalCanDisable != 0;
-}
-
-void checkNcclResult(ncclComm_t comm, ncclResult_t result, char const* operation)
-{
-    if (result == ncclSuccess)
-    {
-        return;
-    }
-    if (result != ncclInProgress)
-    {
-        TLLM_NCCL_CHECK(result);
-    }
-
-    while (true)
-    {
-        ncclResult_t asyncResult = ncclSuccess;
-        result = ncclCommGetAsyncError(comm, &asyncResult);
-        if (result != ncclSuccess)
-        {
-            TLLM_THROW("NCCL %s failed while polling communicator status: %s.", operation, ncclGetErrorString(result));
-        }
-        if (asyncResult == ncclSuccess)
-        {
-            return;
-        }
-        if (asyncResult != ncclInProgress)
-        {
-            TLLM_THROW("NCCL %s failed asynchronously: %s.", operation, ncclGetErrorString(asyncResult));
-        }
-        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
-    }
-}
-
-ncclUniqueId createAndBroadcastNcclId(int rank, tensorrt_llm::mpi::MpiComm const& mpiComm)
-{
-    ncclUniqueId id;
-    if (rank == 0)
-    {
-        TLLM_NCCL_CHECK(ncclGetUniqueId(&id));
-    }
-    mpiComm.bcastValue(id, 0);
-    return id;
-}
 
 ncclDataType_t toNcclType(nvinfer1::DataType dataType)
 {
@@ -230,7 +53,7 @@ void NcclCommunicator::send(
     void const* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    checkNcclResult(mComm, ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "send");
+    TLLM_NCCL_CHECK(ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -240,7 +63,7 @@ void NcclCommunicator::receive(
     void* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    checkNcclResult(mComm, ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "receive");
+    TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -250,45 +73,22 @@ ncclComm_t NcclCommunicator::createComm(int worldSize, int rank, mpi::MpiComm co
 {
 #if ENABLE_MULTI_DEVICE
 
-    setRuntimeConnectIfUnset();
-    auto const timeoutMs = getNcclCommInitTimeoutMs();
-
-    auto id = createAndBroadcastNcclId(rank, mpiComm);
-    auto initResult = initNcclCommWithTimeout(id, worldSize, rank, timeoutMs);
-    auto const initStatus = getNcclInitStatus(initResult, mpiComm);
-
-    if (initStatus.failed)
+    ncclUniqueId id;
+    if (rank == 0)
     {
-        if (initStatus.timedOut)
-        {
-            if (allRanksCanUseNvlsDisableWorkaround(mpiComm))
-            {
-                TLLM_THROW(
-                    "NCCL communicator initialization timed out after %d ms on at least one rank. This may indicate "
-                    "an NVLS multicast resource setup failure in Fabric Manager. Set NCCL_NVLS_ENABLE=0 before "
-                    "process startup and retry. TensorRT-LLM does not retry in-process because NCCL may not recover "
-                    "after a timed-out NVLS initialization.",
-                    timeoutMs);
-            }
-            TLLM_THROW(
-                "NCCL communicator initialization timed out after %d ms on at least one rank. NCCL_NVLS_ENABLE is "
-                "explicitly set, so TensorRT-LLM will not override it.",
-                timeoutMs);
-        }
-        if (!initResult.isSuccess())
-        {
-            abortNcclComm(initResult.comm);
-        }
-        mpiComm.barrier();
-        if (!initResult.isSuccess())
-        {
-            TLLM_THROW(
-                "NCCL communicator initialization failed on rank %d: %s.", rank, ncclGetErrorString(initResult.result));
-        }
-        TLLM_THROW("NCCL communicator initialization failed on at least one peer rank.");
+        ncclGetUniqueId(&id);
     }
-
-    return initResult.comm;
+    mpiComm.bcastValue(id, 0);
+    ncclComm_t comm;
+// Need static connection initialization for accurate KV cache size estimation
+#if defined(_WIN32)
+    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
+        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
+#else
+    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
+#endif // _WIN32
+    TLLM_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, id, rank));
+    return comm;
 #else
     // Python runtime requires instantiation of a communicator even though it may never be used to enable
     // pipeline parallel code-path. To enable this, have an empty communicator with uninitialized state.
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 2a3ffa39e5e9..ff21b81babe7 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -48,8 +48,6 @@
                             JsonModeEval, LlmapiAccuracyTestHarness,
                             LongBenchV1, LongBenchV2)
 
-_NCCL_NVLS_DISABLED_ENV = {"NCCL_NVLS_ENABLE": "0"}
-
 
 # Keep helper definitions below imports so new imports do not need E402
 # suppressions in this legacy test file.
@@ -77,11 +75,6 @@ def patched_start_mpi_pool(self):
                         patched_start_mpi_pool)
 
 
-def disable_nccl_nvls_for_test(mocker):
-    mocker.patch.dict(os.environ, _NCCL_NVLS_DISABLED_ENV)
-    patch_mpi_pool_session_for_env(mocker, _NCCL_NVLS_DISABLED_ENV)
-
-
 def _get_default_torch_compile_config(torch_compile):
     return TorchCompileConfig(enable_fullgraph=True,
                               enable_piecewise_cuda_graph=True,
@@ -1705,10 +1698,7 @@ def test_bfloat16_4gpus_kv_cache_aware_routing(self, mtp_nextn):
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                             attention_dp, cuda_graph, overlap_scheduler,
-                            torch_compile, mocker):
-        if pp_size > 1:
-            disable_nccl_nvls_for_test(mocker)
-
+                            torch_compile):
         if pp_size > 1 and mtp_nextn > 0:
             num_hidden_layers = 30
             pp_partition = [num_hidden_layers // pp_size + 1] * pp_size
@@ -1964,10 +1954,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                     fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler, torch_compile,
-                                    sampler_async_worker, mocker):
-        if pp_size > 1:
-            disable_nccl_nvls_for_test(mocker)
-
+                                    sampler_async_worker):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
         pytorch_config = dict(
@@ -2403,14 +2390,12 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, low_precision_combine, tp_size,
                          pp_size, ep_size, torch_compile, mtp_nextn,
-                         moe_backend, mocker):
+                         moe_backend):
         sm_version = get_sm_version()
         if moe_backend == "TRTLLM" and sm_version in (120, 121):
             pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
         if moe_backend == "CUTEDSL" and sm_version not in (100, 103):
             pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only")
-        if pp_size > 1:
-            disable_nccl_nvls_for_test(mocker)
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.

From eb7647ecaa4aab329ee118ab404e742f6c143f08 Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Wed, 24 Jun 2026 03:05:32 +0000
Subject: [PATCH 6/8] [NVBUG-6224637][fix] Enable CuTe DSL BF16 kernels on
 SM100 PP

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 tensorrt_llm/_torch/modules/attention.py |  6 ++++--
 tensorrt_llm/_torch/modules/gated_mlp.py |  2 ++
 tensorrt_llm/llmapi/llm_args.py          | 10 +++++++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
index 28d5cff5c3d8..3dba1084a8cf 100644
--- a/tensorrt_llm/_torch/modules/attention.py
+++ b/tensorrt_llm/_torch/modules/attention.py
@@ -529,7 +529,8 @@ def __init__(
             force_dynamic_quantization=config.force_dynamic_quantization,
             disable_deep_gemm=disable_deep_gemm,
             use_custom_cublas_mm=use_custom_cublas_mm,
-            use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm)
+            use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm,
+            use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm)
 
         self.quant_config = config.get_quant_config()
         self.attn_backend = config.attn_backend
@@ -1462,7 +1463,8 @@ def __init__(
             reduce_output=reduce_output,
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
-            use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm)
+            use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm,
+            use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm)
 
         def yarn_get_mscale(scale=1, mscale=1):
             if scale <= 1:
diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py
index 7e2fdcaeca20..1bb29e8e3640 100644
--- a/tensorrt_llm/_torch/modules/gated_mlp.py
+++ b/tensorrt_llm/_torch/modules/gated_mlp.py
@@ -84,6 +84,7 @@ def __init__(
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
             use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
+            use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm,
             disable_deep_gemm=disable_deep_gemm,
             fused_weight_shard_indices_mapping=gateup_shard_indices_mapping,
             use_custom_cublas_mm=use_custom_cublas_mm,
@@ -114,6 +115,7 @@ def __init__(
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
             use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
+            use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm,
             disable_deep_gemm=disable_deep_gemm,
             use_custom_cublas_mm=use_custom_cublas_mm,
         )
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 609e11c5a27a..4f0a624d13f1 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -46,7 +46,8 @@
 from tensorrt_llm.lora_helper import (LoraConfig,
                                       get_default_trtllm_modules_to_hf_modules)
 
-from .._utils import _str_to_torch_dtype_dict, mpi_rank, prefer_pinned
+from .._utils import (_str_to_torch_dtype_dict, is_sm_100f, mpi_rank,
+                      prefer_pinned)
 
 # yapf: disable
 # isort: off
@@ -5077,6 +5078,13 @@ def validate_ray_placement_config(self) -> 'TorchLlmArgs':
 
     @model_validator(mode='after')
     def validate_cute_dsl_bf16(self) -> 'TorchLlmArgs':
+        if (not (self.use_cute_dsl_bf16_bmm and self.use_cute_dsl_bf16_gemm)
+                and self.pipeline_parallel_size > 1 and is_sm_100f()):
+            logger.info("Automatically enabling CuTe DSL BF16 BMM and GEMM for "
+                        "SM100/SM103 PP.")
+            self.use_cute_dsl_bf16_bmm = True
+            self.use_cute_dsl_bf16_gemm = True
+
         if self.use_cute_dsl_bf16_bmm or self.use_cute_dsl_bf16_gemm:
             major, minor = torch.cuda.get_device_capability()
             sm = major * 10 + minor

From a1c811f900e1657712812b087c446623b13600b5 Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Wed, 24 Jun 2026 08:45:09 +0000
Subject: [PATCH 7/8] [NVBUG-6224637][test] Remove rebase-added waiver

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index ce70f437ee04..f81f8ebe6652 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -20,7 +20,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mt
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_nvfp4_4gpus[tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6185146)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403)

From ce916714a78e4ee5844d0745ea7842d9c0acc763 Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:19:48 +0000
Subject: [PATCH 8/8] [NVBUG-6224637][fix] Guard CuTe BF16 GEMM config for
 VisualGen

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 tensorrt_llm/_torch/modules/gated_mlp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py
index 1bb29e8e3640..4870db50b44c 100644
--- a/tensorrt_llm/_torch/modules/gated_mlp.py
+++ b/tensorrt_llm/_torch/modules/gated_mlp.py
@@ -44,6 +44,8 @@ def __init__(
         self.use_cute_dsl_blockscaling_mm = use_cute_dsl_blockscaling_mm
 
         config = config or ModelConfig()
+        use_cute_dsl_bf16_gemm = getattr(config, "use_cute_dsl_bf16_gemm",
+                                         False)
         self.mapping = config.mapping
         if overridden_tp_size is not None:
             assert config.mapping.tp_size % overridden_tp_size == 0
@@ -84,7 +86,7 @@ def __init__(
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
             use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
-            use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm,
+            use_cute_dsl_bf16_gemm=use_cute_dsl_bf16_gemm,
             disable_deep_gemm=disable_deep_gemm,
             fused_weight_shard_indices_mapping=gateup_shard_indices_mapping,
             use_custom_cublas_mm=use_custom_cublas_mm,
@@ -115,7 +117,7 @@ def __init__(
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
             use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
-            use_cute_dsl_bf16_gemm=config.use_cute_dsl_bf16_gemm,
+            use_cute_dsl_bf16_gemm=use_cute_dsl_bf16_gemm,
             disable_deep_gemm=disable_deep_gemm,
             use_custom_cublas_mm=use_custom_cublas_mm,
         )