[https://nvbugs/6224637][test] remove GB200 hang workarounds

yuxianq · yuxianq · commit c9ec5ec31fd3 · 2026-06-24T08:27:39.000Z
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,188 +23,11 @@
 #include <nccl.h>
 #endif // ENABLE_MULTI_DEVICE
 
-#include <array>
-#include <chrono>
-#include <cstdlib>
-#include <string>
-#include <thread>
-
 using namespace tensorrt_llm::runtime;
 
 namespace
 {
 #if ENABLE_MULTI_DEVICE
-constexpr int kDefaultNcclCommInitTimeoutMs = 60'000;
-constexpr int kNcclCommInitPollIntervalMs = 20;
-constexpr char const* kNcclCommInitTimeoutEnv = "TLLM_NCCL_COMM_INIT_TIMEOUT_MS";
-constexpr char const* kNcclNvlsEnableEnv = "NCCL_NVLS_ENABLE";
-
-struct NcclInitResult
-{
-    ncclComm_t comm{nullptr};
-    ncclResult_t result{ncclSuccess};
-    bool timedOut{false};
-
-    [[nodiscard]] bool isSuccess() const
-    {
-        return result == ncclSuccess;
-    }
-};
-
-struct NcclInitStatus
-{
-    bool failed{false};
-    bool timedOut{false};
-};
-
-int getNcclCommInitTimeoutMs()
-{
-    auto const* env = std::getenv(kNcclCommInitTimeoutEnv);
-    int const timeoutMs = env == nullptr ? 0 : std::atoi(env);
-    return timeoutMs > 0 ? timeoutMs : kDefaultNcclCommInitTimeoutMs;
-}
-
-bool canSuggestNvlsDisable()
-{
-    auto const* nvlsEnable = std::getenv(kNcclNvlsEnableEnv);
-    return nvlsEnable == nullptr || std::string{nvlsEnable} == "2";
-}
-
-void setRuntimeConnectIfUnset()
-{
-    // Need static connection initialization for accurate KV cache size estimation.
-#if defined(_WIN32)
-    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
-    {
-        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
-    }
-#else
-    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
-#endif // _WIN32
-}
-
-void abortNcclComm(ncclComm_t comm)
-{
-    if (comm == nullptr)
-    {
-        return;
-    }
-
-    auto const result = ncclCommAbort(comm);
-    if (result != ncclSuccess)
-    {
-        TLLM_LOG_WARNING("Failed to abort NCCL communicator: %s.", ncclGetErrorString(result));
-    }
-}
-
-NcclInitResult initNcclCommWithTimeout(ncclUniqueId const& id, int worldSize, int rank, int timeoutMs)
-{
-    NcclInitResult initResult;
-    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
-    config.blocking = 0;
-
-    auto result = ncclCommInitRankConfig(&initResult.comm, worldSize, id, rank, &config);
-    if (result != ncclSuccess && result != ncclInProgress)
-    {
-        initResult.result = result;
-        return initResult;
-    }
-    if (result == ncclSuccess)
-    {
-        initResult.result = ncclSuccess;
-        return initResult;
-    }
-    if (initResult.comm == nullptr)
-    {
-        initResult.result = result;
-        return initResult;
-    }
-
-    auto const deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds{timeoutMs};
-    while (true)
-    {
-        ncclResult_t asyncResult = ncclSuccess;
-        result = ncclCommGetAsyncError(initResult.comm, &asyncResult);
-        if (result != ncclSuccess)
-        {
-            initResult.result = result;
-            return initResult;
-        }
-        if (asyncResult != ncclInProgress)
-        {
-            initResult.result = asyncResult;
-            return initResult;
-        }
-        if (std::chrono::steady_clock::now() >= deadline)
-        {
-            initResult.result = ncclInProgress;
-            initResult.timedOut = true;
-            return initResult;
-        }
-        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
-    }
-}
-
-NcclInitStatus getNcclInitStatus(NcclInitResult const& result, tensorrt_llm::mpi::MpiComm const& mpiComm)
-{
-    std::array<int, 2> localStatus{result.isSuccess() ? 0 : 1, result.timedOut ? 1 : 0};
-    std::array<int, 2> globalStatus{};
-    mpiComm.allreduce(
-        localStatus.data(), globalStatus.data(), 2, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MAX);
-    return {globalStatus[0] != 0, globalStatus[1] != 0};
-}
-
-bool allRanksCanUseNvlsDisableWorkaround(tensorrt_llm::mpi::MpiComm const& mpiComm)
-{
-    int const localCanDisable = canSuggestNvlsDisable() ? 1 : 0;
-    int globalCanDisable = 0;
-    mpiComm.allreduce(
-        &localCanDisable, &globalCanDisable, 1, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MIN);
-
-    return globalCanDisable != 0;
-}
-
-void checkNcclResult(ncclComm_t comm, ncclResult_t result, char const* operation)
-{
-    if (result == ncclSuccess)
-    {
-        return;
-    }
-    if (result != ncclInProgress)
-    {
-        TLLM_NCCL_CHECK(result);
-    }
-
-    while (true)
-    {
-        ncclResult_t asyncResult = ncclSuccess;
-        result = ncclCommGetAsyncError(comm, &asyncResult);
-        if (result != ncclSuccess)
-        {
-            TLLM_THROW("NCCL %s failed while polling communicator status: %s.", operation, ncclGetErrorString(result));
-        }
-        if (asyncResult == ncclSuccess)
-        {
-            return;
-        }
-        if (asyncResult != ncclInProgress)
-        {
-            TLLM_THROW("NCCL %s failed asynchronously: %s.", operation, ncclGetErrorString(asyncResult));
-        }
-        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
-    }
-}
-
-ncclUniqueId createAndBroadcastNcclId(int rank, tensorrt_llm::mpi::MpiComm const& mpiComm)
-{
-    ncclUniqueId id;
-    if (rank == 0)
-    {
-        TLLM_NCCL_CHECK(ncclGetUniqueId(&id));
-    }
-    mpiComm.bcastValue(id, 0);
-    return id;
-}
 
 ncclDataType_t toNcclType(nvinfer1::DataType dataType)
 {
@@ -230,7 +53,7 @@ void NcclCommunicator::send(
     void const* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    checkNcclResult(mComm, ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "send");
+    TLLM_NCCL_CHECK(ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -240,7 +63,7 @@ void NcclCommunicator::receive(
     void* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    checkNcclResult(mComm, ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "receive");
+    TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -250,45 +73,22 @@ ncclComm_t NcclCommunicator::createComm(int worldSize, int rank, mpi::MpiComm co
 {
 #if ENABLE_MULTI_DEVICE
 
-    setRuntimeConnectIfUnset();
-    auto const timeoutMs = getNcclCommInitTimeoutMs();
-
-    auto id = createAndBroadcastNcclId(rank, mpiComm);
-    auto initResult = initNcclCommWithTimeout(id, worldSize, rank, timeoutMs);
-    auto const initStatus = getNcclInitStatus(initResult, mpiComm);
-
-    if (initStatus.failed)
+    ncclUniqueId id;
+    if (rank == 0)
     {
-        if (initStatus.timedOut)
-        {
-            if (allRanksCanUseNvlsDisableWorkaround(mpiComm))
-            {
-                TLLM_THROW(
-                    "NCCL communicator initialization timed out after %d ms on at least one rank. This may indicate "
-                    "an NVLS multicast resource setup failure in Fabric Manager. Set NCCL_NVLS_ENABLE=0 before "
-                    "process startup and retry. TensorRT-LLM does not retry in-process because NCCL may not recover "
-                    "after a timed-out NVLS initialization.",
-                    timeoutMs);
-            }
-            TLLM_THROW(
-                "NCCL communicator initialization timed out after %d ms on at least one rank. NCCL_NVLS_ENABLE is "
-                "explicitly set, so TensorRT-LLM will not override it.",
-                timeoutMs);
-        }
-        if (!initResult.isSuccess())
-        {
-            abortNcclComm(initResult.comm);
-        }
-        mpiComm.barrier();
-        if (!initResult.isSuccess())
-        {
-            TLLM_THROW(
-                "NCCL communicator initialization failed on rank %d: %s.", rank, ncclGetErrorString(initResult.result));
-        }
-        TLLM_THROW("NCCL communicator initialization failed on at least one peer rank.");
+        ncclGetUniqueId(&id);
     }
-
-    return initResult.comm;
+    mpiComm.bcastValue(id, 0);
+    ncclComm_t comm;
+// Need static connection initialization for accurate KV cache size estimation
+#if defined(_WIN32)
+    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
+        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
+#else
+    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
+#endif // _WIN32
+    TLLM_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, id, rank));
+    return comm;
 #else
     // Python runtime requires instantiation of a communicator even though it may never be used to enable
     // pipeline parallel code-path. To enable this, have an empty communicator with uninitialized state.
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -48,8 +48,6 @@
                             JsonModeEval, LlmapiAccuracyTestHarness,
                             LongBenchV1, LongBenchV2)
 
-_NCCL_NVLS_DISABLED_ENV = {"NCCL_NVLS_ENABLE": "0"}
-
 
 # Keep helper definitions below imports so new imports do not need E402
 # suppressions in this legacy test file.
@@ -77,11 +75,6 @@ def patched_start_mpi_pool(self):
                         patched_start_mpi_pool)
 
 
-def disable_nccl_nvls_for_test(mocker):
-    mocker.patch.dict(os.environ, _NCCL_NVLS_DISABLED_ENV)
-    patch_mpi_pool_session_for_env(mocker, _NCCL_NVLS_DISABLED_ENV)
-
-
 def _get_default_torch_compile_config(torch_compile):
     return TorchCompileConfig(enable_fullgraph=True,
                               enable_piecewise_cuda_graph=True,
@@ -1705,10 +1698,7 @@ def test_bfloat16_4gpus_kv_cache_aware_routing(self, mtp_nextn):
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                             attention_dp, cuda_graph, overlap_scheduler,
-                            torch_compile, mocker):
-        if pp_size > 1:
-            disable_nccl_nvls_for_test(mocker)
-
+                            torch_compile):
         if pp_size > 1 and mtp_nextn > 0:
             num_hidden_layers = 30
             pp_partition = [num_hidden_layers // pp_size + 1] * pp_size
@@ -1964,10 +1954,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                     fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler, torch_compile,
-                                    sampler_async_worker, mocker):
-        if pp_size > 1:
-            disable_nccl_nvls_for_test(mocker)
-
+                                    sampler_async_worker):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
         pytorch_config = dict(
@@ -2403,14 +2390,12 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, low_precision_combine, tp_size,
                          pp_size, ep_size, torch_compile, mtp_nextn,
-                         moe_backend, mocker):
+                         moe_backend):
         sm_version = get_sm_version()
         if moe_backend == "TRTLLM" and sm_version in (120, 121):
             pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
         if moe_backend == "CUTEDSL" and sm_version not in (100, 103):
             pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only")
-        if pp_size > 1:
-            disable_nccl_nvls_for_test(mocker)
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.