Handle NCCL NVLS init hangs in unwaived tests

yuxianq · yuxianq · commit 302e3db9bf04 · 2026-06-24T08:27:33.000Z
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp b/cpp/tensorrt_llm/runtime/ncclCommunicator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,11 +23,188 @@
 #include <nccl.h>
 #endif // ENABLE_MULTI_DEVICE
 
+#include <array>
+#include <chrono>
+#include <cstdlib>
+#include <string>
+#include <thread>
+
 using namespace tensorrt_llm::runtime;
 
 namespace
 {
 #if ENABLE_MULTI_DEVICE
+constexpr int kDefaultNcclCommInitTimeoutMs = 60'000;
+constexpr int kNcclCommInitPollIntervalMs = 20;
+constexpr char const* kNcclCommInitTimeoutEnv = "TLLM_NCCL_COMM_INIT_TIMEOUT_MS";
+constexpr char const* kNcclNvlsEnableEnv = "NCCL_NVLS_ENABLE";
+
+struct NcclInitResult
+{
+    ncclComm_t comm{nullptr};
+    ncclResult_t result{ncclSuccess};
+    bool timedOut{false};
+
+    [[nodiscard]] bool isSuccess() const
+    {
+        return result == ncclSuccess;
+    }
+};
+
+struct NcclInitStatus
+{
+    bool failed{false};
+    bool timedOut{false};
+};
+
+int getNcclCommInitTimeoutMs()
+{
+    auto const* env = std::getenv(kNcclCommInitTimeoutEnv);
+    int const timeoutMs = env == nullptr ? 0 : std::atoi(env);
+    return timeoutMs > 0 ? timeoutMs : kDefaultNcclCommInitTimeoutMs;
+}
+
+bool canSuggestNvlsDisable()
+{
+    auto const* nvlsEnable = std::getenv(kNcclNvlsEnableEnv);
+    return nvlsEnable == nullptr || std::string{nvlsEnable} == "2";
+}
+
+void setRuntimeConnectIfUnset()
+{
+    // Need static connection initialization for accurate KV cache size estimation.
+#if defined(_WIN32)
+    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
+    {
+        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
+    }
+#else
+    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
+#endif // _WIN32
+}
+
+void abortNcclComm(ncclComm_t comm)
+{
+    if (comm == nullptr)
+    {
+        return;
+    }
+
+    auto const result = ncclCommAbort(comm);
+    if (result != ncclSuccess)
+    {
+        TLLM_LOG_WARNING("Failed to abort NCCL communicator: %s.", ncclGetErrorString(result));
+    }
+}
+
+NcclInitResult initNcclCommWithTimeout(ncclUniqueId const& id, int worldSize, int rank, int timeoutMs)
+{
+    NcclInitResult initResult;
+    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+    config.blocking = 0;
+
+    auto result = ncclCommInitRankConfig(&initResult.comm, worldSize, id, rank, &config);
+    if (result != ncclSuccess && result != ncclInProgress)
+    {
+        initResult.result = result;
+        return initResult;
+    }
+    if (result == ncclSuccess)
+    {
+        initResult.result = ncclSuccess;
+        return initResult;
+    }
+    if (initResult.comm == nullptr)
+    {
+        initResult.result = result;
+        return initResult;
+    }
+
+    auto const deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds{timeoutMs};
+    while (true)
+    {
+        ncclResult_t asyncResult = ncclSuccess;
+        result = ncclCommGetAsyncError(initResult.comm, &asyncResult);
+        if (result != ncclSuccess)
+        {
+            initResult.result = result;
+            return initResult;
+        }
+        if (asyncResult != ncclInProgress)
+        {
+            initResult.result = asyncResult;
+            return initResult;
+        }
+        if (std::chrono::steady_clock::now() >= deadline)
+        {
+            initResult.result = ncclInProgress;
+            initResult.timedOut = true;
+            return initResult;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
+    }
+}
+
+NcclInitStatus getNcclInitStatus(NcclInitResult const& result, tensorrt_llm::mpi::MpiComm const& mpiComm)
+{
+    std::array<int, 2> localStatus{result.isSuccess() ? 0 : 1, result.timedOut ? 1 : 0};
+    std::array<int, 2> globalStatus{};
+    mpiComm.allreduce(
+        localStatus.data(), globalStatus.data(), 2, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MAX);
+    return {globalStatus[0] != 0, globalStatus[1] != 0};
+}
+
+bool allRanksCanUseNvlsDisableWorkaround(tensorrt_llm::mpi::MpiComm const& mpiComm)
+{
+    int const localCanDisable = canSuggestNvlsDisable() ? 1 : 0;
+    int globalCanDisable = 0;
+    mpiComm.allreduce(
+        &localCanDisable, &globalCanDisable, 1, tensorrt_llm::mpi::MpiType::kINT32, tensorrt_llm::mpi::MpiOp::MIN);
+
+    return globalCanDisable != 0;
+}
+
+void checkNcclResult(ncclComm_t comm, ncclResult_t result, char const* operation)
+{
+    if (result == ncclSuccess)
+    {
+        return;
+    }
+    if (result != ncclInProgress)
+    {
+        TLLM_NCCL_CHECK(result);
+    }
+
+    while (true)
+    {
+        ncclResult_t asyncResult = ncclSuccess;
+        result = ncclCommGetAsyncError(comm, &asyncResult);
+        if (result != ncclSuccess)
+        {
+            TLLM_THROW("NCCL %s failed while polling communicator status: %s.", operation, ncclGetErrorString(result));
+        }
+        if (asyncResult == ncclSuccess)
+        {
+            return;
+        }
+        if (asyncResult != ncclInProgress)
+        {
+            TLLM_THROW("NCCL %s failed asynchronously: %s.", operation, ncclGetErrorString(asyncResult));
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds{kNcclCommInitPollIntervalMs});
+    }
+}
+
+ncclUniqueId createAndBroadcastNcclId(int rank, tensorrt_llm::mpi::MpiComm const& mpiComm)
+{
+    ncclUniqueId id;
+    if (rank == 0)
+    {
+        TLLM_NCCL_CHECK(ncclGetUniqueId(&id));
+    }
+    mpiComm.bcastValue(id, 0);
+    return id;
+}
 
 ncclDataType_t toNcclType(nvinfer1::DataType dataType)
 {
@@ -53,7 +230,7 @@ void NcclCommunicator::send(
     void const* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    TLLM_NCCL_CHECK(ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
+    checkNcclResult(mComm, ncclSend(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "send");
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -63,7 +240,7 @@ void NcclCommunicator::receive(
     void* sendbuff, size_t count, nvinfer1::DataType dataType, int peer, CudaStream const& stream) const
 {
 #if ENABLE_MULTI_DEVICE
-    TLLM_NCCL_CHECK(ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()));
+    checkNcclResult(mComm, ncclRecv(sendbuff, count, toNcclType(dataType), peer, mComm, stream.get()), "receive");
 #else
     TLLM_THROW("Multi device support is disabled.");
 #endif // ENABLE_MULTI_DEVICE
@@ -73,22 +250,45 @@ ncclComm_t NcclCommunicator::createComm(int worldSize, int rank, mpi::MpiComm co
 {
 #if ENABLE_MULTI_DEVICE
 
-    ncclUniqueId id;
-    if (rank == 0)
+    setRuntimeConnectIfUnset();
+    auto const timeoutMs = getNcclCommInitTimeoutMs();
+
+    auto id = createAndBroadcastNcclId(rank, mpiComm);
+    auto initResult = initNcclCommWithTimeout(id, worldSize, rank, timeoutMs);
+    auto const initStatus = getNcclInitStatus(initResult, mpiComm);
+
+    if (initStatus.failed)
     {
-        ncclGetUniqueId(&id);
+        if (initStatus.timedOut)
+        {
+            if (allRanksCanUseNvlsDisableWorkaround(mpiComm))
+            {
+                TLLM_THROW(
+                    "NCCL communicator initialization timed out after %d ms on at least one rank. This may indicate "
+                    "an NVLS multicast resource setup failure in Fabric Manager. Set NCCL_NVLS_ENABLE=0 before "
+                    "process startup and retry. TensorRT-LLM does not retry in-process because NCCL may not recover "
+                    "after a timed-out NVLS initialization.",
+                    timeoutMs);
+            }
+            TLLM_THROW(
+                "NCCL communicator initialization timed out after %d ms on at least one rank. NCCL_NVLS_ENABLE is "
+                "explicitly set, so TensorRT-LLM will not override it.",
+                timeoutMs);
+        }
+        if (!initResult.isSuccess())
+        {
+            abortNcclComm(initResult.comm);
+        }
+        mpiComm.barrier();
+        if (!initResult.isSuccess())
+        {
+            TLLM_THROW(
+                "NCCL communicator initialization failed on rank %d: %s.", rank, ncclGetErrorString(initResult.result));
+        }
+        TLLM_THROW("NCCL communicator initialization failed on at least one peer rank.");
     }
-    mpiComm.bcastValue(id, 0);
-    ncclComm_t comm;
-// Need static connection initialization for accurate KV cache size estimation
-#if defined(_WIN32)
-    if (getenv("NCCL_RUNTIME_CONNECT") == nullptr)
-        _putenv_s("NCCL_RUNTIME_CONNECT", "0");
-#else
-    setenv("NCCL_RUNTIME_CONNECT", "0", 0);
-#endif // _WIN32
-    TLLM_NCCL_CHECK(ncclCommInitRank(&comm, worldSize, id, rank));
-    return comm;
+
+    return initResult.comm;
 #else
     // Python runtime requires instantiation of a communicator even though it may never be used to enable
     // pipeline parallel code-path. To enable this, have an empty communicator with uninitialized state.
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -48,6 +48,8 @@
                             JsonModeEval, LlmapiAccuracyTestHarness,
                             LongBenchV1, LongBenchV2)
 
+_NCCL_NVLS_DISABLED_ENV = {"NCCL_NVLS_ENABLE": "0"}
+
 
 # Keep helper definitions below imports so new imports do not need E402
 # suppressions in this legacy test file.
@@ -75,6 +77,11 @@ def patched_start_mpi_pool(self):
                         patched_start_mpi_pool)
 
 
+def disable_nccl_nvls_for_test(mocker):
+    mocker.patch.dict(os.environ, _NCCL_NVLS_DISABLED_ENV)
+    patch_mpi_pool_session_for_env(mocker, _NCCL_NVLS_DISABLED_ENV)
+
+
 def _get_default_torch_compile_config(torch_compile):
     return TorchCompileConfig(enable_fullgraph=True,
                               enable_piecewise_cuda_graph=True,
@@ -1698,7 +1705,10 @@ def test_bfloat16_4gpus_kv_cache_aware_routing(self, mtp_nextn):
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                             attention_dp, cuda_graph, overlap_scheduler,
-                            torch_compile):
+                            torch_compile, mocker):
+        if pp_size > 1:
+            disable_nccl_nvls_for_test(mocker)
+
         if pp_size > 1 and mtp_nextn > 0:
             num_hidden_layers = 30
             pp_partition = [num_hidden_layers // pp_size + 1] * pp_size
@@ -1954,7 +1964,10 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                                     fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler, torch_compile,
-                                    sampler_async_worker):
+                                    sampler_async_worker, mocker):
+        if pp_size > 1:
+            disable_nccl_nvls_for_test(mocker)
+
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = _get_default_torch_compile_config(torch_compile)
         pytorch_config = dict(
@@ -2390,12 +2403,14 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, low_precision_combine, tp_size,
                          pp_size, ep_size, torch_compile, mtp_nextn,
-                         moe_backend):
+                         moe_backend, mocker):
         sm_version = get_sm_version()
         if moe_backend == "TRTLLM" and sm_version in (120, 121):
             pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
         if moe_backend == "CUTEDSL" and sm_version not in (100, 103):
             pytest.skip(f"{moe_backend} backend supports SM 100 and 103 only")
+        if pp_size > 1:
+            disable_nccl_nvls_for_test(mocker)
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.