From 780a2b744c11737d6f58a7d0448154956ecf42a0 Mon Sep 17 00:00:00 2001 From: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com> Date: Tue, 19 May 2026 04:55:34 -0700 Subject: [PATCH 1/4] [nvbugs/6062416][fix] Reorder NCCLWindowAllocator rank-sync to survive sticky ncclMemAlloc errors NCCLWindowAllocator::allocateAndRegisterBuffer used a per-rank cudaMalloc'd flag + ncclAllReduce(min) to make a per-rank ncclMemAlloc failure non-hanging. The intent was that all ranks reach the collective allreduce regardless of ncclMemAlloc's outcome, so the failing rank can signal "abort" instead of silently skipping the collective. However the cudaMalloc for the rank-sync flag ran AFTER ncclMemAlloc. When ncclMemAlloc returns ncclUnhandledCudaError (the symptom in the MGMN Llama-3.3-70B TP=2 run), the CUDA runtime can be left with a sticky last-error. The next runtime CUDA call (cudaMalloc for the rank-sync flag) then inherits that error, TLLM_CUDA_CHECK throws, and the failing rank unwinds out of allocateAndRegisterBuffer without ever reaching the collective ncclAllReduce(min). Healthy ranks proceed into the collective and block, producing the hang the bug reports. Fix: - Pre-allocate rankSyncFlag before ncclMemAlloc, so the flag is always ready on every rank irrespective of what ncclMemAlloc does. - After a failed ncclMemAlloc, drain any sticky CUDA last-error via cudaGetLastError() so the subsequent cudaMemcpy/ncclAllReduce on the CUDA stream see a clean device state. Both changes are confined to allocateAndRegisterBuffer. On the success path they are no-ops; on the failure path they keep the failing rank on the same collective control flow as healthy ranks, restoring the graceful fallback to plain NCCL allreduce that PR #12125 originally introduced. Signed-off-by: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com> --- cpp/tensorrt_llm/common/ncclUtils.cpp | 29 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/cpp/tensorrt_llm/common/ncclUtils.cpp b/cpp/tensorrt_llm/common/ncclUtils.cpp index 243cc7b2884..a1deb6a35e7 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.cpp +++ b/cpp/tensorrt_llm/common/ncclUtils.cpp @@ -562,20 +562,31 @@ bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle) { - // Step 1: Allocate symmetric memory (per-rank, non-collective — can fail asymmetrically). + // Step 1: Pre-allocate the rank-sync flag *before* ncclMemAlloc. ncclMemAlloc can fail + // asymmetrically with ncclUnhandledCudaError on configurations where the symmetric/VMM path + // is unavailable; that failure may leave a sticky CUDA last-error on the device. If we + // deferred this cudaMalloc until after the failure, the sticky error would propagate into + // cudaMalloc, TLLM_CUDA_CHECK would throw, and the failing rank would never reach the + // collective ncclAllReduce(min) below — hanging every other rank that *did* succeed. + int* rankSyncFlag = nullptr; + TLLM_CUDA_CHECK(cudaMalloc(&rankSyncFlag, sizeof(int))); + CudaMallocGuard flagGuard{rankSyncFlag}; // frees rankSyncFlag on any early return or exception + + // Step 2: Allocate symmetric memory (per-rank, non-collective — can fail asymmetrically). + // If ncclMemAlloc fails, drain any sticky CUDA last-error so the subsequent cudaMemcpy and + // ncclAllReduce(min) observe a clean device state and the failing rank reaches the collective + // below on the same control path as healthy ranks. void* ncclPtr = nullptr; TLLM_NCCL_CHECK_WARN(ncclMemAlloc(&ncclPtr, size)); int const localAllocOk = (ncclPtr != nullptr) ? 1 : 0; NcclMemGuard ncclGuard{ncclPtr}; // frees ncclPtr on any early return or exception + if (!localAllocOk) + { + (void) cudaGetLastError(); + } - // Step 2: ncclCommWindowRegister is collective — if any rank skips it, all other ranks hang. - // Synchronize the per-rank alloc status using a small cudaMalloc flag (not ncclMemAlloc, so - // OOM on symmetric memory does not prevent us from allocating the flag). - int* rankSyncFlag = nullptr; - TLLM_CUDA_CHECK(cudaMalloc(&rankSyncFlag, sizeof(int))); - CudaMallocGuard flagGuard{rankSyncFlag}; // frees rankSyncFlag on any early return or exception - - // Step 3: Populate flag, reduce with min across ranks (0 if any rank failed), then read back. + // Step 3: ncclCommWindowRegister is collective — if any rank skips it, all other ranks hang. + // Populate flag, reduce with min across ranks (0 if any rank failed), then read back. // H2D failure is non-fatal: warn and continue — device flag may be stale but the allreduce // must still be reached by all ranks. allreduce and D2H failures are catastrophic (throw). auto stream = at::cuda::getCurrentCUDAStream().stream(); From 352495da24674ec35a788d8f9786826955ca3788 Mon Sep 17 00:00:00 2001 From: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com> Date: Tue, 19 May 2026 06:40:22 -0700 Subject: [PATCH 2/4] [nvbugs/6062416][fix] Skip caching invalid window buffers when ncclMemAlloc fallback fires When ncclMemAlloc fails on any rank, allocateAndRegisterBuffer collectively agrees on the failure and returns an empty NCCLWindowBuffer so all ranks fall back to plain NCCL. Previously requestBuffer pushed that empty buffer back into the per-comm pool with inUse=true; releaseBuffer is a no-op for a nullptr pointer, so the entry stayed pinned forever. Repeated calls (e.g., autotuner warmup steps) accumulated stale, permanently "in use" entries. Only cache valid buffers. This complements the rank-sync ordering fix in allocateAndRegisterBuffer. Signed-off-by: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com> --- cpp/tensorrt_llm/common/ncclUtils.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/common/ncclUtils.cpp b/cpp/tensorrt_llm/common/ncclUtils.cpp index a1deb6a35e7..02e387f6608 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.cpp +++ b/cpp/tensorrt_llm/common/ncclUtils.cpp @@ -460,7 +460,13 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size "[NCCLUtil] Allocating new NCCL window buffer for comm %p, size=%zu", static_cast(comm), size); int handle = static_cast(commBuffers.size()); NCCLWindowBuffer buffer = allocateAndRegisterBuffer(comm, size, handle); - commBuffers.push_back({buffer, true}); + // Only cache valid buffers. allocateAndRegisterBuffer returns an empty buffer when any rank + // failed ncclMemAlloc (collective fallback to plain allreduce); caching it would leak a + // permanently "in use" empty entry per request because releaseBuffer is a no-op for nullptr. + if (buffer.isValid()) + { + commBuffers.push_back({buffer, true}); + } return buffer; } From 47c83573c458718b9f00ee64fbbc3858c780c988 Mon Sep 17 00:00:00 2001 From: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com> Date: Tue, 19 May 2026 07:59:24 -0700 Subject: [PATCH 3/4] [nvbugs/6062416][fix] Cache per-comm NCCL symmetric-memory unavailability After the collective ncclAllReduce(min) inside allocateAndRegisterBuffer agrees that at least one rank could not ncclMemAlloc, all subsequent requestBuffer() calls for the same comm short-circuit to an empty NCCLWindowBuffer instead of re-running the failing ncclMemAlloc + rank-sync allreduce on every autotuner trial. This is cluster-safe because the unavailable decision is driven by a collective allreduce, so all ranks agree and remain in sync without further communication. The cache is invalidated when the comm is torn down. Signed-off-by: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com> --- cpp/tensorrt_llm/common/ncclUtils.cpp | 20 ++++++++++++++++++++ cpp/tensorrt_llm/common/ncclUtils.h | 6 ++++++ 2 files changed, 26 insertions(+) diff --git a/cpp/tensorrt_llm/common/ncclUtils.cpp b/cpp/tensorrt_llm/common/ncclUtils.cpp index 02e387f6608..5837451cf7f 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.cpp +++ b/cpp/tensorrt_llm/common/ncclUtils.cpp @@ -416,6 +416,17 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size // This is cheap even if no buffers exist yet - cleanup will just return early registerBufferCleanup(comm); + // If a previous allocateAndRegisterBuffer call collectively concluded that this comm + // cannot use NCCL symmetric memory, short-circuit so callers transparently fall back to + // regular allreduce. This avoids re-running ncclMemAlloc + the rank-sync allreduce on + // every autotuner trial, which would otherwise spam warnings and stress the failing path. + // The decision is collective (driven by an ncclAllReduce(min) inside allocateAndRegisterBuffer), + // so all ranks reach the same conclusion and stay in sync without further communication. + if (mSymmetricUnavailable.find(comm) != mSymmetricUnavailable.end()) + { + return NCCLWindowBuffer(); + } + // Check if we have an available buffer of at least the requested size for this communicator // Use best-fit: find the smallest buffer that's >= requested size auto& commBuffers = mBufferPool[comm]; @@ -467,6 +478,13 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size { commBuffers.push_back({buffer, true}); } + else + { + // The collective allreduce inside allocateAndRegisterBuffer agreed that at least one + // rank could not allocate symmetric memory. Mark this comm so future requests don't + // retry the failing path on every autotuner trial. + mSymmetricUnavailable.insert(comm); + } return buffer; } @@ -700,6 +718,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept { // No buffers to clean up, but mark as cleaned mRegisteredComms.erase(comm); + mSymmetricUnavailable.erase(comm); return; } @@ -775,6 +794,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept mBufferPool.erase(commIt); mRegisteredComms.erase(comm); + mSymmetricUnavailable.erase(comm); } #endif // NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0) diff --git a/cpp/tensorrt_llm/common/ncclUtils.h b/cpp/tensorrt_llm/common/ncclUtils.h index 672613f32e4..f99e4aef564 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.h +++ b/cpp/tensorrt_llm/common/ncclUtils.h @@ -282,6 +282,12 @@ class NCCLWindowAllocator mutable std::mutex mMutex; std::unordered_map> mBufferPool; std::unordered_set mRegisteredComms; + // Comms whose symmetric memory path is known to fail collectively (e.g. H100 PCIe without + // NVLink fabric where ncclMemAlloc returns ncclUnhandledCudaError on at least one rank). + // Once recorded, subsequent requestBuffer() calls short-circuit to NCCLWindowBuffer{} so we + // don't repeatedly trigger the warning, the rank-sync allreduce, and the sticky-error drain + // for every autotuner trial. + std::unordered_set mSymmetricUnavailable; }; // RAII wrapper for NCCL window buffers From bcccd8d8ff40d8316cc2c0b10042aa1d58c16b23 Mon Sep 17 00:00:00 2001 From: Ludwig Schneider Date: Wed, 24 Jun 2026 06:56:50 -0700 Subject: [PATCH 4/4] [nvbugs/6062416][fix] Cache NCCL window allocation failures by size Signed-off-by: Ludwig Schneider --- cpp/tensorrt_llm/common/ncclUtils.cpp | 101 ++++++++++------ cpp/tensorrt_llm/common/ncclUtils.h | 23 ++-- .../unit_tests/multi_gpu/ncclUtilsTest.cpp | 114 +++++++++++++++++- 3 files changed, 192 insertions(+), 46 deletions(-) diff --git a/cpp/tensorrt_llm/common/ncclUtils.cpp b/cpp/tensorrt_llm/common/ncclUtils.cpp index 5837451cf7f..e36cfddcd40 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.cpp +++ b/cpp/tensorrt_llm/common/ncclUtils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ namespace { -// RAII guard for cudaMalloc — frees the pointer on destruction, logging a warning on failure. +// RAII guard for cudaMalloc. Frees the pointer on destruction, logging a warning on failure. struct CudaMallocGuard { void* ptr{nullptr}; @@ -56,7 +56,7 @@ struct CudaMallocGuard CudaMallocGuard& operator=(CudaMallocGuard const&) = delete; }; -// RAII guard for ncclMemAlloc — frees the pointer on destruction, logging a warning on failure. +// RAII guard for ncclMemAlloc. Frees the pointer on destruction, logging a warning on failure. struct NcclMemGuard { void* ptr{nullptr}; @@ -416,17 +416,6 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size // This is cheap even if no buffers exist yet - cleanup will just return early registerBufferCleanup(comm); - // If a previous allocateAndRegisterBuffer call collectively concluded that this comm - // cannot use NCCL symmetric memory, short-circuit so callers transparently fall back to - // regular allreduce. This avoids re-running ncclMemAlloc + the rank-sync allreduce on - // every autotuner trial, which would otherwise spam warnings and stress the failing path. - // The decision is collective (driven by an ncclAllReduce(min) inside allocateAndRegisterBuffer), - // so all ranks reach the same conclusion and stay in sync without further communication. - if (mSymmetricUnavailable.find(comm) != mSymmetricUnavailable.end()) - { - return NCCLWindowBuffer(); - } - // Check if we have an available buffer of at least the requested size for this communicator // Use best-fit: find the smallest buffer that's >= requested size auto& commBuffers = mBufferPool[comm]; @@ -451,6 +440,17 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size return bestFit->buffer; } + // If a previous allocateAndRegisterBuffer call collectively failed for this comm at a size + // no larger than this request, do not retry the known-failing new allocation path. Smaller + // requests and already-pooled buffers can still use NCCL windows. + auto const failureIt = mMinSymmetricFailureSize.find(comm); + if (failureIt != mMinSymmetricFailureSize.end() && size >= failureIt->second) + { + TLLM_LOG_DEBUG("[NCCLUtil] Skipping NCCL window allocation for comm %p, size=%zu; known failure threshold=%zu", + static_cast(comm), size, failureIt->second); + return NCCLWindowBuffer(); + } + // No available buffer found, avoid registration during CUDA graph capture auto stream = at::cuda::getCurrentCUDAStream(); cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone; @@ -480,15 +480,38 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size } else { - // The collective allreduce inside allocateAndRegisterBuffer agreed that at least one - // rank could not allocate symmetric memory. Mark this comm so future requests don't - // retry the failing path on every autotuner trial. - mSymmetricUnavailable.insert(comm); + // The collective allreduce inside allocateAndRegisterBuffer agreed that this request + // cannot use symmetric memory on at least one rank. Remember the smallest failing + // request size so repeated too-large autotuner probes do not keep stressing this path. + recordSymmetricFailureLocked(comm, size); } return buffer; } +void NCCLWindowAllocator::recordSymmetricFailureLocked(ncclComm_t comm, size_t size) +{ + auto failureIt = mMinSymmetricFailureSize.find(comm); + if (failureIt == mMinSymmetricFailureSize.end()) + { + mMinSymmetricFailureSize.emplace(comm, size); + } + else if (size < failureIt->second) + { + failureIt->second = size; + } +} + +cudaError_t NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed( + int localAllocOk, CudaGetLastErrorFunc getLastError) noexcept +{ + if (localAllocOk == 0) + { + return getLastError(); + } + return cudaSuccess; +} + NCCLWindowBuffer NCCLWindowAllocator::searchBuffer(ncclComm_t comm, void* ptr) const { if (!comm || !ptr) @@ -586,35 +609,37 @@ bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle) { - // Step 1: Pre-allocate the rank-sync flag *before* ncclMemAlloc. ncclMemAlloc can fail + // Step 1: Pre-allocate the rank-sync flag before ncclMemAlloc. ncclMemAlloc can fail // asymmetrically with ncclUnhandledCudaError on configurations where the symmetric/VMM path - // is unavailable; that failure may leave a sticky CUDA last-error on the device. If we + // is unavailable; that failure may leave a sticky CUDA last-error on the device. If we // deferred this cudaMalloc until after the failure, the sticky error would propagate into // cudaMalloc, TLLM_CUDA_CHECK would throw, and the failing rank would never reach the - // collective ncclAllReduce(min) below — hanging every other rank that *did* succeed. + // collective ncclAllReduce(min) below, hanging every other rank that did succeed. int* rankSyncFlag = nullptr; TLLM_CUDA_CHECK(cudaMalloc(&rankSyncFlag, sizeof(int))); CudaMallocGuard flagGuard{rankSyncFlag}; // frees rankSyncFlag on any early return or exception + auto stream = at::cuda::getCurrentCUDAStream().stream(); + TLLM_CUDA_CHECK(cudaMemsetAsync(rankSyncFlag, 0, sizeof(int), stream)); - // Step 2: Allocate symmetric memory (per-rank, non-collective — can fail asymmetrically). - // If ncclMemAlloc fails, drain any sticky CUDA last-error so the subsequent cudaMemcpy and - // ncclAllReduce(min) observe a clean device state and the failing rank reaches the collective - // below on the same control path as healthy ranks. + // Step 2: Allocate symmetric memory. This per-rank, non-collective call can fail + // asymmetrically. When it fails, NCCL may leave a sticky CUDA error behind; clear it before + // the stream-ordered flag copy and collective fallback so the failing rank still reaches + // ncclAllReduce with the other ranks. void* ncclPtr = nullptr; TLLM_NCCL_CHECK_WARN(ncclMemAlloc(&ncclPtr, size)); int const localAllocOk = (ncclPtr != nullptr) ? 1 : 0; NcclMemGuard ncclGuard{ncclPtr}; // frees ncclPtr on any early return or exception - if (!localAllocOk) - { - (void) cudaGetLastError(); - } + clearCudaErrorIfSymmetricAllocationFailed(localAllocOk); - // Step 3: ncclCommWindowRegister is collective — if any rank skips it, all other ranks hang. + // Step 3: ncclCommWindowRegister is collective. If any rank skips it, all other ranks hang. // Populate flag, reduce with min across ranks (0 if any rank failed), then read back. - // H2D failure is non-fatal: warn and continue — device flag may be stale but the allreduce - // must still be reached by all ranks. allreduce and D2H failures are catastrophic (throw). - auto stream = at::cuda::getCurrentCUDAStream().stream(); - TLLM_CUDA_CHECK_WARN(cudaMemcpy(rankSyncFlag, &localAllocOk, sizeof(int), cudaMemcpyHostToDevice)); + // The flag is initialized to 0, so H2D failure is non-fatal and conservatively falls back + // to regular NCCL while still reaching the collective. allreduce and D2H failures throw. + if (localAllocOk != 0) + { + TLLM_CUDA_CHECK_WARN( + cudaMemcpyAsync(rankSyncFlag, &localAllocOk, sizeof(localAllocOk), cudaMemcpyHostToDevice, stream)); + } TLLM_NCCL_CHECK(ncclAllReduce(rankSyncFlag, rankSyncFlag, 1, ncclInt32, ncclMin, comm, stream)); TLLM_CUDA_CHECK_WARN(cudaStreamSynchronize(stream)); @@ -634,7 +659,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, return NCCLWindowBuffer{}; // ncclGuard frees ncclPtr } - // Step 4: Register with NCCL as a window (collective — all ranks must reach this call). + // Step 4: Register with NCCL as a window. This is collective, so all ranks must reach it. // Failure here is non-fatal: warn and fall back to regular allreduce. // ncclGuard frees ncclPtr on return. ncclWindow_t window = nullptr; @@ -645,7 +670,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, return NCCLWindowBuffer{}; } - // Step 5: Success — transfer ownership to the returned buffer. + // Step 5: Success. Transfer ownership to the returned buffer. ncclGuard.release(); NCCLWindowBuffer buffer{ncclPtr, handle, size, window}; TLLM_LOG_TRACE("[NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p", @@ -718,7 +743,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept { // No buffers to clean up, but mark as cleaned mRegisteredComms.erase(comm); - mSymmetricUnavailable.erase(comm); + mMinSymmetricFailureSize.erase(comm); return; } @@ -794,7 +819,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept mBufferPool.erase(commIt); mRegisteredComms.erase(comm); - mSymmetricUnavailable.erase(comm); + mMinSymmetricFailureSize.erase(comm); } #endif // NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0) diff --git a/cpp/tensorrt_llm/common/ncclUtils.h b/cpp/tensorrt_llm/common/ncclUtils.h index f99e4aef564..fdc88f1aaab 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.h +++ b/cpp/tensorrt_llm/common/ncclUtils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -258,12 +258,23 @@ class NCCLWindowAllocator NCCLWindowAllocator& operator=(NCCLWindowAllocator&&) = delete; private: + friend class NCCLWindowAllocatorTestAccess; + NCCLWindowAllocator() = default; ~NCCLWindowAllocator() = default; // Allocate a new buffer and register it with NCCL as a window NCCLWindowBuffer allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle); + // Record a failed new symmetric allocation (assumes mMutex is already locked). + void recordSymmetricFailureLocked(ncclComm_t comm, size_t size); + + using CudaGetLastErrorFunc = cudaError_t (*)(); + + // Drain the sticky CUDA error left by a failed symmetric allocation. + static cudaError_t clearCudaErrorIfSymmetricAllocationFailed( + int localAllocOk, CudaGetLastErrorFunc getLastError = cudaGetLastError) noexcept; + // Search for a buffer by pointer (assumes mMutex is already locked) NCCLWindowBuffer searchBufferLocked(ncclComm_t comm, void* ptr) const; @@ -282,12 +293,10 @@ class NCCLWindowAllocator mutable std::mutex mMutex; std::unordered_map> mBufferPool; std::unordered_set mRegisteredComms; - // Comms whose symmetric memory path is known to fail collectively (e.g. H100 PCIe without - // NVLink fabric where ncclMemAlloc returns ncclUnhandledCudaError on at least one rank). - // Once recorded, subsequent requestBuffer() calls short-circuit to NCCLWindowBuffer{} so we - // don't repeatedly trigger the warning, the rank-sync allreduce, and the sticky-error drain - // for every autotuner trial. - std::unordered_set mSymmetricUnavailable; + // Smallest request size that is known to fail collectively for each communicator. + // Requests below the recorded size may still succeed and already-pooled buffers are always + // reused before consulting this cache. + std::unordered_map mMinSymmetricFailureSize; }; // RAII wrapper for NCCL window buffers diff --git a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp index 2dca1418ecf..4f3309431a2 100644 --- a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include "tensorrt_llm/runtime/utils/mpiUtils.h" #include +#include #include #include #include @@ -38,6 +39,36 @@ namespace nccl_util = tensorrt_llm::common::nccl_util; using tensorrt_llm::getComm; +namespace tensorrt_llm::common::nccl_util +{ +class NCCLWindowAllocatorTestAccess +{ +public: + static void recordSymmetricFailure(NCCLWindowAllocator& allocator, ncclComm_t comm, size_t size) + { + std::lock_guard lock(allocator.mMutex); + allocator.recordSymmetricFailureLocked(comm, size); + } + + static cudaError_t clearCudaErrorIfSymmetricAllocationFailed( + int localAllocOk, NCCLWindowAllocator::CudaGetLastErrorFunc getLastError = cudaGetLastError) + { + return NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed(localAllocOk, getLastError); + } +}; +} // namespace tensorrt_llm::common::nccl_util + +namespace +{ +int gCudaGetLastErrorCallCount = 0; + +cudaError_t fakeCudaGetLastError() +{ + ++gCudaGetLastErrorCallCount; + return cudaErrorLaunchFailure; +} +} // namespace + TEST(NCCLWindowSupportTest, RuntimeVersionAndGB10Gate) { #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0) @@ -337,6 +368,87 @@ TEST_F(NCCLWindowAllocatorTest, BestFitReuse) allocator.releaseBuffer(*mComm, buffer768KB.ptr); } +TEST_F(NCCLWindowAllocatorTest, FailureCacheIsSizeAwareForNewAllocations) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + auto testComm = createSplitComm(*mComm, 0, mRank); + + constexpr size_t failureSize = 1024 * 1024; + nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, failureSize); + + auto smallBuffer = allocator.requestBuffer(*testComm, failureSize / 2); + ASSERT_TRUE(smallBuffer.isValid()); + EXPECT_EQ(allocator.getBufferCount(*testComm), 1); + + auto failedBuffer = allocator.requestBuffer(*testComm, failureSize); + EXPECT_FALSE(failedBuffer.isValid()); + EXPECT_EQ(allocator.getBufferCount(*testComm), 1); + + allocator.releaseBuffer(*testComm, smallBuffer.ptr); + testComm.reset(); +} + +TEST_F(NCCLWindowAllocatorTest, FailureCacheDoesNotDisableReusableBuffers) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + auto testComm = createSplitComm(*mComm, 0, mRank); + + auto buffer1MB = allocator.requestBuffer(*testComm, 1024 * 1024); + ASSERT_TRUE(buffer1MB.isValid()); + void* ptr1MB = buffer1MB.ptr; + allocator.releaseBuffer(*testComm, ptr1MB); + + nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 512 * 1024); + + auto reusedBuffer = allocator.requestBuffer(*testComm, 768 * 1024); + ASSERT_TRUE(reusedBuffer.isValid()); + EXPECT_EQ(reusedBuffer.ptr, ptr1MB); + EXPECT_EQ(allocator.getBufferCount(*testComm), 1); + allocator.releaseBuffer(*testComm, reusedBuffer.ptr); + + auto failedBuffer = allocator.requestBuffer(*testComm, 2 * 1024 * 1024); + EXPECT_FALSE(failedBuffer.isValid()); + EXPECT_EQ(allocator.getBufferCount(*testComm), 1); + + testComm.reset(); +} + +TEST_F(NCCLWindowAllocatorTest, FailureCacheKeepsSmallestFailureSize) +{ + auto& allocator = nccl_util::NCCLWindowAllocator::getInstance(); + auto testComm = createSplitComm(*mComm, 0, mRank); + + nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 2 * 1024 * 1024); + nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 1024 * 1024); + + auto smallBuffer = allocator.requestBuffer(*testComm, 768 * 1024); + ASSERT_TRUE(smallBuffer.isValid()); + EXPECT_EQ(allocator.getBufferCount(*testComm), 1); + + auto failedBuffer = allocator.requestBuffer(*testComm, 1536 * 1024); + EXPECT_FALSE(failedBuffer.isValid()); + EXPECT_EQ(allocator.getBufferCount(*testComm), 1); + + allocator.releaseBuffer(*testComm, smallBuffer.ptr); + testComm.reset(); +} + +TEST_F(NCCLWindowAllocatorTest, ClearsCudaErrorAfterLocalAllocationFailure) +{ + auto const clearCudaErrorIfFailed = [](int localAllocOk) + { + return nccl_util::NCCLWindowAllocatorTestAccess::clearCudaErrorIfSymmetricAllocationFailed( + localAllocOk, fakeCudaGetLastError); + }; + + gCudaGetLastErrorCallCount = 0; + EXPECT_EQ(clearCudaErrorIfFailed(1), cudaSuccess); + EXPECT_EQ(gCudaGetLastErrorCallCount, 0); + + EXPECT_EQ(clearCudaErrorIfFailed(0), cudaErrorLaunchFailure); + EXPECT_EQ(gCudaGetLastErrorCallCount, 1); +} + TEST_F(NCCLWindowAllocatorTest, MultipleBuffers) { auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();