Skip to content

Commit a79e22a

Browse files
committed
[nvbugs/6062416][fix] Cache NCCL window allocation failures by size
Signed-off-by: Ludwig Schneider <lschneider@nvidia.com>
1 parent 665b70e commit a79e22a

3 files changed

Lines changed: 192 additions & 46 deletions

File tree

cpp/tensorrt_llm/common/ncclUtils.cpp

Lines changed: 63 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
2727
namespace
2828
{
2929

30-
// RAII guard for cudaMalloc — frees the pointer on destruction, logging a warning on failure.
30+
// RAII guard for cudaMalloc. Frees the pointer on destruction, logging a warning on failure.
3131
struct CudaMallocGuard
3232
{
3333
void* ptr{nullptr};
@@ -56,7 +56,7 @@ struct CudaMallocGuard
5656
CudaMallocGuard& operator=(CudaMallocGuard const&) = delete;
5757
};
5858

59-
// RAII guard for ncclMemAlloc — frees the pointer on destruction, logging a warning on failure.
59+
// RAII guard for ncclMemAlloc. Frees the pointer on destruction, logging a warning on failure.
6060
struct NcclMemGuard
6161
{
6262
void* ptr{nullptr};
@@ -416,17 +416,6 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
416416
// This is cheap even if no buffers exist yet - cleanup will just return early
417417
registerBufferCleanup(comm);
418418

419-
// If a previous allocateAndRegisterBuffer call collectively concluded that this comm
420-
// cannot use NCCL symmetric memory, short-circuit so callers transparently fall back to
421-
// regular allreduce. This avoids re-running ncclMemAlloc + the rank-sync allreduce on
422-
// every autotuner trial, which would otherwise spam warnings and stress the failing path.
423-
// The decision is collective (driven by an ncclAllReduce(min) inside allocateAndRegisterBuffer),
424-
// so all ranks reach the same conclusion and stay in sync without further communication.
425-
if (mSymmetricUnavailable.find(comm) != mSymmetricUnavailable.end())
426-
{
427-
return NCCLWindowBuffer();
428-
}
429-
430419
// Check if we have an available buffer of at least the requested size for this communicator
431420
// Use best-fit: find the smallest buffer that's >= requested size
432421
auto& commBuffers = mBufferPool[comm];
@@ -451,6 +440,17 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
451440
return bestFit->buffer;
452441
}
453442

443+
// If a previous allocateAndRegisterBuffer call collectively failed for this comm at a size
444+
// no larger than this request, do not retry the known-failing new allocation path. Smaller
445+
// requests and already-pooled buffers can still use NCCL windows.
446+
auto const failureIt = mMinSymmetricFailureSize.find(comm);
447+
if (failureIt != mMinSymmetricFailureSize.end() && size >= failureIt->second)
448+
{
449+
TLLM_LOG_DEBUG("[NCCLUtil] Skipping NCCL window allocation for comm %p, size=%zu; known failure threshold=%zu",
450+
static_cast<void*>(comm), size, failureIt->second);
451+
return NCCLWindowBuffer();
452+
}
453+
454454
// No available buffer found, avoid registration during CUDA graph capture
455455
auto stream = at::cuda::getCurrentCUDAStream();
456456
cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
@@ -480,15 +480,38 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
480480
}
481481
else
482482
{
483-
// The collective allreduce inside allocateAndRegisterBuffer agreed that at least one
484-
// rank could not allocate symmetric memory. Mark this comm so future requests don't
485-
// retry the failing path on every autotuner trial.
486-
mSymmetricUnavailable.insert(comm);
483+
// The collective allreduce inside allocateAndRegisterBuffer agreed that this request
484+
// cannot use symmetric memory on at least one rank. Remember the smallest failing
485+
// request size so repeated too-large autotuner probes do not keep stressing this path.
486+
recordSymmetricFailureLocked(comm, size);
487487
}
488488

489489
return buffer;
490490
}
491491

492+
void NCCLWindowAllocator::recordSymmetricFailureLocked(ncclComm_t comm, size_t size)
493+
{
494+
auto failureIt = mMinSymmetricFailureSize.find(comm);
495+
if (failureIt == mMinSymmetricFailureSize.end())
496+
{
497+
mMinSymmetricFailureSize.emplace(comm, size);
498+
}
499+
else if (size < failureIt->second)
500+
{
501+
failureIt->second = size;
502+
}
503+
}
504+
505+
cudaError_t NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed(
506+
int localAllocOk, CudaGetLastErrorFunc getLastError) noexcept
507+
{
508+
if (localAllocOk == 0)
509+
{
510+
return getLastError();
511+
}
512+
return cudaSuccess;
513+
}
514+
492515
NCCLWindowBuffer NCCLWindowAllocator::searchBuffer(ncclComm_t comm, void* ptr) const
493516
{
494517
if (!comm || !ptr)
@@ -586,35 +609,37 @@ bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept
586609

587610
NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle)
588611
{
589-
// Step 1: Pre-allocate the rank-sync flag *before* ncclMemAlloc. ncclMemAlloc can fail
612+
// Step 1: Pre-allocate the rank-sync flag before ncclMemAlloc. ncclMemAlloc can fail
590613
// asymmetrically with ncclUnhandledCudaError on configurations where the symmetric/VMM path
591-
// is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
614+
// is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
592615
// deferred this cudaMalloc until after the failure, the sticky error would propagate into
593616
// cudaMalloc, TLLM_CUDA_CHECK would throw, and the failing rank would never reach the
594-
// collective ncclAllReduce(min) belowhanging every other rank that *did* succeed.
617+
// collective ncclAllReduce(min) below, hanging every other rank that did succeed.
595618
int* rankSyncFlag = nullptr;
596619
TLLM_CUDA_CHECK(cudaMalloc(&rankSyncFlag, sizeof(int)));
597620
CudaMallocGuard flagGuard{rankSyncFlag}; // frees rankSyncFlag on any early return or exception
621+
auto stream = at::cuda::getCurrentCUDAStream().stream();
622+
TLLM_CUDA_CHECK(cudaMemsetAsync(rankSyncFlag, 0, sizeof(int), stream));
598623

599-
// Step 2: Allocate symmetric memory (per-rank, non-collective can fail asymmetrically).
600-
// If ncclMemAlloc fails, drain any sticky CUDA last-error so the subsequent cudaMemcpy and
601-
// ncclAllReduce(min) observe a clean device state and the failing rank reaches the collective
602-
// below on the same control path as healthy ranks.
624+
// Step 2: Allocate symmetric memory. This per-rank, non-collective call can fail
625+
// asymmetrically. When it fails, NCCL may leave a sticky CUDA error behind; clear it before
626+
// the stream-ordered flag copy and collective fallback so the failing rank still reaches
627+
// ncclAllReduce with the other ranks.
603628
void* ncclPtr = nullptr;
604629
TLLM_NCCL_CHECK_WARN(ncclMemAlloc(&ncclPtr, size));
605630
int const localAllocOk = (ncclPtr != nullptr) ? 1 : 0;
606631
NcclMemGuard ncclGuard{ncclPtr}; // frees ncclPtr on any early return or exception
607-
if (!localAllocOk)
608-
{
609-
(void) cudaGetLastError();
610-
}
632+
clearCudaErrorIfSymmetricAllocationFailed(localAllocOk);
611633

612-
// Step 3: ncclCommWindowRegister is collective — if any rank skips it, all other ranks hang.
634+
// Step 3: ncclCommWindowRegister is collective. If any rank skips it, all other ranks hang.
613635
// Populate flag, reduce with min across ranks (0 if any rank failed), then read back.
614-
// H2D failure is non-fatal: warn and continue — device flag may be stale but the allreduce
615-
// must still be reached by all ranks. allreduce and D2H failures are catastrophic (throw).
616-
auto stream = at::cuda::getCurrentCUDAStream().stream();
617-
TLLM_CUDA_CHECK_WARN(cudaMemcpy(rankSyncFlag, &localAllocOk, sizeof(int), cudaMemcpyHostToDevice));
636+
// The flag is initialized to 0, so H2D failure is non-fatal and conservatively falls back
637+
// to regular NCCL while still reaching the collective. allreduce and D2H failures throw.
638+
if (localAllocOk != 0)
639+
{
640+
TLLM_CUDA_CHECK_WARN(
641+
cudaMemcpyAsync(rankSyncFlag, &localAllocOk, sizeof(localAllocOk), cudaMemcpyHostToDevice, stream));
642+
}
618643
TLLM_NCCL_CHECK(ncclAllReduce(rankSyncFlag, rankSyncFlag, 1, ncclInt32, ncclMin, comm, stream));
619644
TLLM_CUDA_CHECK_WARN(cudaStreamSynchronize(stream));
620645

@@ -634,7 +659,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
634659
return NCCLWindowBuffer{}; // ncclGuard frees ncclPtr
635660
}
636661

637-
// Step 4: Register with NCCL as a window (collective all ranks must reach this call).
662+
// Step 4: Register with NCCL as a window. This is collective, so all ranks must reach it.
638663
// Failure here is non-fatal: warn and fall back to regular allreduce.
639664
// ncclGuard frees ncclPtr on return.
640665
ncclWindow_t window = nullptr;
@@ -645,7 +670,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
645670
return NCCLWindowBuffer{};
646671
}
647672

648-
// Step 5: Success — transfer ownership to the returned buffer.
673+
// Step 5: Success. Transfer ownership to the returned buffer.
649674
ncclGuard.release();
650675
NCCLWindowBuffer buffer{ncclPtr, handle, size, window};
651676
TLLM_LOG_TRACE("[NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p",
@@ -718,7 +743,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
718743
{
719744
// No buffers to clean up, but mark as cleaned
720745
mRegisteredComms.erase(comm);
721-
mSymmetricUnavailable.erase(comm);
746+
mMinSymmetricFailureSize.erase(comm);
722747
return;
723748
}
724749

@@ -794,7 +819,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
794819

795820
mBufferPool.erase(commIt);
796821
mRegisteredComms.erase(comm);
797-
mSymmetricUnavailable.erase(comm);
822+
mMinSymmetricFailureSize.erase(comm);
798823
}
799824

800825
#endif // NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)

cpp/tensorrt_llm/common/ncclUtils.h

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -258,12 +258,23 @@ class NCCLWindowAllocator
258258
NCCLWindowAllocator& operator=(NCCLWindowAllocator&&) = delete;
259259

260260
private:
261+
friend class NCCLWindowAllocatorTestAccess;
262+
261263
NCCLWindowAllocator() = default;
262264
~NCCLWindowAllocator() = default;
263265

264266
// Allocate a new buffer and register it with NCCL as a window
265267
NCCLWindowBuffer allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle);
266268

269+
// Record a failed new symmetric allocation (assumes mMutex is already locked).
270+
void recordSymmetricFailureLocked(ncclComm_t comm, size_t size);
271+
272+
using CudaGetLastErrorFunc = cudaError_t (*)();
273+
274+
// Drain the sticky CUDA error left by a failed symmetric allocation.
275+
static cudaError_t clearCudaErrorIfSymmetricAllocationFailed(
276+
int localAllocOk, CudaGetLastErrorFunc getLastError = cudaGetLastError) noexcept;
277+
267278
// Search for a buffer by pointer (assumes mMutex is already locked)
268279
NCCLWindowBuffer searchBufferLocked(ncclComm_t comm, void* ptr) const;
269280

@@ -282,12 +293,10 @@ class NCCLWindowAllocator
282293
mutable std::mutex mMutex;
283294
std::unordered_map<ncclComm_t, std::vector<BufferEntry>> mBufferPool;
284295
std::unordered_set<ncclComm_t> mRegisteredComms;
285-
// Comms whose symmetric memory path is known to fail collectively (e.g. H100 PCIe without
286-
// NVLink fabric where ncclMemAlloc returns ncclUnhandledCudaError on at least one rank).
287-
// Once recorded, subsequent requestBuffer() calls short-circuit to NCCLWindowBuffer{} so we
288-
// don't repeatedly trigger the warning, the rank-sync allreduce, and the sticky-error drain
289-
// for every autotuner trial.
290-
std::unordered_set<ncclComm_t> mSymmetricUnavailable;
296+
// Smallest request size that is known to fail collectively for each communicator.
297+
// Requests below the recorded size may still succeed and already-pooled buffers are always
298+
// reused before consulting this cache.
299+
std::unordered_map<ncclComm_t, size_t> mMinSymmetricFailureSize;
291300
};
292301

293302
// RAII wrapper for NCCL window buffers

cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
2222
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
2323

2424
#include <gtest/gtest.h>
25+
#include <mutex>
2526
#include <nccl.h>
2627
#include <thread>
2728
#include <vector>
@@ -38,6 +39,36 @@ namespace nccl_util = tensorrt_llm::common::nccl_util;
3839

3940
using tensorrt_llm::getComm;
4041

42+
namespace tensorrt_llm::common::nccl_util
43+
{
44+
class NCCLWindowAllocatorTestAccess
45+
{
46+
public:
47+
static void recordSymmetricFailure(NCCLWindowAllocator& allocator, ncclComm_t comm, size_t size)
48+
{
49+
std::lock_guard<std::mutex> lock(allocator.mMutex);
50+
allocator.recordSymmetricFailureLocked(comm, size);
51+
}
52+
53+
static cudaError_t clearCudaErrorIfSymmetricAllocationFailed(
54+
int localAllocOk, NCCLWindowAllocator::CudaGetLastErrorFunc getLastError = cudaGetLastError)
55+
{
56+
return NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed(localAllocOk, getLastError);
57+
}
58+
};
59+
} // namespace tensorrt_llm::common::nccl_util
60+
61+
namespace
62+
{
63+
int gCudaGetLastErrorCallCount = 0;
64+
65+
cudaError_t fakeCudaGetLastError()
66+
{
67+
++gCudaGetLastErrorCallCount;
68+
return cudaErrorLaunchFailure;
69+
}
70+
} // namespace
71+
4172
TEST(NCCLWindowSupportTest, RuntimeVersionAndGB10Gate)
4273
{
4374
#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)
@@ -337,6 +368,87 @@ TEST_F(NCCLWindowAllocatorTest, BestFitReuse)
337368
allocator.releaseBuffer(*mComm, buffer768KB.ptr);
338369
}
339370

371+
TEST_F(NCCLWindowAllocatorTest, FailureCacheIsSizeAwareForNewAllocations)
372+
{
373+
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
374+
auto testComm = createSplitComm(*mComm, 0, mRank);
375+
376+
constexpr size_t failureSize = 1024 * 1024;
377+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, failureSize);
378+
379+
auto smallBuffer = allocator.requestBuffer(*testComm, failureSize / 2);
380+
ASSERT_TRUE(smallBuffer.isValid());
381+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
382+
383+
auto failedBuffer = allocator.requestBuffer(*testComm, failureSize);
384+
EXPECT_FALSE(failedBuffer.isValid());
385+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
386+
387+
allocator.releaseBuffer(*testComm, smallBuffer.ptr);
388+
testComm.reset();
389+
}
390+
391+
TEST_F(NCCLWindowAllocatorTest, FailureCacheDoesNotDisableReusableBuffers)
392+
{
393+
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
394+
auto testComm = createSplitComm(*mComm, 0, mRank);
395+
396+
auto buffer1MB = allocator.requestBuffer(*testComm, 1024 * 1024);
397+
ASSERT_TRUE(buffer1MB.isValid());
398+
void* ptr1MB = buffer1MB.ptr;
399+
allocator.releaseBuffer(*testComm, ptr1MB);
400+
401+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 512 * 1024);
402+
403+
auto reusedBuffer = allocator.requestBuffer(*testComm, 768 * 1024);
404+
ASSERT_TRUE(reusedBuffer.isValid());
405+
EXPECT_EQ(reusedBuffer.ptr, ptr1MB);
406+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
407+
allocator.releaseBuffer(*testComm, reusedBuffer.ptr);
408+
409+
auto failedBuffer = allocator.requestBuffer(*testComm, 2 * 1024 * 1024);
410+
EXPECT_FALSE(failedBuffer.isValid());
411+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
412+
413+
testComm.reset();
414+
}
415+
416+
TEST_F(NCCLWindowAllocatorTest, FailureCacheKeepsSmallestFailureSize)
417+
{
418+
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
419+
auto testComm = createSplitComm(*mComm, 0, mRank);
420+
421+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 2 * 1024 * 1024);
422+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 1024 * 1024);
423+
424+
auto smallBuffer = allocator.requestBuffer(*testComm, 768 * 1024);
425+
ASSERT_TRUE(smallBuffer.isValid());
426+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
427+
428+
auto failedBuffer = allocator.requestBuffer(*testComm, 1536 * 1024);
429+
EXPECT_FALSE(failedBuffer.isValid());
430+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
431+
432+
allocator.releaseBuffer(*testComm, smallBuffer.ptr);
433+
testComm.reset();
434+
}
435+
436+
TEST_F(NCCLWindowAllocatorTest, ClearsCudaErrorAfterLocalAllocationFailure)
437+
{
438+
auto const clearCudaErrorIfFailed = [](int localAllocOk)
439+
{
440+
return nccl_util::NCCLWindowAllocatorTestAccess::clearCudaErrorIfSymmetricAllocationFailed(
441+
localAllocOk, fakeCudaGetLastError);
442+
};
443+
444+
gCudaGetLastErrorCallCount = 0;
445+
EXPECT_EQ(clearCudaErrorIfFailed(1), cudaSuccess);
446+
EXPECT_EQ(gCudaGetLastErrorCallCount, 0);
447+
448+
EXPECT_EQ(clearCudaErrorIfFailed(0), cudaErrorLaunchFailure);
449+
EXPECT_EQ(gCudaGetLastErrorCallCount, 1);
450+
}
451+
340452
TEST_F(NCCLWindowAllocatorTest, MultipleBuffers)
341453
{
342454
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();

0 commit comments

Comments
 (0)