Skip to content

Commit 5664998

Browse files
committed
[nvbugs/6062416][fix] Cache NCCL window allocation failures by size
Signed-off-by: Ludwig Schneider <lschneider@nvidia.com>
1 parent 467abbb commit 5664998

3 files changed

Lines changed: 192 additions & 46 deletions

File tree

cpp/tensorrt_llm/common/ncclUtils.cpp

Lines changed: 63 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
2727
namespace
2828
{
2929

30-
// RAII guard for cudaMalloc — frees the pointer on destruction, logging a warning on failure.
30+
// RAII guard for cudaMalloc. Frees the pointer on destruction, logging a warning on failure.
3131
struct CudaMallocGuard
3232
{
3333
void* ptr{nullptr};
@@ -56,7 +56,7 @@ struct CudaMallocGuard
5656
CudaMallocGuard& operator=(CudaMallocGuard const&) = delete;
5757
};
5858

59-
// RAII guard for ncclMemAlloc — frees the pointer on destruction, logging a warning on failure.
59+
// RAII guard for ncclMemAlloc. Frees the pointer on destruction, logging a warning on failure.
6060
struct NcclMemGuard
6161
{
6262
void* ptr{nullptr};
@@ -320,17 +320,6 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
320320
// This is cheap even if no buffers exist yet - cleanup will just return early
321321
registerBufferCleanup(comm);
322322

323-
// If a previous allocateAndRegisterBuffer call collectively concluded that this comm
324-
// cannot use NCCL symmetric memory, short-circuit so callers transparently fall back to
325-
// regular allreduce. This avoids re-running ncclMemAlloc + the rank-sync allreduce on
326-
// every autotuner trial, which would otherwise spam warnings and stress the failing path.
327-
// The decision is collective (driven by an ncclAllReduce(min) inside allocateAndRegisterBuffer),
328-
// so all ranks reach the same conclusion and stay in sync without further communication.
329-
if (mSymmetricUnavailable.find(comm) != mSymmetricUnavailable.end())
330-
{
331-
return NCCLWindowBuffer();
332-
}
333-
334323
// Check if we have an available buffer of at least the requested size for this communicator
335324
// Use best-fit: find the smallest buffer that's >= requested size
336325
auto& commBuffers = mBufferPool[comm];
@@ -355,6 +344,17 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
355344
return bestFit->buffer;
356345
}
357346

347+
// If a previous allocateAndRegisterBuffer call collectively failed for this comm at a size
348+
// no larger than this request, do not retry the known-failing new allocation path. Smaller
349+
// requests and already-pooled buffers can still use NCCL windows.
350+
auto const failureIt = mMinSymmetricFailureSize.find(comm);
351+
if (failureIt != mMinSymmetricFailureSize.end() && size >= failureIt->second)
352+
{
353+
TLLM_LOG_DEBUG("[NCCLUtil] Skipping NCCL window allocation for comm %p, size=%zu; known failure threshold=%zu",
354+
static_cast<void*>(comm), size, failureIt->second);
355+
return NCCLWindowBuffer();
356+
}
357+
358358
// No available buffer found, avoid registration during CUDA graph capture
359359
auto stream = at::cuda::getCurrentCUDAStream();
360360
cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
@@ -384,15 +384,38 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
384384
}
385385
else
386386
{
387-
// The collective allreduce inside allocateAndRegisterBuffer agreed that at least one
388-
// rank could not allocate symmetric memory. Mark this comm so future requests don't
389-
// retry the failing path on every autotuner trial.
390-
mSymmetricUnavailable.insert(comm);
387+
// The collective allreduce inside allocateAndRegisterBuffer agreed that this request
388+
// cannot use symmetric memory on at least one rank. Remember the smallest failing
389+
// request size so repeated too-large autotuner probes do not keep stressing this path.
390+
recordSymmetricFailureLocked(comm, size);
391391
}
392392

393393
return buffer;
394394
}
395395

396+
void NCCLWindowAllocator::recordSymmetricFailureLocked(ncclComm_t comm, size_t size)
397+
{
398+
auto failureIt = mMinSymmetricFailureSize.find(comm);
399+
if (failureIt == mMinSymmetricFailureSize.end())
400+
{
401+
mMinSymmetricFailureSize.emplace(comm, size);
402+
}
403+
else if (size < failureIt->second)
404+
{
405+
failureIt->second = size;
406+
}
407+
}
408+
409+
cudaError_t NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed(
410+
int localAllocOk, CudaGetLastErrorFunc getLastError) noexcept
411+
{
412+
if (localAllocOk == 0)
413+
{
414+
return getLastError();
415+
}
416+
return cudaSuccess;
417+
}
418+
396419
NCCLWindowBuffer NCCLWindowAllocator::searchBuffer(ncclComm_t comm, void* ptr) const
397420
{
398421
if (!comm || !ptr)
@@ -490,35 +513,37 @@ bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept
490513

491514
NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle)
492515
{
493-
// Step 1: Pre-allocate the rank-sync flag *before* ncclMemAlloc. ncclMemAlloc can fail
516+
// Step 1: Pre-allocate the rank-sync flag before ncclMemAlloc. ncclMemAlloc can fail
494517
// asymmetrically with ncclUnhandledCudaError on configurations where the symmetric/VMM path
495-
// is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
518+
// is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
496519
// deferred this cudaMalloc until after the failure, the sticky error would propagate into
497520
// cudaMalloc, TLLM_CUDA_CHECK would throw, and the failing rank would never reach the
498-
// collective ncclAllReduce(min) belowhanging every other rank that *did* succeed.
521+
// collective ncclAllReduce(min) below, hanging every other rank that did succeed.
499522
int* rankSyncFlag = nullptr;
500523
TLLM_CUDA_CHECK(cudaMalloc(&rankSyncFlag, sizeof(int)));
501524
CudaMallocGuard flagGuard{rankSyncFlag}; // frees rankSyncFlag on any early return or exception
525+
auto stream = at::cuda::getCurrentCUDAStream().stream();
526+
TLLM_CUDA_CHECK(cudaMemsetAsync(rankSyncFlag, 0, sizeof(int), stream));
502527

503-
// Step 2: Allocate symmetric memory (per-rank, non-collective can fail asymmetrically).
504-
// If ncclMemAlloc fails, drain any sticky CUDA last-error so the subsequent cudaMemcpy and
505-
// ncclAllReduce(min) observe a clean device state and the failing rank reaches the collective
506-
// below on the same control path as healthy ranks.
528+
// Step 2: Allocate symmetric memory. This per-rank, non-collective call can fail
529+
// asymmetrically. When it fails, NCCL may leave a sticky CUDA error behind; clear it before
530+
// the stream-ordered flag copy and collective fallback so the failing rank still reaches
531+
// ncclAllReduce with the other ranks.
507532
void* ncclPtr = nullptr;
508533
TLLM_NCCL_CHECK_WARN(ncclMemAlloc(&ncclPtr, size));
509534
int const localAllocOk = (ncclPtr != nullptr) ? 1 : 0;
510535
NcclMemGuard ncclGuard{ncclPtr}; // frees ncclPtr on any early return or exception
511-
if (!localAllocOk)
512-
{
513-
(void) cudaGetLastError();
514-
}
536+
clearCudaErrorIfSymmetricAllocationFailed(localAllocOk);
515537

516-
// Step 3: ncclCommWindowRegister is collective — if any rank skips it, all other ranks hang.
538+
// Step 3: ncclCommWindowRegister is collective. If any rank skips it, all other ranks hang.
517539
// Populate flag, reduce with min across ranks (0 if any rank failed), then read back.
518-
// H2D failure is non-fatal: warn and continue — device flag may be stale but the allreduce
519-
// must still be reached by all ranks. allreduce and D2H failures are catastrophic (throw).
520-
auto stream = at::cuda::getCurrentCUDAStream().stream();
521-
TLLM_CUDA_CHECK_WARN(cudaMemcpy(rankSyncFlag, &localAllocOk, sizeof(int), cudaMemcpyHostToDevice));
540+
// The flag is initialized to 0, so H2D failure is non-fatal and conservatively falls back
541+
// to regular NCCL while still reaching the collective. allreduce and D2H failures throw.
542+
if (localAllocOk != 0)
543+
{
544+
TLLM_CUDA_CHECK_WARN(
545+
cudaMemcpyAsync(rankSyncFlag, &localAllocOk, sizeof(localAllocOk), cudaMemcpyHostToDevice, stream));
546+
}
522547
TLLM_NCCL_CHECK(ncclAllReduce(rankSyncFlag, rankSyncFlag, 1, ncclInt32, ncclMin, comm, stream));
523548
TLLM_CUDA_CHECK_WARN(cudaStreamSynchronize(stream));
524549

@@ -538,7 +563,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
538563
return NCCLWindowBuffer{}; // ncclGuard frees ncclPtr
539564
}
540565

541-
// Step 4: Register with NCCL as a window (collective all ranks must reach this call).
566+
// Step 4: Register with NCCL as a window. This is collective, so all ranks must reach it.
542567
// Failure here is non-fatal: warn and fall back to regular allreduce.
543568
// ncclGuard frees ncclPtr on return.
544569
ncclWindow_t window = nullptr;
@@ -549,7 +574,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
549574
return NCCLWindowBuffer{};
550575
}
551576

552-
// Step 5: Success — transfer ownership to the returned buffer.
577+
// Step 5: Success. Transfer ownership to the returned buffer.
553578
ncclGuard.release();
554579
NCCLWindowBuffer buffer{ncclPtr, handle, size, window};
555580
TLLM_LOG_TRACE("[NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p",
@@ -622,7 +647,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
622647
{
623648
// No buffers to clean up, but mark as cleaned
624649
mRegisteredComms.erase(comm);
625-
mSymmetricUnavailable.erase(comm);
650+
mMinSymmetricFailureSize.erase(comm);
626651
return;
627652
}
628653

@@ -698,7 +723,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
698723

699724
mBufferPool.erase(commIt);
700725
mRegisteredComms.erase(comm);
701-
mSymmetricUnavailable.erase(comm);
726+
mMinSymmetricFailureSize.erase(comm);
702727
}
703728

704729
#endif // NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)

cpp/tensorrt_llm/common/ncclUtils.h

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -265,12 +265,23 @@ class NCCLWindowAllocator
265265
NCCLWindowAllocator& operator=(NCCLWindowAllocator&&) = delete;
266266

267267
private:
268+
friend class NCCLWindowAllocatorTestAccess;
269+
268270
NCCLWindowAllocator() = default;
269271
~NCCLWindowAllocator() = default;
270272

271273
// Allocate a new buffer and register it with NCCL as a window
272274
NCCLWindowBuffer allocateAndRegisterBuffer(ncclComm_t comm, size_t size, int handle);
273275

276+
// Record a failed new symmetric allocation (assumes mMutex is already locked).
277+
void recordSymmetricFailureLocked(ncclComm_t comm, size_t size);
278+
279+
using CudaGetLastErrorFunc = cudaError_t (*)();
280+
281+
// Drain the sticky CUDA error left by a failed symmetric allocation.
282+
static cudaError_t clearCudaErrorIfSymmetricAllocationFailed(
283+
int localAllocOk, CudaGetLastErrorFunc getLastError = cudaGetLastError) noexcept;
284+
274285
// Search for a buffer by pointer (assumes mMutex is already locked)
275286
NCCLWindowBuffer searchBufferLocked(ncclComm_t comm, void* ptr) const;
276287

@@ -289,12 +300,10 @@ class NCCLWindowAllocator
289300
mutable std::mutex mMutex;
290301
std::unordered_map<ncclComm_t, std::vector<BufferEntry>> mBufferPool;
291302
std::unordered_set<ncclComm_t> mRegisteredComms;
292-
// Comms whose symmetric memory path is known to fail collectively (e.g. H100 PCIe without
293-
// NVLink fabric where ncclMemAlloc returns ncclUnhandledCudaError on at least one rank).
294-
// Once recorded, subsequent requestBuffer() calls short-circuit to NCCLWindowBuffer{} so we
295-
// don't repeatedly trigger the warning, the rank-sync allreduce, and the sticky-error drain
296-
// for every autotuner trial.
297-
std::unordered_set<ncclComm_t> mSymmetricUnavailable;
303+
// Smallest request size that is known to fail collectively for each communicator.
304+
// Requests below the recorded size may still succeed and already-pooled buffers are always
305+
// reused before consulting this cache.
306+
std::unordered_map<ncclComm_t, size_t> mMinSymmetricFailureSize;
298307
};
299308

300309
// RAII wrapper for NCCL window buffers

cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
2222
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
2323

2424
#include <gtest/gtest.h>
25+
#include <mutex>
2526
#include <nccl.h>
2627
#include <thread>
2728
#include <vector>
@@ -38,6 +39,36 @@ namespace nccl_util = tensorrt_llm::common::nccl_util;
3839

3940
using tensorrt_llm::getComm;
4041

42+
namespace tensorrt_llm::common::nccl_util
43+
{
44+
class NCCLWindowAllocatorTestAccess
45+
{
46+
public:
47+
static void recordSymmetricFailure(NCCLWindowAllocator& allocator, ncclComm_t comm, size_t size)
48+
{
49+
std::lock_guard<std::mutex> lock(allocator.mMutex);
50+
allocator.recordSymmetricFailureLocked(comm, size);
51+
}
52+
53+
static cudaError_t clearCudaErrorIfSymmetricAllocationFailed(
54+
int localAllocOk, NCCLWindowAllocator::CudaGetLastErrorFunc getLastError = cudaGetLastError)
55+
{
56+
return NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed(localAllocOk, getLastError);
57+
}
58+
};
59+
} // namespace tensorrt_llm::common::nccl_util
60+
61+
namespace
62+
{
63+
int gCudaGetLastErrorCallCount = 0;
64+
65+
cudaError_t fakeCudaGetLastError()
66+
{
67+
++gCudaGetLastErrorCallCount;
68+
return cudaErrorLaunchFailure;
69+
}
70+
} // namespace
71+
4172
// Helper function to create a split communicator for testing
4273
// This allows us to test cleanup behavior explicitly by controlling the lifetime
4374
std::shared_ptr<ncclComm_t> createSplitComm(ncclComm_t parentComm, int color, int key)
@@ -321,6 +352,87 @@ TEST_F(NCCLWindowAllocatorTest, BestFitReuse)
321352
allocator.releaseBuffer(*mComm, buffer768KB.ptr);
322353
}
323354

355+
TEST_F(NCCLWindowAllocatorTest, FailureCacheIsSizeAwareForNewAllocations)
356+
{
357+
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
358+
auto testComm = createSplitComm(*mComm, 0, mRank);
359+
360+
constexpr size_t failureSize = 1024 * 1024;
361+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, failureSize);
362+
363+
auto smallBuffer = allocator.requestBuffer(*testComm, failureSize / 2);
364+
ASSERT_TRUE(smallBuffer.isValid());
365+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
366+
367+
auto failedBuffer = allocator.requestBuffer(*testComm, failureSize);
368+
EXPECT_FALSE(failedBuffer.isValid());
369+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
370+
371+
allocator.releaseBuffer(*testComm, smallBuffer.ptr);
372+
testComm.reset();
373+
}
374+
375+
TEST_F(NCCLWindowAllocatorTest, FailureCacheDoesNotDisableReusableBuffers)
376+
{
377+
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
378+
auto testComm = createSplitComm(*mComm, 0, mRank);
379+
380+
auto buffer1MB = allocator.requestBuffer(*testComm, 1024 * 1024);
381+
ASSERT_TRUE(buffer1MB.isValid());
382+
void* ptr1MB = buffer1MB.ptr;
383+
allocator.releaseBuffer(*testComm, ptr1MB);
384+
385+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 512 * 1024);
386+
387+
auto reusedBuffer = allocator.requestBuffer(*testComm, 768 * 1024);
388+
ASSERT_TRUE(reusedBuffer.isValid());
389+
EXPECT_EQ(reusedBuffer.ptr, ptr1MB);
390+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
391+
allocator.releaseBuffer(*testComm, reusedBuffer.ptr);
392+
393+
auto failedBuffer = allocator.requestBuffer(*testComm, 2 * 1024 * 1024);
394+
EXPECT_FALSE(failedBuffer.isValid());
395+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
396+
397+
testComm.reset();
398+
}
399+
400+
TEST_F(NCCLWindowAllocatorTest, FailureCacheKeepsSmallestFailureSize)
401+
{
402+
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();
403+
auto testComm = createSplitComm(*mComm, 0, mRank);
404+
405+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 2 * 1024 * 1024);
406+
nccl_util::NCCLWindowAllocatorTestAccess::recordSymmetricFailure(allocator, *testComm, 1024 * 1024);
407+
408+
auto smallBuffer = allocator.requestBuffer(*testComm, 768 * 1024);
409+
ASSERT_TRUE(smallBuffer.isValid());
410+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
411+
412+
auto failedBuffer = allocator.requestBuffer(*testComm, 1536 * 1024);
413+
EXPECT_FALSE(failedBuffer.isValid());
414+
EXPECT_EQ(allocator.getBufferCount(*testComm), 1);
415+
416+
allocator.releaseBuffer(*testComm, smallBuffer.ptr);
417+
testComm.reset();
418+
}
419+
420+
TEST_F(NCCLWindowAllocatorTest, ClearsCudaErrorAfterLocalAllocationFailure)
421+
{
422+
auto const clearCudaErrorIfFailed = [](int localAllocOk)
423+
{
424+
return nccl_util::NCCLWindowAllocatorTestAccess::clearCudaErrorIfSymmetricAllocationFailed(
425+
localAllocOk, fakeCudaGetLastError);
426+
};
427+
428+
gCudaGetLastErrorCallCount = 0;
429+
EXPECT_EQ(clearCudaErrorIfFailed(1), cudaSuccess);
430+
EXPECT_EQ(gCudaGetLastErrorCallCount, 0);
431+
432+
EXPECT_EQ(clearCudaErrorIfFailed(0), cudaErrorLaunchFailure);
433+
EXPECT_EQ(gCudaGetLastErrorCallCount, 1);
434+
}
435+
324436
TEST_F(NCCLWindowAllocatorTest, MultipleBuffers)
325437
{
326438
auto& allocator = nccl_util::NCCLWindowAllocator::getInstance();

0 commit comments

Comments
 (0)