11/*
2- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+ * Copyright (c) 2025-2026 , NVIDIA CORPORATION. All rights reserved.
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
2727namespace
2828{
2929
30- // RAII guard for cudaMalloc — frees the pointer on destruction, logging a warning on failure.
30+ // RAII guard for cudaMalloc. Frees the pointer on destruction, logging a warning on failure.
3131struct CudaMallocGuard
3232{
3333 void * ptr{nullptr };
@@ -56,7 +56,7 @@ struct CudaMallocGuard
5656 CudaMallocGuard& operator =(CudaMallocGuard const &) = delete ;
5757};
5858
59- // RAII guard for ncclMemAlloc — frees the pointer on destruction, logging a warning on failure.
59+ // RAII guard for ncclMemAlloc. Frees the pointer on destruction, logging a warning on failure.
6060struct NcclMemGuard
6161{
6262 void * ptr{nullptr };
@@ -320,17 +320,6 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
320320 // This is cheap even if no buffers exist yet - cleanup will just return early
321321 registerBufferCleanup (comm);
322322
323- // If a previous allocateAndRegisterBuffer call collectively concluded that this comm
324- // cannot use NCCL symmetric memory, short-circuit so callers transparently fall back to
325- // regular allreduce. This avoids re-running ncclMemAlloc + the rank-sync allreduce on
326- // every autotuner trial, which would otherwise spam warnings and stress the failing path.
327- // The decision is collective (driven by an ncclAllReduce(min) inside allocateAndRegisterBuffer),
328- // so all ranks reach the same conclusion and stay in sync without further communication.
329- if (mSymmetricUnavailable .find (comm) != mSymmetricUnavailable .end ())
330- {
331- return NCCLWindowBuffer ();
332- }
333-
334323 // Check if we have an available buffer of at least the requested size for this communicator
335324 // Use best-fit: find the smallest buffer that's >= requested size
336325 auto & commBuffers = mBufferPool [comm];
@@ -355,6 +344,17 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
355344 return bestFit->buffer ;
356345 }
357346
347+ // If a previous allocateAndRegisterBuffer call collectively failed for this comm at a size
348+ // no larger than this request, do not retry the known-failing new allocation path. Smaller
349+ // requests and already-pooled buffers can still use NCCL windows.
350+ auto const failureIt = mMinSymmetricFailureSize .find (comm);
351+ if (failureIt != mMinSymmetricFailureSize .end () && size >= failureIt->second )
352+ {
353+ TLLM_LOG_DEBUG (" [NCCLUtil] Skipping NCCL window allocation for comm %p, size=%zu; known failure threshold=%zu" ,
354+ static_cast <void *>(comm), size, failureIt->second );
355+ return NCCLWindowBuffer ();
356+ }
357+
358358 // No available buffer found, avoid registration during CUDA graph capture
359359 auto stream = at::cuda::getCurrentCUDAStream ();
360360 cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
@@ -384,15 +384,38 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
384384 }
385385 else
386386 {
387- // The collective allreduce inside allocateAndRegisterBuffer agreed that at least one
388- // rank could not allocate symmetric memory. Mark this comm so future requests don't
389- // retry the failing path on every autotuner trial .
390- mSymmetricUnavailable . insert (comm);
387+ // The collective allreduce inside allocateAndRegisterBuffer agreed that this request
388+ // cannot use symmetric memory on at least one rank. Remember the smallest failing
389+ // request size so repeated too-large autotuner probes do not keep stressing this path .
390+ recordSymmetricFailureLocked (comm, size );
391391 }
392392
393393 return buffer;
394394}
395395
396+ void NCCLWindowAllocator::recordSymmetricFailureLocked (ncclComm_t comm, size_t size)
397+ {
398+ auto failureIt = mMinSymmetricFailureSize .find (comm);
399+ if (failureIt == mMinSymmetricFailureSize .end ())
400+ {
401+ mMinSymmetricFailureSize .emplace (comm, size);
402+ }
403+ else if (size < failureIt->second )
404+ {
405+ failureIt->second = size;
406+ }
407+ }
408+
409+ cudaError_t NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed (
410+ int localAllocOk, CudaGetLastErrorFunc getLastError) noexcept
411+ {
412+ if (localAllocOk == 0 )
413+ {
414+ return getLastError ();
415+ }
416+ return cudaSuccess;
417+ }
418+
396419NCCLWindowBuffer NCCLWindowAllocator::searchBuffer (ncclComm_t comm, void * ptr) const
397420{
398421 if (!comm || !ptr)
@@ -490,35 +513,37 @@ bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept
490513
491514NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer (ncclComm_t comm, size_t size, int handle)
492515{
493- // Step 1: Pre-allocate the rank-sync flag * before* ncclMemAlloc. ncclMemAlloc can fail
516+ // Step 1: Pre-allocate the rank-sync flag before ncclMemAlloc. ncclMemAlloc can fail
494517 // asymmetrically with ncclUnhandledCudaError on configurations where the symmetric/VMM path
495- // is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
518+ // is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
496519 // deferred this cudaMalloc until after the failure, the sticky error would propagate into
497520 // cudaMalloc, TLLM_CUDA_CHECK would throw, and the failing rank would never reach the
498- // collective ncclAllReduce(min) below — hanging every other rank that * did* succeed.
521+ // collective ncclAllReduce(min) below, hanging every other rank that did succeed.
499522 int * rankSyncFlag = nullptr ;
500523 TLLM_CUDA_CHECK (cudaMalloc (&rankSyncFlag, sizeof (int )));
501524 CudaMallocGuard flagGuard{rankSyncFlag}; // frees rankSyncFlag on any early return or exception
525+ auto stream = at::cuda::getCurrentCUDAStream ().stream ();
526+ TLLM_CUDA_CHECK (cudaMemsetAsync (rankSyncFlag, 0 , sizeof (int ), stream));
502527
503- // Step 2: Allocate symmetric memory ( per-rank, non-collective — can fail asymmetrically).
504- // If ncclMemAlloc fails, drain any sticky CUDA last- error so the subsequent cudaMemcpy and
505- // ncclAllReduce(min) observe a clean device state and the failing rank reaches the collective
506- // below on the same control path as healthy ranks.
528+ // Step 2: Allocate symmetric memory. This per-rank, non-collective call can fail
529+ // asymmetrically. When it fails, NCCL may leave a sticky CUDA error behind; clear it before
530+ // the stream-ordered flag copy and collective fallback so the failing rank still reaches
531+ // ncclAllReduce with the other ranks.
507532 void * ncclPtr = nullptr ;
508533 TLLM_NCCL_CHECK_WARN (ncclMemAlloc (&ncclPtr, size));
509534 int const localAllocOk = (ncclPtr != nullptr ) ? 1 : 0 ;
510535 NcclMemGuard ncclGuard{ncclPtr}; // frees ncclPtr on any early return or exception
511- if (!localAllocOk)
512- {
513- (void ) cudaGetLastError ();
514- }
536+ clearCudaErrorIfSymmetricAllocationFailed (localAllocOk);
515537
516- // Step 3: ncclCommWindowRegister is collective — if any rank skips it, all other ranks hang.
538+ // Step 3: ncclCommWindowRegister is collective. If any rank skips it, all other ranks hang.
517539 // Populate flag, reduce with min across ranks (0 if any rank failed), then read back.
518- // H2D failure is non-fatal: warn and continue — device flag may be stale but the allreduce
519- // must still be reached by all ranks. allreduce and D2H failures are catastrophic (throw).
520- auto stream = at::cuda::getCurrentCUDAStream ().stream ();
521- TLLM_CUDA_CHECK_WARN (cudaMemcpy (rankSyncFlag, &localAllocOk, sizeof (int ), cudaMemcpyHostToDevice));
540+ // The flag is initialized to 0, so H2D failure is non-fatal and conservatively falls back
541+ // to regular NCCL while still reaching the collective. allreduce and D2H failures throw.
542+ if (localAllocOk != 0 )
543+ {
544+ TLLM_CUDA_CHECK_WARN (
545+ cudaMemcpyAsync (rankSyncFlag, &localAllocOk, sizeof (localAllocOk), cudaMemcpyHostToDevice, stream));
546+ }
522547 TLLM_NCCL_CHECK (ncclAllReduce (rankSyncFlag, rankSyncFlag, 1 , ncclInt32, ncclMin, comm, stream));
523548 TLLM_CUDA_CHECK_WARN (cudaStreamSynchronize (stream));
524549
@@ -538,7 +563,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
538563 return NCCLWindowBuffer{}; // ncclGuard frees ncclPtr
539564 }
540565
541- // Step 4: Register with NCCL as a window ( collective — all ranks must reach this call) .
566+ // Step 4: Register with NCCL as a window. This is collective, so all ranks must reach it .
542567 // Failure here is non-fatal: warn and fall back to regular allreduce.
543568 // ncclGuard frees ncclPtr on return.
544569 ncclWindow_t window = nullptr ;
@@ -549,7 +574,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
549574 return NCCLWindowBuffer{};
550575 }
551576
552- // Step 5: Success — transfer ownership to the returned buffer.
577+ // Step 5: Success. Transfer ownership to the returned buffer.
553578 ncclGuard.release ();
554579 NCCLWindowBuffer buffer{ncclPtr, handle, size, window};
555580 TLLM_LOG_TRACE (" [NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p" ,
@@ -622,7 +647,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
622647 {
623648 // No buffers to clean up, but mark as cleaned
624649 mRegisteredComms .erase (comm);
625- mSymmetricUnavailable .erase (comm);
650+ mMinSymmetricFailureSize .erase (comm);
626651 return ;
627652 }
628653
@@ -698,7 +723,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
698723
699724 mBufferPool .erase (commIt);
700725 mRegisteredComms .erase (comm);
701- mSymmetricUnavailable .erase (comm);
726+ mMinSymmetricFailureSize .erase (comm);
702727}
703728
704729#endif // NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)
0 commit comments