11/*
2- * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+ * Copyright (c) 2025-2026 , NVIDIA CORPORATION. All rights reserved.
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
2727namespace
2828{
2929
30- // RAII guard for cudaMalloc — frees the pointer on destruction, logging a warning on failure.
30+ // RAII guard for cudaMalloc. Frees the pointer on destruction, logging a warning on failure.
3131struct CudaMallocGuard
3232{
3333 void * ptr{nullptr };
@@ -56,7 +56,7 @@ struct CudaMallocGuard
5656 CudaMallocGuard& operator =(CudaMallocGuard const &) = delete ;
5757};
5858
59- // RAII guard for ncclMemAlloc — frees the pointer on destruction, logging a warning on failure.
59+ // RAII guard for ncclMemAlloc. Frees the pointer on destruction, logging a warning on failure.
6060struct NcclMemGuard
6161{
6262 void * ptr{nullptr };
@@ -416,17 +416,6 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
416416 // This is cheap even if no buffers exist yet - cleanup will just return early
417417 registerBufferCleanup (comm);
418418
419- // If a previous allocateAndRegisterBuffer call collectively concluded that this comm
420- // cannot use NCCL symmetric memory, short-circuit so callers transparently fall back to
421- // regular allreduce. This avoids re-running ncclMemAlloc + the rank-sync allreduce on
422- // every autotuner trial, which would otherwise spam warnings and stress the failing path.
423- // The decision is collective (driven by an ncclAllReduce(min) inside allocateAndRegisterBuffer),
424- // so all ranks reach the same conclusion and stay in sync without further communication.
425- if (mSymmetricUnavailable .find (comm) != mSymmetricUnavailable .end ())
426- {
427- return NCCLWindowBuffer ();
428- }
429-
430419 // Check if we have an available buffer of at least the requested size for this communicator
431420 // Use best-fit: find the smallest buffer that's >= requested size
432421 auto & commBuffers = mBufferPool [comm];
@@ -451,6 +440,17 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
451440 return bestFit->buffer ;
452441 }
453442
443+ // If a previous allocateAndRegisterBuffer call collectively failed for this comm at a size
444+ // no larger than this request, do not retry the known-failing new allocation path. Smaller
445+ // requests and already-pooled buffers can still use NCCL windows.
446+ auto const failureIt = mMinSymmetricFailureSize .find (comm);
447+ if (failureIt != mMinSymmetricFailureSize .end () && size >= failureIt->second )
448+ {
449+ TLLM_LOG_DEBUG (" [NCCLUtil] Skipping NCCL window allocation for comm %p, size=%zu; known failure threshold=%zu" ,
450+ static_cast <void *>(comm), size, failureIt->second );
451+ return NCCLWindowBuffer ();
452+ }
453+
454454 // No available buffer found, avoid registration during CUDA graph capture
455455 auto stream = at::cuda::getCurrentCUDAStream ();
456456 cudaStreamCaptureStatus capture_status = cudaStreamCaptureStatusNone;
@@ -480,15 +480,38 @@ NCCLWindowBuffer NCCLWindowAllocator::requestBuffer(ncclComm_t comm, size_t size
480480 }
481481 else
482482 {
483- // The collective allreduce inside allocateAndRegisterBuffer agreed that at least one
484- // rank could not allocate symmetric memory. Mark this comm so future requests don't
485- // retry the failing path on every autotuner trial .
486- mSymmetricUnavailable . insert (comm);
483+ // The collective allreduce inside allocateAndRegisterBuffer agreed that this request
484+ // cannot use symmetric memory on at least one rank. Remember the smallest failing
485+ // request size so repeated too-large autotuner probes do not keep stressing this path .
486+ recordSymmetricFailureLocked (comm, size );
487487 }
488488
489489 return buffer;
490490}
491491
492+ void NCCLWindowAllocator::recordSymmetricFailureLocked (ncclComm_t comm, size_t size)
493+ {
494+ auto failureIt = mMinSymmetricFailureSize .find (comm);
495+ if (failureIt == mMinSymmetricFailureSize .end ())
496+ {
497+ mMinSymmetricFailureSize .emplace (comm, size);
498+ }
499+ else if (size < failureIt->second )
500+ {
501+ failureIt->second = size;
502+ }
503+ }
504+
505+ cudaError_t NCCLWindowAllocator::clearCudaErrorIfSymmetricAllocationFailed (
506+ int localAllocOk, CudaGetLastErrorFunc getLastError) noexcept
507+ {
508+ if (localAllocOk == 0 )
509+ {
510+ return getLastError ();
511+ }
512+ return cudaSuccess;
513+ }
514+
492515NCCLWindowBuffer NCCLWindowAllocator::searchBuffer (ncclComm_t comm, void * ptr) const
493516{
494517 if (!comm || !ptr)
@@ -586,35 +609,37 @@ bool NCCLWindowAllocator::isCommValid(ncclComm_t comm) const noexcept
586609
587610NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer (ncclComm_t comm, size_t size, int handle)
588611{
589- // Step 1: Pre-allocate the rank-sync flag * before* ncclMemAlloc. ncclMemAlloc can fail
612+ // Step 1: Pre-allocate the rank-sync flag before ncclMemAlloc. ncclMemAlloc can fail
590613 // asymmetrically with ncclUnhandledCudaError on configurations where the symmetric/VMM path
591- // is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
614+ // is unavailable; that failure may leave a sticky CUDA last-error on the device. If we
592615 // deferred this cudaMalloc until after the failure, the sticky error would propagate into
593616 // cudaMalloc, TLLM_CUDA_CHECK would throw, and the failing rank would never reach the
594- // collective ncclAllReduce(min) below — hanging every other rank that * did* succeed.
617+ // collective ncclAllReduce(min) below, hanging every other rank that did succeed.
595618 int * rankSyncFlag = nullptr ;
596619 TLLM_CUDA_CHECK (cudaMalloc (&rankSyncFlag, sizeof (int )));
597620 CudaMallocGuard flagGuard{rankSyncFlag}; // frees rankSyncFlag on any early return or exception
621+ auto stream = at::cuda::getCurrentCUDAStream ().stream ();
622+ TLLM_CUDA_CHECK (cudaMemsetAsync (rankSyncFlag, 0 , sizeof (int ), stream));
598623
599- // Step 2: Allocate symmetric memory ( per-rank, non-collective — can fail asymmetrically).
600- // If ncclMemAlloc fails, drain any sticky CUDA last- error so the subsequent cudaMemcpy and
601- // ncclAllReduce(min) observe a clean device state and the failing rank reaches the collective
602- // below on the same control path as healthy ranks.
624+ // Step 2: Allocate symmetric memory. This per-rank, non-collective call can fail
625+ // asymmetrically. When it fails, NCCL may leave a sticky CUDA error behind; clear it before
626+ // the stream-ordered flag copy and collective fallback so the failing rank still reaches
627+ // ncclAllReduce with the other ranks.
603628 void * ncclPtr = nullptr ;
604629 TLLM_NCCL_CHECK_WARN (ncclMemAlloc (&ncclPtr, size));
605630 int const localAllocOk = (ncclPtr != nullptr ) ? 1 : 0 ;
606631 NcclMemGuard ncclGuard{ncclPtr}; // frees ncclPtr on any early return or exception
607- if (!localAllocOk)
608- {
609- (void ) cudaGetLastError ();
610- }
632+ clearCudaErrorIfSymmetricAllocationFailed (localAllocOk);
611633
612- // Step 3: ncclCommWindowRegister is collective — if any rank skips it, all other ranks hang.
634+ // Step 3: ncclCommWindowRegister is collective. If any rank skips it, all other ranks hang.
613635 // Populate flag, reduce with min across ranks (0 if any rank failed), then read back.
614- // H2D failure is non-fatal: warn and continue — device flag may be stale but the allreduce
615- // must still be reached by all ranks. allreduce and D2H failures are catastrophic (throw).
616- auto stream = at::cuda::getCurrentCUDAStream ().stream ();
617- TLLM_CUDA_CHECK_WARN (cudaMemcpy (rankSyncFlag, &localAllocOk, sizeof (int ), cudaMemcpyHostToDevice));
636+ // The flag is initialized to 0, so H2D failure is non-fatal and conservatively falls back
637+ // to regular NCCL while still reaching the collective. allreduce and D2H failures throw.
638+ if (localAllocOk != 0 )
639+ {
640+ TLLM_CUDA_CHECK_WARN (
641+ cudaMemcpyAsync (rankSyncFlag, &localAllocOk, sizeof (localAllocOk), cudaMemcpyHostToDevice, stream));
642+ }
618643 TLLM_NCCL_CHECK (ncclAllReduce (rankSyncFlag, rankSyncFlag, 1 , ncclInt32, ncclMin, comm, stream));
619644 TLLM_CUDA_CHECK_WARN (cudaStreamSynchronize (stream));
620645
@@ -634,7 +659,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
634659 return NCCLWindowBuffer{}; // ncclGuard frees ncclPtr
635660 }
636661
637- // Step 4: Register with NCCL as a window ( collective — all ranks must reach this call) .
662+ // Step 4: Register with NCCL as a window. This is collective, so all ranks must reach it .
638663 // Failure here is non-fatal: warn and fall back to regular allreduce.
639664 // ncclGuard frees ncclPtr on return.
640665 ncclWindow_t window = nullptr ;
@@ -645,7 +670,7 @@ NCCLWindowBuffer NCCLWindowAllocator::allocateAndRegisterBuffer(ncclComm_t comm,
645670 return NCCLWindowBuffer{};
646671 }
647672
648- // Step 5: Success — transfer ownership to the returned buffer.
673+ // Step 5: Success. Transfer ownership to the returned buffer.
649674 ncclGuard.release ();
650675 NCCLWindowBuffer buffer{ncclPtr, handle, size, window};
651676 TLLM_LOG_TRACE (" [NCCLUtil] Allocated and registered NCCL window buffer: handle=%d, ptr=%p, size=%zu, window=%p" ,
@@ -718,7 +743,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
718743 {
719744 // No buffers to clean up, but mark as cleaned
720745 mRegisteredComms .erase (comm);
721- mSymmetricUnavailable .erase (comm);
746+ mMinSymmetricFailureSize .erase (comm);
722747 return ;
723748 }
724749
@@ -794,7 +819,7 @@ void NCCLWindowAllocator::cleanupBuffersForComm(ncclComm_t comm) noexcept
794819
795820 mBufferPool .erase (commIt);
796821 mRegisteredComms .erase (comm);
797- mSymmetricUnavailable .erase (comm);
822+ mMinSymmetricFailureSize .erase (comm);
798823}
799824
800825#endif // NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)
0 commit comments