NVIDIA
diff --git a/‎tests/pytorch/distributed/test_gtp.py‎
Lines changed: 116 additions & 3 deletions b/‎tests/pytorch/distributed/test_gtp.py‎
Lines changed: 116 additions & 3 deletions
diff --git a/‎tests/pytorch/distributed/test_tp_gtp.py‎
Lines changed: 1 addition & 4 deletions b/‎tests/pytorch/distributed/test_tp_gtp.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎transformer_engine/pytorch/distributed.py‎
Lines changed: 10 additions & 9 deletions b/‎transformer_engine/pytorch/distributed.py‎
Lines changed: 10 additions & 9 deletions
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
 
@@ -28,6 +28,7 @@
 20. TestGTPPrefetchDisabled      – weight_prefetch=False: single-pass forward still works (multi-GPU)
 21. TestFuseWgradAccumulation    – fuse_wgrad_accumulation=True: wgrad→main_grad (multi-GPU)
 22. TestGTPGradAccumHook         – main_grad updated after reduce-scatter backward (multi-GPU)
+23. TestWaitAsyncCommsFallback   – wait_async_comms(finalize_after_drain=True) inline-accumulation fallback when _wgrad_rs_handle is None (single-process)
 
 Multi-GPU tests use torch.multiprocessing.spawn and are skipped when fewer
 than the required CUDA devices are available.
@@ -71,8 +72,6 @@ def reset_fp8_state():
 def reset_gtp_globals():
     """Reset all GTP mutable class/module-level state between tests."""
     yield
-    GTPShardedParam._first_weight_flag = True
-    GTPShardedParam._pending_rs_weight = None
     GTPShardedParam._chain_state = {}
 
 
@@ -1486,3 +1485,117 @@ class TestGTPGradAccumHook:
     def test_main_grad_updated_after_backward(self):
         _requires_multi_gpu(4)
         _run_distributed(_worker_main_grad_updated_after_bwd, 4)
+
+
+# ---------------------------------------------------------------------------
+# 24. wait_async_comms(finalize_after_drain=True) inline-accumulation fallback
+# ---------------------------------------------------------------------------
+
+
+class TestWaitAsyncCommsFallback:
+    """Exercises the inline-accumulation fallback inside
+    ``wait_async_comms(finalize_after_drain=True)``: when a param is in
+    ``_inflight_comm_params`` (async AG was issued) but its ``_wgrad_rs_handle``
+    is ``None`` (no async RS handle to drain), the inner
+    ``_wait_reduce_scatter`` call no-ops and the outer loop must inline the
+    accumulation itself (main_grad.add_ + ticket release + flag set).
+
+    Production flows rarely hit this combination — chain-interior params have
+    both async AG and async RS, and chain-head sync RS doesn't enter
+    ``_inflight_comm_params`` via bwd AG. We construct the state by hand to
+    pin down the fallback's contract.
+    """
+
+    class _FakeGroup:
+        def size(self): return 1
+        def rank(self): return 0
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+    def test_fallback_accumulates_when_no_rs_handle(self):
+        dtype = torch.bfloat16
+        p = GTPShardedParam(torch.zeros(8, 4, dtype=dtype, device="cuda"))
+        p.group = self._FakeGroup()
+        p.expert_idx = None
+        p.pad_length = 0
+        p.chain_id = gtp_module.GTPChain.UNGRAPHED.value
+        p._quantizer = None
+        p.is_routed_expert = False      # ⇒ self._weights property returns [self]
+        p.main_grad = torch.zeros(8, 4, dtype=dtype, device="cuda")
+        p._prefetch_handle = None       # _wait_param_gather is no-op
+        p._wgrad_rs_handle = None       # _wait_reduce_scatter is no-op → fallback fires
+        p._cached_ag_stream = None
+        p._cached_rs_stream = None
+        p.ag_event = torch.cuda.Event(external=True)
+        p.rs_event = torch.cuda.Event(external=True)
+        p.rs_event.record()             # so rs_event.wait() in fallback doesn't block
+        p._already_finalized = False
+        p.grad_added_to_main_grad = False
+
+        # Place a known wgrad in the cache for the fallback to read.
+        cache = gtp_module.get_global_GTP_cache()
+        p._rs_ticket = cache.reserve(p, dtype, fwd=False, reduce_scatter=True)
+        cache.get(p._rs_ticket).fill_(2.0)
+
+        # Save + replace _inflight_comm_params so we don't trip over leftover
+        # params from earlier tests in the loop.
+        saved = set(gtp_module._inflight_comm_params)
+        gtp_module._inflight_comm_params.clear()
+        gtp_module._inflight_comm_params.add(p)
+        try:
+            gtp_module.wait_async_comms(
+                chain_id=p.chain_id,
+                skip_rs=False,
+                finalize_after_drain=True,
+            )
+        finally:
+            gtp_module._inflight_comm_params.clear()
+            gtp_module._inflight_comm_params.update(saved)
+
+        torch.cuda.synchronize()
+        assert torch.all(p.main_grad == 2.0), \
+            f"main_grad should be 2.0 after fallback accumulation; got {p.main_grad}"
+        assert p._already_finalized is True, "_already_finalized must be set"
+        assert p.grad_added_to_main_grad is True, "grad_added_to_main_grad must be set"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+    def test_fallback_skipped_when_already_finalized(self):
+        """When _already_finalized=True, the fallback must NOT re-accumulate."""
+        dtype = torch.bfloat16
+        p = GTPShardedParam(torch.zeros(8, 4, dtype=dtype, device="cuda"))
+        p.group = self._FakeGroup()
+        p.expert_idx = None
+        p.pad_length = 0
+        p.chain_id = gtp_module.GTPChain.UNGRAPHED.value
+        p._quantizer = None
+        p.is_routed_expert = False      # ⇒ self._weights property returns [self]
+        # Pre-existing main_grad with a value the fallback must NOT overwrite.
+        p.main_grad = torch.full((8, 4), 5.0, dtype=dtype, device="cuda")
+        p._prefetch_handle = None
+        p._wgrad_rs_handle = None
+        p._cached_ag_stream = None
+        p._cached_rs_stream = None
+        p.ag_event = torch.cuda.Event(external=True)
+        p.rs_event = torch.cuda.Event(external=True)
+        p.rs_event.record()
+        p._already_finalized = True     # ← short-circuits the fallback
+
+        # No _rs_ticket: if the fallback ran it would AttributeError on
+        # cache.get(None).  The skip path must not touch the cache at all.
+        p._rs_ticket = None
+
+        saved = set(gtp_module._inflight_comm_params)
+        gtp_module._inflight_comm_params.clear()
+        gtp_module._inflight_comm_params.add(p)
+        try:
+            gtp_module.wait_async_comms(
+                chain_id=p.chain_id,
+                skip_rs=False,
+                finalize_after_drain=True,
+            )
+        finally:
+            gtp_module._inflight_comm_params.clear()
+            gtp_module._inflight_comm_params.update(saved)
+
+        torch.cuda.synchronize()
+        assert torch.all(p.main_grad == 5.0), \
+            "main_grad must be untouched when _already_finalized=True"
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
 
@@ -17,7 +17,6 @@
 2.  TestTPGTPColumnParallelLinear – column-parallel Linear: weight shape + fwd/bwd correctness
 3.  TestTPGTPRowParallelLinear    – row-parallel Linear: weight shape + fwd/bwd smoke test
 4.  TestTPGTPLayerNormLinear      – LayerNormLinear column-parallel smoke test
-5.  TestTPGTPLayerNormMLP         – LayerNormMLP (column FC1 + row FC2) smoke test
 
 Tests use (tp_size, gtp_size) = (2, 2) → world_size = 4 (runs on 4-GPU machines).
 
@@ -53,8 +52,6 @@ def reset_fp8_state():
 def reset_gtp_globals():
     """Reset GTP mutable class/module-level state between tests."""
     yield
-    GTPShardedParam._first_weight_flag = True
-    GTPShardedParam._pending_rs_weight = None
     GTPShardedParam._chain_state = {}
 
 
 
@@ -1284,15 +1284,13 @@ def _post_process_nvfp4_gather(
         handle.wait()
         handle = None
 
-    # TODO
-    # # Fix the interleaved transposed data from gathering along first dim.
-    # out._columnwise_scale_inv = _swap_first_dims(columnwise_scale_inv_interleaved, world_size)
-    # out._columnwise_data = _swap_first_dims(columnwise_data_interleaved, world_size)
+    # Fix the interleaved transposed data from gathering along first dim.
+    # In-place .copy_() (not `=` rebind) to keep the storage address stable
+    # for CUDA graph capture — replays see the same pointer they captured.
     out._columnwise_scale_inv.copy_(_swap_first_dims(columnwise_scale_inv_interleaved, world_size))
     out._columnwise_data.copy_(_swap_first_dims(columnwise_data_interleaved, world_size))
 
-    # # Optionally pad the scaling inverse if needed.
-    # out._columnwise_scale_inv = pad_columnwise_scale_inv(out._columnwise_scale_inv)
+    # Optionally pad the scaling inverse if needed (same in-place pattern).
     out._columnwise_scale_inv.copy_(pad_columnwise_scale_inv(out._columnwise_scale_inv))
 
 
@@ -1308,6 +1306,10 @@ class _NVFP4AllGatherAsyncHandle:
     _synchronized: bool = False
 
     def post_process_nvfp4_gather(self) -> None:
+        """Fix interleaved transposed data + pad scale_inv after the async AG completes.
+
+        Idempotent: gated by ``_synchronized`` in :meth:`wait`.
+        """
         _post_process_nvfp4_gather(
             self.output,
             self.columnwise_data_interleaved,
@@ -1454,9 +1456,8 @@ def _all_gather_nvfp4(
                 group=process_group,
             )
 
-            # Transfer amax to output.
-            # TODO: jiemingz
-            # out._amax_rowwise = inp._amax_rowwise
+            # Transfer amax to output via in-place .copy_() so the storage
+            # address stays stable for CUDA graph capture.
             out._amax_rowwise.copy_(inp._amax_rowwise)
 
         # Gather the transposed NVFP4 data along first dimension. Fix format later.