[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 8743a2346b25 · 2026-05-18T07:32:27.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/pytorch/distributed/test_gtp.py b/tests/pytorch/distributed/test_gtp.py
@@ -1507,8 +1507,11 @@ class TestWaitAsyncCommsFallback:
     """
 
     class _FakeGroup:
-        def size(self): return 1
-        def rank(self): return 0
+        def size(self):
+            return 1
+
+        def rank(self):
+            return 0
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
     def test_fallback_accumulates_when_no_rs_handle(self):
@@ -1519,15 +1522,15 @@ def test_fallback_accumulates_when_no_rs_handle(self):
         p.pad_length = 0
         p.chain_id = gtp_module.GTPChain.UNGRAPHED.value
         p._quantizer = None
-        p.is_routed_expert = False      # ⇒ self._weights property returns [self]
+        p.is_routed_expert = False  # ⇒ self._weights property returns [self]
         p.main_grad = torch.zeros(8, 4, dtype=dtype, device="cuda")
-        p._prefetch_handle = None       # _wait_param_gather is no-op
-        p._wgrad_rs_handle = None       # _wait_reduce_scatter is no-op → fallback fires
+        p._prefetch_handle = None  # _wait_param_gather is no-op
+        p._wgrad_rs_handle = None  # _wait_reduce_scatter is no-op → fallback fires
         p._cached_ag_stream = None
         p._cached_rs_stream = None
         p.ag_event = torch.cuda.Event(external=True)
         p.rs_event = torch.cuda.Event(external=True)
-        p.rs_event.record()             # so rs_event.wait() in fallback doesn't block
+        p.rs_event.record()  # so rs_event.wait() in fallback doesn't block
         p._already_finalized = False
         p.grad_added_to_main_grad = False
 
@@ -1552,8 +1555,9 @@ def test_fallback_accumulates_when_no_rs_handle(self):
             gtp_module._inflight_comm_params.update(saved)
 
         torch.cuda.synchronize()
-        assert torch.all(p.main_grad == 2.0), \
-            f"main_grad should be 2.0 after fallback accumulation; got {p.main_grad}"
+        assert torch.all(
+            p.main_grad == 2.0
+        ), f"main_grad should be 2.0 after fallback accumulation; got {p.main_grad}"
         assert p._already_finalized is True, "_already_finalized must be set"
         assert p.grad_added_to_main_grad is True, "grad_added_to_main_grad must be set"
 
@@ -1567,7 +1571,7 @@ def test_fallback_skipped_when_already_finalized(self):
         p.pad_length = 0
         p.chain_id = gtp_module.GTPChain.UNGRAPHED.value
         p._quantizer = None
-        p.is_routed_expert = False      # ⇒ self._weights property returns [self]
+        p.is_routed_expert = False  # ⇒ self._weights property returns [self]
         # Pre-existing main_grad with a value the fallback must NOT overwrite.
         p.main_grad = torch.full((8, 4), 5.0, dtype=dtype, device="cuda")
         p._prefetch_handle = None
@@ -1577,7 +1581,7 @@ def test_fallback_skipped_when_already_finalized(self):
         p.ag_event = torch.cuda.Event(external=True)
         p.rs_event = torch.cuda.Event(external=True)
         p.rs_event.record()
-        p._already_finalized = True     # ← short-circuits the fallback
+        p._already_finalized = True  # ← short-circuits the fallback
 
         # No _rs_ticket: if the fallback ran it would AttributeError on
         # cache.get(None).  The skip path must not touch the cache at all.
@@ -1597,5 +1601,6 @@ def test_fallback_skipped_when_already_finalized(self):
             gtp_module._inflight_comm_params.update(saved)
 
         torch.cuda.synchronize()
-        assert torch.all(p.main_grad == 5.0), \
-            "main_grad must be untouched when _already_finalized=True"
+        assert torch.all(
+            p.main_grad == 5.0
+        ), "main_grad must be untouched when _already_finalized=True"
diff --git a/transformer_engine/pytorch/module/generalized_tensor_parallelism.py b/transformer_engine/pytorch/module/generalized_tensor_parallelism.py
@@ -143,6 +143,7 @@ def classify_gtp_chains(model) -> None:
 
 class GTPWeightState(Enum):
     """State of a GTPShardedParam's AG / RS lifecycle (debug / stale-read guard)."""
+
     NONE = "NONE"  # Sharded, no pending operation
     ASYNC_WAIT = "ASYNC_WAIT"  # Async all-gather in progress
     DATA_READY = "DATA_READY"  # Async all-gather complete, result in cache
@@ -1304,9 +1305,7 @@ def _reduce_scatter(self, wgrads, async_op, nvtx_label=None):
                 async_ops=async_op,
             ) as cm:
                 for out_buffer, tensor in zip(out_buffers, wgrads):
-                    out, _ = reduce_scatter_along_first_dim(
-                        tensor, self.group, output=out_buffer
-                    )
+                    out, _ = reduce_scatter_along_first_dim(tensor, self.group, output=out_buffer)
                     outputs.append(out)
             nvtx_range_pop(f"{nvtx_label}.batched_gtp_rs")