NVIDIA
diff --git a/‎tests/pytorch/distributed/test_gtp.py‎
Lines changed: 282 additions & 129 deletions b/‎tests/pytorch/distributed/test_gtp.py‎
Lines changed: 282 additions & 129 deletions
diff --git a/‎tests/pytorch/distributed/test_tp_gtp.py‎
Lines changed: 105 additions & 59 deletions b/‎tests/pytorch/distributed/test_tp_gtp.py‎
Lines changed: 105 additions & 59 deletions
diff --git a/‎transformer_engine/common/include/transformer_engine/recipe.h‎
Lines changed: 1 addition & 1 deletion b/‎transformer_engine/common/include/transformer_engine/recipe.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎transformer_engine/common/recipe/multi_amax.cu‎
Lines changed: 9 additions & 15 deletions b/‎transformer_engine/common/recipe/multi_amax.cu‎
Lines changed: 9 additions & 15 deletions
diff --git a/‎transformer_engine/pytorch/csrc/extensions.h‎
Lines changed: 1 addition & 2 deletions b/‎transformer_engine/pytorch/csrc/extensions.h‎
Lines changed: 1 addition & 2 deletions
@@ -42,6 +42,7 @@
 # Fixtures
 # ---------------------------------------------------------------------------
 
+
 @pytest.fixture(autouse=True)
 def reset_fp8_state():
     yield
@@ -61,6 +62,7 @@ def reset_gtp_globals():
 # Helpers
 # ---------------------------------------------------------------------------
 
+
 def _free_port() -> int:
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind(("", 0))
@@ -125,18 +127,21 @@ def _build_groups(rank: int, world_size: int, tp_size: int, gtp_size: int):
 # 1. TestTPGTPProcessGroups – group sizes and rank membership
 # ---------------------------------------------------------------------------
 
+
 def _worker_groups(rank, world_size, port, tp_size, gtp_size):
     _dist_init(rank, world_size, port)
     tp_group, gtp_group, tp_rank, gtp_rank = _build_groups(rank, world_size, tp_size, gtp_size)
 
-    assert tp_group.size() == tp_size, \
-        f"rank {rank}: TP group size {tp_group.size()} != {tp_size}"
-    assert gtp_group.size() == gtp_size, \
-        f"rank {rank}: GTP group size {gtp_group.size()} != {gtp_size}"
-    assert dist.get_rank(tp_group) == tp_rank, \
-        f"rank {rank}: TP rank {dist.get_rank(tp_group)} != expected {tp_rank}"
-    assert dist.get_rank(gtp_group) == gtp_rank, \
-        f"rank {rank}: GTP rank {dist.get_rank(gtp_group)} != expected {gtp_rank}"
+    assert tp_group.size() == tp_size, f"rank {rank}: TP group size {tp_group.size()} != {tp_size}"
+    assert (
+        gtp_group.size() == gtp_size
+    ), f"rank {rank}: GTP group size {gtp_group.size()} != {gtp_size}"
+    assert (
+        dist.get_rank(tp_group) == tp_rank
+    ), f"rank {rank}: TP rank {dist.get_rank(tp_group)} != expected {tp_rank}"
+    assert (
+        dist.get_rank(gtp_group) == gtp_rank
+    ), f"rank {rank}: GTP rank {dist.get_rank(gtp_group)} != expected {gtp_rank}"
 
     dist.destroy_process_group()
 
@@ -153,25 +158,34 @@ def test_group_sizes_and_ranks(self, tp_size, gtp_size):
 # 2. TestTPGTPColumnParallelLinear
 # ---------------------------------------------------------------------------
 
+
 def _worker_column_shape(rank, world_size, port, tp_size, gtp_size):
     """Column-parallel: weight shape must be [out_f/(tp_size*gtp_size), in_f]."""
     _dist_init(rank, world_size, port)
     tp_group, gtp_group, _, _ = _build_groups(rank, world_size, tp_size, gtp_size)
 
     in_f = 64
-    out_f = tp_size * gtp_size * 32   # per-rank shard = 32 rows
+    out_f = tp_size * gtp_size * 32  # per-rank shard = 32 rows
 
     layer = te.Linear(
-        in_features=in_f, out_features=out_f,
-        parallel_mode="column", bias=False, params_dtype=torch.bfloat16,
-        device="cuda", tp_group=tp_group, gtp_group=gtp_group,
+        in_features=in_f,
+        out_features=out_f,
+        parallel_mode="column",
+        bias=False,
+        params_dtype=torch.bfloat16,
+        device="cuda",
+        tp_group=tp_group,
+        gtp_group=gtp_group,
     )
 
     expected_rows = out_f // (tp_size * gtp_size)
-    assert isinstance(layer.weight, GTPShardedParam), \
-        f"rank {rank}: weight should be GTPShardedParam"
-    assert layer.weight.shape == (expected_rows, in_f), \
-        f"rank {rank}: expected ({expected_rows}, {in_f}), got {layer.weight.shape}"
+    assert isinstance(
+        layer.weight, GTPShardedParam
+    ), f"rank {rank}: weight should be GTPShardedParam"
+    assert layer.weight.shape == (
+        expected_rows,
+        in_f,
+    ), f"rank {rank}: expected ({expected_rows}, {in_f}), got {layer.weight.shape}"
 
     dist.destroy_process_group()
 
@@ -183,21 +197,26 @@ def _worker_column_correctness(rank, world_size, port, tp_size, gtp_size):
     tp_group, gtp_group, tp_rank, gtp_rank = _build_groups(rank, world_size, tp_size, gtp_size)
 
     batch, in_f = 16, 64
-    out_f = tp_size * gtp_size * 32   # per-rank shard = 32 rows
+    out_f = tp_size * gtp_size * 32  # per-rank shard = 32 rows
     dtype = torch.bfloat16
 
     layer = te.Linear(
-        in_features=in_f, out_features=out_f,
-        parallel_mode="column", bias=False, params_dtype=dtype,
-        device="cuda", tp_group=tp_group, gtp_group=gtp_group,
+        in_features=in_f,
+        out_features=out_f,
+        parallel_mode="column",
+        bias=False,
+        params_dtype=dtype,
+        device="cuda",
+        tp_group=tp_group,
+        gtp_group=gtp_group,
     )
 
     # All-gather GTP shards → TP-local full weight [out_f/tp_size, in_f]
     shard = layer.weight.data.clone()
     all_gtp_shards = [torch.zeros_like(shard) for _ in range(gtp_size)]
     dist.all_gather(all_gtp_shards, shard, group=gtp_group)
     tp_local_weight = torch.cat(all_gtp_shards, dim=0).float()  # strip padding
-    tp_local_weight = tp_local_weight[:out_f // tp_size]
+    tp_local_weight = tp_local_weight[: out_f // tp_size]
 
     # Same full input on all ranks (column-parallel: each rank processes full input)
     inp = torch.randn(batch, in_f, dtype=dtype, device="cuda")
@@ -206,16 +225,17 @@ def _worker_column_correctness(rank, world_size, port, tp_size, gtp_size):
 
     # TE forward: GTP all-gathers weight internally; no TP comm in column-parallel fwd
     out = layer(inp_te, is_first_microbatch=True)
-    assert out.shape == (batch, out_f // tp_size), \
-        f"rank {rank}: output shape {out.shape} != ({batch}, {out_f // tp_size})"
+    assert out.shape == (
+        batch,
+        out_f // tp_size,
+    ), f"rank {rank}: output shape {out.shape} != ({batch}, {out_f // tp_size})"
 
     # Reference: this TP rank's output = inp @ tp_local_weight^T
     ref = inp.float() @ tp_local_weight.T
     ref = ref.to(dtype)
-    assert torch.allclose(out.float(), ref.float(), atol=0.1, rtol=0.1), (
-        f"rank {rank}: output mismatch, "
-        f"max_diff={(out.float() - ref.float()).abs().max():.4f}"
-    )
+    assert torch.allclose(
+        out.float(), ref.float(), atol=0.1, rtol=0.1
+    ), f"rank {rank}: output mismatch, max_diff={(out.float() - ref.float()).abs().max():.4f}"
 
     # Backward: dX is all-reduced across TP group internally by TE
     grad = torch.randn_like(out)
@@ -247,25 +267,33 @@ def test_forward_backward_correctness(self, tp_size, gtp_size):
 # 3. TestTPGTPRowParallelLinear
 # ---------------------------------------------------------------------------
 
+
 def _worker_row_shape(rank, world_size, port, tp_size, gtp_size):
     """Row-parallel: weight shape must be [out_f/gtp_size, in_f/tp_size]."""
     _dist_init(rank, world_size, port)
     tp_group, gtp_group, _, _ = _build_groups(rank, world_size, tp_size, gtp_size)
 
-    in_f = tp_size * 64   # TE divides by tp_size → local in_f = 64
+    in_f = tp_size * 64  # TE divides by tp_size → local in_f = 64
     out_f = gtp_size * 64  # GTP divides by gtp_size → local out_f = 64
 
     layer = te.Linear(
-        in_features=in_f, out_features=out_f,
-        parallel_mode="row", bias=False, params_dtype=torch.bfloat16,
-        device="cuda", tp_group=tp_group, gtp_group=gtp_group,
+        in_features=in_f,
+        out_features=out_f,
+        parallel_mode="row",
+        bias=False,
+        params_dtype=torch.bfloat16,
+        device="cuda",
+        tp_group=tp_group,
+        gtp_group=gtp_group,
     )
 
     expected_shape = (out_f // gtp_size, in_f // tp_size)
-    assert isinstance(layer.weight, GTPShardedParam), \
-        f"rank {rank}: weight should be GTPShardedParam"
-    assert layer.weight.shape == expected_shape, \
-        f"rank {rank}: expected {expected_shape}, got {layer.weight.shape}"
+    assert isinstance(
+        layer.weight, GTPShardedParam
+    ), f"rank {rank}: weight should be GTPShardedParam"
+    assert (
+        layer.weight.shape == expected_shape
+    ), f"rank {rank}: expected {expected_shape}, got {layer.weight.shape}"
 
     dist.destroy_process_group()
 
@@ -277,14 +305,19 @@ def _worker_row_forward_backward(rank, world_size, port, tp_size, gtp_size):
     tp_group, gtp_group, tp_rank, _ = _build_groups(rank, world_size, tp_size, gtp_size)
 
     batch = 16
-    in_f = tp_size * 64    # full in_features
+    in_f = tp_size * 64  # full in_features
     out_f = gtp_size * 64  # full out_features
     dtype = torch.bfloat16
 
     layer = te.Linear(
-        in_features=in_f, out_features=out_f,
-        parallel_mode="row", bias=False, params_dtype=dtype,
-        device="cuda", tp_group=tp_group, gtp_group=gtp_group,
+        in_features=in_f,
+        out_features=out_f,
+        parallel_mode="row",
+        bias=False,
+        params_dtype=dtype,
+        device="cuda",
+        tp_group=tp_group,
+        gtp_group=gtp_group,
     )
 
     # Row-parallel: each TP rank takes the corresponding slice of in_f
@@ -296,8 +329,10 @@ def _worker_row_forward_backward(rank, world_size, port, tp_size, gtp_size):
 
     # TE forward: GTP all-gathers weight, row-parallel all-reduces output across TP
     out = layer(inp, is_first_microbatch=True)
-    assert out.shape == (batch, out_f), \
-        f"rank {rank}: output shape {out.shape} != ({batch}, {out_f})"
+    assert out.shape == (
+        batch,
+        out_f,
+    ), f"rank {rank}: output shape {out.shape} != ({batch}, {out_f})"
     assert torch.isfinite(out).all(), f"rank {rank}: non-finite output"
 
     # wgrad RS path always accumulates into main_grad; allocate before backward.
@@ -321,20 +356,25 @@ def _worker_row_correctness(rank, world_size, port, tp_size, gtp_size):
     dtype = torch.bfloat16
 
     layer = te.Linear(
-        in_features=in_f, out_features=out_f,
-        parallel_mode="row", bias=False, params_dtype=dtype,
-        device="cuda", tp_group=tp_group, gtp_group=gtp_group,
+        in_features=in_f,
+        out_features=out_f,
+        parallel_mode="row",
+        bias=False,
+        params_dtype=dtype,
+        device="cuda",
+        tp_group=tp_group,
+        gtp_group=gtp_group,
     )
 
     # Reconstruct full weight: all-gather GTP shards → TP-local, then all-gather TP shards
     shard = layer.weight.data.clone()
     all_gtp_shards = [torch.zeros_like(shard) for _ in range(gtp_size)]
     dist.all_gather(all_gtp_shards, shard, group=gtp_group)
-    tp_local_weight = torch.cat(all_gtp_shards, dim=0).float()   # [out_f, in_f/tp_size]
+    tp_local_weight = torch.cat(all_gtp_shards, dim=0).float()  # [out_f, in_f/tp_size]
 
     all_tp_weights = [torch.zeros_like(tp_local_weight) for _ in range(tp_size)]
     dist.all_gather(all_tp_weights, tp_local_weight, group=tp_group)
-    full_weight = torch.cat(all_tp_weights, dim=1).float()        # [out_f, in_f]
+    full_weight = torch.cat(all_tp_weights, dim=1).float()  # [out_f, in_f]
 
     # Full input (same on all ranks; we slice below to simulate row-parallel)
     full_inp = torch.randn(batch, in_f, dtype=dtype, device="cuda")
@@ -348,10 +388,9 @@ def _worker_row_correctness(rank, world_size, port, tp_size, gtp_size):
     # Reference: full input @ full weight^T — all ranks should see the same output
     ref = full_inp.float() @ full_weight.T
     ref = ref.to(dtype)
-    assert torch.allclose(out.float(), ref.float(), atol=0.1, rtol=0.1), (
-        f"rank {rank}: output mismatch, "
-        f"max_diff={(out.float() - ref.float()).abs().max():.4f}"
-    )
+    assert torch.allclose(
+        out.float(), ref.float(), atol=0.1, rtol=0.1
+    ), f"rank {rank}: output mismatch, max_diff={(out.float() - ref.float()).abs().max():.4f}"
 
     dist.destroy_process_group()
 
@@ -380,6 +419,7 @@ def test_forward_correctness(self, tp_size, gtp_size):
 # 4. TestTPGTPLayerNormLinear – column-parallel smoke test
 # ---------------------------------------------------------------------------
 
+
 def _worker_layernorm_linear(rank, world_size, port, tp_size, gtp_size):
     _dist_init(rank, world_size, port)
     torch.manual_seed(0)
@@ -391,23 +431,29 @@ def _worker_layernorm_linear(rank, world_size, port, tp_size, gtp_size):
     dtype = torch.bfloat16
 
     layer = te.LayerNormLinear(
-        in_features=in_f, out_features=out_f,
-        bias=False, params_dtype=dtype,
+        in_features=in_f,
+        out_features=out_f,
+        bias=False,
+        params_dtype=dtype,
         parallel_mode="column",
-        device="cuda", tp_group=tp_group, gtp_group=gtp_group,
+        device="cuda",
+        tp_group=tp_group,
+        gtp_group=gtp_group,
     )
-    assert isinstance(layer.weight, GTPShardedParam), \
-        f"rank {rank}: LayerNormLinear.weight should be GTPShardedParam"
+    assert isinstance(
+        layer.weight, GTPShardedParam
+    ), f"rank {rank}: LayerNormLinear.weight should be GTPShardedParam"
     expected_rows = out_f // (tp_size * gtp_size)
-    assert layer.weight.shape == (expected_rows, in_f), \
-        f"rank {rank}: unexpected weight shape {layer.weight.shape}"
+    assert layer.weight.shape == (
+        expected_rows,
+        in_f,
+    ), f"rank {rank}: unexpected weight shape {layer.weight.shape}"
 
     inp = torch.randn(seq, batch, in_f, dtype=dtype, device="cuda", requires_grad=True)
     dist.broadcast(inp, src=0)
 
     out = layer(inp, is_first_microbatch=True)
-    assert out.shape == (seq, batch, out_f // tp_size), \
-        f"rank {rank}: output shape {out.shape}"
+    assert out.shape == (seq, batch, out_f // tp_size), f"rank {rank}: output shape {out.shape}"
     assert torch.isfinite(out).all(), f"rank {rank}: non-finite output"
 
     # wgrad RS path always accumulates into main_grad; allocate before backward.
 
@@ -116,7 +116,7 @@ void nvte_compute_amax_with_config(const NVTETensor input, NVTETensor output,
  *  \param[in]      config        Quantization configuration (for noop_tensor).  May be NULL.
  *  \param[in]      stream        CUDA stream used for the operation.
  */
-void nvte_multi_compute_amax(const NVTETensor *inputs, NVTETensor *outputs, size_t num_tensors,
+void nvte_multi_compute_amax(const NVTETensor* inputs, NVTETensor* outputs, size_t num_tensors,
                              const NVTEQuantizationConfig config, cudaStream_t stream);
 
 /*! \brief Update an FP8 tensor's scale based on its amax.
 
@@ -81,8 +81,7 @@ __launch_bounds__(multi_amax_kernel_threads) __global__
   InputType max = InputType{0.f};
   const int warp_id = threadIdx.x / THREADS_PER_WARP;
 
-  for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < M;
-       tid += gridDim.x * blockDim.x) {
+  for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < M; tid += gridDim.x * blockDim.x) {
     loader.load(tid, N);
 #pragma unroll
     for (int i = 0; i < nvec; ++i) {
@@ -146,18 +145,15 @@ void launch_multi_amax_batch(const MultiAmaxArgs &args, size_t max_numel, Alignm
 
   switch (align) {
     case Alignment::SAME_ALIGNED:
-      MultiAmaxKernel<nvec, true, InputType>
-          <<<grid, threads, 0, stream>>>(args, noop_ptr);
+      MultiAmaxKernel<nvec, true, InputType><<<grid, threads, 0, stream>>>(args, noop_ptr);
       break;
     case Alignment::SAME_UNALIGNED:
-      MultiAmaxKernel<nvec, false, InputType>
-          <<<grid, threads, 0, stream>>>(args, noop_ptr);
+      MultiAmaxKernel<nvec, false, InputType><<<grid, threads, 0, stream>>>(args, noop_ptr);
       break;
     case Alignment::DIFFERENT:
       // Heterogeneous alignment across tensors — fall back to nvec=1, aligned=true path
       // which is safe for any pointer alignment.
-      MultiAmaxKernel<1, true, InputType>
-          <<<grid, threads, 0, stream>>>(args, noop_ptr);
+      MultiAmaxKernel<1, true, InputType><<<grid, threads, 0, stream>>>(args, noop_ptr);
       break;
   }
   NVTE_CHECK_CUDA(cudaGetLastError());
@@ -186,8 +182,8 @@ std::pair<size_t, Alignment> build_batch_args(const std::vector<Tensor *> &input
     args.output_rowwise_amax_list[i] = rw_ptr;
     args.output_columnwise_amax_list[i] = cw_ptr;
     args.input_numel[i] = N;
-    args.num_aligned_elements[i] = get_num_aligned_elements(inp.data.dptr, N, nvec,
-                                                            sizeof(InputType));
+    args.num_aligned_elements[i] =
+        get_num_aligned_elements(inp.data.dptr, N, nvec, sizeof(InputType));
     max_numel = std::max(max_numel, N);
 
     // Fold this tensor's alignment into the batch decision.  CheckAlignment on a
@@ -225,11 +221,9 @@ void multi_compute_amax_impl(const NVTETensor *inputs_, NVTETensor *outputs_, si
     outputs[i] = convertNVTETensorCheck(outputs_[i]);
     const auto &inp = *inputs[i];
     auto &out = *outputs[i];
-    NVTE_CHECK(inp.scaling_mode == NVTE_DELAYED_TENSOR_SCALING,
-               "nvte_multi_compute_amax: input[", i,
-               "] must be unquantized, got scaling_mode=", to_string(inp.scaling_mode));
-    NVTE_CHECK(!is_fp8_dtype(inp.data.dtype),
-               "nvte_multi_compute_amax: input[", i,
+    NVTE_CHECK(inp.scaling_mode == NVTE_DELAYED_TENSOR_SCALING, "nvte_multi_compute_amax: input[",
+               i, "] must be unquantized, got scaling_mode=", to_string(inp.scaling_mode));
+    NVTE_CHECK(!is_fp8_dtype(inp.data.dtype), "nvte_multi_compute_amax: input[", i,
                "] must be unquantized, got dtype=", to_string(inp.data.dtype));
     if (i == 0) {
       input_dtype = inp.data.dtype;
 
@@ -333,8 +333,7 @@ py::object quantize(const at::Tensor &tensor, py::handle quantizer, const py::ob
 py::object compute_amax_nvfp4(const at::Tensor &tensor, py::handle quantizer,
                               const py::object &output);
 py::object quantize_cast_only_nvfp4(const at::Tensor &tensor, py::handle quantizer,
-                                    const py::object &output,
-                                    std::optional<at::Tensor> noop_flag);
+                                    const py::object &output, std::optional<at::Tensor> noop_flag);
 
 // NVFP4-only multi-tensor amax: fuses N per-expert (zero_amax + amax + D2D replicate)
 // chains into a single pair of kernel launches (one multi-zero + one multi-amax) that