fixes for E2E run

zhongbozhu · zhongbozhu · commit c9d56c43bb5a · 2026-06-11T10:46:55.000-07:00
Signed-off-by: Zhongbo Zhu &lt;zhongboz@nvidia.com&gt;
diff --git a/tests/pytorch/megacpp/test_grouped_mlp.py b/tests/pytorch/megacpp/test_grouped_mlp.py
@@ -98,20 +98,20 @@ def _copy_grouped_mlp_params(dst: te_ops.Sequential, src: te_ops.Sequential) ->
                         )
 
 
-def _init_main_grads(module: te_ops.Sequential) -> None:
+def _init_main_grads(module: te_ops.Sequential, dtype: torch.dtype) -> None:
     for linear in (module[0], module[2]):
         if linear.single_grouped_weight:
             linear.weight.main_grad = torch.zeros(
                 linear.num_groups,
                 linear.out_features,
                 linear.in_features,
                 device="cuda",
-                dtype=torch.bfloat16,
+                dtype=dtype,
             )
         else:
             for group_idx in range(linear.num_groups):
                 weight = getattr(linear, f"weight{group_idx}")
-                weight.main_grad = torch.zeros_like(weight)
+                weight.main_grad = torch.zeros_like(weight, dtype=dtype)
 
 
 def _run_grouped_mlp(
@@ -241,6 +241,7 @@ def _run_megacpp_against_python(
     activation_kind: str = "scaled_swiglu",
     single_grouped_param: bool = False,
     accumulate_into_main_grad: bool = False,
+    main_grad_dtype: torch.dtype | None = None,
     compare_zero_expert_grads: bool = True,
     monkeypatch,
 ) -> None:
@@ -274,8 +275,10 @@ def _run_megacpp_against_python(
     )
     _copy_grouped_mlp_params(test, ref)
     if accumulate_into_main_grad:
-        _init_main_grads(ref)
-        _init_main_grads(test)
+        if main_grad_dtype is None:
+            raise ValueError("main_grad_dtype must be set when using Megatron-owned main_grad.")
+        _init_main_grads(ref, main_grad_dtype)
+        _init_main_grads(test, main_grad_dtype)
 
     # Paged stashing passes a static physical buffer to the op while m_splits
     # describe only the valid prefix. Rows after sum(m_splits) are garbage and
@@ -332,13 +335,17 @@ def _run_megacpp_against_python(
     ids=["discrete_weight", "packed_weight"],
 )
 @pytest.mark.parametrize(
-    "accumulate_into_main_grad",
-    [False, True],
-    ids=["cpp_allocated_wgrad", "megatron_main_grad"],
+    "accumulate_into_main_grad,main_grad_dtype",
+    [
+        pytest.param(False, None, id="cpp_allocated_wgrad"),
+        pytest.param(True, torch.bfloat16, id="megatron_main_grad_bf16"),
+        pytest.param(True, torch.float32, id="megatron_main_grad_fp32"),
+    ],
 )
 def test_megacpp_grouped_mlp_wgrad_storage_matches_python(
     single_grouped_param,
     accumulate_into_main_grad,
+    main_grad_dtype,
     monkeypatch,
 ):
     torch.manual_seed(1234)
@@ -349,6 +356,7 @@ def test_megacpp_grouped_mlp_wgrad_storage_matches_python(
         split_device="cuda",
         single_grouped_param=single_grouped_param,
         accumulate_into_main_grad=accumulate_into_main_grad,
+        main_grad_dtype=main_grad_dtype,
         monkeypatch=monkeypatch,
     )
 
diff --git a/transformer_engine/pytorch/csrc/megacpp/grouped_mlp.cpp b/transformer_engine/pytorch/csrc/megacpp/grouped_mlp.cpp
@@ -243,36 +243,33 @@ void grouped_gemm(GroupedTensorWrapper *A, bool transa, GroupedTensorWrapper *B,
 }
 
 std::vector<at::Tensor> output_tensor_list_from_arg(py::handle arg, size_t num_groups,
-                                                    at::ScalarType dtype, const std::string &name) {
+                                                    int64_t rows, int64_t cols,
+                                                    const std::string &name) {
   std::vector<at::Tensor> out;
   if (is_none(arg)) {
     return out;
   }
   out.reserve(num_groups);
-  if (py::isinstance<py::list>(arg) || py::isinstance<py::tuple>(arg)) {
-    auto seq = py::reinterpret_borrow<py::sequence>(arg);
-    NVTE_CHECK(static_cast<size_t>(seq.size()) == num_groups, name, " must have ", num_groups,
-               " tensors.");
-    for (size_t i = 0; i < num_groups; ++i) {
-      auto tensor = seq[i].cast<at::Tensor>();
-      NVTE_CHECK(tensor.is_cuda(), name, " tensors must be CUDA tensors.");
-      NVTE_CHECK(tensor.scalar_type() == dtype, name, " tensors must have the requested dtype.");
-      NVTE_CHECK(tensor.dim() == 2, name, " tensors must be rank-2 wgrad buffers.");
-      check_contiguous(tensor, name);
-      out.emplace_back(tensor);
-    }
-    return out;
-  }
-
-  auto packed = arg.cast<at::Tensor>();
-  NVTE_CHECK(packed.is_cuda(), name, " must be a CUDA tensor.");
-  NVTE_CHECK(packed.scalar_type() == dtype, name, " must have the requested dtype.");
-  NVTE_CHECK(packed.dim() == 3, name, " must have shape [num_groups, rows, cols].");
-  NVTE_CHECK(static_cast<size_t>(packed.size(0)) == num_groups, name, " first dimension must be ",
-             num_groups, ".");
-  check_contiguous(packed, name);
+  // This helper is intentionally only for the discrete-weight external wgrad
+  // path, where Megatron provides one main_grad tensor per expert. The packed
+  // [G, rows, cols] external buffer used by single grouped weight is handled in
+  // wgrad_output_from_arg so it can stay packed and use grouped-tensor GEMM.
+  NVTE_CHECK(py::isinstance<py::list>(arg) || py::isinstance<py::tuple>(arg), name,
+             " must be a list or tuple of wgrad output tensors.");
+  auto seq = py::reinterpret_borrow<py::sequence>(arg);
+  NVTE_CHECK(static_cast<size_t>(seq.size()) == num_groups, name, " must have ", num_groups,
+             " tensors.");
   for (size_t i = 0; i < num_groups; ++i) {
-    out.emplace_back(packed.select(0, static_cast<int64_t>(i)));
+    auto tensor = seq[i].cast<at::Tensor>();
+    NVTE_CHECK(tensor.is_cuda(), name, " tensors must be CUDA tensors.");
+    // Do not require tensor.scalar_type() == compute dtype. Caller-owned
+    // main_grad buffers are allocated by Megatron and may be FP32 even when TE
+    // grouped MLP compute is BF16.
+    NVTE_CHECK(tensor.dim() == 2, name, " tensors must be rank-2 wgrad buffers.");
+    NVTE_CHECK(tensor.size(0) == rows && tensor.size(1) == cols, name,
+               " tensors must have shape [rows, cols].");
+    check_contiguous(tensor, name);
+    out.emplace_back(tensor);
   }
   return out;
 }
@@ -315,7 +312,8 @@ WgradOutput wgrad_output_from_arg(py::handle arg, bool compute_wgrad, size_t num
     // should not receive a newly allocated grad tensor from this helper.
     out.packed = arg.cast<at::Tensor>();
     NVTE_CHECK(out.packed.is_cuda(), name, " must be a CUDA tensor.");
-    NVTE_CHECK(out.packed.scalar_type() == dtype, name, " must have the requested dtype.");
+    // Do not require out.packed.scalar_type() == compute dtype. Caller-owned
+    // main_grad buffers keep the dtype chosen by Megatron's grad-buffer config.
     NVTE_CHECK(out.packed.dim() == 3, name, " must have shape [num_groups, rows, cols].");
     NVTE_CHECK(static_cast<size_t>(out.packed.size(0)) == num_groups, name,
                " first dimension must be ", num_groups, ".");
@@ -328,7 +326,7 @@ WgradOutput wgrad_output_from_arg(py::handle arg, bool compute_wgrad, size_t num
   // Case 4: discrete weights with externally-owned per-expert buffers, e.g.
   // Megatron main_grad list. GEMM writes each tensor in-place and returns no
   // allocated grad list to Python.
-  out.tensors = output_tensor_list_from_arg(arg, num_groups, dtype, name);
+  out.tensors = output_tensor_list_from_arg(arg, num_groups, rows, cols, name);
   return out;
 }