[PyTorch] Allocate grouped linear wgrads as tensor views (#3049)

timmoon10 · pre-commit-ci[bot] · web-flow · commit ace2a9653a2d · 2026-05-28T15:06:01.000-07:00
* Allocate grouped linear wgrads as views Signed-off-by: Tim Moon <tmoon@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/transformer_engine/pytorch/csrc/extensions/allocate.cpp b/transformer_engine/pytorch/csrc/extensions/allocate.cpp
@@ -12,6 +12,16 @@
 namespace transformer_engine {
 namespace pytorch {
 
+/* Allocate multiple PyTorch tensors backed by the same buffer.
+ *
+ * Use with caution and avoid exposing externally.
+ *
+ * In order to reduce CPU overhead, we compute pointer offsets
+ * manually and construct PyTorch tensors with raw pointers. The
+ * backing buffer is deallocated once the final tensor is destroyed.
+ * Stream usage is not recorded, so there may be race conditions if
+ * compute is performed on multiple streams.
+ */
 std::vector<at::Tensor> bulk_allocate(const std::vector<std::vector<size_t>> &shapes,
                                       const std::vector<at::ScalarType> &dtypes,
                                       std::optional<c10::Device> device,
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -496,13 +496,13 @@ def backward(
                 if ctx.fuse_wgrad_accumulation:
                     wgrad_list = main_grads
                 else:
-                    weight_shape = list(weights[0].size())
-                    wgrad_list = tex.bulk_allocate(
-                        [weight_shape] * ctx.num_gemms,
-                        [ctx.activation_dtype] * ctx.num_gemms,
-                        ctx.device,
-                        [256] * ctx.num_gemms,  # alignment
+                    wgrad_packed = torch.empty(
+                        ctx.num_gemms,
+                        *weights[0].size(),
+                        dtype=ctx.activation_dtype,
+                        device=ctx.device,
                     )
+                    wgrad_list = [wgrad_packed[i] for i in range(ctx.num_gemms)]
 
                 if ctx.save_original_input:
                     inp = inputmats[0]
diff --git a/transformer_engine/pytorch/ops/basic/grouped_linear.py b/transformer_engine/pytorch/ops/basic/grouped_linear.py
@@ -1393,12 +1393,12 @@ def _fuser_backward_split_quantize(
                     ]
                     accumulate_into_main_grad = get_accumulate_flag_in_param(weights[0])
                 else:
-                    grad_weights = tex.bulk_allocate(
-                        [weight_shape] * num_groups,
-                        [ctx.dtype] * num_groups,
-                        device,
-                        [256] * num_groups,  # alignment
+                    grad_weights_packed = torch.empty(
+                        grouped_shape,
+                        dtype=ctx.dtype,
+                        device=device,
                     )
+                    grad_weights = [grad_weights_packed[i] for i in range(num_groups)]
                 final_weight_grads = list(grad_weights)
 
         # Perform dgrad GEMMs
diff --git a/transformer_engine/pytorch/ops/fused/backward_grouped_mlp.py b/transformer_engine/pytorch/ops/fused/backward_grouped_mlp.py
@@ -197,12 +197,13 @@ def _compute_grad_params(
                 w_list = [get_main_grad_from_param(w, op_label=op_label) for w in weights]
                 accumulate_into_main_grad = get_accumulate_flag_in_param(weights[0])
             else:
-                w_list = tex.bulk_allocate(
-                    [weight_shape] * num_groups,
-                    [dtype] * num_groups,
-                    device,
-                    [256] * num_groups,  # alignment
+                wgrad_packed = torch.empty(
+                    num_groups,
+                    *weight_shape,
+                    dtype=dtype,
+                    device=device,
                 )
+                w_list = [wgrad_packed[i] for i in range(num_groups)]
             wgrad_output = w_list
 
     if ctx.weight_requires_grad: