fix(te-plugin): handle TE 2.15+ tuple return from _Linear / _GroupedLinear

kevalmorabia97 · kevalmorabia97 · commit c897fbeaaff6 · 2026-05-13T10:31:53.000-07:00
TE 2.15+ changed `_Linear.forward` and `_GroupedLinear.forward` to return
`(out, new_workspace)` tuples instead of a single tensor. ModelOpt's
patched `te_quantized_linear_fn` / `te_grouped_quantized_linear_fn` still
passed the whole tuple into `self.output_quantizer`, crashing inside
`TensorQuantizer.forward` on `tuple.numel()`:

  AttributeError: 'tuple' object has no attribute 'numel'

Mirror the existing pattern from `_QuantTELayerNormLinear.forward`:
quantize only `output[0]` (activation) and pass auxiliary workspace
metadata through verbatim. TE &lt;= 2.14 returns a single tensor and falls
through the isinstance branch unchanged.

This unblocks Megatron-Bridge's TE 2.15 path; the local
`patch_modelopt_te_linear_tuple_output` shim can be removed once this
ships in a tagged release.

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/plugins/transformer_engine.py b/modelopt/torch/quantization/plugins/transformer_engine.py
@@ -93,6 +93,10 @@ def te_quantized_linear_fn(package, func_name, self, *args, **kwargs):
         new_args[weight_pos] = self.weight_quantizer(args[weight_pos])
         new_args[inp_pos] = self.input_quantizer(args[inp_pos])
         output = getattr(package, func_name)(*new_args, **kwargs)
+        # TE 2.15+ returns `(out, new_weight_workspace)`; TE <= 2.14 returns just `out`.
+        # Only the activation tensor participates in output quantization.
+        if isinstance(output, tuple):
+            return (self.output_quantizer(output[0]), *output[1:])
         return self.output_quantizer(output)
 
     # Override the quantized linear function
@@ -181,6 +185,10 @@ def te_grouped_quantized_linear_fn(package, func_name, self, *args):
         for i in range(weights_start, weights_start + num_gemms):
             new_args[i] = self.weight_quantizer(args[i])
         output = getattr(package, func_name)(*new_args)
+        # TE 2.15+ returns `(out, new_workspaces)`; TE <= 2.14 returns just `out`.
+        # Only the activation tensor participates in output quantization.
+        if isinstance(output, tuple):
+            return (self.output_quantizer(output[0]), *output[1:])
         return self.output_quantizer(output)
 
     # Override the quantized linear function