Merge branch 'main' into dmoodie/bugfix/trtexec_safe

dthienan-nv · web-flow · commit fff917eed9fb · 2026-05-13T17:07:03.000-04:00
diff --git a/modelopt/torch/quantization/algorithms.py b/modelopt/torch/quantization/algorithms.py
@@ -1094,10 +1094,8 @@ def run_search_with_stats(self, max_weight_size, verbose=False):
         return best_recipes, is_satisfied
 
 
-# TODO: Enable torch compile for this function
-# Currently modelopt.onnx is breaking this
+@torch.compile(dynamic=True)
 def _get_log_softmax_dist(logits: torch.Tensor, tp_group) -> torch.Tensor:
-    # TODO: test this
     dtype = logits.dtype
     max_logits = torch.amax(logits, dim=-1, keepdim=True)
     torch.distributed.all_reduce(max_logits, op=torch.distributed.ReduceOp.MAX, group=tp_group)
diff --git a/modelopt/torch/quantization/plugins/transformer_engine.py b/modelopt/torch/quantization/plugins/transformer_engine.py
@@ -93,6 +93,10 @@ def te_quantized_linear_fn(package, func_name, self, *args, **kwargs):
         new_args[weight_pos] = self.weight_quantizer(args[weight_pos])
         new_args[inp_pos] = self.input_quantizer(args[inp_pos])
         output = getattr(package, func_name)(*new_args, **kwargs)
+        # TE 2.15+ returns `(out, new_weight_workspace)`; TE <= 2.14 returns just `out`.
+        # Only the activation tensor participates in output quantization.
+        if isinstance(output, tuple):
+            return (self.output_quantizer(output[0]), *output[1:])
         return self.output_quantizer(output)
 
     # Override the quantized linear function
@@ -181,6 +185,10 @@ def te_grouped_quantized_linear_fn(package, func_name, self, *args):
         for i in range(weights_start, weights_start + num_gemms):
             new_args[i] = self.weight_quantizer(args[i])
         output = getattr(package, func_name)(*new_args)
+        # TE 2.15+ returns `(out, new_workspaces)`; TE <= 2.14 returns just `out`.
+        # Only the activation tensor participates in output quantization.
+        if isinstance(output, tuple):
+            return (self.output_quantizer(output[0]), *output[1:])
         return self.output_quantizer(output)
 
     # Override the quantized linear function