NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/torch/export/plugins/megatron_importer.py‎
Lines changed: 11 additions & 0 deletions b/‎modelopt/torch/export/plugins/megatron_importer.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎modelopt/torch/utils/logging.py‎
Lines changed: 2 additions & 1 deletion b/‎modelopt/torch/utils/logging.py‎
Lines changed: 2 additions & 1 deletion
@@ -22,6 +22,7 @@ Changelog
 
 **Bug Fixes**
 
+- Fix Megatron utility functions for generation (with pipeline parallelism) and MMLU score evaluation (10x speedup).
 - Fix Minitron pruning (``mcore_minitron``) for MoE models. Importance estimation hooks were incorrectly registered for MoE modules and NAS step was hanging before this.
 - Fix TRT support for remote autotuning in ONNX Autotune from 10.16+ to 10.15+ and fix TRT versioning check to the ``trtexec`` version instead of the TRT Python API when using ``trtexec`` backend.
 
 
@@ -747,6 +747,17 @@ def _import_state_dict(self):
         if hasattr(model, "output_layer") and not model.share_embeddings_and_output_weights:
             self.rules["output_layer"](model.output_layer)
 
+        # For PP with shared embedding/output weights, re-sync the output layer on the last
+        # pipeline stage from stage 0's (now HF-loaded) embedding. At model init,
+        # setup_embeddings_and_output_layer() zeros out the last stage's weight and all-reduces
+        # from stage 0. After importing HF weights into stage 0's embedding, that sync is stale,
+        # so we re-run it here.
+        if (
+            model.share_embeddings_and_output_weights
+            and model.config.pipeline_model_parallel_size > 1
+        ):
+            model.setup_embeddings_and_output_layer()
+
         # MTP
         if hasattr(model, "mtp"):
             layer_pbar.set_description("Importing MTP")
 
@@ -105,8 +105,9 @@ def no_stdout():
 
 def print_rank_0(*args, **kwargs):
     """Prints only on the master process."""
+    kwargs.setdefault("flush", True)
     if dist.is_master():
-        print(*args, **kwargs, flush=True)
+        print(*args, **kwargs)
 
 
 def warn_rank_0(message, *args, **kwargs):