Use nemo:26.04 for gpu_megatron tests and fix GPTModelExporter.save_pretrained race condition hang

kevalmorabia97 · kevalmorabia97 · commit c593f82f72bd · 2026-04-18T02:52:58.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -80,7 +80,7 @@ jobs:
             container_image: pytorch:26.01-py3
           - example: gpu_megatron
             timeout: 45
-            container_image: nemo:26.02
+            container_image: nemo:26.04
           - example: gpu_trtllm
             timeout: 30
             container_image: tensorrt-llm/release:1.3.0rc10
diff --git a/modelopt/torch/export/unified_export_megatron.py b/modelopt/torch/export/unified_export_megatron.py
@@ -76,6 +76,7 @@
     from megatron.core.parallel_state import (
         get_pipeline_model_parallel_rank,
         get_pipeline_model_parallel_world_size,
+        get_tensor_model_parallel_rank,
     )
     from megatron.core.ssm.mamba_layer import MambaLayer
     from megatron.core.transformer.identity_op import IdentityOp
@@ -258,13 +259,14 @@ def save_pretrained(
         """
         pp_rank = get_pipeline_model_parallel_rank()
         pp_size = get_pipeline_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
 
         # We use the 1st PP rank to handle VLM because vision_models
         # and vision_proj only exist in the first stage.
-        is_first_stage_main_rank = pp_rank == 0
+        is_first_stage_main_rank = pp_rank == 0 and tp_rank == 0
         # We use the last PP rank to write the config because
         # medusa_heads and eagle_module only exist in the last stage.
-        is_last_stage_main_rank = pp_rank == pp_size - 1
+        is_last_stage_main_rank = pp_rank == pp_size - 1 and tp_rank == 0
 
         # Main export process
         layer_state_dicts = self.layer_state_dicts
diff --git a/noxfile.py b/noxfile.py
@@ -119,9 +119,11 @@ def gpu(session):
     session.run("python", "-m", "pytest", "tests/gpu", *_cov_args())
 
 
-# Container: nvcr.io/nvidia/nemo:26.02 or later
+# Container: nvcr.io/nvidia/nemo:26.04 or later
 @nox.session(venv_backend="none")
 def gpu_megatron(session):
+    # nemo:26.04 has transformers 5.x but tensorrt_llm 1.2.0 which does not support it causing import errors
+    session.run("python", "-m", "pip", "uninstall", "-y", "tensorrt_llm")
     session.run("python", "-m", "pip", "install", "-e", ".[hf,dev-test]")
     session.run("python", "-m", "pytest", "tests/gpu_megatron", *_cov_args())
 
diff --git a/tests/gpu_megatron/torch/export/test_unified_export_megatron.py b/tests/gpu_megatron/torch/export/test_unified_export_megatron.py
@@ -24,6 +24,7 @@
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
 from _test_utils.torch.megatron.utils import get_forward
 from _test_utils.torch.transformers_models import create_tiny_llama_dir
+from safetensors import safe_open
 from safetensors.torch import save_file
 
 import modelopt.torch.quantization as mtq
@@ -275,8 +276,6 @@ def _test_qkv_slicing_gqa_tp2(tmp_path, rank, size):
 
     # Verify Q/K/V projections were exported (collect keys from all shard files)
     if rank == 0:
-        from safetensors import safe_open
-
         safetensors_files = list(export_dir.glob("*.safetensors"))
         assert safetensors_files, "no safetensors files found in export dir"
         keys = []
diff --git a/tools/launcher/tests/conftest.py b/tools/launcher/tests/conftest.py
@@ -21,7 +21,7 @@
     uv run python3 -m pytest tests/ -v
 
 Or via nox from Model-Optimizer root:
-    nox -s "unit-3.12(torch_211-tf_latest)"
+    nox -s "unit-3.12(torch_211, tf_latest)"
 """
 
 import os