Skip to content

Commit c593f82

Browse files
Use nemo:26.04 for gpu_megatron tests and fix GPTModelExporter.save_pretrained race condition hang
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
1 parent 2c0e9ad commit c593f82

5 files changed

Lines changed: 10 additions & 7 deletions

File tree

.github/workflows/gpu_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ jobs:
8080
container_image: pytorch:26.01-py3
8181
- example: gpu_megatron
8282
timeout: 45
83-
container_image: nemo:26.02
83+
container_image: nemo:26.04
8484
- example: gpu_trtllm
8585
timeout: 30
8686
container_image: tensorrt-llm/release:1.3.0rc10

modelopt/torch/export/unified_export_megatron.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
from megatron.core.parallel_state import (
7777
get_pipeline_model_parallel_rank,
7878
get_pipeline_model_parallel_world_size,
79+
get_tensor_model_parallel_rank,
7980
)
8081
from megatron.core.ssm.mamba_layer import MambaLayer
8182
from megatron.core.transformer.identity_op import IdentityOp
@@ -258,13 +259,14 @@ def save_pretrained(
258259
"""
259260
pp_rank = get_pipeline_model_parallel_rank()
260261
pp_size = get_pipeline_model_parallel_world_size()
262+
tp_rank = get_tensor_model_parallel_rank()
261263

262264
# We use the 1st PP rank to handle VLM because vision_models
263265
# and vision_proj only exist in the first stage.
264-
is_first_stage_main_rank = pp_rank == 0
266+
is_first_stage_main_rank = pp_rank == 0 and tp_rank == 0
265267
# We use the last PP rank to write the config because
266268
# medusa_heads and eagle_module only exist in the last stage.
267-
is_last_stage_main_rank = pp_rank == pp_size - 1
269+
is_last_stage_main_rank = pp_rank == pp_size - 1 and tp_rank == 0
268270

269271
# Main export process
270272
layer_state_dicts = self.layer_state_dicts

noxfile.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,11 @@ def gpu(session):
119119
session.run("python", "-m", "pytest", "tests/gpu", *_cov_args())
120120

121121

122-
# Container: nvcr.io/nvidia/nemo:26.02 or later
122+
# Container: nvcr.io/nvidia/nemo:26.04 or later
123123
@nox.session(venv_backend="none")
124124
def gpu_megatron(session):
125+
# nemo:26.04 has transformers 5.x but tensorrt_llm 1.2.0 which does not support it causing import errors
126+
session.run("python", "-m", "pip", "uninstall", "-y", "tensorrt_llm")
125127
session.run("python", "-m", "pip", "install", "-e", ".[hf,dev-test]")
126128
session.run("python", "-m", "pytest", "tests/gpu_megatron", *_cov_args())
127129

tests/gpu_megatron/torch/export/test_unified_export_megatron.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from _test_utils.torch.megatron.models import get_mcore_gpt_model
2525
from _test_utils.torch.megatron.utils import get_forward
2626
from _test_utils.torch.transformers_models import create_tiny_llama_dir
27+
from safetensors import safe_open
2728
from safetensors.torch import save_file
2829

2930
import modelopt.torch.quantization as mtq
@@ -275,8 +276,6 @@ def _test_qkv_slicing_gqa_tp2(tmp_path, rank, size):
275276

276277
# Verify Q/K/V projections were exported (collect keys from all shard files)
277278
if rank == 0:
278-
from safetensors import safe_open
279-
280279
safetensors_files = list(export_dir.glob("*.safetensors"))
281280
assert safetensors_files, "no safetensors files found in export dir"
282281
keys = []

tools/launcher/tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
uv run python3 -m pytest tests/ -v
2222
2323
Or via nox from Model-Optimizer root:
24-
nox -s "unit-3.12(torch_211-tf_latest)"
24+
nox -s "unit-3.12(torch_211, tf_latest)"
2525
"""
2626

2727
import os

0 commit comments

Comments
 (0)