Skip to content

Commit 6c09a6f

Browse files
Use nemo:26.02 container for megatron gpu tests
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
1 parent 4e33368 commit 6c09a6f

5 files changed

Lines changed: 50 additions & 56 deletions

File tree

.github/workflows/_example_tests_runner.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ jobs:
4848
- name: Install dependencies
4949
run: |
5050
# use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
51-
pip uninstall -y nvidia-modelopt
5251
python -m pip install ".${{ inputs.pip_install_extras }}"
5352
5453
if [[ "${{ inputs.example }}" == *"diffusers"* ]]; then

.github/workflows/example_tests.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@ jobs:
118118
pip_install_extras: "[hf,dev-test]"
119119
runner: linux-amd64-gpu-rtxpro6000-latest-2
120120

121-
##### NeMo Example Tests #####
122-
nemo-pr:
121+
##### Megatron Example Tests #####
122+
megatron-pr:
123123
needs: [check-file-changes, wait-checks]
124124
if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
125125
strategy: &nemo_strategy
@@ -135,7 +135,7 @@ jobs:
135135
pip_install_extras: "[hf,puzzletron,dev-test]"
136136
runner: linux-amd64-gpu-rtxpro6000-latest-1
137137

138-
nemo-non-pr:
138+
megatron-non-pr:
139139
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
140140
strategy: *nemo_strategy
141141
uses: ./.github/workflows/_example_tests_runner.yml
@@ -160,7 +160,7 @@ jobs:
160160
with:
161161
docker_image: "nvcr.io/nvidia/tensorrt:26.02-py3"
162162
example: ${{ matrix.example }}
163-
pip_install_extras: "[all,dev-test]"
163+
pip_install_extras: "[onnx,hf,dev-test]"
164164
runner: linux-amd64-gpu-rtxpro6000-latest-1
165165

166166
onnx-non-pr:
@@ -171,14 +171,14 @@ jobs:
171171
with:
172172
docker_image: "nvcr.io/nvidia/tensorrt:26.02-py3"
173173
example: ${{ matrix.example }}
174-
pip_install_extras: "[all,dev-test]"
174+
pip_install_extras: "[onnx,hf,dev-test]"
175175
runner: linux-amd64-gpu-rtxpro6000-latest-2
176176

177177
##### Required Check for PR #####
178178
example-pr-required-check:
179179
# Run even if example tests are skipped
180180
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
181-
needs: [check-file-changes, torch-pr, trtllm-pr, nemo-pr, onnx-pr]
181+
needs: [check-file-changes, torch-pr, trtllm-pr, megatron-pr, onnx-pr]
182182
runs-on: ubuntu-latest
183183
steps:
184184
- name: Required GPU tests did not succeed
@@ -187,7 +187,7 @@ jobs:
187187
(needs.check-file-changes.outputs.any_changed == 'true' && (
188188
needs.torch-pr.result != 'success' ||
189189
needs.trtllm-pr.result != 'success' ||
190-
needs.nemo-pr.result != 'success' ||
190+
needs.megatron-pr.result != 'success' ||
191191
needs.onnx-pr.result != 'success'
192192
))
193193
run: exit 1

.github/workflows/gpu_tests.yml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ on:
88
- cron: "0 0 * * *" # Nightly
99
workflow_dispatch: # On-demand
1010

11-
# Cancel previous runs if new commit is pushed to the same PR
11+
# Cancel previous runs if new commit is pushed to the same PR
1212
concurrency:
1313
group: ${{ github.workflow }}-${{ startsWith(github.ref, 'refs/heads/pull-request/') && github.ref || github.sha }}
1414
cancel-in-progress: true
@@ -42,7 +42,9 @@ jobs:
4242
.github/workflows/gpu_tests.yml
4343
modelopt/**
4444
tests/gpu/**
45+
tests/gpu_megatron/**
4546
tests/gpu_regression/**
47+
tests/gpu_trtllm/**
4648
examples/speculative_decoding/**
4749
examples/dataset/**
4850
modelopt_recipes/general/speculative_decoding/**
@@ -71,13 +73,13 @@ jobs:
7173
timeout: 60
7274
container_image: pytorch:26.01-py3
7375
# tests/gpu/_extensions/test_onnx_extensions.py fails for newer containers until https://github.com/tbenthompson/cppimport/pull/98
74-
- example: gpu-regression
76+
- example: gpu_regression
7577
timeout: 15
7678
container_image: pytorch:26.01-py3
77-
- example: gpu-megatron
79+
- example: gpu_megatron
7880
timeout: 45
79-
container_image: pytorch:26.01-py3
80-
- example: gpu-trtllm
81+
container_image: nemo:26.02
82+
- example: gpu_trtllm
8183
timeout: 30
8284
container_image: tensorrt-llm/release:1.3.0rc10
8385
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
@@ -99,8 +101,14 @@ jobs:
99101
COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
100102
COVERAGE_FILE: ${{ github.workspace }}/.coverage
101103
run: |
102-
pip install tox-current-env
103-
COV_ARGS="--cov" tox -e cuda13-${{ matrix.example }} --current-env
104+
# nemo containers use uv venvs which is not compatible with tox-current-env, so run tests directly
105+
if [[ "${{ matrix.example }}" == "gpu_megatron" ]]; then
106+
python -m pip install -e .[hf,dev-test]
107+
python -m pytest tests/gpu_megatron --cov
108+
else
109+
python -m pip install tox tox-current-env
110+
COV_ARGS="--cov" python -m tox -e cuda13-${{ matrix.example }} --current-env
111+
fi
104112
- name: Upload GPU coverage to Codecov
105113
uses: codecov/codecov-action@v5
106114
with:

tests/gpu_megatron/torch/peft/plugins/test_megatron_peft.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,11 @@
2020
import torch
2121
import torch.nn.init as init
2222
from _test_utils.torch.megatron.models import get_mcore_gpt_model
23-
from _test_utils.torch.megatron.utils import initialize_for_megatron
24-
from megatron.core import dist_checkpointing
23+
from _test_utils.torch.megatron.utils import (
24+
initialize_for_megatron,
25+
load_distributed_checkpoint,
26+
save_distributed_checkpoint,
27+
)
2528

2629
import modelopt.torch.peft as mtpeft
2730
import modelopt.torch.quantization as mtq
@@ -148,20 +151,6 @@
148151
}
149152

150153

151-
def save_distributed_checkpoint(checkpoint_path, gpt_model):
152-
sharded_state_dict = gpt_model.sharded_state_dict(prefix="")
153-
dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
154-
155-
156-
def load_distributed_checkpoint(checkpoint_path, gpt_model):
157-
sharded_state_dict = gpt_model.sharded_state_dict(prefix="")
158-
checkpoint = dist_checkpointing.load(
159-
sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path
160-
)
161-
gpt_model.load_state_dict(checkpoint)
162-
return gpt_model
163-
164-
165154
def _gpt_model_provider(tp_size: int, hidden_size=256, vocab_size=64, meta_device=False):
166155
"""Build the model."""
167156

tox.ini

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
[tox]
22
envlist=
33
pre-commit-all
4-
py312-torch210-tf_latest-unit
4+
py312-torch211-tf_latest-unit
55
cuda13-gpu
6-
cuda13-gpu-regression
7-
cuda13-gpu-megatron
6+
cuda13-gpu_regression
87
skipsdist = True
98
toxworkdir = /tmp/{env:USER}-modelopt-tox
109
passenv =
@@ -57,44 +56,43 @@ commands =
5756
###########################################################
5857
# GPU test environments (Should be used with --current-env)
5958
###########################################################
59+
# Container: nvcr.io/nvidia/pytorch:26.01-py3 or later
6060
[testenv:cuda13-gpu]
6161
commands_pre =
6262
# Install deps here so that it gets installed even in --current-env
63-
pip install --no-build-isolation git+https://github.com/Dao-AILab/fast-hadamard-transform.git
64-
pip install -e .[all,dev-test]
63+
python -m pip install --no-build-isolation git+https://github.com/Dao-AILab/fast-hadamard-transform.git
64+
python -m pip install -e .[all,dev-test]
6565

6666
# Install cupy-cuda13x for INT4 ONNX quantization (default is cupy-cuda12x)
67-
pip uninstall -y cupy-cuda12x
68-
pip install cupy-cuda13x
67+
python -m pip uninstall -y cupy-cuda12x
68+
python -m pip install cupy-cuda13x
6969

7070
# Install mamba and causal-conv1d for Nemotron tests
71-
pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
72-
pip install --no-build-isolation git+https://github.com/Dao-AILab/causal-conv1d.git
71+
python -m pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
72+
python -m pip install --no-build-isolation git+https://github.com/Dao-AILab/causal-conv1d.git
7373
commands =
7474
python -m pytest tests/gpu {env:COV_ARGS:}
7575

76-
[testenv:cuda13-gpu-regression]
76+
[testenv:cuda13-gpu_regression]
7777
commands_pre =
78-
pip install -e .[hf,dev-test]
78+
python -m pip install -e .[hf,dev-test]
7979
commands =
8080
python -m pytest tests/gpu_regression {env:COV_ARGS:}
8181

82-
[testenv:cuda13-gpu-megatron]
82+
# Container: nvcr.io/nvidia/nemo:26.02 or later
83+
# NOTE: tox is bypassed for this env in CI (see gpu_tests.yml) because tox-current-env is
84+
# incompatible with uv venvs, and any new tox env would lack NeMo/Megatron packages from /opt/venv.
85+
# [testenv:cuda13-gpu_megatron]
86+
# commands_pre =
87+
# python -m pip install -e .[hf,dev-test]
88+
# commands =
89+
# python -m pytest tests/gpu_megatron {env:COV_ARGS:}
90+
91+
# Container: nvcr.io/nvidia/tensorrt-llm/release:1.2.0 or later
92+
[testenv:cuda13-gpu_trtllm]
8393
commands_pre =
8494
# Install deps here so that it gets installed even in --current-env
85-
# Temporarily disable latest mcore until we fix its nvidia-resiliency-ext dependency
86-
pip install 'megatron-core<0.17.0'
87-
pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
88-
pip install --no-build-isolation git+https://github.com/Dao-AILab/causal-conv1d.git
89-
pip install -e .[hf,dev-test]
90-
commands =
91-
python -m pytest tests/gpu_megatron {env:COV_ARGS:}
92-
93-
[testenv:cuda13-gpu-trtllm]
94-
# Expected to be run in TRT-LLM container
95-
commands_pre =
96-
# Install deps here so that it gets installed even in --current-env
97-
pip install -e .[hf,dev-test]
95+
python -m pip install -e .[hf,dev-test]
9896
commands =
9997
python -m pytest tests/gpu_trtllm {env:COV_ARGS:}
10098

0 commit comments

Comments
 (0)