|
17 | 17 | # limitations under the License. |
18 | 18 | import argparse |
19 | 19 | import io |
20 | | -import os |
21 | 20 | import shlex |
22 | 21 | from contextlib import redirect_stderr, redirect_stdout |
23 | 22 | from typing import Tuple |
24 | 23 |
|
25 | 24 | import pytest |
26 | | -from lightning.fabric.plugins.environments.lightning import find_free_network_port |
27 | 25 | from nemo import lightning as nl |
28 | 26 | from transformer_engine.pytorch.fp8 import check_fp8_support |
29 | 27 |
|
@@ -152,98 +150,6 @@ def test_train_evo2_stops(tmp_path): |
152 | 150 | assert "train_step_timing in s" in trainer.logged_metrics |
153 | 151 |
|
154 | 152 |
|
155 | | -@pytest.mark.slow |
156 | | -@pytest.mark.parametrize("model_size", ["7b_nv", "7b_arc_longcontext"]) |
157 | | -def test_train_single_gpu(tmp_path, model_size: str): |
158 | | - """ |
159 | | - This test runs them single gpu evo2 training command with sample data in a temporary directory. |
160 | | - """ |
161 | | - num_steps = 7 |
162 | | - open_port = find_free_network_port() |
163 | | - # a local copy of the environment |
164 | | - env = dict(**os.environ) |
165 | | - env["MASTER_PORT"] = str(open_port) |
166 | | - # Part 1: Make sure training runs for only --early-stop-on-step steps |
167 | | - additional_args1 = [ |
168 | | - "--result-dir", |
169 | | - str(tmp_path), |
170 | | - "--model-size", |
171 | | - model_size, |
172 | | - "--num-layers", |
173 | | - str(4), |
174 | | - "--hybrid-override-pattern", |
175 | | - "SDH*", |
176 | | - "--no-activation-checkpointing", |
177 | | - "--use-precision-aware-optimizer", |
178 | | - "--add-bias-output", |
179 | | - "--bf16-main-grads", |
180 | | - "--val-check-interval", |
181 | | - str(5), |
182 | | - "--max-steps", |
183 | | - str(num_steps), |
184 | | - "--early-stop-on-step", |
185 | | - str(num_steps - 2), |
186 | | - "--warmup-steps", |
187 | | - str(1), |
188 | | - "--seq-length", |
189 | | - str(128), |
190 | | - "--wandb-offline", |
191 | | - "--wandb-anonymous", |
192 | | - "--mock-data", |
193 | | - ] |
194 | | - args1 = parse_args(args=additional_args1) |
195 | | - stdout_buf, stderr_buf = io.StringIO(), io.StringIO() |
196 | | - with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf), distributed_model_parallel_state(): |
197 | | - train(args=args1) |
198 | | - train_stdout = stdout_buf.getvalue() |
199 | | - train_lines = train_stdout.split("\n") |
200 | | - iteration_lines = [line for line in train_lines if "Training epoch" in line] |
201 | | - assert len(iteration_lines) == 5 |
202 | | - iteration_line_1 = iteration_lines[0] |
203 | | - # No strong opinion on how the total should be computed in the case of early stopping. We allow either for now |
204 | | - # unless there is an issue, such as with the LR scheduler... |
205 | | - # TODO: Add a test somewhere that covers that early stopping callback has no impact on the LR scheduler |
206 | | - assert "iteration 0/4" in iteration_line_1 or "iteration 0/6" in iteration_line_1 |
207 | | - iteration_line_final = iteration_lines[-1] |
208 | | - assert "iteration 4/4" in iteration_line_final or "iteration 4/6" in iteration_line_final |
209 | | - |
210 | | - # Part 2: Make sure training picks up where it left off |
211 | | - additional_args2 = [ |
212 | | - "--result-dir", |
213 | | - str(tmp_path), |
214 | | - "--model-size", |
215 | | - model_size, |
216 | | - "--num-layers", |
217 | | - str(4), |
218 | | - "--hybrid-override-pattern", |
219 | | - "SDH*", |
220 | | - "--no-activation-checkpointing", |
221 | | - "--use-precision-aware-optimizer", |
222 | | - "--add-bias-output", |
223 | | - "--max-steps", |
224 | | - str(num_steps), |
225 | | - "--warmup-steps", |
226 | | - str(1), |
227 | | - "--seq-length", |
228 | | - str(128), |
229 | | - "--wandb-offline", |
230 | | - "--wandb-anonymous", |
231 | | - "--mock-data", |
232 | | - ] |
233 | | - args2 = parse_args(args=additional_args2) |
234 | | - stdout_buf, stderr_buf = io.StringIO(), io.StringIO() |
235 | | - with redirect_stdout(stdout_buf), redirect_stderr(stderr_buf), distributed_model_parallel_state(): |
236 | | - train(args=args2) |
237 | | - train_stdout = stdout_buf.getvalue() |
238 | | - train_lines = train_stdout.split("\n") |
239 | | - iteration_lines = [line for line in train_lines if "Training epoch" in line] |
240 | | - assert len(iteration_lines) == 2 |
241 | | - iteration_line_1 = iteration_lines[0] |
242 | | - assert "iteration 5/6" in iteration_line_1 |
243 | | - iteration_line_2 = iteration_lines[1] |
244 | | - assert "iteration 6/6" in iteration_line_2 |
245 | | - |
246 | | - |
247 | 153 | @pytest.mark.parametrize( |
248 | 154 | "additional_args", |
249 | 155 | [ |
|
0 commit comments