|
34 | 34 |
|
35 | 35 | _fp8_available, _fp8_reason = check_fp8_support() |
36 | 36 |
|
| 37 | +requires_multi_gpu = pytest.mark.skipif( |
| 38 | + not torch.cuda.is_available() or torch.cuda.device_count() < 2, |
| 39 | + reason="Test requires at least 2 GPUs", |
| 40 | +) |
| 41 | + |
37 | 42 |
|
38 | 43 | @pytest.fixture(scope="session") |
39 | 44 | def session_temp_dir(tmp_path_factory): |
@@ -175,26 +180,70 @@ def test_accelerate_launch(accelerate_config, tmp_path): |
175 | 180 | assert train_py.exists(), f"train.py not found at {train_py}" |
176 | 181 | assert accelerate_config_path.exists(), f"{accelerate_config} not found at {accelerate_config_path}" |
177 | 182 |
|
178 | | - # Run 'accelerate launch train.py' as a subprocess |
179 | | - env = os.environ.copy() |
180 | | - |
181 | | - subprocess.run( |
182 | | - [ |
183 | | - sys.executable, |
184 | | - "-m", |
185 | | - "accelerate.commands.launch", |
186 | | - "--config_file", |
187 | | - str(accelerate_config_path), |
188 | | - str(train_py), |
189 | | - "--config-name", |
190 | | - "L0_sanity", |
191 | | - f"trainer.output_dir={tmp_path}", |
192 | | - ], |
193 | | - cwd=recipe_dir, |
| 183 | + cmd = [ |
| 184 | + sys.executable, |
| 185 | + "-m", |
| 186 | + "accelerate.commands.launch", |
| 187 | + "--config_file", |
| 188 | + str(accelerate_config_path), |
| 189 | + str(train_py), |
| 190 | + "--config-name", |
| 191 | + "L0_sanity", |
| 192 | + f"trainer.output_dir={tmp_path}", |
| 193 | + "trainer.save_steps=1000", |
| 194 | + "trainer.eval_steps=1000", |
| 195 | + "trainer.do_eval=false", |
| 196 | + ] |
| 197 | + |
| 198 | + result = subprocess.run( |
| 199 | + cmd, |
| 200 | + check=False, |
| 201 | + text=True, |
194 | 202 | stdout=subprocess.PIPE, |
195 | 203 | stderr=subprocess.PIPE, |
| 204 | + timeout=240, |
| 205 | + ) |
| 206 | + |
| 207 | + if result.returncode != 0: |
| 208 | + print(f"STDOUT:\n{result.stdout}") |
| 209 | + print(f"STDERR:\n{result.stderr}") |
| 210 | + pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}") |
| 211 | + |
| 212 | + |
| 213 | +@requires_multi_gpu |
| 214 | +def test_accelerate_launch_multi_gpu(tmp_path): |
| 215 | + """Test that accelerate launch runs successfully.""" |
| 216 | + # Find the recipe directory and train.py |
| 217 | + recipe_dir = Path(__file__).parent |
| 218 | + train_py = recipe_dir / "train.py" |
| 219 | + |
| 220 | + cmd = [ |
| 221 | + sys.executable, |
| 222 | + "-m", |
| 223 | + "accelerate.commands.launch", |
| 224 | + "--config_file", |
| 225 | + str(recipe_dir / "accelerate_config" / "bf16_config.yaml"), |
| 226 | + "--num_processes", |
| 227 | + "2", |
| 228 | + str(train_py), |
| 229 | + "--config-name", |
| 230 | + "L0_sanity", |
| 231 | + f"trainer.output_dir={tmp_path}", |
| 232 | + "trainer.save_steps=1000", |
| 233 | + "trainer.eval_steps=1000", |
| 234 | + "trainer.do_eval=false", |
| 235 | + ] |
| 236 | + |
| 237 | + result = subprocess.run( |
| 238 | + cmd, |
| 239 | + check=False, |
196 | 240 | text=True, |
197 | | - check=True, |
| 241 | + stdout=subprocess.PIPE, |
| 242 | + stderr=subprocess.PIPE, |
198 | 243 | timeout=240, |
199 | | - env=env, |
200 | 244 | ) |
| 245 | + |
| 246 | + if result.returncode != 0: |
| 247 | + print(f"STDOUT:\n{result.stdout}") |
| 248 | + print(f"STDERR:\n{result.stderr}") |
| 249 | + pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}") |
0 commit comments