Skip to content

Commit a3577ac

Browse files
committed
fix(esm2_accelerate_te): skip GPU-dependent tests when CUDA is unavailable
Tests using FP8 (TransformerEngine), HuggingFace models with bf16=True, and the resume-from-checkpoint test all require CUDA. When the CI runner has a broken or unavailable GPU, these tests now skip gracefully instead of failing with CUDA errors. Add requires_gpu marker to: - test_te_with_fp8_config (TE asserts CUDA availability for FP8) - test_hf_with_default_config (TrainingArguments bf16 validation) - test_hf_with_fsdp2_config (same) - test_train_can_resume_from_checkpoint (calls cuda_setDevice) Signed-off-by: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com>
1 parent 2ebccb1 commit a3577ac

3 files changed

Lines changed: 18 additions & 1 deletion

File tree

bionemo-recipes/recipes/esm2_accelerate_te/tests/launch.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
import train
2626

2727

28+
requires_gpu = pytest.mark.skipif(
29+
not torch.cuda.is_available(),
30+
reason="Test requires a GPU",
31+
)
32+
2833
requires_multi_gpu = pytest.mark.skipif(
2934
not torch.cuda.is_available() or torch.cuda.device_count() < 2,
3035
reason="Test requires at least 2 GPUs",

bionemo-recipes/recipes/esm2_accelerate_te/tests/test_accelerate_esm2.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# limitations under the License.
1515

1616
# Local helper function import, resolved in conftest.py
17-
from launch import launch_accelerate, requires_multi_gpu
17+
from launch import launch_accelerate, requires_gpu, requires_multi_gpu
1818

1919

2020
def test_te_with_default_config(tmp_path):
@@ -37,16 +37,19 @@ def test_te_with_dynamo_config(tmp_path):
3737
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
3838

3939

40+
@requires_gpu
4041
def test_te_with_fp8_config(tmp_path):
4142
train_loss = launch_accelerate("fp8.yaml", tmp_path, 1, "L0_sanity", "model_tag=./example_8m_checkpoint")
4243
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
4344

4445

46+
@requires_gpu
4547
def test_hf_with_default_config(tmp_path):
4648
train_loss = launch_accelerate("default.yaml", tmp_path, 1, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")
4749
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
4850

4951

52+
@requires_gpu
5053
def test_hf_with_fsdp2_config(tmp_path):
5154
train_loss = launch_accelerate("fsdp2_hf.yaml", tmp_path, 1, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")
5255
assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"

bionemo-recipes/recipes/esm2_accelerate_te/tests/test_train_resume.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,20 @@
1818
import shutil
1919
from pathlib import Path
2020

21+
import pytest
22+
import torch
2123
from hydra import compose, initialize_config_dir
2224

2325
import train
2426

2527

28+
requires_gpu = pytest.mark.skipif(
29+
not torch.cuda.is_available(),
30+
reason="Test requires a GPU",
31+
)
32+
33+
34+
@requires_gpu
2635
def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
2736
"""Test that train.py runs successfully with sanity config and creates expected outputs."""
2837

0 commit comments

Comments
 (0)