fix(esm2_accelerate_te): skip GPU-dependent tests when CUDA is unavailable

svc-bionemo · svc-bionemo · commit a3577ace6887 · 2026-04-30T05:04:07.000-07:00
Tests using FP8 (TransformerEngine), HuggingFace models with bf16=True,
and the resume-from-checkpoint test all require CUDA. When the CI runner
has a broken or unavailable GPU, these tests now skip gracefully instead
of failing with CUDA errors.

Add requires_gpu marker to:
- test_te_with_fp8_config (TE asserts CUDA availability for FP8)
- test_hf_with_default_config (TrainingArguments bf16 validation)
- test_hf_with_fsdp2_config (same)
- test_train_can_resume_from_checkpoint (calls cuda_setDevice)

Signed-off-by: svc-bionemo &lt;267129667+svc-bionemo@users.noreply.github.com&gt;
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/tests/launch.py b/bionemo-recipes/recipes/esm2_accelerate_te/tests/launch.py
@@ -25,6 +25,11 @@
 import train
 
 
+requires_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="Test requires a GPU",
+)
+
 requires_multi_gpu = pytest.mark.skipif(
     not torch.cuda.is_available() or torch.cuda.device_count() < 2,
     reason="Test requires at least 2 GPUs",
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/tests/test_accelerate_esm2.py b/bionemo-recipes/recipes/esm2_accelerate_te/tests/test_accelerate_esm2.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 # Local helper function import, resolved in conftest.py
-from launch import launch_accelerate, requires_multi_gpu
+from launch import launch_accelerate, requires_gpu, requires_multi_gpu
 
 
 def test_te_with_default_config(tmp_path):
@@ -37,16 +37,19 @@ def test_te_with_dynamo_config(tmp_path):
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
 
 
+@requires_gpu
 def test_te_with_fp8_config(tmp_path):
     train_loss = launch_accelerate("fp8.yaml", tmp_path, 1, "L0_sanity", "model_tag=./example_8m_checkpoint")
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
 
 
+@requires_gpu
 def test_hf_with_default_config(tmp_path):
     train_loss = launch_accelerate("default.yaml", tmp_path, 1, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
 
 
+@requires_gpu
 def test_hf_with_fsdp2_config(tmp_path):
     train_loss = launch_accelerate("fsdp2_hf.yaml", tmp_path, 1, "L0_sanity", "model_tag=facebook/esm2_t6_8M_UR50D")
     assert train_loss < 3.0, f"Final train_loss {train_loss} should be less than 3.0"
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/tests/test_train_resume.py b/bionemo-recipes/recipes/esm2_accelerate_te/tests/test_train_resume.py
@@ -18,11 +18,20 @@
 import shutil
 from pathlib import Path
 
+import pytest
+import torch
 from hydra import compose, initialize_config_dir
 
 import train
 
 
+requires_gpu = pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="Test requires a GPU",
+)
+
+
+@requires_gpu
 def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
     """Test that train.py runs successfully with sanity config and creates expected outputs."""