Clean up param names, tune skips, and try to speed up resumption

jstjohn · jstjohn · commit b4cb0dea90a6 · 2025-09-10T20:54:53.000Z
Signed-off-by: John St John &lt;jstjohn@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py
@@ -507,12 +507,8 @@ def predict(
         resume_if_exists=True,
         resume_ignore_no_checkpoint=False,
         resume_past_end=False,
-        restore_config=nl.RestoreConfig(
-            path=str(ckpt_dir),  # NeMo expects a string path.
-            load_model_state=True,
-            load_optim_state=False,
-            load_artifacts=False,
-        ),
+        resume_from_path=str(ckpt_dir),
+        restore_config=None,
     )
     tokenizer = get_nmt_tokenizer("byte-level")
 
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_predict.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_predict.py
@@ -30,6 +30,7 @@
 from bionemo.core.data.load import load
 from bionemo.llm.lightning import batch_collator
 from bionemo.testing.data.fasta import ALU_SEQUENCE, create_fasta_file
+from bionemo.testing.torch import check_fp8_support
 
 
 def is_a6000_gpu() -> bool:
@@ -74,11 +75,18 @@ def checkpoint_7b_1m_path() -> Path:
 @pytest.mark.parametrize(
     "ddp,pp,tp,wi",
     [
-        (1, 1, 1, "epoch"),
-        (2, 1, 1, "epoch"),
-        (2, 1, 1, "batch"),
-        (1, 2, 1, "epoch"),
-        (1, 1, 2, "epoch"),
+        pytest.param(1, 1, 1, "epoch", id="ddp=1,pp=1,tp=1,wi=epoch"),
+        pytest.param(2, 1, 1, "epoch", id="ddp=2,pp=1,tp=1,wi=epoch"),
+        pytest.param(2, 1, 1, "batch", id="ddp=2,pp=1,tp=1,wi=batch"),
+        pytest.param(
+            1,
+            2,
+            1,
+            "epoch",
+            id="ddp=1,pp=2,tp=1,wi=epoch",
+            marks=pytest.mark.skip("Pipeline parallelism test currently hangs."),
+        ),
+        pytest.param(1, 1, 2, "epoch", id="ddp=1,pp=1,tp=2,wi=epoch"),
     ],
 )
 def test_predict_evo2_runs(
@@ -177,15 +185,29 @@ def test_predict_evo2_runs(
 @pytest.mark.parametrize(
     "ddp,cp,pp,tp,fp8,wi",
     [
-        (1, 1, 1, 1, False, "epoch"),
-        (2, 1, 1, 1, False, "epoch"),
-        (2, 1, 1, 1, False, "batch"),  # simulate a large prediction run with dp parallelism
-        (1, 2, 1, 1, False, "epoch"),
-        (1, 2, 1, 1, False, "batch"),
-        (1, 1, 2, 1, False, "epoch"),
-        (1, 1, 2, 1, True, "epoch"),  # Cover case where FP8 was not supported with TP=2
-        (1, 1, 1, 2, False, "epoch"),
+        pytest.param(1, 1, 1, 1, False, "epoch", id="ddp=1,cp=1,pp=1,tp=1,fp8=False,wi=epoch"),
+        pytest.param(2, 1, 1, 1, False, "epoch", id="ddp=2,cp=1,pp=1,tp=1,fp8=False,wi=epoch"),
+        pytest.param(
+            2, 1, 1, 1, False, "batch", id="ddp=2,cp=1,pp=1,tp=1,fp8=False,wi=batch"
+        ),  # simulate a large prediction run with dp parallelism
+        pytest.param(1, 2, 1, 1, False, "epoch", id="ddp=1,cp=2,pp=1,tp=1,fp8=False,wi=epoch"),
+        pytest.param(1, 2, 1, 1, False, "batch", id="ddp=1,cp=2,pp=1,tp=1,fp8=False,wi=batch"),
+        pytest.param(
+            1,
+            1,
+            2,
+            1,
+            False,
+            "epoch",
+            id="ddp=1,cp=1,pp=2,tp=1,fp8=False,wi=epoch",
+            marks=pytest.mark.skip("Pipeline parallelism test currently hangs."),
+        ),
+        pytest.param(
+            1, 1, 1, 2, True, "epoch", id="ddp=1,cp=1,pp=1,tp=2,fp8=True,wi=epoch"
+        ),  # Cover case where FP8 was not supported with TP=2
+        pytest.param(1, 1, 1, 2, False, "epoch", id="ddp=1,cp=1,pp=1,tp=2,fp8=False,wi=epoch"),
     ],
+    ids=lambda x: f"ddp={x[0]},cp={x[1]},pp={x[2]},tp={x[3]},fp8={x[4]},wi={x[5]}",
 )
 def test_predict_evo2_runs_with_log_probs(
     tmp_path,
@@ -210,6 +232,9 @@ def test_predict_evo2_runs_with_log_probs(
     world_size = ddp * cp * pp * tp
     if world_size > torch.cuda.device_count():
         pytest.skip(f"World size {world_size} is less than the number of GPUs {torch.cuda.device_count()}")
+    is_fp8_supported, _, _ = check_fp8_support(torch.cuda.current_device())
+    if not is_fp8_supported and fp8:
+        pytest.skip("FP8 is not supported on this GPU.")
 
     fasta_file_path = tmp_path / "test.fasta"
     create_fasta_file(
@@ -221,6 +246,7 @@ def test_predict_evo2_runs_with_log_probs(
     if is_a6000_gpu():
         # Fix hanging issue on A6000 GPUs with multi-gpu tests
         env["NCCL_P2P_DISABLE"] = "1"
+
     fp8_option = "--fp8" if fp8 else ""
     # Build the command string.
     # Note: The command assumes that `train_evo2` is in your PATH.
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/utils/callbacks.py b/sub-packages/bionemo-llm/src/bionemo/llm/utils/callbacks.py
@@ -16,6 +16,8 @@
 
 import os
 from typing import Any, Literal, Sequence
+
+
 try:  # Python 3.12+
     from typing import override
 except ImportError:  # Python < 3.12