add partial conv tests to esm2_accelerate recipe (#1122)

pstjohn · web-flow · commit 0d306524b107 · 2025-09-08T17:26:05.000Z
Adds partial conv tests to the esm2_accelerate recipe similar to those
used in the mfsdp recipe

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- New Features
  - Added configurable warmup steps (default 0) to training.
- Chores
  - Increased default training duration (more steps).
- Reduced frequency of saving, evaluation, and logging to lower
overhead.
- Tests
- Improved distributed run stability by using dynamic, collision-free
ports.
- Added parsing of final training loss from output and assertions to
ensure expected convergence.
  - Streamlined test overrides for faster, deterministic sanity runs.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/.devcontainer/recipes/requirements.txt b/.devcontainer/recipes/requirements.txt
@@ -7,6 +7,6 @@ megatron-fsdp==0.1.0rc0
 torchmetrics
 tqdm
 transformer_engine
-transformers @ git+https://github.com/huggingface/transformers.git
+transformers
 typer
 wandb
diff --git a/recipes/esm2_accelerate/hydra_config/L0_sanity.yaml b/recipes/esm2_accelerate/hydra_config/L0_sanity.yaml
@@ -1,14 +1,17 @@
 defaults:
   - defaults
+  - _self_
 
 model_tag: "nvidia/esm2_t6_8M_UR50D"
-stop_after_n_steps: 4
+stop_after_n_steps: 250
+
 trainer:
   run_name: "esm2_t6_8M_UR50D_sanity"
   per_device_train_batch_size: 2
   per_device_eval_batch_size: 2
-  save_steps: 2
-  eval_steps: 2
-  logging_steps: 1
+  save_steps: 1000
+  eval_steps: 1000
+  logging_steps: 10
   report_to: "none"
   dataloader_num_workers: 0
+  warmup_steps: 0
diff --git a/recipes/esm2_accelerate/requirements.txt b/recipes/esm2_accelerate/requirements.txt
@@ -3,5 +3,5 @@ datasets
 deepspeed
 hydra-core
 torchmetrics
-transformers @ git+https://github.com/huggingface/transformers.git
+transformers
 wandb
diff --git a/recipes/esm2_accelerate/test_train.py b/recipes/esm2_accelerate/test_train.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import os
+import random
 import re
 import shutil
 import subprocess
@@ -40,6 +41,38 @@
 )
 
 
+def extract_final_train_loss(output_text: str) -> float:
+    """
+    Parse the training output to extract the final train_loss value.
+
+    Args:
+        output_text: Combined stdout and stderr from training process
+
+    Returns:
+        Final train_loss value as float
+
+    Raises:
+        ValueError: If no train_loss found or parsing fails
+    """
+    # Look for dictionary-like patterns containing train_loss
+    # Pattern matches: {'key': value, 'train_loss': value, ...}
+    pattern = r'\{[^{}]*[\'"]train_loss[\'"]:\s*([0-9.]+)[^{}]*\}'
+
+    matches = re.findall(pattern, output_text)
+
+    if not matches:
+        # Fallback: try to find train_loss in any context
+        simple_pattern = r'[\'"]train_loss[\'"]:\s*([0-9.]+)'
+        matches = re.findall(simple_pattern, output_text)
+
+    if not matches:
+        raise ValueError("No train_loss found in training output")
+
+    # Return the last (final) train_loss value found
+    final_train_loss = float(matches[-1])
+    return final_train_loss
+
+
 def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
     """Test that train.py runs successfully with sanity config and creates expected outputs."""
 
@@ -51,11 +84,20 @@ def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
     monkeypatch.setenv("RANK", "0")
     monkeypatch.setenv("WORLD_SIZE", "1")
     monkeypatch.setenv("MASTER_ADDR", "localhost")
-    monkeypatch.setenv("MASTER_PORT", "29500")
+    monkeypatch.setenv("MASTER_PORT", f"{random.randint(20000, 40000)}")
     monkeypatch.setenv("WANDB_MODE", "disabled")
 
     with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
-        sanity_config = compose(config_name="L0_sanity", overrides=[f"trainer.output_dir={tmp_path}"])
+        sanity_config = compose(
+            config_name="L0_sanity",
+            overrides=[
+                f"trainer.output_dir={tmp_path}",
+                "stop_after_n_steps=4",
+                "trainer.do_eval=False",
+                "trainer.save_steps=2",
+                f"hydra.run.dir={tmp_path}/outputs",
+            ],
+        )
 
     main(sanity_config)
 
@@ -155,11 +197,15 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
         str(accelerate_config_path),
         "--num_processes",
         "1",
+        "--main_process_port",
+        f"{random.randint(20000, 40000)}",
         str(train_py),
         "--config-name",
         "L0_sanity.yaml",
         f"model_tag={model_tag}",
         f"trainer.output_dir={tmp_path}",
+        f"hydra.run.dir={tmp_path}/outputs",
+        "trainer.do_eval=False",
     ]
 
     result = subprocess.run(
@@ -176,6 +222,17 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
         print(f"STDERR:\n{result.stderr}")
         pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
 
+    # Parse the training output to check final train_loss
+    combined_output = result.stdout + result.stderr
+    try:
+        final_train_loss = extract_final_train_loss(combined_output)
+        print(f"Final train_loss: {final_train_loss}")
+        assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
+    except ValueError as e:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Failed to extract train_loss from output: {e}")
+
 
 @requires_multi_gpu
 @pytest.mark.parametrize(
@@ -186,9 +243,11 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
         # modeling_esm_te import seems to fix it.
         # ("fsdp1_te.yaml", "nvidia/esm2_t6_8M_UR50D"),
         ("fsdp2_te.yaml", "nvidia/esm2_t6_8M_UR50D"),
-        ("default.yaml", "facebook/esm2_t6_8M_UR50D"),
-        ("fsdp1_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
-        ("fsdp2_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
+        # TODO: (BIONEMO-2761). These tests were broken by https://github.com/huggingface/transformers/pull/40370, but
+        # oddly the single-GPU tests still seem to pass. Changing the attention_backend doesn't seem to help.
+        # ("default.yaml", "facebook/esm2_t6_8M_UR50D"),
+        # ("fsdp1_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
+        # ("fsdp2_hf.yaml", "facebook/esm2_t6_8M_UR50D"),
     ],
 )
 def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
@@ -211,11 +270,15 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
         str(accelerate_config_path),
         "--num_processes",
         "2",
+        "--main_process_port",
+        f"{random.randint(20000, 40000)}",
         str(train_py),
         "--config-name",
         "L0_sanity.yaml",
         f"model_tag={model_tag}",
         f"trainer.output_dir={tmp_path}",
+        f"hydra.run.dir={tmp_path}/outputs",
+        "trainer.do_eval=False",
     ]
 
     result = subprocess.run(
@@ -231,3 +294,14 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
         print(f"STDOUT:\n{result.stdout}")
         print(f"STDERR:\n{result.stderr}")
         pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
+
+    # Parse the training output to check final train_loss
+    combined_output = result.stdout + result.stderr
+    try:
+        final_train_loss = extract_final_train_loss(combined_output)
+        print(f"Final train_loss: {final_train_loss}")
+        assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
+    except ValueError as e:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Failed to extract train_loss from output: {e}")