add partial conv tests to esm2_accelerate recipe

pstjohn · pstjohn · commit 51c8a67a51dc · 2025-09-08T06:36:06.000-07:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/recipes/esm2_accelerate/hydra_config/L0_sanity.yaml b/recipes/esm2_accelerate/hydra_config/L0_sanity.yaml
@@ -1,14 +1,17 @@
 defaults:
   - defaults
+  - _self_
 
 model_tag: "nvidia/esm2_t6_8M_UR50D"
-stop_after_n_steps: 4
+stop_after_n_steps: 250
+
 trainer:
   run_name: "esm2_t6_8M_UR50D_sanity"
   per_device_train_batch_size: 2
   per_device_eval_batch_size: 2
-  save_steps: 2
-  eval_steps: 2
-  logging_steps: 1
+  save_steps: 1000
+  eval_steps: 1000
+  logging_steps: 10
   report_to: "none"
   dataloader_num_workers: 0
+  warmup_steps: 0
diff --git a/recipes/esm2_accelerate/test_train.py b/recipes/esm2_accelerate/test_train.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import os
+import random
 import re
 import shutil
 import subprocess
@@ -40,6 +41,38 @@
 )
 
 
+def extract_final_train_loss(output_text: str) -> float:
+    """
+    Parse the training output to extract the final train_loss value.
+
+    Args:
+        output_text: Combined stdout and stderr from training process
+
+    Returns:
+        Final train_loss value as float
+
+    Raises:
+        ValueError: If no train_loss found or parsing fails
+    """
+    # Look for dictionary-like patterns containing train_loss
+    # Pattern matches: {'key': value, 'train_loss': value, ...}
+    pattern = r'\{[^{}]*[\'"]train_loss[\'"]:\s*([0-9.]+)[^{}]*\}'
+
+    matches = re.findall(pattern, output_text)
+
+    if not matches:
+        # Fallback: try to find train_loss in any context
+        simple_pattern = r'[\'"]train_loss[\'"]:\s*([0-9.]+)'
+        matches = re.findall(simple_pattern, output_text)
+
+    if not matches:
+        raise ValueError("No train_loss found in training output")
+
+    # Return the last (final) train_loss value found
+    final_train_loss = float(matches[-1])
+    return final_train_loss
+
+
 def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
     """Test that train.py runs successfully with sanity config and creates expected outputs."""
 
@@ -51,11 +84,19 @@ def test_train_can_resume_from_checkpoint(monkeypatch, tmp_path: Path):
     monkeypatch.setenv("RANK", "0")
     monkeypatch.setenv("WORLD_SIZE", "1")
     monkeypatch.setenv("MASTER_ADDR", "localhost")
-    monkeypatch.setenv("MASTER_PORT", "29500")
+    monkeypatch.setenv("MASTER_PORT", f"{random.randint(20000, 40000)}")
     monkeypatch.setenv("WANDB_MODE", "disabled")
 
     with initialize_config_dir(config_dir=str(recipe_dir / "hydra_config"), version_base="1.2"):
-        sanity_config = compose(config_name="L0_sanity", overrides=[f"trainer.output_dir={tmp_path}"])
+        sanity_config = compose(
+            config_name="L0_sanity",
+            overrides=[
+                f"trainer.output_dir={tmp_path}",
+                "stop_after_n_steps=4",
+                "trainer.do_eval=False",
+                "trainer.save_steps=2",
+            ],
+        )
 
     main(sanity_config)
 
@@ -155,11 +196,14 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
         str(accelerate_config_path),
         "--num_processes",
         "1",
+        "--main_process_port",
+        f"{random.randint(20000, 40000)}",
         str(train_py),
         "--config-name",
         "L0_sanity.yaml",
         f"model_tag={model_tag}",
         f"trainer.output_dir={tmp_path}",
+        "trainer.do_eval=False",
     ]
 
     result = subprocess.run(
@@ -176,6 +220,17 @@ def test_accelerate_launch(accelerate_config, model_tag, tmp_path):
         print(f"STDERR:\n{result.stderr}")
         pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
 
+    # Parse the training output to check final train_loss
+    combined_output = result.stdout + result.stderr
+    try:
+        final_train_loss = extract_final_train_loss(combined_output)
+        print(f"Final train_loss: {final_train_loss}")
+        assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
+    except ValueError as e:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Failed to extract train_loss from output: {e}")
+
 
 @requires_multi_gpu
 @pytest.mark.parametrize(
@@ -211,11 +266,14 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
         str(accelerate_config_path),
         "--num_processes",
         "2",
+        "--main_process_port",
+        f"{random.randint(20000, 40000)}",
         str(train_py),
         "--config-name",
         "L0_sanity.yaml",
         f"model_tag={model_tag}",
         f"trainer.output_dir={tmp_path}",
+        "trainer.do_eval=False",
     ]
 
     result = subprocess.run(
@@ -231,3 +289,15 @@ def test_accelerate_launch_multi_gpu(accelerate_config, model_tag, tmp_path):
         print(f"STDOUT:\n{result.stdout}")
         print(f"STDERR:\n{result.stderr}")
         pytest.fail(f"Command:\n{' '.join(cmd)}\nfailed with exit code {result.returncode}")
+
+    # Parse the training output to check final train_loss
+    combined_output = result.stdout + result.stderr
+    try:
+        final_train_loss = extract_final_train_loss(combined_output)
+        breakpoint()
+        print(f"Final train_loss: {final_train_loss}")
+        assert final_train_loss < 3.0, f"Final train_loss {final_train_loss} should be less than 3.0"
+    except ValueError as e:
+        print(f"STDOUT:\n{result.stdout}")
+        print(f"STDERR:\n{result.stderr}")
+        pytest.fail(f"Failed to extract train_loss from output: {e}")
diff --git a/recipes/esm2_accelerate/train.py b/recipes/esm2_accelerate/train.py
@@ -38,7 +38,12 @@ def main(args: DictConfig):
     config = AutoConfig.from_pretrained(args.model_tag, trust_remote_code=True)
     config.max_seq_length = args.max_seq_length
     config.micro_batch_size = args.trainer.per_device_train_batch_size
-    model = AutoModelForMaskedLM.from_config(config, trust_remote_code=True, torch_dtype=torch.bfloat16)
+
+    model = AutoModelForMaskedLM.from_config(
+        config,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
 
     train_dataset, eval_dataset, data_collator = create_datasets_and_collator(
         tokenizer_name=args.model_tag,
@@ -57,7 +62,7 @@ def main(args: DictConfig):
         callbacks=[StopAfterNStepsCallback(args.stop_after_n_steps)],
     )
 
-    logger.info("ACCELERATE STATE:\n%s\n", trainer.accelerator.state)
+    train_result, eval_result = None, None
 
     if training_args.do_train:
         Path(training_args.output_dir).mkdir(parents=True, exist_ok=True)
@@ -72,7 +77,11 @@ def main(args: DictConfig):
         trainer.save_model(str(Path(training_args.output_dir) / "checkpoint-last"))
 
     if training_args.do_eval:
-        trainer.evaluate()
+        eval_result = trainer.evaluate()
+        logger.info("Evaluation complete. Metrics: %s", eval_result)
+        trainer.save_metrics("eval", eval_result)
+
+    return train_result, eval_result
 
 
 if __name__ == "__main__":