fix(modal): apply fixes from first successful Modal training run

abrichr · claude · abrichr · commit 120c903065b0 · 2026-02-24T14:03:14.000-05:00
- Add `serialized=True` to @app.function for non-global-scope support
- Auto-create volume before upload, add `--force` for overwrites
- Fix variable scoping (`vol = training_volume`) inside remote function
- Add `openadapt-ml[training]` to container image dependencies
- Use `--jsonl` flag in train subprocess for correct data path
- Add `modal` to project dependencies
- Update test to verify create+put two-call pattern

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/openadapt_ml/cloud/modal_cloud.py b/openadapt_ml/cloud/modal_cloud.py
@@ -87,6 +87,7 @@ def _build_app():
         "accelerate",
         "pyyaml",
         "pillow",
+        "openadapt-ml[training]",
     )
 
     return app, training_image, training_volume
@@ -120,11 +121,15 @@ def _register_train_function():
     """
     app, training_image, training_volume = _ensure_app()
 
+    # Capture volume reference for use inside remote function
+    vol = training_volume
+
     @app.function(
         gpu="A10G",
         image=training_image,
-        volumes={VOLUME_MOUNT: training_volume},
+        volumes={VOLUME_MOUNT: vol},
         timeout=3600,
+        serialized=True,
     )
     def train_model(
         config_yaml: str,
@@ -141,50 +146,52 @@ def train_model(
         """
         import json as _json
         import os as _os
+        import subprocess as _subprocess
+        import sys as _sys
         import time
 
         import yaml
 
         results_dir = RESULTS_REMOTE_PATH
-        os.makedirs(results_dir, exist_ok=True)
+        _os.makedirs(results_dir, exist_ok=True)
 
         config = yaml.safe_load(config_yaml)
 
-        # Point config at volume paths
-        config["dataset_path"] = f"{bundle_path}/training_data.jsonl"
-        config["image_dir"] = f"{bundle_path}/images"
-        config["output_dir"] = results_dir
-
         # Write config to disk for the trainer
         config_path = f"{VOLUME_MOUNT}/train_config.yaml"
         with open(config_path, "w") as f:
             yaml.dump(config, f)
 
+        # Paths inside the volume
+        jsonl_path = f"{bundle_path}/training_data.jsonl"
+
         # Log start
         training_log = {
             "status": "running",
             "start_time": time.time(),
-            "config": config,
             "losses": [],
         }
         log_path = f"{results_dir}/training_log.json"
         with open(log_path, "w") as f:
             _json.dump(training_log, f, indent=2)
+        vol.commit()
 
-        # Commit volume so logs are visible during training
-        training_volume.commit()
-
-        # Run training via subprocess (same pattern as Lambda)
+        # Run training via subprocess using --jsonl flag
         cmd = [
-            sys.executable,
+            _sys.executable,
             "-m",
             "openadapt_ml.scripts.train",
             "--config",
             config_path,
+            "--jsonl",
+            jsonl_path,
+            "--output-dir",
+            results_dir,
         ]
 
+        print(f"Running: {' '.join(cmd)}")
         try:
-            result = subprocess.run(
+            result = _subprocess.run(
                 cmd,
                 capture_output=True,
                 text=True,
@@ -199,11 +206,13 @@ def train_model(
             )
 
             if result.stdout:
+                print(result.stdout[-2000:])
                 training_log["stdout_tail"] = result.stdout[-2000:]
             if result.stderr:
+                print(result.stderr[-2000:])
                 training_log["stderr_tail"] = result.stderr[-2000:]
 
-        except subprocess.TimeoutExpired:
+        except _subprocess.TimeoutExpired:
             training_log["status"] = "timeout"
             training_log["end_time"] = time.time()
             training_log["elapsed_time"] = (
@@ -218,10 +227,10 @@ def train_model(
             )
 
         # Read losses from the trainer's own log if it exists
-        trainer_log = f"{results_dir}/training_log.json"
-        if _os.path.exists(trainer_log):
+        trainer_log_path = f"{results_dir}/training_log.json"
+        if _os.path.exists(trainer_log_path):
             try:
-                with open(trainer_log) as f:
+                with open(trainer_log_path) as f:
                     trainer_data = _json.load(f)
                 if "losses" in trainer_data:
                     training_log["losses"] = trainer_data["losses"]
@@ -233,8 +242,7 @@ def train_model(
         # Save final log and commit volume
         with open(log_path, "w") as f:
             _json.dump(training_log, f, indent=2)
-
-        training_volume.commit()
+        vol.commit()
 
         return _json.dumps(
             {
@@ -271,13 +279,21 @@ def upload_bundle_to_volume(local_bundle: str | Path) -> None:
 
     print(f"Uploading bundle to Modal volume '{VOLUME_NAME}'...")
 
+    # Create volume if it doesn't exist
+    create_cmd = ["modal", "volume", "create", VOLUME_NAME]
+    create_result = subprocess.run(create_cmd, capture_output=True, text=True)
+    if create_result.returncode == 0:
+        print(f"  Created volume '{VOLUME_NAME}'")
+    # Ignore errors (volume may already exist)
+
     cmd = [
         "modal",
         "volume",
         "put",
         VOLUME_NAME,
         str(local_bundle),
         "/bundle",
+        "--force",
     ]
     result = subprocess.run(cmd, capture_output=True, text=True)
     if result.returncode != 0:
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "click>=8.1.0", # CLI framework
     "google-generativeai>=0.8.5",
     "matplotlib>=3.10.7",
+    "modal>=1.3.4",
     "openadapt-capture>=0.3.0",
     "peft>=0.18.0",
     "pillow>=12.0.0",
diff --git a/tests/test_modal_cloud.py b/tests/test_modal_cloud.py
@@ -97,7 +97,7 @@ def test_upload_bundle_without_jsonl_raises(self):
                 upload_bundle_to_volume(tmpdir)
 
     def test_upload_bundle_calls_modal_volume_put(self):
-        """Test that upload invokes 'modal volume put' with correct args."""
+        """Test that upload invokes 'modal volume create' then 'modal volume put'."""
         from openadapt_ml.cloud.modal_cloud import upload_bundle_to_volume, VOLUME_NAME
 
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -115,14 +115,16 @@ def test_upload_bundle_calls_modal_volume_put(self):
             ) as mock_run:
                 upload_bundle_to_volume(tmpdir)
 
-                mock_run.assert_called_once()
-                cmd = mock_run.call_args[0][0]
-                assert cmd[0] == "modal"
-                assert cmd[1] == "volume"
-                assert cmd[2] == "put"
-                assert cmd[3] == VOLUME_NAME
-                assert cmd[4] == tmpdir
-                assert cmd[5] == "/bundle"
+                # Two calls: create volume + put
+                assert mock_run.call_count == 2
+                # Second call is the put
+                put_cmd = mock_run.call_args_list[1][0][0]
+                assert put_cmd[0] == "modal"
+                assert put_cmd[1] == "volume"
+                assert put_cmd[2] == "put"
+                assert put_cmd[3] == VOLUME_NAME
+                assert put_cmd[4] == tmpdir
+                assert put_cmd[5] == "/bundle"
 
     def test_upload_bundle_failure_raises(self):
         """Test that a failed volume put raises RuntimeError."""
diff --git a/uv.lock b/uv.lock