Skip to content

Commit 120c903

Browse files
abrichrclaude
andcommitted
fix(modal): apply fixes from first successful Modal training run
- Add `serialized=True` to @app.function for non-global-scope support - Auto-create volume before upload, add `--force` for overwrites - Fix variable scoping (`vol = training_volume`) inside remote function - Add `openadapt-ml[training]` to container image dependencies - Use `--jsonl` flag in train subprocess for correct data path - Add `modal` to project dependencies - Update test to verify create+put two-call pattern Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 25481ea commit 120c903

4 files changed

Lines changed: 383 additions & 35 deletions

File tree

openadapt_ml/cloud/modal_cloud.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def _build_app():
8787
"accelerate",
8888
"pyyaml",
8989
"pillow",
90+
"openadapt-ml[training]",
9091
)
9192

9293
return app, training_image, training_volume
@@ -120,11 +121,15 @@ def _register_train_function():
120121
"""
121122
app, training_image, training_volume = _ensure_app()
122123

124+
# Capture volume reference for use inside remote function
125+
vol = training_volume
126+
123127
@app.function(
124128
gpu="A10G",
125129
image=training_image,
126-
volumes={VOLUME_MOUNT: training_volume},
130+
volumes={VOLUME_MOUNT: vol},
127131
timeout=3600,
132+
serialized=True,
128133
)
129134
def train_model(
130135
config_yaml: str,
@@ -141,50 +146,52 @@ def train_model(
141146
"""
142147
import json as _json
143148
import os as _os
149+
import subprocess as _subprocess
150+
import sys as _sys
144151
import time
145152

146153
import yaml
147154

148155
results_dir = RESULTS_REMOTE_PATH
149-
os.makedirs(results_dir, exist_ok=True)
156+
_os.makedirs(results_dir, exist_ok=True)
150157

151158
config = yaml.safe_load(config_yaml)
152159

153-
# Point config at volume paths
154-
config["dataset_path"] = f"{bundle_path}/training_data.jsonl"
155-
config["image_dir"] = f"{bundle_path}/images"
156-
config["output_dir"] = results_dir
157-
158160
# Write config to disk for the trainer
159161
config_path = f"{VOLUME_MOUNT}/train_config.yaml"
160162
with open(config_path, "w") as f:
161163
yaml.dump(config, f)
162164

165+
# Paths inside the volume
166+
jsonl_path = f"{bundle_path}/training_data.jsonl"
167+
163168
# Log start
164169
training_log = {
165170
"status": "running",
166171
"start_time": time.time(),
167-
"config": config,
168172
"losses": [],
169173
}
170174
log_path = f"{results_dir}/training_log.json"
171175
with open(log_path, "w") as f:
172176
_json.dump(training_log, f, indent=2)
177+
vol.commit()
173178

174-
# Commit volume so logs are visible during training
175-
training_volume.commit()
176-
177-
# Run training via subprocess (same pattern as Lambda)
179+
# Run training via subprocess using --jsonl flag
178180
cmd = [
179-
sys.executable,
181+
_sys.executable,
180182
"-m",
181183
"openadapt_ml.scripts.train",
182184
"--config",
183185
config_path,
186+
"--jsonl",
187+
jsonl_path,
188+
"--output-dir",
189+
results_dir,
184190
]
185191

192+
print(f"Running: {' '.join(cmd)}")
186193
try:
187-
result = subprocess.run(
194+
result = _subprocess.run(
188195
cmd,
189196
capture_output=True,
190197
text=True,
@@ -199,11 +206,13 @@ def train_model(
199206
)
200207

201208
if result.stdout:
209+
print(result.stdout[-2000:])
202210
training_log["stdout_tail"] = result.stdout[-2000:]
203211
if result.stderr:
212+
print(result.stderr[-2000:])
204213
training_log["stderr_tail"] = result.stderr[-2000:]
205214

206-
except subprocess.TimeoutExpired:
215+
except _subprocess.TimeoutExpired:
207216
training_log["status"] = "timeout"
208217
training_log["end_time"] = time.time()
209218
training_log["elapsed_time"] = (
@@ -218,10 +227,10 @@ def train_model(
218227
)
219228

220229
# Read losses from the trainer's own log if it exists
221-
trainer_log = f"{results_dir}/training_log.json"
222-
if _os.path.exists(trainer_log):
230+
trainer_log_path = f"{results_dir}/training_log.json"
231+
if _os.path.exists(trainer_log_path):
223232
try:
224-
with open(trainer_log) as f:
233+
with open(trainer_log_path) as f:
225234
trainer_data = _json.load(f)
226235
if "losses" in trainer_data:
227236
training_log["losses"] = trainer_data["losses"]
@@ -233,8 +242,7 @@ def train_model(
233242
# Save final log and commit volume
234243
with open(log_path, "w") as f:
235244
_json.dump(training_log, f, indent=2)
236-
237-
training_volume.commit()
245+
vol.commit()
238246

239247
return _json.dumps(
240248
{
@@ -271,13 +279,21 @@ def upload_bundle_to_volume(local_bundle: str | Path) -> None:
271279

272280
print(f"Uploading bundle to Modal volume '{VOLUME_NAME}'...")
273281

282+
# Create volume if it doesn't exist
283+
create_cmd = ["modal", "volume", "create", VOLUME_NAME]
284+
create_result = subprocess.run(create_cmd, capture_output=True, text=True)
285+
if create_result.returncode == 0:
286+
print(f" Created volume '{VOLUME_NAME}'")
287+
# Ignore errors (volume may already exist)
288+
274289
cmd = [
275290
"modal",
276291
"volume",
277292
"put",
278293
VOLUME_NAME,
279294
str(local_bundle),
280295
"/bundle",
296+
"--force",
281297
]
282298
result = subprocess.run(cmd, capture_output=True, text=True)
283299
if result.returncode != 0:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ dependencies = [
2727
"click>=8.1.0", # CLI framework
2828
"google-generativeai>=0.8.5",
2929
"matplotlib>=3.10.7",
30+
"modal>=1.3.4",
3031
"openadapt-capture>=0.3.0",
3132
"peft>=0.18.0",
3233
"pillow>=12.0.0",

tests/test_modal_cloud.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def test_upload_bundle_without_jsonl_raises(self):
9797
upload_bundle_to_volume(tmpdir)
9898

9999
def test_upload_bundle_calls_modal_volume_put(self):
100-
"""Test that upload invokes 'modal volume put' with correct args."""
100+
"""Test that upload invokes 'modal volume create' then 'modal volume put'."""
101101
from openadapt_ml.cloud.modal_cloud import upload_bundle_to_volume, VOLUME_NAME
102102

103103
with tempfile.TemporaryDirectory() as tmpdir:
@@ -115,14 +115,16 @@ def test_upload_bundle_calls_modal_volume_put(self):
115115
) as mock_run:
116116
upload_bundle_to_volume(tmpdir)
117117

118-
mock_run.assert_called_once()
119-
cmd = mock_run.call_args[0][0]
120-
assert cmd[0] == "modal"
121-
assert cmd[1] == "volume"
122-
assert cmd[2] == "put"
123-
assert cmd[3] == VOLUME_NAME
124-
assert cmd[4] == tmpdir
125-
assert cmd[5] == "/bundle"
118+
# Two calls: create volume + put
119+
assert mock_run.call_count == 2
120+
# Second call is the put
121+
put_cmd = mock_run.call_args_list[1][0][0]
122+
assert put_cmd[0] == "modal"
123+
assert put_cmd[1] == "volume"
124+
assert put_cmd[2] == "put"
125+
assert put_cmd[3] == VOLUME_NAME
126+
assert put_cmd[4] == tmpdir
127+
assert put_cmd[5] == "/bundle"
126128

127129
def test_upload_bundle_failure_raises(self):
128130
"""Test that a failed volume put raises RuntimeError."""

0 commit comments

Comments
 (0)