@@ -87,6 +87,7 @@ def _build_app():
8787 "accelerate" ,
8888 "pyyaml" ,
8989 "pillow" ,
90+ "openadapt-ml[training]" ,
9091 )
9192
9293 return app , training_image , training_volume
@@ -120,11 +121,15 @@ def _register_train_function():
120121 """
121122 app , training_image , training_volume = _ensure_app ()
122123
124+ # Capture volume reference for use inside remote function
125+ vol = training_volume
126+
123127 @app .function (
124128 gpu = "A10G" ,
125129 image = training_image ,
126- volumes = {VOLUME_MOUNT : training_volume },
130+ volumes = {VOLUME_MOUNT : vol },
127131 timeout = 3600 ,
132+ serialized = True ,
128133 )
129134 def train_model (
130135 config_yaml : str ,
@@ -141,50 +146,52 @@ def train_model(
141146 """
142147 import json as _json
143148 import os as _os
149+ import subprocess as _subprocess
150+ import sys as _sys
144151 import time
145152
146153 import yaml
147154
148155 results_dir = RESULTS_REMOTE_PATH
149- os .makedirs (results_dir , exist_ok = True )
156+ _os .makedirs (results_dir , exist_ok = True )
150157
151158 config = yaml .safe_load (config_yaml )
152159
153- # Point config at volume paths
154- config ["dataset_path" ] = f"{ bundle_path } /training_data.jsonl"
155- config ["image_dir" ] = f"{ bundle_path } /images"
156- config ["output_dir" ] = results_dir
157-
158160 # Write config to disk for the trainer
159161 config_path = f"{ VOLUME_MOUNT } /train_config.yaml"
160162 with open (config_path , "w" ) as f :
161163 yaml .dump (config , f )
162164
165+ # Paths inside the volume
166+ jsonl_path = f"{ bundle_path } /training_data.jsonl"
167+
163168 # Log start
164169 training_log = {
165170 "status" : "running" ,
166171 "start_time" : time .time (),
167- "config" : config ,
168172 "losses" : [],
169173 }
170174 log_path = f"{ results_dir } /training_log.json"
171175 with open (log_path , "w" ) as f :
172176 _json .dump (training_log , f , indent = 2 )
177+ vol .commit ()
173178
174- # Commit volume so logs are visible during training
175- training_volume .commit ()
176-
177- # Run training via subprocess (same pattern as Lambda)
179+ # Run training via subprocess using --jsonl flag
178180 cmd = [
179- sys .executable ,
181+ _sys .executable ,
180182 "-m" ,
181183 "openadapt_ml.scripts.train" ,
182184 "--config" ,
183185 config_path ,
186+ "--jsonl" ,
187+ jsonl_path ,
188+ "--output-dir" ,
189+ results_dir ,
184190 ]
185191
192+ print (f"Running: { ' ' .join (cmd )} " )
186193 try :
187- result = subprocess .run (
194+ result = _subprocess .run (
188195 cmd ,
189196 capture_output = True ,
190197 text = True ,
@@ -199,11 +206,13 @@ def train_model(
199206 )
200207
201208 if result .stdout :
209+ print (result .stdout [- 2000 :])
202210 training_log ["stdout_tail" ] = result .stdout [- 2000 :]
203211 if result .stderr :
212+ print (result .stderr [- 2000 :])
204213 training_log ["stderr_tail" ] = result .stderr [- 2000 :]
205214
206- except subprocess .TimeoutExpired :
215+ except _subprocess .TimeoutExpired :
207216 training_log ["status" ] = "timeout"
208217 training_log ["end_time" ] = time .time ()
209218 training_log ["elapsed_time" ] = (
@@ -218,10 +227,10 @@ def train_model(
218227 )
219228
220229 # Read losses from the trainer's own log if it exists
221- trainer_log = f"{ results_dir } /training_log.json"
222- if _os .path .exists (trainer_log ):
230+ trainer_log_path = f"{ results_dir } /training_log.json"
231+ if _os .path .exists (trainer_log_path ):
223232 try :
224- with open (trainer_log ) as f :
233+ with open (trainer_log_path ) as f :
225234 trainer_data = _json .load (f )
226235 if "losses" in trainer_data :
227236 training_log ["losses" ] = trainer_data ["losses" ]
@@ -233,8 +242,7 @@ def train_model(
233242 # Save final log and commit volume
234243 with open (log_path , "w" ) as f :
235244 _json .dump (training_log , f , indent = 2 )
236-
237- training_volume .commit ()
245+ vol .commit ()
238246
239247 return _json .dumps (
240248 {
@@ -271,13 +279,21 @@ def upload_bundle_to_volume(local_bundle: str | Path) -> None:
271279
272280 print (f"Uploading bundle to Modal volume '{ VOLUME_NAME } '..." )
273281
282+ # Create volume if it doesn't exist
283+ create_cmd = ["modal" , "volume" , "create" , VOLUME_NAME ]
284+ create_result = subprocess .run (create_cmd , capture_output = True , text = True )
285+ if create_result .returncode == 0 :
286+ print (f" Created volume '{ VOLUME_NAME } '" )
287+ # Ignore errors (volume may already exist)
288+
274289 cmd = [
275290 "modal" ,
276291 "volume" ,
277292 "put" ,
278293 VOLUME_NAME ,
279294 str (local_bundle ),
280295 "/bundle" ,
296+ "--force" ,
281297 ]
282298 result = subprocess .run (cmd , capture_output = True , text = True )
283299 if result .returncode != 0 :
0 commit comments