File tree Expand file tree Collapse file tree
bionemo-recipes/recipes/codonfm_native_te Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -60,6 +60,7 @@ lr_scheduler_kwargs:
6060checkpoint :
6161 ckpt_dir : ???
6262 save_final_model : true
63+ save_final_model_with_checkpoint : false
6364 resume_from_checkpoint : true
6465 save_every_n_steps : 1_000
6566 max_checkpoints : 5
Original file line number Diff line number Diff line change @@ -241,6 +241,13 @@ def main(args: DictConfig) -> float | None:
241241 dist_config = dist_config ,
242242 max_checkpoints = args .checkpoint .max_checkpoints ,
243243 )
244+ if args .checkpoint .save_final_model_with_checkpoint :
245+ save_final_model_ddp (
246+ model = model ,
247+ config = config ,
248+ save_directory = ckpt_path / f"step_{ step } " / "final_model" ,
249+ dist_config = dist_config ,
250+ )
244251
245252 if val_dataloader is not None and step > 0 and step % args .validation .eval_interval == 0 :
246253 model .eval ()
Original file line number Diff line number Diff line change @@ -273,6 +273,13 @@ def main(args: DictConfig) -> float | None:
273273 dist_config = dist_config ,
274274 max_checkpoints = args .checkpoint .max_checkpoints ,
275275 )
276+ if args .checkpoint .save_final_model_with_checkpoint :
277+ save_final_model_fsdp2 (
278+ model = model ,
279+ config = config ,
280+ save_directory = ckpt_path / f"step_{ step } " / "final_model" ,
281+ dist_config = dist_config ,
282+ )
276283
277284 if val_dataloader is not None and step > 0 and step % args .validation .eval_interval == 0 :
278285 model .eval ()
You can’t perform that action at this time.
0 commit comments