From e3f0b8a3947b026181569bde319112f79426298d Mon Sep 17 00:00:00 2001 From: yashasvi Date: Mon, 3 Nov 2025 07:07:50 +0530 Subject: [PATCH] fix: directly save final ckpt in save_model_dir Signed-off-by: yashasvi --- .../config/acceleration_configs/fast_moe.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index 37602daf15..39eed4f5de 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -96,16 +96,21 @@ def on_save( Also saves the final model in save_model_dir if provided. """ - def checkpoint(checkpoint_dir, save_dir): - hf_converted_output_dir = os.path.join( - save_dir, "hf_converted_checkpoint" - ) - if os.path.exists(hf_converted_output_dir): + def checkpoint(checkpoint_dir, save_dir, is_intermediate: bool = True): + if is_intermediate: + hf_converted_output_dir = os.path.join( + save_dir, "hf_converted_checkpoint" + ) + else: + hf_converted_output_dir = save_dir + + if os.path.exists(hf_converted_output_dir) and is_intermediate: # If the folder already exists # we return, since this is possible to happen # saving the checkpointing at the end of the training return - os.mkdir(hf_converted_output_dir) + + os.makedirs(hf_converted_output_dir, exist_ok=True) try: recover_safetensors_from_dcp( checkpoint_dir, @@ -165,8 +170,10 @@ def checkpoint(checkpoint_dir, save_dir): and state.global_step == state.max_steps ): if not os.path.exists(self.save_model_dir): - os.mkdir(self.save_model_dir) - checkpoint(checkpoint_dir, self.save_model_dir) + os.makedirs(self.save_model_dir, exist_ok=True) + checkpoint( + checkpoint_dir, self.save_model_dir, is_intermediate=False + ) callbacks.append( ConvertAndSaveHFCheckpointAtEverySave(