temp2

JYMiracle305 · JYMiracle305 · commit 85cddfd28bb0 · 2026-05-29T12:44:36.000+08:00
diff --git a/example/common/checkpoint_loader.cc b/example/common/checkpoint_loader.cc
@@ -35,7 +35,7 @@ ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &
         }
     }
 
-    Checkpoint::Load(resume_dir, args.model.get(), args.optimizer.get(), &args.state, args.load_options);
+    Checkpoint::Load(resume_dir, *args.model, args.optimizer.get(), args.state);
 
     result.global_step = static_cast<int>(args.state.global_step);
     if (args.state.data_batch_stride != static_cast<int64_t>(ddp_world_size)) {
@@ -70,17 +70,12 @@ void SaveCheckpoint(const SaveCheckpointArgs &args) {
     state.data_batch_stride = args.ddp_size;
     state.last_lr = args.last_lr;
     state.optimizer_type = args.optimizer_type;
-    state.checkpoint_file_format = args.checkpoint_file_format;
     state.ddp_size = args.ddp_size;
     state.tp_size = args.tp_size;
     state.sp_size = args.sp_size;
     state.pp_size = args.pp_size;
 
-    CheckpointOptions options;
-    options.format = args.checkpoint_file_format;
-    options.no_save_optim = args.no_save_optim;
-    options.model_bin_writer = args.model_bin_writer;
-    Checkpoint::Save(args.save_dir, args.model, args.optimizer, state, options);
+    Checkpoint::Save(args.save_dir, args.model, &args.optimizer, state);
 
     const auto ckpt_end = std::chrono::high_resolution_clock::now();
     const double ckpt_ms = std::chrono::duration<double, std::milli>(ckpt_end - ckpt_start).count();
diff --git a/example/common/checkpoint_loader.h b/example/common/checkpoint_loader.h
@@ -26,7 +26,6 @@ struct ResumeFromCheckpointArgs {
     DistributedDataLoader &train_loader;
     TrainerState &state;
     DataLoaderIterator &train_iter;
-    CheckpointLoadOptions load_options;
 };
 
 struct ResumeFromCheckpointResult {
@@ -40,7 +39,6 @@ struct SaveCheckpointArgs {
     size_t data_batch_idx = 0;
     double last_lr = 0.0;
     std::string optimizer_type;
-    std::string checkpoint_file_format = "bin";
     int ddp_size = 1;
     int tp_size = 1;
     int sp_size = 1;
@@ -52,7 +50,6 @@ struct SaveCheckpointArgs {
     const nn::parallel::Rank &rank;
     const nn::Module &model;
     const Optimizer &optimizer;
-    std::function<void(const nn::Module &, const std::filesystem::path &)> model_bin_writer;
 };
 
 ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args);
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -88,10 +88,6 @@ DEFINE_string(load, "", "checkpoint directory to resume from");
 DEFINE_string(save, "./checkpoints", "root directory used to store checkpoints");
 DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
 DEFINE_bool(no_save_optim, false, "whether optimizer state is persisted in checkpoints");
-DEFINE_string(checkpoint_file_format, "ckpt",
-              "checkpoint format: bin|ckpt. "
-              "'bin' generates model.bin/optimizer.bin (bin supports LLMC model format via callbacks); "
-              "'ckpt' generates model.ckpt/optimizer.ckpt (native StateDict binary).");
 // precision check
 DEFINE_string(
     precision_check, "",
@@ -344,22 +340,13 @@ void Train(const nn::parallel::Rank &rank) {
 
     int start_step = 0;
     TrainerState state;
-    CheckpointLoadOptions load_options;
-    load_options.load_optimizer_state = true;
-    load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
-        auto loaded_model = gpt2::LoadFromLLMC(model_path.string());
-        target_model->LoadStateDict(loaded_model->StateDict());
-    };
-    const auto resume_result = ResumeFromCheckpoint({
-        .resume_root = FLAGS_load,
-        .rank = rank,
-        .model = model,
-        .optimizer = optimizer,
-        .train_loader = train_loader,
-        .state = state,
-        .train_iter = train_iter,
-        .load_options = load_options,
-    });
+    const auto resume_result = ResumeFromCheckpoint({.resume_root = FLAGS_load,
+                                                     .rank = rank,
+                                                     .model = model,
+                                                     .optimizer = optimizer,
+                                                     .train_loader = train_loader,
+                                                     .state = state,
+                                                     .train_iter = train_iter});
     start_step = resume_result.global_step;
     saved_data_batch_idx = resume_result.data_batch_idx;
 
@@ -371,7 +358,6 @@ void Train(const nn::parallel::Rank &rank) {
                   .data_batch_idx = saved_data_batch_idx,
                   .last_lr = FLAGS_learning_rate,
                   .optimizer_type = "SGD",
-                  .checkpoint_file_format = FLAGS_checkpoint_file_format,
                   .ddp_size = ddp_world_size,
                   .tp_size = tp_world_size,
                   .sp_size = sp_world_size,
@@ -383,9 +369,6 @@ void Train(const nn::parallel::Rank &rank) {
                   .rank = rank,
                   .model = *model,
                   .optimizer = *optimizer,
-                  .model_bin_writer
-                  = [&](const nn::Module &,
-                        const std::filesystem::path &model_path) { gpt2::SaveAsLLMC(llmc_model, model_path.string()); },
               });
           };
 
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -86,10 +86,7 @@ DEFINE_string(load, "", "checkpoint directory to resume from");
 DEFINE_string(save, "./checkpoints", "root directory used to store checkpoints");
 DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
 DEFINE_bool(no_save_optim, true, "whether optimizer state is persisted in checkpoints");
-DEFINE_string(checkpoint_file_format, "ckpt",
-              "checkpoint format: bin|ckpt. "
-              "'bin' generates model.bin/optimizer.bin (bin supports LLMC model format via callbacks); "
-              "'ckpt' generates model.ckpt/optimizer.ckpt (native StateDict binary).");
+
 // precision check
 DEFINE_string(
     precision_check, "",
@@ -322,12 +319,6 @@ void Train(const nn::parallel::Rank &rank) {
 
     int start_step = 0;
     TrainerState state;
-    CheckpointLoadOptions load_options;
-    load_options.load_optimizer_state = true;
-    load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
-        auto loaded_model = llama3::LoadFromLLMC(model_path.string());
-        target_model->LoadStateDict(loaded_model->StateDict());
-    };
     const auto resume_result = ResumeFromCheckpoint({
         .resume_root = FLAGS_load,
         .rank = rank,
@@ -336,36 +327,31 @@ void Train(const nn::parallel::Rank &rank) {
         .train_loader = train_loader,
         .state = state,
         .train_iter = train_iter,
-        .load_options = load_options,
     });
     start_step = resume_result.global_step;
     saved_data_batch_idx = resume_result.data_batch_idx;
 
-    auto save_checkpoint = [&](const std::filesystem::path &save_dir, int64_t global_step,
-                               bool prune_step_checkpoints) {
-        SaveCheckpoint({
-            .save_dir = save_dir,
-            .global_step = global_step,
-            .data_batch_idx = saved_data_batch_idx,
-            .last_lr = FLAGS_learning_rate,
-            .optimizer_type = "Adam",
-            .checkpoint_file_format = FLAGS_checkpoint_file_format,
-            .ddp_size = ddp_world_size,
-            .tp_size = tp_world_size,
-            .sp_size = sp_world_size,
-            .pp_size = pp_world_size,
-            .no_save_optim = FLAGS_no_save_optim,
-            .prune_step_checkpoints = prune_step_checkpoints,
-            .checkpoint_root_dir = FLAGS_save,
-            .max_checkpoint_keep = FLAGS_max_checkpoint_keep,
-            .rank = rank,
-            .model = *model,
-            .optimizer = *optimizer,
-            .model_bin_writer
-            = [&](const nn::Module &,
-                  const std::filesystem::path &model_path) { llama3::SaveAsLLMC(llmc_model, model_path.string()); },
-        });
-    };
+    auto save_checkpoint
+        = [&](const std::filesystem::path &save_dir, int64_t global_step, bool prune_step_checkpoints) {
+              SaveCheckpoint({
+                  .save_dir = save_dir,
+                  .global_step = global_step,
+                  .data_batch_idx = saved_data_batch_idx,
+                  .last_lr = FLAGS_learning_rate,
+                  .optimizer_type = "Adam",
+                  .ddp_size = ddp_world_size,
+                  .tp_size = tp_world_size,
+                  .sp_size = sp_world_size,
+                  .pp_size = pp_world_size,
+                  .no_save_optim = FLAGS_no_save_optim,
+                  .prune_step_checkpoints = prune_step_checkpoints,
+                  .checkpoint_root_dir = FLAGS_save,
+                  .max_checkpoint_keep = FLAGS_max_checkpoint_keep,
+                  .rank = rank,
+                  .model = *model,
+                  .optimizer = *optimizer,
+              });
+          };
 
     for (int step = start_step; step < FLAGS_num_iteration + 1; ++step) {
         // Reset precision check counters at start of each iteration for file overwrite
diff --git a/infini_train/include/checkpoint.h b/infini_train/include/checkpoint.h
@@ -18,34 +18,24 @@ struct TrainerState {
     int64_t global_step = 0;
     int64_t data_batch_idx = 0;
     int64_t data_batch_stride = 1;
+    // FIXME(zbl): learning_rate should be restored from scheduler state, move `last_lr` from TrainerState to
+    // SchedulerState later
     double last_lr = 0.0;
     std::string optimizer_type = "unknown";
-    std::string checkpoint_file_format = "bin";
 
     int ddp_size = 1;
     int tp_size = 1;
     int sp_size = 1;
     int pp_size = 1;
 };
 
-struct CheckpointOptions {
-    std::string format = "bin";
-    bool no_save_optim = false;
-    std::function<void(const nn::Module &, const std::filesystem::path &)> model_bin_writer;
-};
-
-struct CheckpointLoadOptions {
-    bool load_optimizer_state = true;
-    std::function<void(nn::Module *, const std::filesystem::path &)> model_bin_loader;
-};
-
 class Checkpoint {
 public:
-    static void Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer &optimizer,
-                     const TrainerState &state, const CheckpointOptions &options = {});
+    static void Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer *optimizer,
+                     const TrainerState &state, bool no_save_optim = false);
 
-    static void Load(const std::filesystem::path &checkpoint_dir, nn::Module *model, Optimizer *optimizer,
-                     TrainerState *state, const CheckpointLoadOptions &options = {});
+    static void Load(const std::filesystem::path &checkpoint_dir, nn::Module &model, Optimizer *optimizer,
+                     TrainerState &state, bool load_optimizer_state = true);
 
 private:
     static void SaveStateDictBinary(const std::filesystem::path &path,
@@ -56,7 +46,6 @@ class Checkpoint {
 
     static void SaveTrainerState(const std::filesystem::path &path, const TrainerState &state);
     static TrainerState LoadTrainerState(const std::filesystem::path &path);
-    static std::string InferFormat(const std::filesystem::path &checkpoint_dir);
 };
 
 } // namespace infini_train
diff --git a/infini_train/src/checkpoint.cc b/infini_train/src/checkpoint.cc
@@ -84,24 +84,20 @@ template <typename T> T ExtractNumberField(const std::string &content, const std
 }
 } // namespace
 
-void Checkpoint::Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer &optimizer,
-                      const TrainerState &state, const CheckpointOptions &options) {
-    CHECK(options.format == "bin" || options.format == "ckpt") << "Unsupported checkpoint format: " << options.format;
+void Checkpoint::Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer *optimizer,
+                      const TrainerState &state, bool no_save_optim) {
     std::filesystem::create_directories(checkpoint_dir);
-    LOG(ERROR) << "[CKPT] Save begin: dir=" << checkpoint_dir << ", format=" << options.format
-               << ", global_step=" << state.global_step;
+    LOG(ERROR) << "[CKPT] Save begin: dir=" << checkpoint_dir << ", global_step=" << state.global_step;
 
-    const auto model_path = checkpoint_dir / (options.format == "ckpt" ? "model.ckpt" : "model.bin");
-    if (options.format == "bin" && options.model_bin_writer) {
-        options.model_bin_writer(model, model_path);
-    } else {
-        SaveStateDictBinary(model_path, model.StateDict());
-    }
+    const auto model_path = checkpoint_dir / ("model.ckpt");
 
-    if (options.no_save_optim) {
-        auto opt_state = optimizer.StateDict();
+    SaveStateDictBinary(model_path, model.StateDict());
+
+    if (!no_save_optim) {
+        CHECK(optimizer != nullptr) << "Optimizer pointer is null, cannot save optimizer state.";
+        auto opt_state = optimizer->StateDict();
         if (!opt_state.empty()) {
-            const auto opt_path = checkpoint_dir / (options.format == "ckpt" ? "optimizer.ckpt" : "optimizer.bin");
+            const auto opt_path = checkpoint_dir / "optimizer.ckpt";
             SaveStateDictBinary(opt_path, opt_state);
         }
     }
@@ -110,48 +106,32 @@ void Checkpoint::Save(const std::filesystem::path &checkpoint_dir, const nn::Mod
     LOG(ERROR) << "[CKPT] Save done: dir=" << checkpoint_dir;
 }
 
-void Checkpoint::Load(const std::filesystem::path &checkpoint_dir, nn::Module *model, Optimizer *optimizer,
-                      TrainerState *state, const CheckpointLoadOptions &options) {
-    CHECK(model != nullptr);
-    CHECK(state != nullptr);
-
-    const std::string format = InferFormat(checkpoint_dir);
-    const auto model_path = checkpoint_dir / (format == "ckpt" ? "model.ckpt" : "model.bin");
-    LOG(ERROR) << "[CKPT] Load begin: dir=" << checkpoint_dir << ", format=" << format;
+void Checkpoint::Load(const std::filesystem::path &checkpoint_dir, nn::Module &model, Optimizer *optimizer,
+                      TrainerState &state, bool load_optimizer_state) {
+    const auto model_path = checkpoint_dir / "model.ckpt";
     LOG(ERROR) << "[CKPT] Loading model: " << model_path;
-    if (format == "bin" && options.model_bin_loader) {
-        const uint32_t magic = PeekMagic(model_path);
-        if (magic == kCkptMagic) {
-            LOG(ERROR) << "[CKPT] Model format detected: native checkpoint binary.";
-            model->LoadStateDict(LoadStateDictBinary(model_path));
-        } else {
-            LOG(ERROR) << "[CKPT] Model format detected: external model.bin (magic=" << magic
-                       << "), use model_bin_loader callback.";
-            options.model_bin_loader(model, model_path);
-        }
-    } else {
-        model->LoadStateDict(LoadStateDictBinary(model_path));
-    }
 
-    if (optimizer != nullptr && options.load_optimizer_state) {
-        const auto opt_path = checkpoint_dir / (format == "ckpt" ? "optimizer.ckpt" : "optimizer.bin");
+    model.LoadStateDict(LoadStateDictBinary(model_path));
+
+    if (optimizer == nullptr) {
+        LOG(ERROR) << "[CKPT] No optimizer instance, skip optimizer state loading.";
+    } else if (load_optimizer_state) {
+        const auto opt_path = checkpoint_dir / "optimizer.ckpt";
         if (std::filesystem::exists(opt_path)) {
             LOG(ERROR) << "[CKPT] Loading optimizer: " << opt_path;
             optimizer->LoadStateDict(LoadStateDictBinary(opt_path));
         } else {
             LOG(ERROR) << "[CKPT] Optimizer state not found, skip: " << opt_path;
         }
-    } else if (optimizer == nullptr) {
-        LOG(ERROR) << "[CKPT] No optimizer instance, skip optimizer state loading.";
     } else {
         LOG(ERROR) << "[CKPT] load_optimizer_state=false, skip optimizer state loading.";
     }
 
-    *state = LoadTrainerState(checkpoint_dir / "trainer_state.json");
-    LOG(ERROR) << "[CKPT] Load done: global_step=" << state->global_step << ", data_batch_idx=" << state->data_batch_idx
-               << ", data_batch_stride=" << state->data_batch_stride << ", last_lr=" << state->last_lr
-               << ", optimizer_type=" << state->optimizer_type << ", topology(ddp,tp,sp,pp)=(" << state->ddp_size << ","
-               << state->tp_size << "," << state->sp_size << "," << state->pp_size << ")";
+    state = LoadTrainerState(checkpoint_dir / "trainer_state.json");
+    LOG(ERROR) << "[CKPT] Load done: global_step=" << state.global_step << ", data_batch_idx=" << state.data_batch_idx
+               << ", data_batch_stride=" << state.data_batch_stride << ", last_lr=" << state.last_lr
+               << ", optimizer_type=" << state.optimizer_type << ", topology(ddp,tp,sp,pp)=(" << state.ddp_size << ","
+               << state.tp_size << "," << state.sp_size << "," << state.pp_size << ")";
 }
 
 void Checkpoint::SaveStateDictBinary(const std::filesystem::path &path,
@@ -233,7 +213,6 @@ void Checkpoint::SaveTrainerState(const std::filesystem::path &path, const Train
     ofs << "  \"data_batch_stride\": " << state.data_batch_stride << ",\n";
     ofs << "  \"last_lr\": " << state.last_lr << ",\n";
     ofs << "  \"optimizer_type\": \"" << state.optimizer_type << "\",\n";
-    ofs << "  \"checkpoint_file_format\": \"" << state.checkpoint_file_format << "\",\n";
     ofs << "  \"ddp_size\": " << state.ddp_size << ",\n";
     ofs << "  \"tp_size\": " << state.tp_size << ",\n";
     ofs << "  \"sp_size\": " << state.sp_size << ",\n";
@@ -252,23 +231,10 @@ TrainerState Checkpoint::LoadTrainerState(const std::filesystem::path &path) {
     state.data_batch_stride = ExtractNumberField<int64_t>(content, "data_batch_stride", 1);
     state.last_lr = ExtractNumberField<double>(content, "last_lr", 0.0);
     state.optimizer_type = ExtractStringField(content, "optimizer_type", "unknown");
-    state.checkpoint_file_format = ExtractStringField(content, "checkpoint_file_format", "bin");
     state.ddp_size = ExtractNumberField<int>(content, "ddp_size", 1);
     state.tp_size = ExtractNumberField<int>(content, "tp_size", 1);
     state.sp_size = ExtractNumberField<int>(content, "sp_size", 1);
     state.pp_size = ExtractNumberField<int>(content, "pp_size", 1);
     return state;
 }
-
-std::string Checkpoint::InferFormat(const std::filesystem::path &checkpoint_dir) {
-    if (std::filesystem::exists(checkpoint_dir / "model.ckpt")) {
-        return "ckpt";
-    }
-    if (std::filesystem::exists(checkpoint_dir / "model.bin")) {
-        return "bin";
-    }
-    LOG(FATAL) << "Failed to infer checkpoint format from path: " << checkpoint_dir;
-    return "bin";
-}
-
 } // namespace infini_train
diff --git a/tests/checkpoint/test_checkpoint_serialization.cc b/tests/checkpoint/test_checkpoint_serialization.cc
diff --git a/tests/checkpoint/test_trainer_state.cc b/tests/checkpoint/test_trainer_state.cc