InfiniTensor
diff --git a/‎example/common/checkpoint_loader.cc‎
Lines changed: 13 additions & 24 deletions b/‎example/common/checkpoint_loader.cc‎
Lines changed: 13 additions & 24 deletions
diff --git a/‎example/common/checkpoint_loader.h‎
Lines changed: 2 additions & 7 deletions b/‎example/common/checkpoint_loader.h‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 21 additions & 30 deletions b/‎example/gpt2/main.cc‎
Lines changed: 21 additions & 30 deletions
diff --git a/‎example/llama3/main.cc‎
Lines changed: 41 additions & 44 deletions b/‎example/llama3/main.cc‎
Lines changed: 41 additions & 44 deletions
diff --git a/‎infini_train/include/checkpoint.h‎
Lines changed: 7 additions & 20 deletions b/‎infini_train/include/checkpoint.h‎
Lines changed: 7 additions & 20 deletions
@@ -17,16 +17,16 @@ namespace nn = infini_train::nn;
 
 ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args) {
     ResumeFromCheckpointResult result;
-    int ddp_world_size = nn::parallel::global::GetDataParallelSize();
-    int tp_world_size = nn::parallel::global::GetTensorParallelSize();
-    int sp_world_size = nn::parallel::global::GetSequenceParallelEnabled() ? tp_world_size : 1;
-    int pp_world_size = nn::parallel::global::GetPipelineParallelSize();
-
     if (args.resume_root.empty()) {
         LOG(INFO) << "No checkpoint specified for resume. Starting training from scratch.";
         return result;
     }
 
+    int ddp_world_size = nn::parallel::global::GetDataParallelSize();
+    int tp_world_size = nn::parallel::global::GetTensorParallelSize();
+    int sp_world_size = nn::parallel::global::GetSequenceParallelEnabled() ? tp_world_size : 1;
+    int pp_world_size = nn::parallel::global::GetPipelineParallelSize();
+
     std::filesystem::path resume_dir = args.resume_root;
     if (args.rank.IsParallel()) {
         const auto rank_dir = resume_dir / std::format("rank_{:06d}", args.rank.GlobalRank());
@@ -35,27 +35,23 @@ ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &
         }
     }
 
-    Checkpoint::Load(resume_dir, args.model.get(), args.optimizer.get(), &args.state, args.load_options);
+    Checkpoint::Load(resume_dir, *args.model, args.optimizer.get(), args.state);
 
     result.global_step = static_cast<int>(args.state.global_step);
-    if (args.state.data_batch_stride != static_cast<int64_t>(ddp_world_size)) {
-        LOG(FATAL) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
-                                  "Proceeding with recorded data_batch_idx {}.",
-                                  args.state.data_batch_stride, ddp_world_size, args.state.data_batch_idx);
-    }
 
+    CHECK_EQ(args.state.ddp_size, ddp_world_size) << "DDP size mismatch: checkpoint has DDP=" << args.state.ddp_size
+                                                  << ", but current run has DDP=" << ddp_world_size;
     CHECK_EQ(args.state.tp_size, tp_world_size)
         << "TP size mismatch: checkpoint has TP=" << args.state.tp_size << ", but current run has TP=" << tp_world_size;
     CHECK_EQ(args.state.sp_size, sp_world_size)
         << "SP size mismatch: checkpoint has SP=" << args.state.sp_size << ", but current run has SP=" << sp_world_size;
     CHECK_EQ(args.state.pp_size, pp_world_size)
         << "PP size mismatch: checkpoint has PP=" << args.state.pp_size << ", but current run has PP=" << pp_world_size;
 
-    result.data_batch_idx = static_cast<size_t>(std::max<int64_t>(args.state.data_batch_idx, 0));
-    args.train_iter = args.train_loader.IteratorAtBatchIndex(result.data_batch_idx);
+    result.consumed_batches = static_cast<size_t>(std::max<int64_t>(args.state.consumed_batches, 0));
     if (args.rank.IsMainRank()) {
-        LOG(INFO) << std::format("Resume training from step {}, last_lr {:.3e}, data_batch_idx {}",
-                                 args.state.global_step, args.state.last_lr, args.state.data_batch_idx);
+        LOG(INFO) << std::format("Resume training from step {}, last_lr {:.3e}, consumed_batches  {}",
+                                 args.state.global_step, args.state.last_lr, args.state.consumed_batches);
     }
 
     return result;
@@ -66,21 +62,14 @@ void SaveCheckpoint(const SaveCheckpointArgs &args) {
 
     TrainerState state;
     state.global_step = args.global_step;
-    state.data_batch_idx = static_cast<int64_t>(args.data_batch_idx);
-    state.data_batch_stride = args.ddp_size;
+    state.consumed_batches = static_cast<int64_t>(args.consumed_batches);
     state.last_lr = args.last_lr;
-    state.optimizer_type = args.optimizer_type;
-    state.checkpoint_file_format = args.checkpoint_file_format;
     state.ddp_size = args.ddp_size;
     state.tp_size = args.tp_size;
     state.sp_size = args.sp_size;
     state.pp_size = args.pp_size;
 
-    CheckpointOptions options;
-    options.format = args.checkpoint_file_format;
-    options.no_save_optim = args.no_save_optim;
-    options.model_bin_writer = args.model_bin_writer;
-    Checkpoint::Save(args.save_dir, args.model, args.optimizer, state, options);
+    Checkpoint::Save(args.save_dir, args.model, &args.optimizer, state);
 
     const auto ckpt_end = std::chrono::high_resolution_clock::now();
     const double ckpt_ms = std::chrono::duration<double, std::milli>(ckpt_end - ckpt_start).count();
 
@@ -25,22 +25,18 @@ struct ResumeFromCheckpointArgs {
     std::shared_ptr<Optimizer> optimizer;
     DistributedDataLoader &train_loader;
     TrainerState &state;
-    DataLoaderIterator &train_iter;
-    CheckpointLoadOptions load_options;
 };
 
 struct ResumeFromCheckpointResult {
     int global_step = 0;
-    size_t data_batch_idx = 0;
+    size_t consumed_batches = 0;
 };
 
 struct SaveCheckpointArgs {
     std::filesystem::path save_dir;
     int64_t global_step = 0;
-    size_t data_batch_idx = 0;
+    size_t consumed_batches = 0;
     double last_lr = 0.0;
-    std::string optimizer_type;
-    std::string checkpoint_file_format = "bin";
     int ddp_size = 1;
     int tp_size = 1;
     int sp_size = 1;
@@ -52,7 +48,6 @@ struct SaveCheckpointArgs {
     const nn::parallel::Rank &rank;
     const nn::Module &model;
     const Optimizer &optimizer;
-    std::function<void(const nn::Module &, const std::filesystem::path &)> model_bin_writer;
 };
 
 ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args);
 
@@ -88,10 +88,6 @@ DEFINE_string(load, "", "checkpoint directory to resume from");
 DEFINE_string(save, "./checkpoints", "root directory used to store checkpoints");
 DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
 DEFINE_bool(no_save_optim, false, "whether optimizer state is persisted in checkpoints");
-DEFINE_string(checkpoint_file_format, "ckpt",
-              "checkpoint format: bin|ckpt. "
-              "'bin' generates model.bin/optimizer.bin (bin supports LLMC model format via callbacks); "
-              "'ckpt' generates model.ckpt/optimizer.ckpt (native StateDict binary).");
 // precision check
 DEFINE_string(
     precision_check, "",
@@ -332,7 +328,7 @@ void Train(const nn::parallel::Rank &rank) {
     }
 
     auto train_iter = train_loader.begin();
-    size_t saved_data_batch_idx = train_iter.BatchIndex();
+
     std::shared_ptr<nn::Module> loss_fn
         = (tp_world_size > 1) ? std::static_pointer_cast<nn::Module>(
               std::make_shared<VocabParallelCrossEntropyLoss>(model_config.original_vocab_size))
@@ -344,34 +340,32 @@ void Train(const nn::parallel::Rank &rank) {
 
     int start_step = 0;
     TrainerState state;
-    CheckpointLoadOptions load_options;
-    load_options.load_optimizer_state = true;
-    load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
-        auto loaded_model = gpt2::LoadFromLLMC(model_path.string());
-        target_model->LoadStateDict(loaded_model->StateDict());
-    };
-    const auto resume_result = ResumeFromCheckpoint({
-        .resume_root = FLAGS_load,
-        .rank = rank,
-        .model = model,
-        .optimizer = optimizer,
-        .train_loader = train_loader,
-        .state = state,
-        .train_iter = train_iter,
-        .load_options = load_options,
-    });
+    const auto resume_result = ResumeFromCheckpoint({.resume_root = FLAGS_load,
+                                                     .rank = rank,
+                                                     .model = model,
+                                                     .optimizer = optimizer,
+                                                     .train_loader = train_loader,
+                                                     .state = state});
     start_step = resume_result.global_step;
-    saved_data_batch_idx = resume_result.data_batch_idx;
+    size_t consumed_batches = resume_result.consumed_batches;
+
+    // TODO(jym): Replace with Sampler abstraction when available.
+    // Skip dataloader to resume from the correct batch position.
+    if (consumed_batches > 0) {
+        size_t start = train_iter.BatchIndex();
+        // Each rank processes every ddp_world_size-th batch starting from its own rank.
+        // num_skips calculates how many ++ iterations to reach the saved batch position.
+        size_t num_skips = (consumed_batches - start) / ddp_world_size;
+        for (size_t i = 0; i < num_skips; ++i) { ++train_iter; }
+    }
 
     auto save_checkpoint
         = [&](const std::filesystem::path &save_dir, int64_t global_step, bool prune_step_checkpoints) {
               SaveCheckpoint({
                   .save_dir = save_dir,
                   .global_step = global_step,
-                  .data_batch_idx = saved_data_batch_idx,
+                  .consumed_batches = consumed_batches,
                   .last_lr = FLAGS_learning_rate,
-                  .optimizer_type = "SGD",
-                  .checkpoint_file_format = FLAGS_checkpoint_file_format,
                   .ddp_size = ddp_world_size,
                   .tp_size = tp_world_size,
                   .sp_size = sp_world_size,
@@ -383,9 +377,6 @@ void Train(const nn::parallel::Rank &rank) {
                   .rank = rank,
                   .model = *model,
                   .optimizer = *optimizer,
-                  .model_bin_writer
-                  = [&](const nn::Module &,
-                        const std::filesystem::path &model_path) { gpt2::SaveAsLLMC(llmc_model, model_path.string()); },
               });
           };
 
@@ -439,7 +430,7 @@ void Train(const nn::parallel::Rank &rank) {
                 // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
                 // TODO(dcj): support dataloader.reset() later
                 ++train_iter;
-                saved_data_batch_idx = train_iter.BatchIndex();
+                consumed_batches = train_iter.BatchIndex();
                 x = std::make_shared<Tensor>(x->To(device));
                 y = std::make_shared<Tensor>(y->To(device));
 
@@ -470,7 +461,7 @@ void Train(const nn::parallel::Rank &rank) {
             // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
             // TODO(dcj): support dataloader.reset() later
             ++train_iter;
-            saved_data_batch_idx = train_iter.BatchIndex();
+            consumed_batches = train_iter.BatchIndex();
             x = std::make_shared<Tensor>(x->To(device));
             y = std::make_shared<Tensor>(y->To(device));
 
 
@@ -86,10 +86,7 @@ DEFINE_string(load, "", "checkpoint directory to resume from");
 DEFINE_string(save, "./checkpoints", "root directory used to store checkpoints");
 DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
 DEFINE_bool(no_save_optim, true, "whether optimizer state is persisted in checkpoints");
-DEFINE_string(checkpoint_file_format, "ckpt",
-              "checkpoint format: bin|ckpt. "
-              "'bin' generates model.bin/optimizer.bin (bin supports LLMC model format via callbacks); "
-              "'ckpt' generates model.ckpt/optimizer.ckpt (native StateDict binary).");
+
 // precision check
 DEFINE_string(
     precision_check, "",
@@ -311,7 +308,7 @@ void Train(const nn::parallel::Rank &rank) {
     }
 
     auto train_iter = train_loader.begin();
-    size_t saved_data_batch_idx = train_iter.BatchIndex();
+
     std::shared_ptr<nn::Module> loss_fn
         = (tp_world_size > 1) ? std::static_pointer_cast<nn::Module>(std::make_shared<VocabParallelCrossEntropyLoss>())
                               : std::static_pointer_cast<nn::Module>(std::make_shared<nn::CrossEntropyLoss>());
@@ -322,50 +319,48 @@ void Train(const nn::parallel::Rank &rank) {
 
     int start_step = 0;
     TrainerState state;
-    CheckpointLoadOptions load_options;
-    load_options.load_optimizer_state = true;
-    load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
-        auto loaded_model = llama3::LoadFromLLMC(model_path.string());
-        target_model->LoadStateDict(loaded_model->StateDict());
-    };
     const auto resume_result = ResumeFromCheckpoint({
         .resume_root = FLAGS_load,
         .rank = rank,
         .model = model,
         .optimizer = optimizer,
         .train_loader = train_loader,
         .state = state,
-        .train_iter = train_iter,
-        .load_options = load_options,
     });
+
     start_step = resume_result.global_step;
-    saved_data_batch_idx = resume_result.data_batch_idx;
-
-    auto save_checkpoint = [&](const std::filesystem::path &save_dir, int64_t global_step,
-                               bool prune_step_checkpoints) {
-        SaveCheckpoint({
-            .save_dir = save_dir,
-            .global_step = global_step,
-            .data_batch_idx = saved_data_batch_idx,
-            .last_lr = FLAGS_learning_rate,
-            .optimizer_type = "Adam",
-            .checkpoint_file_format = FLAGS_checkpoint_file_format,
-            .ddp_size = ddp_world_size,
-            .tp_size = tp_world_size,
-            .sp_size = sp_world_size,
-            .pp_size = pp_world_size,
-            .no_save_optim = FLAGS_no_save_optim,
-            .prune_step_checkpoints = prune_step_checkpoints,
-            .checkpoint_root_dir = FLAGS_save,
-            .max_checkpoint_keep = FLAGS_max_checkpoint_keep,
-            .rank = rank,
-            .model = *model,
-            .optimizer = *optimizer,
-            .model_bin_writer
-            = [&](const nn::Module &,
-                  const std::filesystem::path &model_path) { llama3::SaveAsLLMC(llmc_model, model_path.string()); },
-        });
-    };
+    size_t consumed_batches = resume_result.consumed_batches;
+
+    // TODO(jym): Replace with Sampler abstraction when available.
+    // Skip dataloader to resume from the correct batch position.
+    if (consumed_batches > 0) {
+        size_t start = train_iter.BatchIndex();
+        // Each rank processes every ddp_world_size-th batch starting from its own rank.
+        // num_skips calculates how many ++ iterations to reach the saved batch position.
+        size_t num_skips = (consumed_batches - start) / ddp_world_size;
+        for (size_t i = 0; i < num_skips; ++i) { ++train_iter; }
+    }
+
+    auto save_checkpoint
+        = [&](const std::filesystem::path &save_dir, int64_t global_step, bool prune_step_checkpoints) {
+              SaveCheckpoint({
+                  .save_dir = save_dir,
+                  .global_step = global_step,
+                  .consumed_batches = consumed_batches,
+                  .last_lr = FLAGS_learning_rate,
+                  .ddp_size = ddp_world_size,
+                  .tp_size = tp_world_size,
+                  .sp_size = sp_world_size,
+                  .pp_size = pp_world_size,
+                  .no_save_optim = FLAGS_no_save_optim,
+                  .prune_step_checkpoints = prune_step_checkpoints,
+                  .checkpoint_root_dir = FLAGS_save,
+                  .max_checkpoint_keep = FLAGS_max_checkpoint_keep,
+                  .rank = rank,
+                  .model = *model,
+                  .optimizer = *optimizer,
+              });
+          };
 
     for (int step = start_step; step < FLAGS_num_iteration + 1; ++step) {
         // Reset precision check counters at start of each iteration for file overwrite
@@ -414,10 +409,11 @@ void Train(const nn::parallel::Rank &rank) {
 
                 // (bs, seq_len), (bs, seq_len)
                 auto [x, y] = *train_iter;
-                // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
+                // if we are trying to overfit a single batch, we reset the loader here by commenting out the
+                // line below
                 // TODO(dcj): support dataloader.reset() later
                 ++train_iter;
-                saved_data_batch_idx = train_iter.BatchIndex();
+                consumed_batches = train_iter.BatchIndex();
                 x = std::make_shared<Tensor>(x->To(device));
                 y = std::make_shared<Tensor>(y->To(device));
 
@@ -444,10 +440,11 @@ void Train(const nn::parallel::Rank &rank) {
             optimizer->Step();
         } else {
             auto [x, y] = *train_iter;
-            // if we are trying to overfit a single batch, we reset the loader here by commenting out the line below
+            // if we are trying to overfit a single batch, we reset the loader here by commenting out the line
+            // below
             // TODO(dcj): support dataloader.reset() later
             ++train_iter;
-            saved_data_batch_idx = train_iter.BatchIndex();
+            consumed_batches = train_iter.BatchIndex();
             x = std::make_shared<Tensor>(x->To(device));
             y = std::make_shared<Tensor>(y->To(device));
 
 
@@ -16,36 +16,24 @@ class Module;
 
 struct TrainerState {
     int64_t global_step = 0;
-    int64_t data_batch_idx = 0;
-    int64_t data_batch_stride = 1;
+    int64_t consumed_batches = 0;
+    // FIXME(jym): learning_rate should be restored from scheduler state, move `last_lr` from TrainerState to
+    // SchedulerState later
     double last_lr = 0.0;
-    std::string optimizer_type = "unknown";
-    std::string checkpoint_file_format = "bin";
 
     int ddp_size = 1;
     int tp_size = 1;
     int sp_size = 1;
     int pp_size = 1;
 };
 
-struct CheckpointOptions {
-    std::string format = "bin";
-    bool no_save_optim = false;
-    std::function<void(const nn::Module &, const std::filesystem::path &)> model_bin_writer;
-};
-
-struct CheckpointLoadOptions {
-    bool load_optimizer_state = true;
-    std::function<void(nn::Module *, const std::filesystem::path &)> model_bin_loader;
-};
-
 class Checkpoint {
 public:
-    static void Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer &optimizer,
-                     const TrainerState &state, const CheckpointOptions &options = {});
+    static void Save(const std::filesystem::path &checkpoint_dir, const nn::Module &model, const Optimizer *optimizer,
+                     const TrainerState &state, bool no_save_optim = false);
 
-    static void Load(const std::filesystem::path &checkpoint_dir, nn::Module *model, Optimizer *optimizer,
-                     TrainerState *state, const CheckpointLoadOptions &options = {});
+    static void Load(const std::filesystem::path &checkpoint_dir, nn::Module &model, Optimizer *optimizer,
+                     TrainerState &state, bool load_optimizer_state = true);
 
 private:
     static void SaveStateDictBinary(const std::filesystem::path &path,
@@ -56,7 +44,6 @@ class Checkpoint {
 
     static void SaveTrainerState(const std::filesystem::path &path, const TrainerState &state);
     static TrainerState LoadTrainerState(const std::filesystem::path &path);
-    static std::string InferFormat(const std::filesystem::path &checkpoint_dir);
 };
 
 } // namespace infini_train