InfiniTensor
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎example/gpt2/checkpoint_loader.cc‎ ‎example/common/checkpoint_loader.cc‎example/gpt2/checkpoint_loader.cc renamed to example/common/checkpoint_loader.cc
Lines changed: 517 additions & 11 deletions b/‎example/gpt2/checkpoint_loader.cc‎ ‎example/common/checkpoint_loader.cc‎example/gpt2/checkpoint_loader.cc renamed to example/common/checkpoint_loader.cc
Lines changed: 517 additions & 11 deletions
diff --git a/‎example/common/checkpoint_loader.h‎
Lines changed: 76 additions & 0 deletions b/‎example/common/checkpoint_loader.h‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎example/common/utils.cc‎
Lines changed: 0 additions & 88 deletions b/‎example/common/utils.cc‎
Lines changed: 0 additions & 88 deletions
diff --git a/‎example/common/utils.h‎
Lines changed: 0 additions & 43 deletions b/‎example/common/utils.h‎
Lines changed: 0 additions & 43 deletions
diff --git a/‎example/gpt2/checkpoint_loader.h‎
Lines changed: 0 additions & 13 deletions b/‎example/gpt2/checkpoint_loader.h‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎example/gpt2/config.h‎
Lines changed: 2 additions & 2 deletions b/‎example/gpt2/config.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 5 additions & 3 deletions b/‎example/gpt2/main.cc‎
Lines changed: 5 additions & 3 deletions
@@ -176,7 +176,7 @@ add_executable(gpt2
   example/gpt2/main.cc
   example/common/tiny_shakespeare_dataset.cc
   example/common/utils.cc
-  example/gpt2/checkpoint_loader.cc
+  example/common/checkpoint_loader.cc
   example/common/tokenizer.cc
 )
 link_infini_train_exe(gpt2)
@@ -185,7 +185,7 @@ add_executable(llama3
   example/llama3/main.cc
   example/common/tiny_shakespeare_dataset.cc
   example/common/utils.cc
-  example/llama3/checkpoint_loader.cc
+  example/common/checkpoint_loader.cc
   example/common/tokenizer.cc
 )
 link_infini_train_exe(llama3)
 
@@ -0,0 +1,76 @@
+#pragma once
+
+#include "infini_train/include/checkpoint.h"
+#include "infini_train/include/dataloader.h"
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/parallel/rank.h"
+#include "infini_train/include/optimizer.h"
+
+#include "gflags/gflags.h"
+
+#include <cstdint>
+#include <cstring>
+#include <filesystem>
+
+#include <functional>
+#include <limits>
+#include <string>
+
+namespace infini_train {
+namespace nn {
+class TransformerModel;
+}
+
+namespace gpt2 {
+std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath);
+void SaveAsLLMC(const std::shared_ptr<nn::TransformerModel> &model, const std::string &filepath);
+} // namespace gpt2
+namespace llama3 {
+std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath);
+void SaveAsLLMC(const std::shared_ptr<nn::TransformerModel> &model, const std::string &filepath);
+} // namespace llama3
+
+struct ResumeFromCheckpointArgs {
+    fLS::clstring resume_root;
+    const nn::parallel::Rank &rank;
+    std::shared_ptr<nn::Module> model;
+    std::shared_ptr<Optimizer> optimizer;
+    DistributedDataLoader &train_loader;
+    TrainerState &state;
+    DataLoaderIterator &train_iter;
+    CheckpointLoadOptions load_options;
+};
+
+struct ResumeFromCheckpointResult {
+    int global_step = 0;
+    float best_loss = std::numeric_limits<float>::infinity();
+    size_t data_batch_idx = 0;
+};
+
+struct SaveCheckpointArgs {
+    std::filesystem::path save_dir;
+    int64_t global_step = 0;
+    size_t data_batch_idx = 0;
+    float best_loss = std::numeric_limits<float>::infinity();
+    double last_lr = 0.0;
+    std::string optimizer_type;
+    std::string checkpoint_format = "bin";
+    int ddp_size = 1;
+    int tp_size = 1;
+    int sp_size = 1;
+    int pp_size = 1;
+    bool save_optimizer_state = true;
+    bool prune_step_checkpoints = false;
+    std::filesystem::path checkpoint_root_dir;
+    size_t max_checkpoint_keep = 0;
+    const nn::parallel::Rank &rank;
+    const nn::Module &model;
+    const Optimizer &optimizer;
+    std::function<void(const nn::Module &, const std::filesystem::path &)> model_bin_writer;
+};
+
+ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args);
+
+void SaveCheckpoint(const SaveCheckpointArgs &args);
+
+} // namespace infini_train
@@ -68,92 +68,4 @@ void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t s
     ifs.read(reinterpret_cast<char *>(dst), static_cast<std::streamsize>(cnt * sizeof(float)));
     ifs.seekg(base + std::streamoff(len * sizeof(float)));
 }
-
-ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args) {
-    ResumeFromCheckpointResult result;
-    int ddp_world_size = nn::parallel::global::GetDataParallelSize();
-
-    if (args.resume_root.empty()) {
-        LOG(INFO) << "No checkpoint specified for resume. Starting training from scratch.";
-        return result;
-    }
-
-    std::filesystem::path resume_dir = args.resume_root;
-    if (args.rank.IsParallel()) {
-        const auto rank_dir = resume_dir / std::format("rank_{:06d}", args.rank.GlobalRank());
-        if (std::filesystem::exists(rank_dir)) {
-            resume_dir = rank_dir;
-        }
-    }
-
-    Checkpoint::Load(resume_dir, args.model.get(), args.optimizer.get(), &args.state, args.load_options);
-
-    result.global_step = static_cast<int>(args.state.global_step);
-    result.best_loss = args.state.best_loss;
-    if (args.state.data_batch_stride != static_cast<int64_t>(ddp_world_size) && args.rank.IsMainRank()) {
-        LOG(WARNING) << std::format("Checkpoint data_batch_stride {} mismatches current ddp_world_size {}. "
-                                    "Proceeding with recorded data_batch_idx {}.",
-                                    args.state.data_batch_stride, ddp_world_size, args.state.data_batch_idx);
-    }
-    result.data_batch_idx = static_cast<size_t>(std::max<int64_t>(args.state.data_batch_idx, 0));
-    args.train_iter = args.train_loader.IteratorAtBatchIndex(result.data_batch_idx);
-    if (args.rank.IsMainRank()) {
-        LOG(INFO) << std::format(
-            "Resume training from step {} with best_loss {:.6f}, last_lr {:.3e}, data_batch_idx {}",
-            args.state.global_step, args.state.best_loss, args.state.last_lr, args.state.data_batch_idx);
-    }
-
-    return result;
-}
-
-void SaveCheckpoint(const SaveCheckpointArgs &args) {
-    const auto ckpt_start = std::chrono::high_resolution_clock::now();
-
-    TrainerState state;
-    state.global_step = args.global_step;
-    state.data_batch_idx = static_cast<int64_t>(args.data_batch_idx);
-    state.data_batch_stride = args.ddp_size;
-    state.best_loss = args.best_loss;
-    state.last_lr = args.last_lr;
-    state.optimizer_type = args.optimizer_type;
-    state.checkpoint_format = args.checkpoint_format;
-    state.ddp_size = args.ddp_size;
-    state.tp_size = args.tp_size;
-    state.sp_size = args.sp_size;
-    state.pp_size = args.pp_size;
-
-    CheckpointOptions options;
-    options.format = args.checkpoint_format;
-    options.save_optimizer_state = args.save_optimizer_state;
-    options.model_bin_writer = args.model_bin_writer;
-    Checkpoint::Save(args.save_dir, args.model, args.optimizer, state, options);
-
-    const auto ckpt_end = std::chrono::high_resolution_clock::now();
-    const double ckpt_ms = std::chrono::duration<double, std::milli>(ckpt_end - ckpt_start).count();
-
-    if (!args.rank.IsMainRank()) {
-        return;
-    }
-
-    LOG(INFO) << std::format("Checkpoint saved at: {} ({:.2f} ms)", args.save_dir.string(), ckpt_ms);
-
-    if (!args.prune_step_checkpoints) {
-        return;
-    }
-
-    std::vector<std::filesystem::path> ckpts;
-    if (std::filesystem::exists(args.checkpoint_root_dir)) {
-        for (const auto &entry : std::filesystem::directory_iterator(args.checkpoint_root_dir)) {
-            if (entry.is_directory() && entry.path().filename().string().starts_with("checkpoint_step_")) {
-                ckpts.push_back(entry.path());
-            }
-        }
-        std::sort(ckpts.begin(), ckpts.end());
-        while (ckpts.size() > args.max_checkpoint_keep) {
-            std::filesystem::remove_all(ckpts.front());
-            ckpts.erase(ckpts.begin());
-        }
-    }
-}
-
 } // namespace infini_train
@@ -42,47 +42,4 @@ void ReadVectorAllFloat(std::ifstream &ifs, float *dst, int64_t len);
 
 void ReadVectorShardFloat(std::ifstream &ifs, float *dst, int64_t len, int64_t start, int64_t cnt);
 
-struct ResumeFromCheckpointArgs {
-    fLS::clstring resume_root;
-    const nn::parallel::Rank &rank;
-    std::shared_ptr<nn::Module> model;
-    std::shared_ptr<Optimizer> optimizer;
-    DistributedDataLoader &train_loader;
-    TrainerState &state;
-    DataLoaderIterator &train_iter;
-    CheckpointLoadOptions load_options;
-};
-
-struct ResumeFromCheckpointResult {
-    int global_step = 0;
-    float best_loss = std::numeric_limits<float>::infinity();
-    size_t data_batch_idx = 0;
-};
-
-struct SaveCheckpointArgs {
-    std::filesystem::path save_dir;
-    int64_t global_step = 0;
-    size_t data_batch_idx = 0;
-    float best_loss = std::numeric_limits<float>::infinity();
-    double last_lr = 0.0;
-    std::string optimizer_type;
-    std::string checkpoint_format = "bin";
-    int ddp_size = 1;
-    int tp_size = 1;
-    int sp_size = 1;
-    int pp_size = 1;
-    bool save_optimizer_state = true;
-    bool prune_step_checkpoints = false;
-    std::filesystem::path checkpoint_root_dir;
-    size_t max_checkpoint_keep = 0;
-    const nn::parallel::Rank &rank;
-    const nn::Module &model;
-    const Optimizer &optimizer;
-    std::function<void(const nn::Module &, const std::filesystem::path &)> model_bin_writer;
-};
-
-ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args);
-
-void SaveCheckpoint(const SaveCheckpointArgs &args);
-
 } // namespace infini_train
@@ -2,7 +2,7 @@
 
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
-namespace nn = infini_train::nn;
+namespace infini_train {
 namespace gpt2 {
 inline nn::TransformerConfig GPT2Config() {
     return {.block_size = 1024,
@@ -22,5 +22,5 @@ inline nn::TransformerConfig GPT2Config() {
             .ffn_dim_multiplier = std::nullopt,
             .multiple_of = 1};
 }
-
 } // namespace gpt2
+} // namespace infini_train
@@ -9,7 +9,6 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "example/common/utils.h"
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
@@ -39,9 +38,9 @@
 #include "infini_train/include/utils/precision_check_config.h"
 #include "infini_train/include/utils/precision_checker.h"
 
+#include "example/common/checkpoint_loader.h"
 #include "example/common/tiny_shakespeare_dataset.h"
 #include "example/common/tokenizer.h"
-#include "example/gpt2/checkpoint_loader.h"
 #include "example/gpt2/config.h"
 
 // I/O
@@ -87,7 +86,10 @@ DEFINE_string(resume_from, "", "checkpoint directory to resume from");
 DEFINE_string(checkpoint_dir, "./checkpoints", "root directory used to store checkpoints");
 DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
 DEFINE_bool(save_optimizer_state, true, "whether optimizer state is persisted in checkpoints");
-DEFINE_string(checkpoint_format, "bin", "checkpoint format: bin|pth");
+DEFINE_string(checkpoint_format, "pth",
+              "checkpoint format: bin|pth. "
+              "'bin' generates model.bin/optimizer.bin (bin supports LLMC model format via callbacks); "
+              "'pth' generates model.pth/optimizer.pth (native StateDict binary).");
 // precision check
 DEFINE_string(
     precision_check, "",