InfiniTensor
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎example/common/checkpoint_loader.cc‎
Lines changed: 5 additions & 967 deletions b/‎example/common/checkpoint_loader.cc‎
Lines changed: 5 additions & 967 deletions
diff --git a/‎example/common/checkpoint_loader.h‎
Lines changed: 5 additions & 20 deletions b/‎example/common/checkpoint_loader.h‎
Lines changed: 5 additions & 20 deletions
diff --git a/‎example/gpt2/checkpoint_loader.cc‎
Lines changed: 544 additions & 0 deletions b/‎example/gpt2/checkpoint_loader.cc‎
Lines changed: 544 additions & 0 deletions
diff --git a/‎example/gpt2/checkpoint_loader.h‎
Lines changed: 14 additions & 0 deletions b/‎example/gpt2/checkpoint_loader.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎example/gpt2/config.h‎
Lines changed: 8 additions & 11 deletions b/‎example/gpt2/config.h‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 17 additions & 20 deletions b/‎example/gpt2/main.cc‎
Lines changed: 17 additions & 20 deletions
diff --git a/‎example/llama3/checkpoint_loader.cc‎
Lines changed: 5 additions & 15 deletions b/‎example/llama3/checkpoint_loader.cc‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎example/llama3/checkpoint_loader.h‎
Lines changed: 14 additions & 0 deletions b/‎example/llama3/checkpoint_loader.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎example/llama3/config.h‎
Lines changed: 8 additions & 10 deletions b/‎example/llama3/config.h‎
Lines changed: 8 additions & 10 deletions
@@ -196,6 +196,7 @@ add_executable(gpt2
   example/common/utils.cc
   example/common/checkpoint_loader.cc
   example/common/tokenizer.cc
+  example/gpt2/checkpoint_loader.cc
 )
 link_infini_train_exe(gpt2)
 
@@ -205,6 +206,7 @@ add_executable(llama3
   example/common/utils.cc
   example/common/checkpoint_loader.cc
   example/common/tokenizer.cc
+  example/llama3/checkpoint_loader.cc
 )
 link_infini_train_exe(llama3)
 
 
@@ -15,22 +15,11 @@
 #include "infini_train/include/nn/parallel/rank.h"
 #include "infini_train/include/optimizer.h"
 
-namespace infini_train {
-namespace nn {
-class TransformerModel;
-}
-
-namespace gpt2 {
-std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath);
-void SaveAsLLMC(const std::shared_ptr<nn::TransformerModel> &model, const std::string &filepath);
-} // namespace gpt2
-namespace llama3 {
-std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath);
-void SaveAsLLMC(const std::shared_ptr<nn::TransformerModel> &model, const std::string &filepath);
-} // namespace llama3
+using namespace infini_train;
+namespace nn = infini_train::nn;
 
 struct ResumeFromCheckpointArgs {
-    fLS::clstring resume_root;
+    std::filesystem::path resume_root;
     const nn::parallel::Rank &rank;
     std::shared_ptr<nn::Module> model;
     std::shared_ptr<Optimizer> optimizer;
@@ -42,23 +31,21 @@ struct ResumeFromCheckpointArgs {
 
 struct ResumeFromCheckpointResult {
     int global_step = 0;
-    float best_loss = std::numeric_limits<float>::infinity();
     size_t data_batch_idx = 0;
 };
 
 struct SaveCheckpointArgs {
     std::filesystem::path save_dir;
     int64_t global_step = 0;
     size_t data_batch_idx = 0;
-    float best_loss = std::numeric_limits<float>::infinity();
     double last_lr = 0.0;
     std::string optimizer_type;
-    std::string checkpoint_format = "bin";
+    std::string checkpoint_file_format = "bin";
     int ddp_size = 1;
     int tp_size = 1;
     int sp_size = 1;
     int pp_size = 1;
-    bool save_optimizer_state = true;
+    bool no_save_optim = false;
     bool prune_step_checkpoints = false;
     std::filesystem::path checkpoint_root_dir;
     size_t max_checkpoint_keep = 0;
@@ -71,5 +58,3 @@ struct SaveCheckpointArgs {
 ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &args);
 
 void SaveCheckpoint(const SaveCheckpointArgs &args);
-
-} // namespace infini_train
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cstring>
+#include <memory>
+#include <string>
+
+namespace infini_train::nn {
+class TransformerModel;
+}
+
+namespace gpt2 {
+std::shared_ptr<infini_train::nn::TransformerModel> LoadFromLLMC(const std::string &filepath);
+void SaveAsLLMC(const std::shared_ptr<infini_train::nn::TransformerModel> &model, const std::string &filepath);
+} // namespace gpt2
@@ -4,19 +4,18 @@
 
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
-namespace infini_train {
 namespace gpt2 {
-inline nn::TransformerConfig GPT2Config() {
+inline infini_train::nn::TransformerConfig GPT2Config() {
     return {.block_size = 1024,
             .vocab_size = 50304,
             .original_vocab_size = 50257,
             .n_layer = 12,
             .n_head = 12,
             .n_kv_head = 12,
             .n_embd = 768,
-            .attention_type = nn::AttentionType::kStandard,
-            .activation_type = nn::MLPType::kGELU,
-            .norm_type = nn::NormType::kLayerNorm,
+            .attention_type = infini_train::nn::AttentionType::kStandard,
+            .activation_type = infini_train::nn::MLPType::kGELU,
+            .norm_type = infini_train::nn::NormType::kLayerNorm,
             .add_bias_linear = true,
             .add_bias_lm_head = false,
             .tie_weights = true,
@@ -25,7 +24,7 @@ inline nn::TransformerConfig GPT2Config() {
             .multiple_of = 1};
 }
 
-inline void SanitizeGPT2Config(const nn::TransformerConfig &c) {
+inline void SanitizeGPT2Config(const infini_train::nn::TransformerConfig &c) {
     CHECK_GT(c.block_size, 0);
     CHECK_GT(c.vocab_size, 0);
     CHECK_GE(c.vocab_size, c.original_vocab_size);
@@ -34,10 +33,8 @@ inline void SanitizeGPT2Config(const nn::TransformerConfig &c) {
     CHECK_GT(c.n_embd, 0);
     CHECK_EQ(c.n_embd % c.n_head, 0) << "n_embd must be divisible by n_head";
     CHECK_EQ(c.n_kv_head, c.n_head) << "GPT-2 does not use GQA; n_kv_head must equal n_head";
-    CHECK(c.attention_type == nn::AttentionType::kStandard) << "GPT-2 requires standard attention";
-    CHECK(c.activation_type == nn::MLPType::kGELU) << "GPT-2 requires GELU activation";
-    CHECK(c.norm_type == nn::NormType::kLayerNorm) << "GPT-2 requires LayerNorm";
+    CHECK(c.attention_type == infini_train::nn::AttentionType::kStandard) << "GPT-2 requires standard attention";
+    CHECK(c.activation_type == infini_train::nn::MLPType::kGELU) << "GPT-2 requires GELU activation";
+    CHECK(c.norm_type == infini_train::nn::NormType::kLayerNorm) << "GPT-2 requires LayerNorm";
 }
-
 } // namespace gpt2
-} // namespace infini_train
@@ -41,8 +41,10 @@
 #include "example/common/checkpoint_loader.h"
 #include "example/common/tiny_shakespeare_dataset.h"
 #include "example/common/tokenizer.h"
+#include "example/gpt2/checkpoint_loader.h"
 #include "example/gpt2/config.h"
 
+// TODO(jym): Reorganize CLI flags into categories for better readability and maintainability.
 // I/O
 DEFINE_string(input_bin, "", "input .bin to train on");
 DEFINE_string(input_val_bin, "", "input .bin to eval validation loss on");
@@ -81,12 +83,12 @@ DEFINE_uint32(virtual_pipeline_parallel, 1, "Number of chunks in PP stage.");
 
 // precision
 DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)");
-DEFINE_uint32(save_steps, 0, "save checkpoint every N steps; 0 disables saving");
-DEFINE_string(resume_from, "", "checkpoint directory to resume from");
-DEFINE_string(checkpoint_dir, "./checkpoints", "root directory used to store checkpoints");
+DEFINE_uint32(save_interval, 0, "save checkpoint every N steps; 0 disables saving");
+DEFINE_string(load, "", "checkpoint directory to resume from");
+DEFINE_string(save, "./checkpoints", "root directory used to store checkpoints");
 DEFINE_uint32(max_checkpoint_keep, 3, "max number of checkpoint steps to keep");
-DEFINE_bool(save_optimizer_state, true, "whether optimizer state is persisted in checkpoints");
-DEFINE_string(checkpoint_format, "ckpt",
+DEFINE_bool(no_save_optim, false, "whether optimizer state is persisted in checkpoints");
+DEFINE_string(checkpoint_file_format, "ckpt",
               "checkpoint format: bin|ckpt. "
               "'bin' generates model.bin/optimizer.bin (bin supports LLMC model format via callbacks); "
               "'ckpt' generates model.ckpt/optimizer.ckpt (native StateDict binary).");
@@ -317,7 +319,7 @@ void Train(const nn::parallel::Rank &rank) {
     // TODO(dcj): support more complex optimizer later
     // auto optimizer = optimizers::SGD(model->Parameters(), FLAGS_learning_rate);
     std::shared_ptr<Optimizer> optimizer = nullptr;
-    auto optimizer_creator = optimizers::SGD::Create(FLAGS_learning_rate);
+    auto optimizer_creator = optimizers::SGD::CreateNamed(FLAGS_learning_rate);
 
     if (FLAGS_use_distributed_optimizer) {
         auto model_chunks = (pp_world_size > 1)
@@ -341,16 +343,15 @@ void Train(const nn::parallel::Rank &rank) {
     auto impl = core::GetDeviceGuardImpl(device.type());
 
     int start_step = 0;
-    float best_loss = std::numeric_limits<float>::infinity();
     TrainerState state;
     CheckpointLoadOptions load_options;
     load_options.load_optimizer_state = true;
     load_options.model_bin_loader = [](nn::Module *target_model, const std::filesystem::path &model_path) {
         auto loaded_model = gpt2::LoadFromLLMC(model_path.string());
         target_model->LoadStateDict(loaded_model->StateDict());
     };
-    const auto resume_result = infini_train::ResumeFromCheckpoint({
-        .resume_root = FLAGS_resume_from,
+    const auto resume_result = ResumeFromCheckpoint({
+        .resume_root = FLAGS_load,
         .rank = rank,
         .model = model,
         .optimizer = optimizer,
@@ -360,26 +361,24 @@ void Train(const nn::parallel::Rank &rank) {
         .load_options = load_options,
     });
     start_step = resume_result.global_step;
-    best_loss = resume_result.best_loss;
     saved_data_batch_idx = resume_result.data_batch_idx;
 
     auto save_checkpoint
         = [&](const std::filesystem::path &save_dir, int64_t global_step, bool prune_step_checkpoints) {
-              infini_train::SaveCheckpoint({
+              SaveCheckpoint({
                   .save_dir = save_dir,
                   .global_step = global_step,
                   .data_batch_idx = saved_data_batch_idx,
-                  .best_loss = best_loss,
                   .last_lr = FLAGS_learning_rate,
                   .optimizer_type = "SGD",
-                  .checkpoint_format = FLAGS_checkpoint_format,
+                  .checkpoint_file_format = FLAGS_checkpoint_file_format,
                   .ddp_size = ddp_world_size,
                   .tp_size = tp_world_size,
                   .sp_size = sp_world_size,
                   .pp_size = pp_world_size,
-                  .save_optimizer_state = FLAGS_save_optimizer_state,
+                  .no_save_optim = FLAGS_no_save_optim,
                   .prune_step_checkpoints = prune_step_checkpoints,
-                  .checkpoint_root_dir = FLAGS_checkpoint_dir,
+                  .checkpoint_root_dir = FLAGS_save,
                   .max_checkpoint_keep = FLAGS_max_checkpoint_keep,
                   .rank = rank,
                   .model = *model,
@@ -484,8 +483,6 @@ void Train(const nn::parallel::Rank &rank) {
             lossf = static_cast<const float *>(lossf_tensor->To(Device()).DataPtr())[0];
         }
 
-        best_loss = std::min(best_loss, lossf);
-
         const auto iter_end = std::chrono::high_resolution_clock::now();
         const double duration_us = std::chrono::duration<double, std::micro>(iter_end - iter_start).count();
         const double tps = FLAGS_total_batch_size / (duration_us / 1e6);
@@ -509,9 +506,9 @@ void Train(const nn::parallel::Rank &rank) {
             }
         }
 
-        if (FLAGS_save_steps > 0 && (step + 1) % FLAGS_save_steps == 0) {
+        if (FLAGS_save_interval > 0 && (step + 1) % FLAGS_save_interval == 0) {
             std::filesystem::path step_dir
-                = std::filesystem::path(FLAGS_checkpoint_dir) / std::format("checkpoint_step_{:06d}", step + 1);
+                = std::filesystem::path(FLAGS_save) / std::format("checkpoint_step_{:06d}", step + 1);
             if (rank.IsParallel()) {
                 step_dir /= std::format("rank_{:06d}", rank.GlobalRank());
             }
@@ -525,7 +522,7 @@ void Train(const nn::parallel::Rank &rank) {
         nn::lora::SaveLoRAWeights(model, FLAGS_lora_save_path);
     }
 
-    std::filesystem::path final_dir = std::filesystem::path(FLAGS_checkpoint_dir) / "checkpoint_final";
+    std::filesystem::path final_dir = std::filesystem::path(FLAGS_save) / "checkpoint_final";
     if (rank.IsParallel()) {
         final_dir /= std::format("rank_{:06d}", rank.GlobalRank());
     }
 
@@ -5,15 +5,12 @@
 #include <filesystem>
 #include <fstream>
 #include <memory>
-#include <random>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "glog/logging.h"
 
-#include "example/common/utils.h"
-#include "example/llama3/config.h"
 #include "infini_train/include/nn/modules/normalization.h"
 #include "infini_train/include/nn/modules/transformer/causal_self_attention.h"
 #include "infini_train/include/nn/modules/transformer/mlp.h"
@@ -22,24 +19,18 @@
 #include "infini_train/include/nn/parallel/tensor_parallel.h"
 #include "infini_train/include/tensor.h"
 
+#include "example/common/utils.h"
+#include "example/llama3/config.h"
+
 using namespace infini_train;
 namespace nn = infini_train::nn;
 
-namespace {
-constexpr int kRandomSeed = 42;
-
-// TODO(zbl): make this rng generator compatible with torch later
-static std::mt19937 gen{kRandomSeed};
-} // namespace
-
 namespace {
 constexpr int32_t kLLaMA3Magic = 20240803;
 constexpr int32_t kLLaMA3FP32Version = 3;
 } // namespace
 
-namespace llama3 {
-
-std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath) {
+std::shared_ptr<nn::TransformerModel> llama3::LoadFromLLMC(const std::string &filepath) {
     if (!std::filesystem::exists(filepath)) {
         LOG(FATAL) << "File not found: " << filepath;
     }
@@ -346,7 +337,7 @@ std::shared_ptr<nn::TransformerModel> LoadFromLLMC(const std::string &filepath)
     return llama3;
 }
 
-void SaveAsLLMC(const std::shared_ptr<nn::TransformerModel> &model, const std::string &filepath) {
+void llama3::SaveAsLLMC(const std::shared_ptr<nn::TransformerModel> &model, const std::string &filepath) {
     CHECK_EQ(nn::parallel::global::GetTensorParallelSize(), 1) << "SaveAsLLMC currently supports TP=1 only.";
     CHECK_EQ(nn::parallel::global::GetPipelineParallelSize(), 1) << "SaveAsLLMC currently supports PP=1 only.";
 
@@ -448,4 +439,3 @@ void SaveAsLLMC(const std::shared_ptr<nn::TransformerModel> &model, const std::s
     ofs.flush();
     CHECK(ofs.good()) << "Failed to flush model file: " << filepath;
 }
-} // namespace llama3
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cstring>
+#include <memory>
+#include <string>
+
+namespace infini_train::nn {
+class TransformerModel;
+}
+
+namespace llama3 {
+std::shared_ptr<infini_train::nn::TransformerModel> LoadFromLLMC(const std::string &filepath);
+void SaveAsLLMC(const std::shared_ptr<infini_train::nn::TransformerModel> &model, const std::string &filepath);
+} // namespace llama3
@@ -4,19 +4,18 @@
 
 #include "infini_train/include/nn/modules/transformer/transformer_config.h"
 
-namespace infini_train {
 namespace llama3 {
-inline nn::TransformerConfig LLaMA3Config() {
+inline infini_train::nn::TransformerConfig LLaMA3Config() {
     return {.block_size = 8192,
             .vocab_size = 128256,
             .original_vocab_size = 128256,
             .n_layer = 16,
             .n_head = 32,
             .n_kv_head = 8,
             .n_embd = 2048,
-            .attention_type = nn::AttentionType::kRoPE,
-            .activation_type = nn::MLPType::kSwiGLU,
-            .norm_type = nn::NormType::kRMSNorm,
+            .attention_type = infini_train::nn::AttentionType::kRoPE,
+            .activation_type = infini_train::nn::MLPType::kSwiGLU,
+            .norm_type = infini_train::nn::NormType::kRMSNorm,
             .add_bias_linear = false,
             .add_bias_lm_head = false,
             .tie_weights = false,
@@ -25,7 +24,7 @@ inline nn::TransformerConfig LLaMA3Config() {
             .multiple_of = 256};
 }
 
-inline void SanitizeLLaMA3Config(const nn::TransformerConfig &c) {
+inline void SanitizeLLaMA3Config(const infini_train::nn::TransformerConfig &c) {
     CHECK_GT(c.block_size, 0);
     CHECK_GT(c.vocab_size, 0);
     CHECK_GE(c.vocab_size, c.original_vocab_size);
@@ -36,13 +35,12 @@ inline void SanitizeLLaMA3Config(const nn::TransformerConfig &c) {
     CHECK_EQ(c.n_head % c.n_kv_head, 0) << "n_head must be divisible by n_kv_head for GQA";
     CHECK_GT(c.n_embd, 0);
     CHECK_EQ(c.n_embd % c.n_head, 0) << "n_embd must be divisible by n_head";
-    CHECK(c.attention_type == nn::AttentionType::kRoPE) << "LLaMA-3 requires RoPE attention";
-    CHECK(c.activation_type == nn::MLPType::kSwiGLU) << "LLaMA-3 requires SwiGLU activation";
-    CHECK(c.norm_type == nn::NormType::kRMSNorm) << "LLaMA-3 requires RMSNorm";
+    CHECK(c.attention_type == infini_train::nn::AttentionType::kRoPE) << "LLaMA-3 requires RoPE attention";
+    CHECK(c.activation_type == infini_train::nn::MLPType::kSwiGLU) << "LLaMA-3 requires SwiGLU activation";
+    CHECK(c.norm_type == infini_train::nn::NormType::kRMSNorm) << "LLaMA-3 requires RMSNorm";
     CHECK(!c.add_bias_linear) << "LLaMA-3 has no bias in linear layers";
     CHECK(!c.tie_weights) << "LLaMA-3 does not tie embedding and lm_head weights";
     CHECK(c.ffn_dim_multiplier.has_value()) << "LLaMA-3 requires ffn_dim_multiplier";
     CHECK_GT(c.multiple_of, 0);
 }
 } // namespace llama3
-} // namespace infini_train
Original file line number	Diff line number	Diff line change
`@@ -196,6 +196,7 @@ add_executable(gpt2`
`196`	`196`	`example/common/utils.cc`
`197`	`197`	`example/common/checkpoint_loader.cc`
`198`	`198`	`example/common/tokenizer.cc`
	`199`	`+ example/gpt2/checkpoint_loader.cc`
`199`	`200`	`)`
`200`	`201`	`link_infini_train_exe(gpt2)`
`201`	`202`
`@@ -205,6 +206,7 @@ add_executable(llama3`
`205`	`206`	`example/common/utils.cc`
`206`	`207`	`example/common/checkpoint_loader.cc`
`207`	`208`	`example/common/tokenizer.cc`
	`209`	`+ example/llama3/checkpoint_loader.cc`
`208`	`210`	`)`
`209`	`211`	`link_infini_train_exe(llama3)`
`210`	`212`