feat: add model config validation on resume

JYMiracle305 · kilinchange · commit b550d3587451 · 2026-06-16T16:04:53.000+08:00
diff --git a/example/common/checkpoint_loader.cc b/example/common/checkpoint_loader.cc
@@ -9,6 +9,7 @@
 
 #include "glog/logging.h"
 
+#include "infini_train/include/nn/modules/transformer/transformer_config.h"
 #include "infini_train/include/nn/parallel/global.h"
 #include "infini_train/include/tensor.h"
 
@@ -39,6 +40,17 @@ ResumeFromCheckpointResult ResumeFromCheckpoint(const ResumeFromCheckpointArgs &
 
     result.global_step = static_cast<int>(args.state.global_step);
 
+    CHECK_EQ(args.state.n_layer, args.model_config.n_layer)
+        << "n_layer mismatch: ckpt=" << args.state.n_layer << ", config=" << args.model_config.n_layer;
+    CHECK_EQ(args.state.n_head, args.model_config.n_head)
+        << "n_head mismatch: ckpt=" << args.state.n_head << ", config=" << args.model_config.n_head;
+    CHECK_EQ(args.state.n_kv_head, args.model_config.n_kv_head)
+        << "n_kv_head mismatch: ckpt=" << args.state.n_kv_head << ", config=" << args.model_config.n_kv_head;
+    CHECK_EQ(args.state.n_embd, args.model_config.n_embd)
+        << "n_embd mismatch: ckpt=" << args.state.n_embd << ", config=" << args.model_config.n_embd;
+    CHECK_EQ(args.state.vocab_size, args.model_config.vocab_size)
+        << "vocab_size mismatch: ckpt=" << args.state.vocab_size << ", config=" << args.model_config.vocab_size;
+
     CHECK_EQ(args.state.ddp_size, ddp_world_size) << "DDP size mismatch: checkpoint has DDP=" << args.state.ddp_size
                                                   << ", but current run has DDP=" << ddp_world_size;
     CHECK_EQ(args.state.tp_size, tp_world_size)
@@ -64,6 +76,11 @@ void SaveCheckpoint(const SaveCheckpointArgs &args) {
     state.global_step = args.global_step;
     state.consumed_batches = static_cast<int64_t>(args.consumed_batches);
     state.last_lr = args.last_lr;
+    state.n_layer = args.n_layer;
+    state.n_head = args.n_head;
+    state.n_kv_head = args.n_kv_head;
+    state.n_embd = args.n_embd;
+    state.vocab_size = args.vocab_size;
     state.ddp_size = args.ddp_size;
     state.tp_size = args.tp_size;
     state.sp_size = args.sp_size;
diff --git a/example/common/checkpoint_loader.h b/example/common/checkpoint_loader.h
@@ -1,13 +1,8 @@
 #pragma once
 
-#include "gflags/gflags.h"
-
 #include <cstdint>
 #include <cstring>
 #include <filesystem>
-#include <functional>
-#include <limits>
-#include <string>
 
 #include "infini_train/include/checkpoint.h"
 #include "infini_train/include/dataloader.h"
@@ -18,12 +13,17 @@
 using namespace infini_train;
 namespace nn = infini_train::nn;
 
+namespace infini_train::nn {
+class TransformerConfig;
+}
+
 struct ResumeFromCheckpointArgs {
     std::filesystem::path resume_root;
     const nn::parallel::Rank &rank;
     std::shared_ptr<nn::Module> model;
     std::shared_ptr<Optimizer> optimizer;
     DistributedDataLoader &train_loader;
+    const nn::TransformerConfig &model_config;
     TrainerState &state;
 };
 
@@ -37,6 +37,11 @@ struct SaveCheckpointArgs {
     int64_t global_step = 0;
     size_t consumed_batches = 0;
     double last_lr = 0.0;
+    int64_t n_layer = 0;
+    int64_t n_head = 0;
+    int64_t n_kv_head = 0;
+    int64_t n_embd = 0;
+    int64_t vocab_size = 0;
     int ddp_size = 1;
     int tp_size = 1;
     int sp_size = 1;
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -336,6 +336,7 @@ void Train(const nn::parallel::Rank &rank) {
                                                      .model = model,
                                                      .optimizer = optimizer,
                                                      .train_loader = train_loader,
+                                                     .model_config = model_config,
                                                      .state = state});
     start_step = resume_result.global_step;
     size_t consumed_batches = resume_result.consumed_batches;
@@ -357,6 +358,11 @@ void Train(const nn::parallel::Rank &rank) {
                   .global_step = global_step,
                   .consumed_batches = consumed_batches,
                   .last_lr = FLAGS_learning_rate,
+                  .n_layer = model_config.n_layer,
+                  .n_head = model_config.n_head,
+                  .n_kv_head = model_config.n_kv_head,
+                  .n_embd = model_config.n_embd,
+                  .vocab_size = model_config.vocab_size,
                   .ddp_size = ddp_world_size,
                   .tp_size = tp_world_size,
                   .sp_size = sp_world_size,
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -316,6 +316,7 @@ void Train(const nn::parallel::Rank &rank) {
         .model = model,
         .optimizer = optimizer,
         .train_loader = train_loader,
+        .model_config = model_config,
         .state = state,
     });
 
@@ -339,6 +340,11 @@ void Train(const nn::parallel::Rank &rank) {
                   .global_step = global_step,
                   .consumed_batches = consumed_batches,
                   .last_lr = FLAGS_learning_rate,
+                  .n_layer = model_config.n_layer,
+                  .n_head = model_config.n_head,
+                  .n_kv_head = model_config.n_kv_head,
+                  .n_embd = model_config.n_embd,
+                  .vocab_size = model_config.vocab_size,
                   .ddp_size = ddp_world_size,
                   .tp_size = tp_world_size,
                   .sp_size = sp_world_size,
diff --git a/infini_train/include/checkpoint.h b/infini_train/include/checkpoint.h
@@ -20,7 +20,11 @@ struct TrainerState {
     // FIXME(jym): learning_rate should be restored from scheduler state, move `last_lr` from TrainerState to
     // SchedulerState later
     double last_lr = 0.0;
-
+    int64_t n_layer = 0;
+    int64_t n_head = 0;
+    int64_t n_kv_head = 0;
+    int64_t n_embd = 0;
+    int64_t vocab_size = 0;
     int ddp_size = 1;
     int tp_size = 1;
     int sp_size = 1;
diff --git a/infini_train/src/checkpoint.cc b/infini_train/src/checkpoint.cc
@@ -208,6 +208,11 @@ void Checkpoint::SaveTrainerState(const std::filesystem::path &path, const Train
     std::ofstream ofs(path);
     CHECK(ofs.is_open()) << "Failed to open trainer state file: " << path;
     ofs << "{\n";
+    ofs << "  \"n_layer\": " << state.n_layer << ",\n";
+    ofs << "  \"n_head\": " << state.n_head << ",\n";
+    ofs << "  \"n_kv_head\": " << state.n_kv_head << ",\n";
+    ofs << "  \"n_embd\": " << state.n_embd << ",\n";
+    ofs << "  \"vocab_size\": " << state.vocab_size << "\n";
     ofs << "  \"global_step\": " << state.global_step << ",\n";
     ofs << "  \"consumed_batches \": " << state.consumed_batches << ",\n";
     ofs << "  \"last_lr\": " << state.last_lr << ",\n";
@@ -226,6 +231,11 @@ TrainerState Checkpoint::LoadTrainerState(const std::filesystem::path &path) {
     const std::string content((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
 
     TrainerState state;
+    state.n_layer = ExtractNumberField<int64_t>(content, "n_layer", 0);
+    state.n_head = ExtractNumberField<int64_t>(content, "n_head", 0);
+    state.n_kv_head = ExtractNumberField<int64_t>(content, "n_kv_head", 0);
+    state.n_embd = ExtractNumberField<int64_t>(content, "n_embd", 0);
+    state.vocab_size = ExtractNumberField<int64_t>(content, "vocab_size", 0);
     state.global_step = ExtractNumberField<int64_t>(content, "global_step", 0);
     state.consumed_batches = ExtractNumberField<int64_t>(content, "consumed_batches ", 0);
     state.last_lr = ExtractNumberField<double>(content, "last_lr", 0.0);
diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash
@@ -72,6 +72,10 @@ PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)";  : "${PROFILE_LOG_DIR:=./profile_
 COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)";  : "${COMPARE_LOG_DIR:=}"
 RUN_CTEST="$(read_var RUN_CTEST)";              : "${RUN_CTEST:=true}"
 CTEST_CMD="$(read_var CTEST_CMD)";              : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}"
+CKPT_CLEAN_DIRS=(
+    "/data1/ckpt"
+    "./checkpoints"
+)
 
 mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"
 
@@ -114,6 +118,17 @@ clean_build_dir() {
     rm -rf "${BUILD_DIR:?}/"*
 }
 
+# Clean checkpoint directories (called once at start of script)
+clean_checkpoints() {
+    echo -e "\033[1;31m[CLEAN] Removing checkpoint directories from previous run\033[0m"
+    for dir in "${CKPT_CLEAN_DIRS[@]}"; do
+        if [[ -d "$dir" ]]; then
+            echo -e "\033[1;31m[CLEAN] Removing: ${dir}\033[0m"
+            rm -rf "${dir:?}"
+        fi
+    done
+}
+
 # Run a command and log output
 run_and_log() {
     local cmd="$1"
@@ -298,6 +313,9 @@ for ((id=0; id<num_builds; ++id)); do
             llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${llama3_arg_str}"
             run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
         done
+
+        # Clean checkpoints from previous run to avoid disk overflow and stale state
+        clean_checkpoints
     done
 done
 
diff --git a/tests/checkpoint/test_optimizer_state.cc b/tests/checkpoint/test_optimizer_state.cc
@@ -17,17 +17,16 @@ TEST_P(OptimizerStateTest, AdamStateDictKeys) {
     param->set_requires_grad(true);
     param->Fill(1.0f);
 
-    auto adam = std::make_shared<optimizers::Adam>(
-        std::vector<std::pair<std::string, std::shared_ptr<Tensor>>>{{"weight", param}}, 0.001);
+    auto adam = std::make_shared<optimizers::Adam>(std::vector<std::shared_ptr<Tensor>>{{param}}, 0.001);
 
     adam->ZeroGrad();
     adam->Step(); // t=1
     adam->Step(); // t=2
 
     auto state = adam->StateDict();
     EXPECT_GT(state.size(), 0);
-    EXPECT_TRUE(state.count("adam.m.weight"));
-    EXPECT_TRUE(state.count("adam.v.weight"));
+    EXPECT_TRUE(state.count("adam.m.0"));
+    EXPECT_TRUE(state.count("adam.v.0"));
     EXPECT_TRUE(state.count("adam.t"));
 
     auto t_cpu = state["adam.t"]->To(Device());
@@ -41,8 +40,7 @@ TEST_P(OptimizerStateTest, AdamStateDictRoundTrip) {
     param1->set_requires_grad(true);
     param1->Fill(1.0f);
 
-    auto adam1 = std::make_shared<optimizers::Adam>(
-        std::vector<std::pair<std::string, std::shared_ptr<Tensor>>>{{"w", param1}}, 0.001);
+    auto adam1 = std::make_shared<optimizers::Adam>(std::vector<std::shared_ptr<Tensor>>{{param1}}, 0.001);
     adam1->ZeroGrad();
     adam1->Step();
     adam1->Step();
@@ -54,8 +52,7 @@ TEST_P(OptimizerStateTest, AdamStateDictRoundTrip) {
     param2->set_requires_grad(true);
     param2->Fill(1.0f);
 
-    auto adam2 = std::make_shared<optimizers::Adam>(
-        std::vector<std::pair<std::string, std::shared_ptr<Tensor>>>{{"w", param2}}, 0.001);
+    auto adam2 = std::make_shared<optimizers::Adam>(std::vector<std::shared_ptr<Tensor>>{{param2}}, 0.001);
     adam2->LoadStateDict(saved);
 
     adam2->ZeroGrad();
diff --git a/tests/checkpoint/test_trainer_state.cc b/tests/checkpoint/test_trainer_state.cc
@@ -21,6 +21,11 @@ TEST_P(TrainerStateTest, DefaultValues) {
     TrainerState state;
     EXPECT_EQ(state.global_step, 0);
     EXPECT_EQ(state.consumed_batches, 0);
+    EXPECT_EQ(state.n_layer, 0);
+    EXPECT_EQ(state.n_head, 0);
+    EXPECT_EQ(state.n_kv_head, 0);
+    EXPECT_EQ(state.n_embd, 0);
+    EXPECT_EQ(state.vocab_size, 0);
     EXPECT_EQ(state.ddp_size, 1);
     EXPECT_EQ(state.tp_size, 1);
     EXPECT_EQ(state.sp_size, 1);
@@ -48,7 +53,6 @@ TEST_P(TrainerStateTest, TrainerStateFileCreated) {
     std::string content((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
     EXPECT_NE(content.find("\"global_step\""), std::string::npos);
     EXPECT_NE(content.find("\"consumed_batches \""), std::string::npos);
-    EXPECT_NE(content.find("\"Adam\""), std::string::npos);
 
     std::filesystem::remove_all(dir);
 }
@@ -61,6 +65,11 @@ TEST_P(TrainerStateTest, RoundTrip) {
         .global_step = 99,
         .consumed_batches = 5000,
         .last_lr = 3e-4,
+        .n_layer = 24,
+        .n_head = 16,
+        .n_kv_head = 8,
+        .n_embd = 1024,
+        .vocab_size = 128256,
         .ddp_size = 2,
         .tp_size = 1,
         .sp_size = 1,
@@ -87,6 +96,11 @@ TEST_P(TrainerStateTest, RoundTrip) {
     EXPECT_EQ(loaded.global_step, 99);
     EXPECT_EQ(loaded.consumed_batches, 5000);
     EXPECT_NEAR(loaded.last_lr, 3e-4, 1e-10);
+    EXPECT_EQ(loaded.n_layer, 24);
+    EXPECT_EQ(loaded.n_head, 16);
+    EXPECT_EQ(loaded.n_kv_head, 8);
+    EXPECT_EQ(loaded.n_embd, 1024);
+    EXPECT_EQ(loaded.vocab_size, 128256);
     EXPECT_EQ(loaded.ddp_size, 2);
     EXPECT_EQ(loaded.pp_size, 2);