refactor: rename current_lr_ to recover_lr_ and update related methods, add validation tests for learning rate schedulers

kinorw · Kinorw · commit 327d2631053c · 2026-04-02T00:39:50.000+08:00
- it now only be used for learning rate recovery when using loadstate
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -224,3 +224,6 @@ link_infini_train_exe(test_sequential_lr)
 
 add_executable(test_chained_lr test/lr_scheduler/test_chained_lr.cc)
 link_infini_train_exe(test_chained_lr)
+
+add_executable(test_lr_scheduler_validation test/lr_scheduler/test_lr_scheduler_validation.cc)
+link_infini_train_exe(test_lr_scheduler_validation)
diff --git a/infini_train/include/lr_scheduler.h b/infini_train/include/lr_scheduler.h
@@ -67,14 +67,16 @@ class LRScheduler {
     virtual StateDict State() const;
     virtual void LoadState(const StateDict &state);
 
+    bool SharesOptimizerWith(const std::shared_ptr<Optimizer> &opt) const;
+
 protected:
     virtual float GetClosedFormLR() const = 0;
     virtual float GetChainedFormLR() const;
     void ApplyLR(float lr);
 
     std::shared_ptr<Optimizer> optimizer_;
     int64_t last_step_;
-    float current_lr_;
+    float recover_lr_;
     float base_lr_;
     bool is_initial_ = false;
 };
@@ -155,7 +157,9 @@ class SequentialLR : public LRScheduler {
     void LoadState(const StateDict &state) override;
 
 protected:
-    float GetClosedFormLR() const override { return current_lr_; }
+    float GetClosedFormLR() const override {
+        return base_lr_;
+    } // FIXME: SequentialLR should not have a closed-form LR, but we need to implement this pure virtual function.
     void UndoChildInitialSteps();
 
 private:
@@ -176,11 +180,13 @@ class ChainedScheduler : public LRScheduler {
     void LoadState(const StateDict &state) override;
 
 protected:
-    float GetClosedFormLR() const override { return current_lr_; }
+    float GetClosedFormLR() const override {
+        return base_lr_;
+    } // FIXME: ChainedScheduler should not have a closed-form LR, but we need to implement this pure virtual function.
 
 private:
     std::vector<std::shared_ptr<LRScheduler>> schedulers_;
 };
 
 } // namespace lr_schedulers
-} // namespace infini_train
+} // namespace infini_train
diff --git a/infini_train/src/lr_scheduler.cc b/infini_train/src/lr_scheduler.cc
@@ -68,11 +68,10 @@ std::shared_ptr<LRScheduler> CreateLRScheduler(std::shared_ptr<Optimizer> optimi
 };
 
 LRScheduler::LRScheduler(std::shared_ptr<Optimizer> optimizer, int64_t last_step)
-    : optimizer_(std::move(optimizer)), last_step_(last_step), current_lr_(0.0f), base_lr_(0.0f) {
+    : optimizer_(std::move(optimizer)), last_step_(last_step), base_lr_(0.0f) {
     CHECK(optimizer_) << "LRScheduler: optimizer must not be null.";
     optimizer_->SetInitialLearningRate(optimizer_->GetLearningRate());
     base_lr_ = optimizer_->GetInitialLearningRate();
-    current_lr_ = base_lr_;
 }
 
 void LRScheduler::Step() {
@@ -91,34 +90,33 @@ void LRScheduler::InitialStep() {
     is_initial_ = false;
 }
 
-void LRScheduler::ApplyLR(float lr) {
-    current_lr_ = lr;
-    optimizer_->SetLearningRate(current_lr_);
-}
+void LRScheduler::ApplyLR(float lr) { optimizer_->SetLearningRate(lr); }
 
 float LRScheduler::GetChainedFormLR() const { return GetClosedFormLR(); }
 
-float LRScheduler::GetLR() const { return current_lr_; }
+float LRScheduler::GetLR() const { return optimizer_->GetLearningRate(); }
 
 float LRScheduler::BaseLR() const { return base_lr_; }
 
 int64_t LRScheduler::LastStep() const { return last_step_; }
 
+bool LRScheduler::SharesOptimizerWith(const std::shared_ptr<Optimizer> &opt) const { return optimizer_ == opt; }
+
 void LRScheduler::ResetStep(int64_t step) { last_step_ = step; }
 
 StateDict LRScheduler::State() const {
     return {
         {"last_step", last_step_},
-        {"current_lr", current_lr_},
+        {"recover_lr", optimizer_->GetLearningRate()},
         {"base_lr", base_lr_},
     };
 }
 
 void LRScheduler::LoadState(const StateDict &state) {
     last_step_ = std::get<int64_t>(state.at("last_step"));
-    current_lr_ = std::get<float>(state.at("current_lr"));
+    recover_lr_ = std::get<float>(state.at("recover_lr"));
     base_lr_ = std::get<float>(state.at("base_lr"));
-    optimizer_->SetLearningRate(current_lr_);
+    optimizer_->SetLearningRate(recover_lr_);
 }
 
 // Concrete LR Schedulers
@@ -128,7 +126,10 @@ namespace lr_schedulers {
 // --- ConstantLR ---
 
 ConstantLR::ConstantLR(std::shared_ptr<Optimizer> optimizer, float factor, int total_iters, int64_t last_step)
-    : LRScheduler(std::move(optimizer), last_step), factor_(factor), total_iters_(total_iters) {}
+    : LRScheduler(std::move(optimizer), last_step), factor_(factor), total_iters_(total_iters) {
+    CHECK_GE(factor_, 0.0f) << "ConstantLR: factor must be >= 0.";
+    CHECK_LE(factor_, 1.0f) << "ConstantLR: factor must be <= 1.";
+}
 
 float ConstantLR::GetClosedFormLR() const { return last_step_ < total_iters_ ? base_lr_ * factor_ : base_lr_; }
 
@@ -147,7 +148,10 @@ float ConstantLR::GetChainedFormLR() const {
 // --- StepLR ---
 
 StepLR::StepLR(std::shared_ptr<Optimizer> optimizer, int64_t step_size, float gamma, int64_t last_step)
-    : LRScheduler(std::move(optimizer), last_step), step_size_(step_size), gamma_(gamma) {}
+    : LRScheduler(std::move(optimizer), last_step), step_size_(step_size), gamma_(gamma) {
+    CHECK_GT(step_size_, 0) << "StepLR: step_size must be > 0.";
+    CHECK_GT(gamma_, 0.0f) << "StepLR: gamma must be > 0.";
+}
 
 float StepLR::GetClosedFormLR() const {
     return base_lr_
@@ -165,7 +169,13 @@ float StepLR::GetChainedFormLR() const {
 LinearLR::LinearLR(std::shared_ptr<Optimizer> optimizer, float start_factor, float end_factor, int64_t total_iters,
                    int64_t last_step)
     : LRScheduler(std::move(optimizer), last_step), start_factor_(start_factor), end_factor_(end_factor),
-      total_iters_(total_iters) {}
+      total_iters_(total_iters) {
+    CHECK_GT(start_factor_, 0.0f) << "LinearLR: start_factor must be > 0.";
+    CHECK_LE(start_factor_, 1.0f) << "LinearLR: start_factor must be <= 1.";
+    CHECK_GE(end_factor_, 0.0f) << "LinearLR: end_factor must be >= 0.";
+    CHECK_LE(end_factor_, 1.0f) << "LinearLR: end_factor must be <= 1.";
+    CHECK_GT(total_iters_, 0) << "LinearLR: total_iters must be > 0.";
+}
 
 float LinearLR::GetClosedFormLR() const {
     if (last_step_ >= total_iters_) {
@@ -198,31 +208,40 @@ float LinearLR::GetChainedFormLR() const {
 }
 
 LambdaLR::LambdaLR(std::shared_ptr<Optimizer> optimizer, std::function<float(int64_t)> lr_lambda, int64_t last_step)
-    : LRScheduler(std::move(optimizer), last_step), lr_lambda_(std::move(lr_lambda)) {}
+    : LRScheduler(std::move(optimizer), last_step), lr_lambda_(std::move(lr_lambda)) {
+    CHECK(lr_lambda_) << "LambdaLR: lr_lambda must not be null.";
+}
 
 float LambdaLR::GetClosedFormLR() const { return base_lr_ * lr_lambda_(last_step_); }
 
 SequentialLR::SequentialLR(std::shared_ptr<Optimizer> optimizer, std::vector<std::shared_ptr<LRScheduler>> schedulers,
                            std::vector<int64_t> milestones, int64_t last_step)
     : LRScheduler(std::move(optimizer), last_step), schedulers_(std::move(schedulers)),
-      milestones_(std::move(milestones)) {}
-
-void SequentialLR::InitialStep() {
+      milestones_(std::move(milestones)) {
     CHECK(!schedulers_.empty()) << "SequentialLR requires at least one scheduler.";
+
+    for (size_t i = 0; i < schedulers_.size(); ++i) {
+        CHECK(schedulers_[i]) << "SequentialLR: scheduler at index " << i << " must not be null.";
+        CHECK(schedulers_[i]->SharesOptimizerWith(optimizer_))
+            << "SequentialLR: scheduler at index " << i << " must share the same optimizer.";
+    }
+
     CHECK_EQ(milestones_.size(), schedulers_.size() - 1)
         << "SequentialLR: milestones count must be schedulers count - 1.";
 
     for (size_t i = 1; i < milestones_.size(); ++i) {
         CHECK_GT(milestones_[i], milestones_[i - 1]) << "Milestones must be strictly increasing.";
     }
+}
+
+void SequentialLR::InitialStep() {
 
     optimizer_->SetLearningRate(schedulers_[0]->BaseLR());
 
     UndoChildInitialSteps();
 
     ++last_step_;
     schedulers_[0]->InitialStep();
-    current_lr_ = schedulers_[0]->GetLR();
 }
 
 void SequentialLR::UndoChildInitialSteps() {
@@ -245,14 +264,12 @@ void SequentialLR::Step() {
     } else {
         scheduler->Step();
     }
-
-    current_lr_ = optimizer_->GetLearningRate();
 }
 
 StateDict SequentialLR::State() const {
     StateDict state;
     state["last_step"] = last_step_;
-    state["current_lr"] = current_lr_;
+    state["recover_lr"] = optimizer_->GetLearningRate();
     state["base_lr"] = base_lr_;
     for (size_t i = 0; i < schedulers_.size(); ++i) {
         auto sub_state = schedulers_[i]->State();
@@ -263,7 +280,7 @@ StateDict SequentialLR::State() const {
 
 void SequentialLR::LoadState(const StateDict &state) {
     last_step_ = std::get<int64_t>(state.at("last_step"));
-    current_lr_ = std::get<float>(state.at("current_lr"));
+    recover_lr_ = std::get<float>(state.at("recover_lr"));
     base_lr_ = std::get<float>(state.at("base_lr"));
 
     for (size_t i = 0; i < schedulers_.size(); ++i) {
@@ -278,23 +295,28 @@ void SequentialLR::LoadState(const StateDict &state) {
             schedulers_[i]->LoadState(sub_state);
         }
     }
-    optimizer_->SetLearningRate(current_lr_);
+    optimizer_->SetLearningRate(recover_lr_);
 }
 
 ChainedScheduler::ChainedScheduler(std::shared_ptr<Optimizer> optimizer,
                                    std::vector<std::shared_ptr<LRScheduler>> schedulers, int64_t last_step)
-    : LRScheduler(std::move(optimizer), last_step), schedulers_(std::move(schedulers)) {}
-
-void ChainedScheduler::InitialStep() {
+    : LRScheduler(std::move(optimizer), last_step), schedulers_(std::move(schedulers)) {
     CHECK(!schedulers_.empty()) << "ChainedScheduler requires at least one scheduler.";
 
-    current_lr_ = optimizer_->GetLearningRate();
+    for (size_t i = 0; i < schedulers_.size(); ++i) {
+        CHECK(schedulers_[i]) << "ChainedScheduler: scheduler at index " << i << " must not be null.";
+        CHECK(schedulers_[i]->SharesOptimizerWith(optimizer_))
+            << "ChainedScheduler: scheduler at index " << i << " must share the same optimizer.";
+    }
+}
+
+void ChainedScheduler::InitialStep() {
+    last_step_ = 0;
 }
 
 void ChainedScheduler::Step() {
     ++last_step_;
     for (auto &sched : schedulers_) { sched->Step(); }
-    current_lr_ = optimizer_->GetLearningRate();
 }
 
 StateDict ChainedScheduler::State() const {
@@ -323,4 +345,4 @@ void ChainedScheduler::LoadState(const StateDict &state) {
 }
 
 } // namespace lr_schedulers
-} // namespace infini_train
+} // namespace infini_train
diff --git a/test/lr_scheduler/test_chained_lr.cc b/test/lr_scheduler/test_chained_lr.cc
@@ -7,7 +7,7 @@ using namespace infini_train::lr_schedulers;
 namespace {
 constexpr float kBaseLR = 0.1f;
 }
-// TC1: 单子调度器退化
+
 void TestSingleScheduler() {
     std::cout << "[TC1] TestSingleScheduler" << std::endl;
     auto opt = MakeDummyOptimizer(kBaseLR);
@@ -23,7 +23,7 @@ void TestSingleScheduler() {
     ASSERT_FLOAT_NEAR(sched->GetLR(), 0.1f, kEps);
 }
 
-// TC2: StepLR + LambdaLR 乘法叠加
+// TC2: StepLR + LambdaLR
 void TestMultiplicativeChain() {
     std::cout << "[TC2] TestMultiplicativeChain" << std::endl;
     auto opt = MakeDummyOptimizer(kBaseLR);
@@ -53,7 +53,7 @@ void TestMultiplicativeChain() {
     ASSERT_FLOAT_NEAR(sched->GetLR(), 0.07f, kEps);
 }
 
-// TC3: ConstantLR + StepLR 叠加 (无穿插声明)
+// TC3: ConstantLR + StepLR
 void TestConstantPlusStep() {
     std::cout << "[TC3] TestConstantPlusStep" << std::endl;
     auto opt = MakeDummyOptimizer(kBaseLR);
@@ -86,7 +86,7 @@ void TestConstantPlusStep() {
     ASSERT_FLOAT_NEAR(sched->GetLR(), 0.01f, kEps);
 }
 
-// TC4: ConstantLR + StepLR 叠加（有穿插声明）
+// TC4: ConstantLR + StepLR (with extra unused scheduler）
 void TestConstantPlusStepDLC() {
     std::cout << "[TC4] TestConstantPlusStepDLC" << std::endl;
     auto opt = MakeDummyOptimizer(kBaseLR);
@@ -129,7 +129,7 @@ void TestConstantPlusStepDLC() {
     ASSERT_FLOAT_NEAR(sched->GetLR(), 0.02f, kEps);
 }
 
-// TC5: State/LoadState 往返
+// TC5: State/LoadState
 void TestStateRoundTrip() {
     std::cout << "[TC5] TestStateRoundTrip" << std::endl;
     auto opt = MakeDummyOptimizer(kBaseLR);
@@ -152,7 +152,7 @@ void TestStateRoundTrip() {
     ASSERT_FLOAT_NEAR(sched2->GetLR(), sched->GetLR(), kEps);
 }
 
-// TC6: resume 一致性
+// TC6: resume consistency (load state at step K, then step N-K, should match directly stepping to N)
 void TestResumeConsistency() {
     std::cout << "[TC6] TestResumeConsistency" << std::endl;
     constexpr int kN = 10, kK = 4;
@@ -199,4 +199,4 @@ int main(int argc, char *argv[]) {
         std::cout << g_fail_count << " test(s) FAILED" << std::endl;
     }
     return g_fail_count > 0 ? 1 : 0;
-}
+}
diff --git a/test/lr_scheduler/test_constant_lr.cc b/test/lr_scheduler/test_constant_lr.cc
@@ -181,4 +181,4 @@ int main(int argc, char *argv[]) {
         std::cout << g_fail_count << " test(s) FAILED" << std::endl;
     }
     return g_fail_count > 0 ? 1 : 0;
-}
+}
diff --git a/test/lr_scheduler/test_helpers.h b/test/lr_scheduler/test_helpers.h
@@ -32,4 +32,4 @@ void Check(bool cond, const char *expr, int line) {
 #define ASSERT_FLOAT_EQ(a, b) Check(FloatNear((a), (b)), #a " == " #b, __LINE__)
 #define ASSERT_FLOAT_NEAR(a, b, eps) Check(FloatNear((a), (b), (eps)), #a " ≈ " #b, __LINE__)
 
-} // namespace
+} // namespace
diff --git a/test/lr_scheduler/test_lambda_lr.cc b/test/lr_scheduler/test_lambda_lr.cc
@@ -14,7 +14,7 @@ void TestIdentityLambda() {
                                             .type = "lambda",
                                             .lambda_fn = [](int64_t) { return 1.0f; },
                                         });
-    // 构造器内 Step() → last_step_=0, lr = 0.1 * 1.0 = 0.1
+    // Step() → last_step_=0, lr = 0.1 * 1.0 = 0.1
     ASSERT_TRUE(sched->LastStep() == 0);
     ASSERT_FLOAT_NEAR(sched->GetLR(), kBaseLR, kEps);
     ASSERT_FLOAT_NEAR(opt->GetLearningRate(), kBaseLR, kEps);
@@ -124,4 +124,4 @@ int main(int argc, char *argv[]) {
         std::cout << g_fail_count << " test(s) FAILED" << std::endl;
     }
     return g_fail_count > 0 ? 1 : 0;
-}
+}
diff --git a/test/lr_scheduler/test_linear_lr.cc b/test/lr_scheduler/test_linear_lr.cc
@@ -137,4 +137,4 @@ int main(int argc, char *argv[]) {
         std::cout << g_fail_count << " test(s) FAILED" << std::endl;
     }
     return g_fail_count > 0 ? 1 : 0;
-}
+}
diff --git a/test/lr_scheduler/test_lr_scheduler.cc b/test/lr_scheduler/test_lr_scheduler.cc
@@ -103,7 +103,7 @@ void TestLinearDecay() {
     ASSERT_FLOAT_EQ(opt->GetLearningRate(), 0.05f);
 }
 
-// T4: State → LoadState 往返一致性。
+// T4: State → LoadState
 void TestStateRoundTrip() {
     std::cout << "[T4] TestStateRoundTrip" << std::endl;
     constexpr int64_t kTotalSteps = 20;
@@ -115,7 +115,7 @@ void TestStateRoundTrip() {
     StateDict saved = sched->State();
 
     ASSERT_TRUE(saved.count("last_step") == 1);
-    ASSERT_TRUE(saved.count("current_lr") == 1);
+    ASSERT_TRUE(saved.count("recover_lr") == 1);
     ASSERT_TRUE(saved.count("base_lr") == 1);
 
     auto opt2 = MakeDummyOptimizer(kBaseLR);
@@ -175,4 +175,4 @@ int main(int argc, char *argv[]) {
     std::cout << "========================================" << std::endl;
 
     return g_fail_count > 0 ? 1 : 0;
-}
+}
diff --git a/test/lr_scheduler/test_lr_scheduler_validation.cc b/test/lr_scheduler/test_lr_scheduler_validation.cc
diff --git a/test/lr_scheduler/test_sequential_lr.cc b/test/lr_scheduler/test_sequential_lr.cc
diff --git a/test/lr_scheduler/test_step_lr.cc b/test/lr_scheduler/test_step_lr.cc

Original file line number	Diff line number	Diff line change
`@@ -181,4 +181,4 @@ int main(int argc, char *argv[]) {`
`181`	`181`	`std::cout << g_fail_count << " test(s) FAILED" << std::endl;`
`182`	`182`	`}`
`183`	`183`	`return g_fail_count > 0 ? 1 : 0;`
`184`		`-}`
	`184`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -137,4 +137,4 @@ int main(int argc, char *argv[]) {`
`137`	`137`	`std::cout << g_fail_count << " test(s) FAILED" << std::endl;`
`138`	`138`	`}`
`139`	`139`	`return g_fail_count > 0 ? 1 : 0;`
`140`		`-}`
	`140`	`+}`