InfiniTensor
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎example/gpt2/main.cc‎
Lines changed: 19 additions & 23 deletions b/‎example/gpt2/main.cc‎
Lines changed: 19 additions & 23 deletions
diff --git a/‎example/llama3/main.cc‎
Lines changed: 19 additions & 23 deletions b/‎example/llama3/main.cc‎
Lines changed: 19 additions & 23 deletions
diff --git a/‎infini_train/include/lr_scheduler.h‎
Lines changed: 9 additions & 24 deletions b/‎infini_train/include/lr_scheduler.h‎
Lines changed: 9 additions & 24 deletions
diff --git a/‎infini_train/src/lr_scheduler.cc‎
Lines changed: 68 additions & 54 deletions b/‎infini_train/src/lr_scheduler.cc‎
Lines changed: 68 additions & 54 deletions
@@ -225,5 +225,8 @@ link_infini_train_exe(test_sequential_lr)
 add_executable(test_chained_lr test/lr_scheduler/test_chained_lr.cc)
 link_infini_train_exe(test_chained_lr)
 
+add_executable(test_training_lr_scheduler test/lr_scheduler/test_training_lr_scheduler.cc)
+link_infini_train_exe(test_training_lr_scheduler)
+
 add_executable(test_lr_scheduler_validation test/lr_scheduler/test_lr_scheduler_validation.cc)
 link_infini_train_exe(test_lr_scheduler_validation)
@@ -13,8 +13,8 @@
 #include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/dataloader.h"
 #include "infini_train/include/device.h"
-#include "infini_train/include/nn/lora/lora_utils.h"
 #include "infini_train/include/lr_scheduler.h"
+#include "infini_train/include/nn/lora/lora_utils.h"
 #include "infini_train/include/nn/modules/loss.h"
 #include "infini_train/include/nn/modules/module.h"
 #include "infini_train/include/nn/parallel/ddp/distributed_data_parallel.h"
@@ -55,18 +55,14 @@ DEFINE_uint32(num_iteration, 10, "number of iterations to run");
 DEFINE_uint32(freq_generate_txt, 10, "frequency of text generation");
 DEFINE_uint32(text_length, 64, "the length of the generated text");
 // optimization
-DEFINE_double(learning_rate, 1e-4, "learning rate warmup iterations");
+DEFINE_double(learning_rate, 1e-4, "Peak learning rate.");
 DEFINE_bool(use_distributed_optimizer, false, "Whether to enable DistributedOptimizer(only take effects when DP>1)");
 // lr scheduler
-DEFINE_string(lr_scheduler, "none", "Learning rate scheduler type: none|constant|step|linear");
-DEFINE_int64(warmup_steps, 0, "Number of linear warmup steps (0 = no warmup)");
-DEFINE_double(warmup_start_factor, 0.333333, "Starting learning rate factor for linear warmup (multiplied by base LR)");
-DEFINE_double(warmup_end_factor, 1.0, "Ending learning rate factor for linear warmup (multiplied by base LR)");
-DEFINE_int64(step_size, 30, "StepLR: period of learning rate decay");
-DEFINE_double(gamma, 0.1, "StepLR: multiplicative factor of lr decay");
-DEFINE_double(start_factor, 0.333333, "LinearLR: starting multiplicative factor");
-DEFINE_double(end_factor, 1.0, "LinearLR: ending multiplicative factor");
-DEFINE_int64(lr_total_iters, 5, "ConstantLR/LinearLR: total iterations for the scheduler");
+DEFINE_double(min_lr, 0.0, "Minimum learning rate.");
+DEFINE_string(lr_decay_style, "constant", "LR decay style: none|constant|linear|cosine|inverse-square-root");
+DEFINE_int64(lr_warmup_iters, 0, "Number of linear warmup iterations.");
+DEFINE_double(lr_warmup_init, 0.0, "Initial learning rate at the start of warmup.");
+DEFINE_int64(lr_decay_iters, 0, "Number of iterations to decay LR over (0 = num_iteration).");
 // evaluation
 DEFINE_uint32(val_loss_every, 0, "every how many steps to evaluate val loss?");
 DEFINE_uint32(sample_every, 0, "how often to sample from the model?");
@@ -109,6 +105,8 @@ constexpr char kDeviceCPU[] = "cpu";
 constexpr char kDeviceCUDA[] = "cuda";
 constexpr char kDtypeFP32[] = "float32";
 constexpr char kDtypeBF16[] = "bfloat16";
+const std::unordered_set<std::string> kSupportedLRDecayStyles
+    = {"none", "constant", "linear", "cosine", "inverse-square-root"};
 
 //
 const std::unordered_map<std::string, GPT2Config> kModelToConfigs = {
@@ -129,6 +127,8 @@ const std::unordered_map<std::string, GPT2::ModelType> kStrToModelType = {
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
 DEFINE_validator(device,
                  [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_validator(lr_decay_style,
+                 [](const char *, const std::string &value) { return kSupportedLRDecayStyles.contains(value); });
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -321,18 +321,14 @@ void Train(const nn::parallel::Rank &rank) {
         optimizer = optimizer_creator(params_to_optimize);
     }
 
-    LRSchedulerConfig sched_config;
-    sched_config.type = FLAGS_lr_scheduler;
-    sched_config.warmup_steps = FLAGS_warmup_steps;
-    sched_config.warmup_start_factor = static_cast<float>(FLAGS_warmup_start_factor);
-    sched_config.warmup_end_factor = static_cast<float>(FLAGS_warmup_end_factor);
-    sched_config.step_size = FLAGS_step_size;
-    sched_config.step_gamma = static_cast<float>(FLAGS_gamma);
-    sched_config.linear_start_factor = static_cast<float>(FLAGS_start_factor);
-    sched_config.linear_end_factor = static_cast<float>(FLAGS_end_factor);
-    sched_config.constant_factor = static_cast<float>(FLAGS_start_factor); // 复用
-    sched_config.constant_total_iters = FLAGS_lr_total_iters;
-    sched_config.linear_total_iters = FLAGS_lr_total_iters;
+    const int64_t lr_decay_iters = FLAGS_lr_decay_iters > 0 ? FLAGS_lr_decay_iters : FLAGS_num_iteration;
+    TrainingLRSchedulerConfig sched_config;
+    sched_config.lr = static_cast<float>(FLAGS_learning_rate);
+    sched_config.min_lr = static_cast<float>(FLAGS_min_lr);
+    sched_config.lr_decay_style = FLAGS_lr_decay_style;
+    sched_config.lr_decay_iters = lr_decay_iters;
+    sched_config.lr_warmup_iters = FLAGS_lr_warmup_iters;
+    sched_config.lr_warmup_init = static_cast<float>(FLAGS_lr_warmup_init);
     auto scheduler = CreateLRScheduler(optimizer, sched_config);
 
     auto train_iter = train_loader.begin();
 
@@ -11,8 +11,8 @@
 #include "infini_train/include/core/runtime/device_guard.h"
 #include "infini_train/include/dataloader.h"
 #include "infini_train/include/device.h"
-#include "infini_train/include/nn/lora/lora_utils.h"
 #include "infini_train/include/lr_scheduler.h"
+#include "infini_train/include/nn/lora/lora_utils.h"
 #include "infini_train/include/nn/modules/loss.h"
 #include "infini_train/include/nn/modules/module.h"
 #include "infini_train/include/nn/parallel/ddp/distributed_data_parallel.h"
@@ -54,18 +54,14 @@ DEFINE_uint32(num_iteration, 10, "number of iterations to run");
 DEFINE_uint32(freq_generate_txt, 10, "frequency of text generation");
 DEFINE_uint32(text_length, 64, "the length of the generated text");
 // optimization
-DEFINE_double(learning_rate, 1e-5, "learning rate warmup iterations");
+DEFINE_double(learning_rate, 1e-5, "Peak learning rate.");
 DEFINE_bool(use_distributed_optimizer, false, "Whether to enable DistributedOptimizer(only take effects when DP>1)");
 // lr scheduler
-DEFINE_string(lr_scheduler, "none", "Learning rate scheduler type: none|constant|step|linear");
-DEFINE_int64(warmup_steps, 0, "Number of linear warmup steps (0 = no warmup)");
-DEFINE_double(warmup_start_factor, 0.333333, "Starting learning rate factor for linear warmup (multiplied by base LR)");
-DEFINE_double(warmup_end_factor, 1.0, "Ending learning rate factor for linear warmup (multiplied by base LR)");
-DEFINE_int64(step_size, 30, "StepLR: period of learning rate decay");
-DEFINE_double(gamma, 0.1, "StepLR: multiplicative factor of lr decay");
-DEFINE_double(start_factor, 0.333333, "LinearLR: starting multiplicative factor");
-DEFINE_double(end_factor, 1.0, "LinearLR: ending multiplicative factor");
-DEFINE_int64(lr_total_iters, 5, "ConstantLR/LinearLR: total iterations for the scheduler");
+DEFINE_double(min_lr, 0.0, "Minimum learning rate.");
+DEFINE_string(lr_decay_style, "constant", "LR decay style: none|constant|linear|cosine|inverse-square-root");
+DEFINE_int64(lr_warmup_iters, 0, "Number of linear warmup iterations.");
+DEFINE_double(lr_warmup_init, 0.0, "Initial learning rate at the start of warmup.");
+DEFINE_int64(lr_decay_iters, 0, "Number of iterations to decay LR over (0 = num_iteration).");
 // evaluation
 DEFINE_uint32(val_loss_every, 0, "every how many steps to evaluate val loss?");
 DEFINE_uint32(sample_every, 0, "how often to sample from the model?");
@@ -104,11 +100,15 @@ constexpr char kDeviceCPU[] = "cpu";
 constexpr char kDeviceCUDA[] = "cuda";
 constexpr char kDtypeFP32[] = "float32";
 constexpr char kDtypeBF16[] = "bfloat16";
+const std::unordered_set<std::string> kSupportedLRDecayStyles
+    = {"none", "constant", "linear", "cosine", "inverse-square-root"};
 } // namespace
 
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
 DEFINE_validator(device,
                  [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_validator(lr_decay_style,
+                 [](const char *, const std::string &value) { return kSupportedLRDecayStyles.contains(value); });
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -293,18 +293,14 @@ void Train(const nn::parallel::Rank &rank) {
         optimizer = optimizer_creator(params_to_optimize);
     }
 
-    LRSchedulerConfig sched_config;
-    sched_config.type = FLAGS_lr_scheduler;
-    sched_config.warmup_steps = FLAGS_warmup_steps;
-    sched_config.warmup_start_factor = static_cast<float>(FLAGS_warmup_start_factor);
-    sched_config.warmup_end_factor = static_cast<float>(FLAGS_warmup_end_factor);
-    sched_config.step_size = FLAGS_step_size;
-    sched_config.step_gamma = static_cast<float>(FLAGS_gamma);
-    sched_config.linear_start_factor = static_cast<float>(FLAGS_start_factor);
-    sched_config.linear_end_factor = static_cast<float>(FLAGS_end_factor);
-    sched_config.constant_factor = static_cast<float>(FLAGS_start_factor); // 复用
-    sched_config.constant_total_iters = FLAGS_lr_total_iters;
-    sched_config.linear_total_iters = FLAGS_lr_total_iters;
+    const int64_t lr_decay_iters = FLAGS_lr_decay_iters > 0 ? FLAGS_lr_decay_iters : FLAGS_num_iteration;
+    TrainingLRSchedulerConfig sched_config;
+    sched_config.lr = static_cast<float>(FLAGS_learning_rate);
+    sched_config.min_lr = static_cast<float>(FLAGS_min_lr);
+    sched_config.lr_decay_style = FLAGS_lr_decay_style;
+    sched_config.lr_decay_iters = lr_decay_iters;
+    sched_config.lr_warmup_iters = FLAGS_lr_warmup_iters;
+    sched_config.lr_warmup_init = static_cast<float>(FLAGS_lr_warmup_init);
     auto scheduler = CreateLRScheduler(optimizer, sched_config);
 
     auto train_iter = train_loader.begin();
 
@@ -16,29 +16,13 @@ class Optimizer;
 using StateValue = std::variant<int64_t, float, double, std::string, std::vector<float>>;
 using StateDict = std::unordered_map<std::string, StateValue>;
 
-struct LRSchedulerConfig {
-    std::string type = "none";
-    // ConstantLR
-    float constant_factor = 1.0f / 3.0f;
-    int constant_total_iters = 5;
-    // StepLR
-    int64_t step_size = 10;
-    float step_gamma = 0.1f;
-    // LinearLR
-    float linear_start_factor = 1.0f / 3.0f;
-    float linear_end_factor = 1.0f;
-    int linear_total_iters = 5;
-    // LambdaLR
-    std::function<float(int64_t)> lambda_fn = nullptr;
-    // SequentialLR
-    std::vector<LRSchedulerConfig> sequential_configs;
-    std::vector<int64_t> sequential_milestones;
-    // ChainedScheduler
-    std::vector<LRSchedulerConfig> chained_configs;
-    // warmup
-    int64_t warmup_steps = 0;
-    float warmup_start_factor = 1.0f / 3.0f;
-    float warmup_end_factor = 1.0f;
+struct TrainingLRSchedulerConfig {
+    std::string lr_decay_style = "constant";
+    float lr = 0.0f;
+    float min_lr = 0.0f;
+    int64_t lr_decay_iters = 1;
+    int64_t lr_warmup_iters = 0;
+    float lr_warmup_init = 0.0f;
 };
 
 class LRScheduler {
@@ -81,7 +65,8 @@ class LRScheduler {
     bool is_initial_ = false;
 };
 
-std::shared_ptr<LRScheduler> CreateLRScheduler(std::shared_ptr<Optimizer> optimizer, const LRSchedulerConfig &config);
+std::shared_ptr<LRScheduler> CreateLRScheduler(std::shared_ptr<Optimizer> optimizer,
+                                               const TrainingLRSchedulerConfig &config);
 
 namespace lr_schedulers {
 
 
@@ -1,71 +1,87 @@
 #include "infini_train/include/lr_scheduler.h"
 
+#include <algorithm>
+#include <cmath>
+#include <numbers>
+#include <utility>
+
 #include "glog/logging.h"
 
 #include "infini_train/include/optimizer.h"
 
 namespace infini_train {
 
-std::shared_ptr<LRScheduler> CreateLRScheduler(std::shared_ptr<Optimizer> optimizer, const LRSchedulerConfig &config) {
-    if (config.type == "none") {
+std::shared_ptr<LRScheduler> CreateLRScheduler(std::shared_ptr<Optimizer> optimizer,
+                                               const TrainingLRSchedulerConfig &config) {
+    if (config.lr_decay_style == "none") {
         return nullptr;
     }
 
-    auto create_main = [&](std::shared_ptr<Optimizer> opt) -> std::shared_ptr<LRScheduler> {
-        if (config.type == "constant") {
-            return LRScheduler::Create<lr_schedulers::ConstantLR>(opt, config.constant_factor,
-                                                                  config.constant_total_iters);
-        }
-        if (config.type == "step") {
-            return LRScheduler::Create<lr_schedulers::StepLR>(opt, config.step_size, config.step_gamma);
-        }
-        if (config.type == "linear") {
-            return LRScheduler::Create<lr_schedulers::LinearLR>(opt, config.linear_start_factor,
-                                                                config.linear_end_factor, config.linear_total_iters);
-        }
-        if (config.type == "lambda") {
-            return LRScheduler::Create<lr_schedulers::LambdaLR>(opt, config.lambda_fn);
-        }
-        if (config.type == "sequential") {
-            std::vector<std::shared_ptr<LRScheduler>> schedulers;
-            std::vector<int64_t> milestones = config.sequential_milestones;
-            for (const auto &sub_config : config.sequential_configs) {
-                auto sub_sched = CreateLRScheduler(opt, sub_config);
-                if (sub_sched) {
-                    schedulers.push_back(sub_sched);
+    CHECK(optimizer) << "CreateLRScheduler: optimizer must not be null.";
+    const float max_lr = config.lr != 0.0f ? config.lr : optimizer->GetLearningRate();
+    CHECK_GT(max_lr, 0.0f) << "CreateLRScheduler: max_lr must be > 0.";
+    CHECK_GE(config.lr_warmup_init, 0.0f) << "CreateLRScheduler: lr_warmup_init must be >= 0.";
+    CHECK_GE(config.min_lr, 0.0f) << "CreateLRScheduler: min_lr must be >= 0.";
+    CHECK_GE(max_lr, config.min_lr) << "CreateLRScheduler: max_lr must be >= min_lr.";
+    CHECK_LE(config.lr_warmup_init, max_lr) << "CreateLRScheduler: lr_warmup_init must be <= max_lr.";
+    CHECK_GE(config.lr_warmup_iters, 0) << "CreateLRScheduler: lr_warmup_iters must be >= 0.";
+    CHECK_GT(config.lr_decay_iters, 0) << "CreateLRScheduler: lr_decay_iters must be > 0.";
+    CHECK_LT(config.lr_warmup_iters, config.lr_decay_iters)
+        << "CreateLRScheduler: lr_warmup_iters must be < lr_decay_iters.";
+    CHECK(config.lr_decay_style == "constant" || config.lr_decay_style == "linear" || config.lr_decay_style == "cosine"
+          || config.lr_decay_style == "inverse-square-root")
+        << "CreateLRScheduler: unsupported lr_decay_style: " << config.lr_decay_style;
+
+    std::shared_ptr<LRScheduler> main_scheduler;
+    const int64_t decay_iters_after_warmup = config.lr_decay_iters - config.lr_warmup_iters;
+    if (config.lr_decay_style == "constant") {
+        main_scheduler = LRScheduler::Create<lr_schedulers::LambdaLR>(optimizer, [](int64_t) { return 1.0f; });
+    } else if (config.lr_decay_style == "linear") {
+        main_scheduler = LRScheduler::Create<lr_schedulers::LinearLR>(optimizer, 1.0f, config.min_lr / max_lr,
+                                                                      decay_iters_after_warmup);
+    } else if (config.lr_decay_style == "cosine") {
+        main_scheduler = LRScheduler::Create<lr_schedulers::LambdaLR>(
+            optimizer, [max_lr, min_lr = config.min_lr, decay_iters_after_warmup](int64_t step) {
+                if (step > decay_iters_after_warmup) {
+                    return min_lr / max_lr;
                 }
-            }
-            return LRScheduler::Create<lr_schedulers::SequentialLR>(opt, schedulers, milestones);
-        }
-        if (config.type == "chained") {
-            std::vector<std::shared_ptr<LRScheduler>> schedulers;
-            for (const auto &sub_config : config.chained_configs) {
-                auto sub_sched = CreateLRScheduler(opt, sub_config);
-                if (sub_sched) {
-                    schedulers.push_back(sub_sched);
+                const float decay_ratio = static_cast<float>(step) / static_cast<float>(decay_iters_after_warmup);
+                CHECK_GE(decay_ratio, 0.0f) << "CreateLRScheduler: decay "
+                                               "ratio must be >= 0.";
+                CHECK_LE(decay_ratio, 1.0f) << "CreateLRScheduler: decay "
+                                               "ratio must be <= 1.";
+                const float coeff = 0.5f * (std::cos(std::numbers::pi_v<float> * decay_ratio) + 1.0f);
+                return (min_lr + coeff * (max_lr - min_lr)) / max_lr;
+            });
+    } else if (config.lr_decay_style == "inverse-square-root") {
+        main_scheduler = LRScheduler::Create<lr_schedulers::LambdaLR>(
+            optimizer, [max_lr, min_lr = config.min_lr, lr_warmup_iters = config.lr_warmup_iters,
+                        lr_decay_iters = config.lr_decay_iters](int64_t step) {
+                const int64_t global_step = step + lr_warmup_iters;
+                if (global_step > lr_decay_iters) {
+                    return min_lr / max_lr;
                 }
-            }
-            return LRScheduler::Create<lr_schedulers::ChainedScheduler>(opt, schedulers);
-        }
-        LOG(FATAL) << "Unsupported LR scheduler type: " << config.type;
-        return nullptr;
-    };
-
-    if (config.warmup_steps <= 0) {
-        return create_main(optimizer);
+                const auto warmup = static_cast<float>(std::max<int64_t>(lr_warmup_iters, 1));
+                const auto current = static_cast<float>(std::max<int64_t>(global_step, 1));
+                return std::max(min_lr, max_lr * std::sqrt(warmup) / std::sqrt(current)) / max_lr;
+            });
     }
 
-    auto warmup_scheduler = LRScheduler::Create<lr_schedulers::LinearLR>(optimizer,
-                                                                         /*start_factor=*/config.warmup_start_factor,
-                                                                         /*end_factor=*/config.warmup_end_factor,
-                                                                         /*total_iters=*/config.warmup_steps);
-
-    auto main_scheduler = create_main(optimizer);
+    CHECK(main_scheduler) << "CreateLRScheduler: failed to create scheduler.";
+    if (config.lr_warmup_iters == 0) {
+        return main_scheduler;
+    }
 
+    auto warmup_scheduler = LRScheduler::Create<lr_schedulers::LambdaLR>(
+        optimizer,
+        [lr_warmup_init = config.lr_warmup_init, max_lr, lr_warmup_iters = config.lr_warmup_iters](int64_t step) {
+            const float warmup_ratio = static_cast<float>(step) / static_cast<float>(lr_warmup_iters);
+            return (lr_warmup_init + (max_lr - lr_warmup_init) * warmup_ratio) / max_lr;
+        });
     return LRScheduler::Create<lr_schedulers::SequentialLR>(
-        optimizer, std::vector<std::shared_ptr<LRScheduler>>{warmup_scheduler, main_scheduler},
-        std::vector<int64_t>{config.warmup_steps});
-};
+        std::move(optimizer), std::vector<std::shared_ptr<LRScheduler>>{warmup_scheduler, main_scheduler},
+        std::vector<int64_t>{config.lr_warmup_iters});
+}
 
 LRScheduler::LRScheduler(std::shared_ptr<Optimizer> optimizer, int64_t last_step)
     : optimizer_(std::move(optimizer)), last_step_(last_step), base_lr_(0.0f) {
@@ -310,9 +326,7 @@ ChainedScheduler::ChainedScheduler(std::shared_ptr<Optimizer> optimizer,
     }
 }
 
-void ChainedScheduler::InitialStep() {
-    last_step_ = 0;
-}
+void ChainedScheduler::InitialStep() { last_step_ = 0; }
 
 void ChainedScheduler::Step() {
     ++last_step_;