fix: broadcast lora_A init from TP rank 0 to ensure consistent replicated weights

chen2021673 · chen2021673 · commit c614ec658247 · 2026-04-29T09:03:19.000Z
diff --git a/infini_train/src/nn/lora/lora_parallel_linear.cc b/infini_train/src/nn/lora/lora_parallel_linear.cc
@@ -11,6 +11,7 @@
 #include "infini_train/include/nn/init.h"
 #include "infini_train/include/nn/modules/linear.h"
 #include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/nn/parallel/process_group.h"
 #include "infini_train/include/nn/parallel/tensor_parallel.h"
 #include "infini_train/include/nn/parallel/utils.h"
 #include "infini_train/include/tensor.h"
@@ -89,22 +90,38 @@ LoRAColumnParallelLinear::LoRAColumnParallelLinear(std::shared_ptr<parallel::Col
 }
 
 void LoRAColumnParallelLinear::InitLoRAWeights() {
-    // LoRA weights stored directly in parameters_
-    // Following PEFT pattern conceptually:
-    // lora_A: [rank, in_features] - replicated
+    // lora_A: [rank, in_features] - replicated across TP ranks
     // lora_B: [out_features_per_partition, rank] - sharded like base weight
-
-    // lora_A: [rank, in_features]
     parameters_[kParamLoraAName]
         = std::make_shared<Tensor>(std::vector<int64_t>{config_.rank, in_features_}, DataType::kFLOAT32, device_)
               ->RequiresGrad();
-    if (config_.use_kaiming_a) {
-        init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
+
+    if (parallel::global::GetTensorParallelSize() > 1) {
+        const auto global_rank = device_.Rank().GlobalRank();
+        auto *tp_group = parallel::ProcessGroupFactory::Instance(device_.type())
+                             ->Get(parallel::GetTensorParallelProcessGroupName(global_rank));
+        const int tp_rank = tp_group->GetGroupRank(global_rank);
+
+        // Only TP rank 0 generates random values; others zero-init.
+        // AllReduce(sum) then broadcasts rank-0's values to all TP ranks.
+        if (tp_rank == 0) {
+            if (config_.use_kaiming_a) {
+                init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
+            } else {
+                init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
+            }
+        } else {
+            init::Zeros(parameters_[kParamLoraAName]);
+        }
+        tp_group->AllReduce(parameters_[kParamLoraAName]);
     } else {
-        init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
+        if (config_.use_kaiming_a) {
+            init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
+        } else {
+            init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
+        }
     }
 
-    // lora_B: [out_per_partition, rank] - sharded like base weight
     parameters_[kParamLoraBName]
         = std::make_shared<Tensor>(std::vector<int64_t>{out_features_per_partition_, config_.rank}, DataType::kFLOAT32,
                                    device_)
diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash
@@ -154,8 +154,9 @@ run_and_log() {
         > "$log_path"
     fi
 
-    # Write the current run command to the log
-    echo "[COMMAND] $cmd" >> "$log_path"
+    # Write the current run command to the log (expand $LORA_WEIGHTS_DIR)
+    local expanded_cmd="${cmd//\$LORA_WEIGHTS_DIR/$LORA_WEIGHTS_DIR}"
+    echo "[COMMAND] $expanded_cmd" >> "$log_path"
 
     # Run the command and append both stdout and stderr to the log file
     if ! eval "$cmd" >> "$log_path" 2>&1; then
@@ -267,10 +268,12 @@ for ((id=0; id<num_builds; ++id)); do
             arg_str="$(args_string_for_test "$gi" "$ti")"
 
             # gpt2
+            LORA_WEIGHTS_DIR="$GPT2_LORA_WEIGHTS_DIR"
             gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
             run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
 
             # llama3
+            LORA_WEIGHTS_DIR="$LLAMA3_LORA_WEIGHTS_DIR"
             llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
             run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
         done
diff --git a/scripts/test_config.json b/scripts/test_config.json
@@ -5,6 +5,8 @@
         "GPT2_LLMC_FILEPATH": "/data/shared/InfiniTrain-dev/data/llmc/gpt2/gpt2_124M.bin",
         "LLAMA3_INPUT_BIN": "/data/shared/InfiniTrain-dev/data/llmc/llama3/tinyshakespeare/tiny_shakespeare_train.bin",
         "LLAMA3_LLMC_FILEPATH": "/data/shared/InfiniTrain-dev/data/llmc/llama3/llama3.2_1B_fp32.bin",
+        "LLAMA3_LORA_WEIGHTS_DIR": "/data/shared/InfiniTrain-dev/data/llmc/llama3/llama3.2_1B_lora_weights_rank8_alpha16.bin",
+        "GPT2_LORA_WEIGHTS_DIR": "/data/shared/InfiniTrain-dev/data/llmc/gpt2/gpt2_124M_lora_weights_rank8_alpha16.bin",
         "PROFILE_LOG_DIR": "./profile_logs",
         "LOG_DIR": "./logs",
         "COMPARE_LOG_DIR": ""
@@ -313,7 +315,8 @@
                         "dtype": "float32",
                         "lora_rank": 8,
                         "lora_alpha": 16.0,
-                        "lora_target_modules": "c_attn,attn.c_proj"
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {
@@ -322,7 +325,8 @@
                         "dtype": "bfloat16",
                         "lora_rank": 8,
                         "lora_alpha": 16.0,
-                        "lora_target_modules": "c_attn,attn.c_proj"
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {
@@ -334,7 +338,8 @@
                         "total_batch_size": 5120,
                         "lora_rank": 8,
                         "lora_alpha": 16.0,
-                        "lora_target_modules": "c_attn,attn.c_proj"
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {
@@ -346,7 +351,8 @@
                         "total_batch_size": 5120,
                         "lora_rank": 8,
                         "lora_alpha": 16.0,
-                        "lora_target_modules": "c_attn,attn.c_proj"
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {
@@ -359,7 +365,8 @@
                         "total_batch_size": 5120,
                         "lora_rank": 8,
                         "lora_alpha": 16.0,
-                        "lora_target_modules": "c_attn,attn.c_proj"
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {
@@ -372,7 +379,8 @@
                         "total_batch_size": 5120,
                         "lora_rank": 8,
                         "lora_alpha": 16.0,
-                        "lora_target_modules": "c_attn,attn.c_proj"
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {
@@ -384,9 +392,10 @@
                         "batch_size": 40,
                         "total_batch_size": 5120,
                         "tensor_parallel": 4,
-                        "lora_rank": 4,
-                        "lora_alpha": 8.0,
-                        "lora_target_modules": "c_attn,c_fc,c_proj"
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {
@@ -398,9 +407,10 @@
                         "batch_size": 40,
                         "total_batch_size": 5120,
                         "tensor_parallel": 4,
-                        "lora_rank": 16,
-                        "lora_alpha": 32.0,
-                        "lora_target_modules": "c_attn,c_fc,c_proj"
+                        "lora_rank": 8,
+                        "lora_alpha": 16.0,
+                        "lora_target_modules": "c_attn,attn.c_proj",
+                        "lora_load_path": "$LORA_WEIGHTS_DIR"
                     }
                 },
                 {