refactor: in-place BroadCast + ScatterFromRank for TP-aware init

chen2021673 · chen2021673 · commit f918ae38999e · 2026-05-14T08:24:59.000Z
Replace BroadCast's allocate-then-return signature with an in-place form
(void return) that takes pre-grouped tensors per local device. Lets root
ranks broadcast directly out of the source tensor with no self-copy and
no extra allocation. Add ScatterFromRank as the multi-process counterpart
to Scatter for the same reason. Use both in LoRA*ParallelLinear so TP
rank-0 init no longer pays a tp_size-fold communication or scratch cost.
diff --git a/infini_train/include/nn/parallel/process_group.h b/infini_train/include/nn/parallel/process_group.h
@@ -59,15 +59,21 @@ class ProcessGroup {
                                        bool async_op = false) const;
 
     // Legacy communication APIs (Single-stream)
-    virtual std::vector<std::shared_ptr<Tensor>>
-    BroadCast(const std::vector<std::shared_ptr<Tensor>> &input_tensors) const;
+    // In-place broadcast of tensors grouped as [device0 tensors..., device1 tensors...].
+    // Pass root_group_rank in multi-process mode; -1 infers it from tensors[0].
+    virtual void BroadCast(const std::vector<std::shared_ptr<Tensor>> &tensors, int root_group_rank = -1) const;
 
     virtual std::vector<std::shared_ptr<Tensor>>
     ReduceAddCoalesced(const std::vector<std::vector<std::shared_ptr<Tensor>>> &grads, Device destination) const;
 
     virtual std::vector<std::shared_ptr<Tensor>> Scatter(const std::shared_ptr<Tensor> &tensor,
                                                          std::vector<Device> devices, int64_t dim) const;
 
+    // Multi-process-friendly in-place scatter. Outputs are this process's local shard(s);
+    // full tensor data is read only on src_group_rank.
+    virtual void ScatterFromRank(const std::vector<std::shared_ptr<Tensor>> &outputs,
+                                 const std::shared_ptr<Tensor> &tensor, int64_t dim, int src_group_rank) const;
+
     virtual std::shared_ptr<Tensor> Gather(const std::vector<std::shared_ptr<Tensor>> &tensors, Device destination,
                                            int64_t dim) const;
 
diff --git a/infini_train/src/autograd/comm.cc b/infini_train/src/autograd/comm.cc
@@ -82,8 +82,21 @@ std::vector<std::shared_ptr<Tensor>> Broadcast::Forward(const std::vector<std::s
             << "Broadcast function not implemented for tensors on different device type";
     }
 
+    std::vector<std::shared_ptr<Tensor>> outputs;
+    outputs.reserve(target_gpus_.size() * input_tensors.size());
+    for (const auto &device : target_gpus_) {
+        for (const auto &tensor : input_tensors) {
+            if (device == input_device_) {
+                outputs.push_back(tensor);
+            } else {
+                outputs.push_back(std::make_shared<Tensor>(tensor->Dims(), tensor->Dtype(), device));
+            }
+        }
+    }
+
     // TODO(dcj): mark non differentiable
-    return pg_->BroadCast(input_tensors);
+    pg_->BroadCast(outputs, pg_->GetGroupRank(input_device_.Rank().GlobalRank()));
+    return outputs;
 }
 
 void Broadcast::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
diff --git a/infini_train/src/nn/lora/lora_parallel_linear.cc b/infini_train/src/nn/lora/lora_parallel_linear.cc
@@ -102,18 +102,15 @@ void LoRAColumnParallelLinear::InitLoRAWeights() {
                              ->Get(parallel::GetTensorParallelProcessGroupName(global_rank));
         const int tp_rank = tp_group->GetGroupRank(global_rank);
 
-        // Only TP rank 0 generates random values; others zero-init.
-        // AllReduce(sum) then broadcasts rank-0's values to all TP ranks.
+        // TP rank 0 generates random values; broadcast replicates to other ranks in-place.
         if (tp_rank == 0) {
             if (config_.use_kaiming_a) {
                 init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
             } else {
                 init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
             }
-        } else {
-            init::Zeros(parameters_[kParamLoraAName]);
         }
-        tp_group->AllReduce(parameters_[kParamLoraAName]);
+        tp_group->BroadCast({parameters_[kParamLoraAName]}, /*root_group_rank=*/0);
     } else {
         if (config_.use_kaiming_a) {
             init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
@@ -303,17 +300,41 @@ void LoRARowParallelLinear::InitLoRAWeights() {
     // lora_B: [out_features, rank] - replicated
 
     // lora_A: [rank, in_features_per_partition]
+    // TP rank 0 generates full [lora_rank, in_features], broadcasts to all TP ranks,
+    // then each rank slices its own shard along dim=1.
     parameters_[kParamLoraAName]
         = std::make_shared<Tensor>(std::vector<int64_t>{config_.rank, in_features_per_partition_}, DataType::kFLOAT32,
                                    device_)
               ->RequiresGrad();
-    if (config_.use_kaiming_a) {
-        init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
+
+    if (parallel::global::GetTensorParallelSize() > 1) {
+        const auto global_rank = device_.Rank().GlobalRank();
+        auto *tp_group = parallel::ProcessGroupFactory::Instance(device_.type())
+                             ->Get(parallel::GetTensorParallelProcessGroupName(global_rank));
+        const int tp_rank = tp_group->GetGroupRank(global_rank);
+        const int tp_size = parallel::global::GetTensorParallelSize();
+
+        // TP rank 0 generates full [lora_rank, in_features]; scatter shards along dim=1 to all ranks.
+        // Non-src processes pass a tensor carrying only shape/dtype (contents unread).
+        auto full_lora_A = std::make_shared<Tensor>(
+            std::vector<int64_t>{config_.rank, in_features_per_partition_ * tp_size}, DataType::kFLOAT32, device_);
+        if (tp_rank == 0) {
+            if (config_.use_kaiming_a) {
+                init::KaimingUniform(full_lora_A, config_.kaiming_a_param);
+            } else {
+                init::Normal(full_lora_A, 0.0f, 0.02f);
+            }
+        }
+        tp_group->ScatterFromRank({parameters_[kParamLoraAName]}, full_lora_A, /*dim=*/1, /*src_group_rank=*/0);
     } else {
-        init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
+        if (config_.use_kaiming_a) {
+            init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
+        } else {
+            init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
+        }
     }
 
-    // lora_B: [out_features, rank]
+    // lora_B: [out_features, rank] - replicated, zeros
     parameters_[kParamLoraBName]
         = std::make_shared<Tensor>(std::vector<int64_t>{out_features_, config_.rank}, DataType::kFLOAT32, device_)
               ->RequiresGrad();
diff --git a/infini_train/src/nn/parallel/process_group.cc b/infini_train/src/nn/parallel/process_group.cc
@@ -248,45 +248,52 @@ std::shared_ptr<Work> ProcessGroup::Recv(std::vector<std::shared_ptr<Tensor>> te
     }
 }
 
-std::vector<std::shared_ptr<Tensor>>
-ProcessGroup::BroadCast(const std::vector<std::shared_ptr<Tensor>> &input_tensors) const {
-    std::vector<std::shared_ptr<Tensor>> outputs;
+void ProcessGroup::BroadCast(const std::vector<std::shared_ptr<Tensor>> &tensors, int root_group_rank) const {
+    CHECK_GT(tensors.size(), 0);
+    CHECK_GT(devices_.size(), 0);
+    CHECK_EQ(tensors.size() % devices_.size(), 0)
+        << "BroadCast: tensors must be grouped by local device with the same tensor count per device";
+    const size_t num_tensors_per_device = tensors.size() / devices_.size();
+
     std::vector<core::Stream *> streams;
     std::vector<core::CclComm *> comms;
-    std::vector<Device> devices;
+    std::vector<int> local_group_ranks;
+    streams.reserve(devices_.size());
+    comms.reserve(devices_.size());
+    local_group_ranks.reserve(devices_.size());
 
-    CHECK_EQ(world_size_, comms_.size());
-    for (size_t i = 0; i < world_size_; ++i) {
-        auto device = devices_[i];
-        for (const auto &input_tensor : input_tensors) {
-            outputs.push_back(std::make_shared<Tensor>(input_tensor->Dims(), input_tensor->Dtype(), device));
-        }
-        devices.push_back(device);
+    for (const auto &device : devices_) {
         streams.push_back(runtime_impl_->GetStream(device));
         comms.push_back(device_comm_map_.at(device.index()));
+        local_group_ranks.push_back(global_group_rank_map_.at(device.Rank().GlobalRank()));
     }
 
-    int root = -1;
-    for (size_t i = 0; i < devices.size(); ++i) {
-        if (devices[i] == input_tensors[0]->GetDevice()) {
-            root = static_cast<int>(i);
-            break;
-        }
+    // Determine NCCL root (= group rank of the source). In single-process mode the caller may
+    // omit it and we infer from tensors[0]->GetDevice(); in multi-process mode the source
+    // may not be on this process, so the caller must provide the group rank explicitly.
+    int root = root_group_rank;
+    if (root < 0) {
+        auto it = global_group_rank_map_.find(tensors[0]->GetDevice().Rank().GlobalRank());
+        CHECK(it != global_group_rank_map_.end())
+            << "BroadCast: root device not found in group and root_group_rank was not provided";
+        root = it->second;
     }
-    CHECK_NE(root, -1) << "Root not found in input devices";
-
-    core::CclGroupGuard ccl_group_guard(devices[0].type());
-    for (size_t i = 0; i < devices.size(); ++i) {
-        core::DeviceGuard guard(devices[i]);
-        for (size_t j = 0; j < input_tensors.size(); ++j) {
-            const auto &input_tensor = input_tensors[j];
-            const void *send_buffer = (static_cast<int>(i) == root ? input_tensor->DataPtr() : nullptr);
-            ccl_impl_->Broadcast(send_buffer, outputs[i * input_tensors.size() + j]->DataPtr(),
-                                 input_tensor->NumElements(), input_tensor->Dtype(), root, comms[i], streams[i]);
+    CHECK_GE(root, 0);
+    CHECK_LT(root, world_size_);
+
+    core::CclGroupGuard ccl_group_guard(devices_[0].type());
+    for (size_t i = 0; i < devices_.size(); ++i) {
+        core::DeviceGuard guard(devices_[i]);
+        const int local_group_rank = local_group_ranks[i];
+        for (size_t j = 0; j < num_tensors_per_device; ++j) {
+            const auto &tensor = tensors[i * num_tensors_per_device + j];
+            CHECK(tensor != nullptr) << "BroadCast: null tensor";
+            CHECK_EQ(tensor->GetDevice(), devices_[i]) << "BroadCast: tensors must match local device grouping";
+            const void *send_buffer = (local_group_rank == root ? tensor->DataPtr() : nullptr);
+            ccl_impl_->Broadcast(send_buffer, tensor->DataPtr(), tensor->NumElements(), tensor->Dtype(), root, comms[i],
+                                 streams[i]);
         }
     }
-
-    return outputs;
 }
 
 std::vector<std::shared_ptr<Tensor>>
@@ -358,6 +365,72 @@ std::vector<std::shared_ptr<Tensor>> ProcessGroup::Scatter(const std::shared_ptr
     return outputs;
 }
 
+void ProcessGroup::ScatterFromRank(const std::vector<std::shared_ptr<Tensor>> &outputs,
+                                   const std::shared_ptr<Tensor> &tensor, int64_t dim, int src_group_rank) const {
+    CHECK(tensor != nullptr) << "Scatter: tensor carrying full shape/dtype must be provided on every process";
+    CHECK_GE(src_group_rank, 0);
+    CHECK_LT(src_group_rank, world_size_);
+    CHECK_GT(devices_.size(), 0);
+    CHECK_EQ(outputs.size(), devices_.size()) << "ScatterFromRank: expects one output per local group device";
+    const int src_rank = src_group_rank;
+
+    // Identify local group ranks (in the same order as devices_).
+    std::vector<int> local_group_ranks;
+    local_group_ranks.reserve(devices_.size());
+    for (const auto &d : devices_) { local_group_ranks.push_back(global_group_rank_map_.at(d.Rank().GlobalRank())); }
+    const auto src_local_it = std::find(local_group_ranks.begin(), local_group_ranks.end(), src_rank);
+    const bool src_is_local = src_local_it != local_group_ranks.end();
+
+    CHECK_EQ(tensor->Dims()[dim] % static_cast<int64_t>(world_size_), 0)
+        << "Scatter: dim size must be divisible by world size";
+    const int64_t shard_size = tensor->Dims()[dim] / static_cast<int64_t>(world_size_);
+    std::vector<std::shared_ptr<Tensor>> split_tensors;
+    if (src_is_local) {
+        split_tensors = tensor->Split(shard_size, dim);
+        CHECK_EQ(split_tensors.size(), static_cast<size_t>(world_size_));
+    }
+
+    std::vector<int64_t> shard_dims = tensor->Dims();
+    shard_dims[dim] = shard_size;
+    const DataType shard_dtype = tensor->Dtype();
+    for (size_t i = 0; i < outputs.size(); ++i) {
+        CHECK(outputs[i] != nullptr) << "ScatterFromRank: null output tensor";
+        CHECK_EQ(outputs[i]->GetDevice(), devices_[i]) << "ScatterFromRank: output device mismatch";
+        CHECK(outputs[i]->Dims() == shard_dims) << "ScatterFromRank: output shape mismatch";
+        CHECK(outputs[i]->Dtype() == shard_dtype) << "ScatterFromRank: output dtype mismatch";
+    }
+
+    core::CclGroupGuard ccl_group_guard(devices_[0].type());
+
+    if (src_is_local) {
+        const size_t src_local_idx = static_cast<size_t>(src_local_it - local_group_ranks.begin());
+        const auto &src_device = devices_[src_local_idx];
+        core::DeviceGuard guard(src_device);
+        auto *stream = runtime_impl_->GetStream(src_device);
+        auto *comm = device_comm_map_.at(src_device.index());
+        for (int dst = 0; dst < world_size_; ++dst) {
+            if (dst == src_rank) {
+                continue;
+            }
+            ccl_impl_->Send(split_tensors[dst]->DataPtr(), split_tensors[dst]->NumElements(), shard_dtype, dst, comm,
+                            stream);
+        }
+    }
+
+    for (size_t i = 0; i < devices_.size(); ++i) {
+        const auto &local_device = devices_[i];
+        const int local_rank = local_group_ranks[i];
+        if (src_is_local && local_rank == src_rank) {
+            outputs[i]->CopyFrom(split_tensors[src_rank]);
+            continue;
+        }
+        core::DeviceGuard guard(local_device);
+        auto *stream = runtime_impl_->GetStream(local_device);
+        auto *comm = device_comm_map_.at(local_device.index());
+        ccl_impl_->Recv(outputs[i]->DataPtr(), outputs[i]->NumElements(), shard_dtype, src_rank, comm, stream);
+    }
+}
+
 std::shared_ptr<Tensor> ProcessGroup::Gather(const std::vector<std::shared_ptr<Tensor>> &tensors, Device destination,
                                              int64_t dim) const {
     int64_t num_devices = static_cast<int64_t>(tensors.size());