refactor: add multi-process Scatter overload and use it for LoRA lora_A init

chen2021673 · chen2021673 · commit 845915632f17 · 2026-05-13T12:14:47.000Z
Add ProcessGroup::Scatter(tensor, dim, src_group_rank) overload where each
process only materializes shards for its own local devices. Use it in
LoRARowParallelLinear to replace broadcast+slice, avoiding tp_size-fold
communication volume during init.
diff --git a/infini_train/include/nn/parallel/process_group.h b/infini_train/include/nn/parallel/process_group.h
@@ -59,14 +59,25 @@ class ProcessGroup {
                                        bool async_op = false) const;
 
     // Legacy communication APIs (Single-stream)
-    virtual std::vector<std::shared_ptr<Tensor>>
-    BroadCast(const std::vector<std::shared_ptr<Tensor>> &input_tensors) const;
+    // If root_group_rank is -1, infer root from input_tensors[0]'s device (single-process mode).
+    // In multi-process mode, the caller must pass the source's group rank on every rank.
+    virtual std::vector<std::shared_ptr<Tensor>> BroadCast(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
+                                                           int root_group_rank = -1) const;
 
     virtual std::vector<std::shared_ptr<Tensor>>
     ReduceAddCoalesced(const std::vector<std::vector<std::shared_ptr<Tensor>>> &grads, Device destination) const;
 
+    // Single-process / DataParallel form: `devices` enumerates all target devices (must be local
+    // to this process). Source is inferred from `tensor->GetDevice()` when `src_group_rank` is -1.
     virtual std::vector<std::shared_ptr<Tensor>> Scatter(const std::shared_ptr<Tensor> &tensor,
-                                                         std::vector<Device> devices, int64_t dim) const;
+                                                         std::vector<Device> devices, int64_t dim,
+                                                         int src_group_rank = -1) const;
+
+    // Multi-process-friendly form (TP init etc.): each process only materializes shard(s) for
+    // its own local device(s) in this group. `tensor` must carry the full shape/dtype on every
+    // process; data is only read on the src process.
+    virtual std::vector<std::shared_ptr<Tensor>> Scatter(const std::shared_ptr<Tensor> &tensor, int64_t dim,
+                                                         int src_group_rank) const;
 
     virtual std::shared_ptr<Tensor> Gather(const std::vector<std::shared_ptr<Tensor>> &tensors, Device destination,
                                            int64_t dim) const;
diff --git a/infini_train/src/nn/lora/lora_parallel_linear.cc b/infini_train/src/nn/lora/lora_parallel_linear.cc
@@ -102,18 +102,16 @@ void LoRAColumnParallelLinear::InitLoRAWeights() {
                              ->Get(parallel::GetTensorParallelProcessGroupName(global_rank));
         const int tp_rank = tp_group->GetGroupRank(global_rank);
 
-        // Only TP rank 0 generates random values; others zero-init.
-        // AllReduce(sum) then broadcasts rank-0's values to all TP ranks.
+        // TP rank 0 generates random values; Broadcast replicates to other ranks.
         if (tp_rank == 0) {
             if (config_.use_kaiming_a) {
                 init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
             } else {
                 init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
             }
-        } else {
-            init::Zeros(parameters_[kParamLoraAName]);
         }
-        tp_group->AllReduce(parameters_[kParamLoraAName]);
+        auto broadcasted = tp_group->BroadCast({parameters_[kParamLoraAName]}, /*root_group_rank=*/0);
+        parameters_[kParamLoraAName]->CopyFrom(broadcasted[0]);
     } else {
         if (config_.use_kaiming_a) {
             init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
@@ -303,17 +301,42 @@ void LoRARowParallelLinear::InitLoRAWeights() {
     // lora_B: [out_features, rank] - replicated
 
     // lora_A: [rank, in_features_per_partition]
+    // TP rank 0 generates full [lora_rank, in_features], broadcasts to all TP ranks,
+    // then each rank slices its own shard along dim=1.
     parameters_[kParamLoraAName]
         = std::make_shared<Tensor>(std::vector<int64_t>{config_.rank, in_features_per_partition_}, DataType::kFLOAT32,
                                    device_)
               ->RequiresGrad();
-    if (config_.use_kaiming_a) {
-        init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
+
+    if (parallel::global::GetTensorParallelSize() > 1) {
+        const auto global_rank = device_.Rank().GlobalRank();
+        auto *tp_group = parallel::ProcessGroupFactory::Instance(device_.type())
+                             ->Get(parallel::GetTensorParallelProcessGroupName(global_rank));
+        const int tp_rank = tp_group->GetGroupRank(global_rank);
+        const int tp_size = parallel::global::GetTensorParallelSize();
+
+        // TP rank 0 generates full [lora_rank, in_features]; scatter shards along dim=1 to all ranks.
+        // Non-src processes pass a tensor carrying only shape/dtype (contents unread).
+        auto full_lora_A = std::make_shared<Tensor>(
+            std::vector<int64_t>{config_.rank, in_features_per_partition_ * tp_size}, DataType::kFLOAT32, device_);
+        if (tp_rank == 0) {
+            if (config_.use_kaiming_a) {
+                init::KaimingUniform(full_lora_A, config_.kaiming_a_param);
+            } else {
+                init::Normal(full_lora_A, 0.0f, 0.02f);
+            }
+        }
+        auto shards = tp_group->Scatter(full_lora_A, /*dim=*/1, /*src_group_rank=*/0);
+        parameters_[kParamLoraAName]->CopyFrom(shards[0]);
     } else {
-        init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
+        if (config_.use_kaiming_a) {
+            init::KaimingUniform(parameters_[kParamLoraAName], config_.kaiming_a_param);
+        } else {
+            init::Normal(parameters_[kParamLoraAName], 0.0f, 0.02f);
+        }
     }
 
-    // lora_B: [out_features, rank]
+    // lora_B: [out_features, rank] - replicated, zeros
     parameters_[kParamLoraBName]
         = std::make_shared<Tensor>(std::vector<int64_t>{out_features_, config_.rank}, DataType::kFLOAT32, device_)
               ->RequiresGrad();
diff --git a/infini_train/src/nn/parallel/process_group.cc b/infini_train/src/nn/parallel/process_group.cc
@@ -248,39 +248,40 @@ std::shared_ptr<Work> ProcessGroup::Recv(std::vector<std::shared_ptr<Tensor>> te
     }
 }
 
-std::vector<std::shared_ptr<Tensor>>
-ProcessGroup::BroadCast(const std::vector<std::shared_ptr<Tensor>> &input_tensors) const {
+std::vector<std::shared_ptr<Tensor>> ProcessGroup::BroadCast(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
+                                                             int root_group_rank) const {
     std::vector<std::shared_ptr<Tensor>> outputs;
     std::vector<core::Stream *> streams;
     std::vector<core::CclComm *> comms;
-    std::vector<Device> devices;
 
-    CHECK_EQ(world_size_, comms_.size());
-    for (size_t i = 0; i < world_size_; ++i) {
-        auto device = devices_[i];
+    // Only iterate over this process's devices (in single-process mode this equals world_size_;
+    // in multi-process mode it is a strict subset).
+    for (const auto &device : devices_) {
         for (const auto &input_tensor : input_tensors) {
             outputs.push_back(std::make_shared<Tensor>(input_tensor->Dims(), input_tensor->Dtype(), device));
         }
-        devices.push_back(device);
         streams.push_back(runtime_impl_->GetStream(device));
         comms.push_back(device_comm_map_.at(device.index()));
     }
 
-    int root = -1;
-    for (size_t i = 0; i < devices.size(); ++i) {
-        if (devices[i] == input_tensors[0]->GetDevice()) {
-            root = static_cast<int>(i);
-            break;
-        }
+    // Determine NCCL root (= group rank of the source). In single-process mode the caller may
+    // omit it and we infer from input_tensors[0]->GetDevice(); in multi-process mode the source
+    // may not be on this process, so the caller must provide the group rank explicitly.
+    int root = root_group_rank;
+    if (root < 0) {
+        auto it = global_group_rank_map_.find(input_tensors[0]->GetDevice().Rank().GlobalRank());
+        CHECK(it != global_group_rank_map_.end())
+            << "BroadCast: root device not found in group and root_group_rank was not provided";
+        root = it->second;
     }
-    CHECK_NE(root, -1) << "Root not found in input devices";
 
-    core::CclGroupGuard ccl_group_guard(devices[0].type());
-    for (size_t i = 0; i < devices.size(); ++i) {
-        core::DeviceGuard guard(devices[i]);
+    core::CclGroupGuard ccl_group_guard(devices_[0].type());
+    for (size_t i = 0; i < devices_.size(); ++i) {
+        core::DeviceGuard guard(devices_[i]);
+        const int local_group_rank = global_group_rank_map_.at(devices_[i].Rank().GlobalRank());
         for (size_t j = 0; j < input_tensors.size(); ++j) {
             const auto &input_tensor = input_tensors[j];
-            const void *send_buffer = (static_cast<int>(i) == root ? input_tensor->DataPtr() : nullptr);
+            const void *send_buffer = (local_group_rank == root ? input_tensor->DataPtr() : nullptr);
             ccl_impl_->Broadcast(send_buffer, outputs[i * input_tensors.size() + j]->DataPtr(),
                                  input_tensor->NumElements(), input_tensor->Dtype(), root, comms[i], streams[i]);
         }
@@ -330,30 +331,169 @@ ProcessGroup::ReduceAddCoalesced(const std::vector<std::vector<std::shared_ptr<T
 }
 
 std::vector<std::shared_ptr<Tensor>> ProcessGroup::Scatter(const std::shared_ptr<Tensor> &tensor,
-                                                           std::vector<Device> devices, int64_t dim) const {
+                                                           std::vector<Device> devices, int64_t dim,
+                                                           int src_group_rank) const {
+    CHECK_EQ(devices.size(), static_cast<size_t>(world_size_)) << "Scatter expects one device per group rank";
+    CHECK_GT(devices.size(), 0);
+    CHECK(tensor != nullptr) << "Scatter: tensor carrying full shape/dtype must be provided on every process";
+
+    // Resolve src rank: explicit overrides inference from tensor device.
+    int src_rank = src_group_rank;
+    if (src_rank < 0) {
+        for (size_t i = 0; i < devices.size(); ++i) {
+            if (tensor->GetDevice() == devices[i]) {
+                src_rank = static_cast<int>(i);
+                break;
+            }
+        }
+        CHECK_NE(src_rank, -1) << "Source device not found in input devices";
+    }
+    CHECK_GE(src_rank, 0);
+    CHECK_LT(src_rank, world_size_);
+
+    // Identify local group ranks (in the same order as devices_).
+    std::vector<int> local_group_ranks;
+    local_group_ranks.reserve(devices_.size());
+    for (const auto &d : devices_) { local_group_ranks.push_back(global_group_rank_map_.at(d.Rank().GlobalRank())); }
+    const auto src_local_it = std::find(local_group_ranks.begin(), local_group_ranks.end(), src_rank);
+    const bool src_is_local = src_local_it != local_group_ranks.end();
+
+    // Source splits only when it owns the full tensor. Shard shape is identical for all ranks
+    // when the dim is evenly divisible; we rely on that for preallocation on non-src processes.
+    CHECK_EQ(tensor->Dims()[dim] % static_cast<int64_t>(devices.size()), 0)
+        << "Scatter: dim size must be divisible by world size";
+    const int64_t shard_size = tensor->Dims()[dim] / static_cast<int64_t>(devices.size());
+    std::vector<std::shared_ptr<Tensor>> split_tensors;
+    if (src_is_local) {
+        split_tensors = tensor->Split(shard_size, dim);
+        CHECK_EQ(split_tensors.size(), devices.size());
+    }
+
+    std::vector<int64_t> shard_dims = tensor->Dims();
+    shard_dims[dim] = shard_size;
+    const DataType shard_dtype = tensor->Dtype();
+
+    // Preallocate output shards for this process's local devices.
     std::vector<std::shared_ptr<Tensor>> outputs;
-    auto split_tensors = tensor->Split(tensor->Dims()[dim] / devices.size(), dim);
-    std::vector<core::Stream *> streams;
-    std::vector<core::CclComm *> comms;
-    int src_rank = -1;
+    outputs.reserve(devices_.size());
+    for (const auto &d : devices_) { outputs.push_back(std::make_shared<Tensor>(shard_dims, shard_dtype, d)); }
 
-    for (size_t i = 0; i < devices.size(); ++i) {
-        if (tensor->GetDevice() == devices[i]) {
-            src_rank = static_cast<int>(i);
+    // Single-process mode: all devices live here, keep the symmetric Send/Recv loop for clarity.
+    if (global::GetNnodes() == 1 && global::GetNprocPerNode() == 1) {
+        std::vector<core::Stream *> streams;
+        std::vector<core::CclComm *> comms;
+        streams.reserve(devices.size());
+        comms.reserve(devices.size());
+        for (const auto &d : devices) {
+            streams.push_back(runtime_impl_->GetStream(d));
+            comms.push_back(device_comm_map_.at(d.index()));
         }
-        outputs.push_back(std::make_shared<Tensor>(split_tensors[i]->Dims(), split_tensors[i]->Dtype(), devices[i]));
-        streams.push_back(runtime_impl_->GetStream(devices[i]));
-        comms.push_back(device_comm_map_.at(devices[i].index()));
+        core::CclGroupGuard ccl_group_guard(devices[0].type());
+        for (size_t i = 0; i < devices.size(); ++i) {
+            core::DeviceGuard guard(devices[i]);
+            ccl_impl_->Send(split_tensors[i]->DataPtr(), split_tensors[i]->NumElements(), shard_dtype,
+                            static_cast<int>(i), comms[src_rank], streams[src_rank]);
+            ccl_impl_->Recv(outputs[i]->DataPtr(), outputs[i]->NumElements(), shard_dtype, src_rank, comms[i],
+                            streams[i]);
+        }
+        return outputs;
     }
-    CHECK_NE(src_rank, -1) << "Source device not found in input devices";
 
-    core::CclGroupGuard ccl_group_guard(devices[0].type());
-    for (size_t i = 0; i < devices.size(); ++i) {
-        core::DeviceGuard guard(devices[i]);
-        ccl_impl_->Send(split_tensors[i]->DataPtr(), split_tensors[i]->NumElements(), tensor->Dtype(), i,
-                        comms[src_rank], streams[src_rank]);
-        ccl_impl_->Recv(outputs[i]->DataPtr(), outputs[i]->NumElements(), tensor->Dtype(), src_rank, comms[i],
-                        streams[i]);
+    // Multi-process mode: each process handles only its local device(s).
+    core::CclGroupGuard ccl_group_guard(devices_[0].type());
+
+    // Src issues a Send to every non-src group rank (including group ranks hosted in other processes).
+    if (src_is_local) {
+        const size_t src_local_idx = static_cast<size_t>(src_local_it - local_group_ranks.begin());
+        const auto &src_device = devices_[src_local_idx];
+        core::DeviceGuard guard(src_device);
+        auto *stream = runtime_impl_->GetStream(src_device);
+        auto *comm = device_comm_map_.at(src_device.index());
+        for (int dst = 0; dst < world_size_; ++dst) {
+            if (dst == src_rank) {
+                continue;
+            }
+            ccl_impl_->Send(split_tensors[dst]->DataPtr(), split_tensors[dst]->NumElements(), shard_dtype, dst, comm,
+                            stream);
+        }
+    }
+
+    // Every local device posts either a local copy (if it is src) or a Recv from src.
+    for (size_t i = 0; i < devices_.size(); ++i) {
+        const auto &local_device = devices_[i];
+        const int local_rank = local_group_ranks[i];
+        if (src_is_local && local_rank == src_rank) {
+            outputs[i]->CopyFrom(split_tensors[src_rank]);
+            continue;
+        }
+        core::DeviceGuard guard(local_device);
+        auto *stream = runtime_impl_->GetStream(local_device);
+        auto *comm = device_comm_map_.at(local_device.index());
+        ccl_impl_->Recv(outputs[i]->DataPtr(), outputs[i]->NumElements(), shard_dtype, src_rank, comm, stream);
+    }
+    return outputs;
+}
+
+std::vector<std::shared_ptr<Tensor>> ProcessGroup::Scatter(const std::shared_ptr<Tensor> &tensor, int64_t dim,
+                                                           int src_group_rank) const {
+    CHECK(tensor != nullptr) << "Scatter: tensor carrying full shape/dtype must be provided on every process";
+    CHECK_GE(src_group_rank, 0);
+    CHECK_LT(src_group_rank, world_size_);
+    CHECK_GT(devices_.size(), 0);
+    const int src_rank = src_group_rank;
+
+    // Identify local group ranks (in the same order as devices_).
+    std::vector<int> local_group_ranks;
+    local_group_ranks.reserve(devices_.size());
+    for (const auto &d : devices_) { local_group_ranks.push_back(global_group_rank_map_.at(d.Rank().GlobalRank())); }
+    const auto src_local_it = std::find(local_group_ranks.begin(), local_group_ranks.end(), src_rank);
+    const bool src_is_local = src_local_it != local_group_ranks.end();
+
+    CHECK_EQ(tensor->Dims()[dim] % static_cast<int64_t>(world_size_), 0)
+        << "Scatter: dim size must be divisible by world size";
+    const int64_t shard_size = tensor->Dims()[dim] / static_cast<int64_t>(world_size_);
+    std::vector<std::shared_ptr<Tensor>> split_tensors;
+    if (src_is_local) {
+        split_tensors = tensor->Split(shard_size, dim);
+        CHECK_EQ(split_tensors.size(), static_cast<size_t>(world_size_));
+    }
+
+    std::vector<int64_t> shard_dims = tensor->Dims();
+    shard_dims[dim] = shard_size;
+    const DataType shard_dtype = tensor->Dtype();
+
+    std::vector<std::shared_ptr<Tensor>> outputs;
+    outputs.reserve(devices_.size());
+    for (const auto &d : devices_) { outputs.push_back(std::make_shared<Tensor>(shard_dims, shard_dtype, d)); }
+
+    core::CclGroupGuard ccl_group_guard(devices_[0].type());
+
+    if (src_is_local) {
+        const size_t src_local_idx = static_cast<size_t>(src_local_it - local_group_ranks.begin());
+        const auto &src_device = devices_[src_local_idx];
+        core::DeviceGuard guard(src_device);
+        auto *stream = runtime_impl_->GetStream(src_device);
+        auto *comm = device_comm_map_.at(src_device.index());
+        for (int dst = 0; dst < world_size_; ++dst) {
+            if (dst == src_rank) {
+                continue;
+            }
+            ccl_impl_->Send(split_tensors[dst]->DataPtr(), split_tensors[dst]->NumElements(), shard_dtype, dst, comm,
+                            stream);
+        }
+    }
+
+    for (size_t i = 0; i < devices_.size(); ++i) {
+        const auto &local_device = devices_[i];
+        const int local_rank = local_group_ranks[i];
+        if (src_is_local && local_rank == src_rank) {
+            outputs[i]->CopyFrom(split_tensors[src_rank]);
+            continue;
+        }
+        core::DeviceGuard guard(local_device);
+        auto *stream = runtime_impl_->GetStream(local_device);
+        auto *comm = device_comm_map_.at(local_device.index());
+        ccl_impl_->Recv(outputs[i]->DataPtr(), outputs[i]->NumElements(), shard_dtype, src_rank, comm, stream);
     }
     return outputs;
 }