fix(maca): harden multi-thread DDP+TP init on gpt2

kilinchange · kilinchange · commit 1f10a97d734e · 2026-04-22T10:00:29.000Z
- Move MACA/MCCL P2P_DISABLE setenv into MacaGuardImpl ctor and parse
  --tensor_parallel from /proc/self/cmdline, so both flags land before
  mcInit(0) (setenv from main() was too late at static init).
- Also disable MCCL_P2P_DISABLE when TP&gt;1: MACA_P2P_DISABLE alone still
  lets MCCL establish its own P2P buffers, which deadlocks multi-PG
  init on TP+SP / TP+SP+PP+VPP.
- gpt2 main: defer ProcessGroup creation until after model-&gt;To(device),
  serialize the upload under a mutex + barrier across DP threads. MCCL
  init otherwise leaves stale read-only P2P mappings in the VA ranges
  mcMalloc later returns, racing with concurrent model uploads.
- Drop the now-redundant setenv blocks from gpt2/llama3 main().
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -1,8 +1,13 @@
+#include <algorithm>
+#include <barrier>
 #include <chrono>
 #include <cstdlib>
 #include <format>
+#include <iterator>
 #include <memory>
+#include <mutex>
 #include <optional>
+#include <thread>
 #include <unordered_map>
 #include <unordered_set>
 
@@ -148,31 +153,33 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
 
+    auto rank_in_group = [&](const std::vector<int> &group_ranks) {
+        auto it = std::find(group_ranks.begin(), group_ranks.end(), rank.GlobalRank());
+        CHECK(it != group_ranks.end());
+        return static_cast<int>(std::distance(group_ranks.begin(), it));
+    };
+
     if (rank.IsParallel()) {
         auto parallel_device_type
             = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
         device = Device(parallel_device_type, rank.thread_rank());
-        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
+        // NOTE(dcj): On MACA, defer ProcessGroup creation until AFTER the model
+        // has been uploaded to the device. MCCL init registers internal P2P
+        // buffers that leave stale read-only mappings in the address ranges
+        // mcMalloc later hands out; allocating the model first keeps it in a
+        // P2P-clean region of the VA space and avoids the init-time race on
+        // multi-thread DDP+TP. Mirrors the llama3 fix combo.
         if (ddp_world_size > 1) {
-            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
-                                             GetDataParallelGroupRanks(rank.GlobalRank()));
-            ddp_rank = ddp_pg->GetGroupRank(rank.GlobalRank());
+            ddp_rank = rank_in_group(GetDataParallelGroupRanks(rank.GlobalRank()));
         }
-
         if (tp_world_size > 1) {
-            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
-                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
-            tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
+            tp_rank = rank_in_group(GetTensorParallelGroupRanks(rank.GlobalRank()));
             // NOTE(zbl): Reserved for VocabParallelEmbedding
             nn::parallel::tp_rank = tp_rank;
         }
-
         if (pp_world_size > 1) {
-            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
-                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
-            pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
+            pp_rank = rank_in_group(GetPipelineParallelGroupRanks(rank.GlobalRank()));
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
@@ -206,7 +213,46 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
-    model->To(device);
+    // On MACA, parallel mcMalloc/mcMemcpy across threads still races even with
+    // an mcMalloc mutex, because the runtime auto-maps allocations P2P-readonly
+    // between sibling devices. Serialize the entire model upload so each
+    // thread's allocations land before any peer thread starts touching the
+    // address space.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        static std::mutex model_to_mutex;
+        std::lock_guard<std::mutex> lock(model_to_mutex);
+        model->To(device);
+        auto upload_impl = core::GetDeviceGuardImpl(device.type());
+        upload_impl->SynchronizeDevice(device);
+    } else {
+        model->To(device);
+    }
+
+    // Synchronize model upload across all DP threads before any MCCL init runs.
+    // The barrier ensures no thread enters mcclCommInitAll while peer threads
+    // are still mid-mcMemcpyAsync.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        auto pre_pg_impl = core::GetDeviceGuardImpl(device.type());
+        pre_pg_impl->SynchronizeDevice(device);
+        static std::barrier pre_pg_barrier(FLAGS_nthread_per_process);
+        pre_pg_barrier.arrive_and_wait();
+    }
+
+    if (rank.IsParallel()) {
+        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
+        if (ddp_world_size > 1) {
+            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
+                                             GetDataParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (tp_world_size > 1) {
+            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
+                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (pp_world_size > 1) {
+            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
+                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
+        }
+    }
 
     utils::PrecisionChecker::BuildNameMap(model.get());
 
@@ -470,13 +516,6 @@ int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
-    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
-    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
-    // call (i.e. before threads that create ProcessGroups are spawned).
-    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
-        setenv("MACA_P2P_DISABLE", "1", 1);
-    }
-
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -492,13 +492,6 @@ int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
     google::InitGoogleLogging(argv[0]);
 
-    // On MACA, when TP > 1 disable P2P to prevent MCCL communication-ordering
-    // deadlocks and P2P teardown crashes.  Must be set before any mcclCommInitAll
-    // call (i.e. before threads that create ProcessGroups are spawned).
-    if (FLAGS_device == kDeviceMACA && FLAGS_tensor_parallel > 1) {
-        setenv("MACA_P2P_DISABLE", "1", 1);
-    }
-
     auto precision_config = utils::PrecisionCheckConfig::Parse(FLAGS_precision_check);
     nn::parallel::global::InitAllEnv(FLAGS_nthread_per_process, FLAGS_tensor_parallel, FLAGS_sequence_parallel,
                                      FLAGS_pipeline_parallel, FLAGS_virtual_pipeline_parallel);
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -2,8 +2,12 @@
 
 #include <array>
 #include <cstdlib>
+#include <fstream>
 #include <memory>
 #include <mutex>
+#include <sstream>
+#include <string>
+#include <vector>
 
 #include "infini_train/include/common/maca/common_maca.h"
 #include "infini_train/include/core/runtime/runtime_common.h"
@@ -15,6 +19,47 @@ namespace infini_train::core::maca {
 namespace {
 constexpr int kMaxGpus = 8;
 
+// Read /proc/self/cmdline and return --tensor_parallel value, or 1 if absent /
+// unparseable. Must be callable from static init (before main runs), so we
+// cannot use gflags here.
+int ReadTensorParallelFromCmdline() {
+    std::ifstream in("/proc/self/cmdline", std::ios::binary);
+    if (!in) {
+        return 1;
+    }
+    std::vector<std::string> args;
+    std::string cur;
+    char c;
+    while (in.get(c)) {
+        if (c == '\0') {
+            if (!cur.empty()) {
+                args.push_back(std::move(cur));
+                cur.clear();
+            }
+        } else {
+            cur.push_back(c);
+        }
+    }
+    if (!cur.empty()) {
+        args.push_back(std::move(cur));
+    }
+    for (size_t i = 0; i < args.size(); ++i) {
+        const auto &a = args[i];
+        std::string value;
+        if (a.rfind("--tensor_parallel=", 0) == 0) {
+            value = a.substr(std::string("--tensor_parallel=").size());
+        } else if (a == "--tensor_parallel" && i + 1 < args.size()) {
+            value = args[i + 1];
+        } else {
+            continue;
+        }
+        try {
+            return std::stoi(value);
+        } catch (...) { return 1; }
+    }
+    return 1;
+}
+
 static std::array<std::unique_ptr<MacaStream>, kMaxGpus> maca_streams;
 static std::array<std::unique_ptr<MacaBlasHandle>, kMaxGpus> maca_blas_handles;
 
@@ -84,6 +129,20 @@ MacaGuardImpl::MacaGuardImpl() {
     // themselves before launch.
     setenv("MACA_LAUNCH_BLOCKING", "1", 0);
 
+    // When TP > 1 on MACA, disable both the MACA runtime P2P mapping and the
+    // MCCL-level P2P path to prevent multi-PG init deadlocks (threads
+    // concurrently creating both DP and TP comms hang in mcclCommInitAll).
+    // MACA_P2P_DISABLE alone is not sufficient for TP+SP / TP+SP+PP+VPP
+    // configurations — MCCL still establishes its own P2P buffers during init,
+    // so we must disable that too. Both must be set before mcInit(0); setenv
+    // from main() is too late because this ctor runs at static init. Peek at
+    // /proc/self/cmdline to keep single-card / DP-only / PP-only runs on the
+    // P2P fast path.
+    if (ReadTensorParallelFromCmdline() > 1) {
+        setenv("MACA_P2P_DISABLE", "1", 0);
+        setenv("MCCL_P2P_DISABLE", "1", 0);
+    }
+
     // The MACA runtime requires an explicit mcInit(0) before any other call.
     // CUDA has no equivalent; mirroring the DeviceManager ctor from 87390cd.
     MACA_CHECK(mcInit(0));