fix(maca): stabilize multi-thread DDP on llama3/gpt2

kilinchange · kilinchange · commit 905bee0f996b · 2026-04-22T06:30:09.000Z
The MACA runtime auto-cross-maps mcMalloc'd buffers as P2P-readonly
between sibling devices in the same process, so multi-thread DDP
(nthread&gt;=4) crashed ~70% of the time during model upload with
"Writing to readonly page" on a 64MB buffer whose owner node was
missing from the mapped peer list.

llama3/main.cc: defer ProcessGroup creation until after model-&gt;To,
serialize model-&gt;To across DP threads with a process-wide mutex,
and barrier between upload and PG init so MCCL P2P registration
never overlaps with peer-thread allocations. Compute in-group
ranks via std::find on the rank topology so LoadFromLLMC still
sees the correct tp_rank before any PG exists.

reducer.cc: switch FinalizeBackward to host-blocking
work-&gt;Synchronize() so the CPU bucket-rebuild can't race past an
in-flight AllReduce.

maca_guard_impl.cc: setenv MACA_LAUNCH_BLOCKING=1 before mcInit(0)
in the ctor (setenv from main is too late since mcInit runs during
static init), and serialize mcMalloc/mcFree behind a global mutex.

llama3/gpt2 main.cc: std::_Exit(0) after training when device==maca
&amp;&amp; nthread_per_process&gt;1 to bypass the broken static-destruction
chain — ProcessGroupMCCL intentionally skips mcclCommDestroy, and
the leaked MCCL/P2P buffers otherwise trip mxkwUnmapMemoryToGPU
and SIGABRT during teardown.

Validated: 20/20 passes on
  ./llama3 --device maca --nthread_per_process=8 --num_iteration=10
           --batch_size=10 --total_batch_size=5120
Single-card path (nthread_per_process=1) still passes.
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
@@ -501,5 +501,15 @@ int main(int argc, char *argv[]) {
     gflags::ShutDownCommandLineFlags();
     google::ShutdownGoogleLogging();
 
+    // On MACA with multi-thread DDP, ProcessGroupMCCL intentionally skips
+    // mcclCommDestroy because GPU runtime may already be torn down by the time
+    // static destructors run; the leaked MCCL comm/P2P buffers then trip the
+    // MACA runtime during static destruction with mxkwUnmapMemoryToGPU
+    // failures and SIGABRT.  Bypass the destructor chain so the test sees
+    // exit=0 once Train() returns cleanly.
+    if (FLAGS_device == kDeviceMACA && FLAGS_nthread_per_process > 1) {
+        std::_Exit(0);
+    }
+
     return 0;
 }
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
@@ -1,7 +1,11 @@
+#include <algorithm>
+#include <barrier>
 #include <cstdlib>
 #include <format>
+#include <iterator>
 #include <memory>
 #include <optional>
+#include <thread>
 #include <unordered_set>
 
 #include "gflags/gflags.h"
@@ -130,31 +134,36 @@ void Train(const nn::parallel::Rank &rank) {
     const ProcessGroup *tp_pg = nullptr;
     const ProcessGroup *pp_pg = nullptr;
 
+    auto rank_in_group = [&](const std::vector<int> &group_ranks) {
+        auto it = std::find(group_ranks.begin(), group_ranks.end(), rank.GlobalRank());
+        CHECK(it != group_ranks.end());
+        return static_cast<int>(std::distance(group_ranks.begin(), it));
+    };
+
     if (rank.IsParallel()) {
         auto parallel_device_type
             = (FLAGS_device == kDeviceMACA) ? Device::DeviceType::kMACA : Device::DeviceType::kCUDA;
         device = Device(parallel_device_type, rank.thread_rank());
-        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
+        // NOTE(dcj): On MACA, defer ProcessGroup creation until AFTER the model
+        // has been uploaded to the device.  MCCL init registers internal P2P
+        // buffers that leave stale read-only mappings in the address ranges
+        // mcMalloc later hands out; allocating the model first keeps it in a
+        // P2P-clean region of the VA space and avoids the "Writing to readonly
+        // page" race on multi-thread DDP.
+        //
+        // Compute the in-group ranks now so model loading (which reads
+        // nn::parallel::tp_rank) gets the correct shard.
         if (ddp_world_size > 1) {
-            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
-                                             GetDataParallelGroupRanks(rank.GlobalRank()));
-            ddp_rank = ddp_pg->GetGroupRank(rank.GlobalRank());
+            ddp_rank = rank_in_group(GetDataParallelGroupRanks(rank.GlobalRank()));
         }
-
         if (tp_world_size > 1) {
-            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
-                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
-            tp_rank = tp_pg->GetGroupRank(rank.GlobalRank());
+            tp_rank = rank_in_group(GetTensorParallelGroupRanks(rank.GlobalRank()));
             // NOTE(zbl): Reserved for VocabParallelEmbedding
             nn::parallel::tp_rank = tp_rank;
         }
-
         if (pp_world_size > 1) {
-            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
-                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
-            pp_rank = pp_pg->GetGroupRank(rank.GlobalRank());
-
+            pp_rank = rank_in_group(GetPipelineParallelGroupRanks(rank.GlobalRank()));
             nn::parallel::pp_rank = pp_rank;
         }
     } else {
@@ -187,7 +196,48 @@ void Train(const nn::parallel::Rank &rank) {
         model = std::make_shared<nn::TransformerModel>(model_config);
     }
 
-    model->To(device);
+    // On MACA, parallel mcMalloc/mcMemcpy across threads still races even with
+    // an mcMalloc mutex, because the runtime auto-maps allocations P2P-readonly
+    // between sibling devices.  Serialize the entire model upload so each
+    // thread's allocations land before any peer thread starts touching the
+    // address space.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        static std::mutex model_to_mutex;
+        std::lock_guard<std::mutex> lock(model_to_mutex);
+        model->To(device);
+        auto upload_impl = core::GetDeviceGuardImpl(device.type());
+        upload_impl->SynchronizeDevice(device);
+    } else {
+        model->To(device);
+    }
+
+    // Synchronize model upload across all DP threads before any MCCL init runs.
+    // The barrier ensures no thread enters mcclCommInitAll while peer threads
+    // are still mid-mcMemcpyAsync; the SynchronizeDevice ensures the GPU work
+    // is actually retired, not merely queued, before MCCL touches the address
+    // space.
+    if (FLAGS_device == kDeviceMACA && rank.IsParallel() && FLAGS_nthread_per_process > 1) {
+        auto pre_pg_impl = core::GetDeviceGuardImpl(device.type());
+        pre_pg_impl->SynchronizeDevice(device);
+        static std::barrier pre_pg_barrier(FLAGS_nthread_per_process);
+        pre_pg_barrier.arrive_and_wait();
+    }
+
+    if (rank.IsParallel()) {
+        auto *pg_factory = ProcessGroupFactory::Instance(device.type());
+        if (ddp_world_size > 1) {
+            ddp_pg = pg_factory->GetOrCreate(GetDataParallelProcessGroupName(rank.GlobalRank()),
+                                             GetDataParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (tp_world_size > 1) {
+            tp_pg = pg_factory->GetOrCreate(GetTensorParallelProcessGroupName(rank.GlobalRank()),
+                                            GetTensorParallelGroupRanks(rank.GlobalRank()));
+        }
+        if (pp_world_size > 1) {
+            pp_pg = pg_factory->GetOrCreate(GetPipelineParallelProcessGroupName(rank.GlobalRank()),
+                                            GetPipelineParallelGroupRanks(rank.GlobalRank()));
+        }
+    }
 
     utils::PrecisionChecker::BuildNameMap(model.get());
 
@@ -473,5 +523,15 @@ int main(int argc, char *argv[]) {
     gflags::ShutDownCommandLineFlags();
     google::ShutdownGoogleLogging();
 
+    // On MACA with multi-thread DDP, ProcessGroupMCCL intentionally skips
+    // mcclCommDestroy because GPU runtime may already be torn down by the time
+    // static destructors run; the leaked MCCL comm/P2P buffers then trip the
+    // MACA runtime during static destruction with mxkwUnmapMemoryToGPU
+    // failures and SIGABRT.  Bypass the destructor chain so the test sees
+    // exit=0 once Train() returns cleanly.
+    if (FLAGS_device == kDeviceMACA && FLAGS_nthread_per_process > 1) {
+        std::_Exit(0);
+    }
+
     return 0;
 }
diff --git a/infini_train/src/core/runtime/maca/maca_guard_impl.cc b/infini_train/src/core/runtime/maca/maca_guard_impl.cc
@@ -1,6 +1,7 @@
 #include "infini_train/src/core/runtime/maca/maca_guard_impl.h"
 
 #include <array>
+#include <cstdlib>
 #include <memory>
 #include <mutex>
 
@@ -20,6 +21,12 @@ static std::array<std::unique_ptr<MacaBlasHandle>, kMaxGpus> maca_blas_handles;
 static std::array<std::once_flag, kMaxGpus> device_stream_flags;
 static std::array<std::once_flag, kMaxGpus> device_handle_flags;
 
+// Serialize host-side allocations across threads.  The MACA runtime/MCCL share
+// a process-wide virtual address pool; concurrent mcMalloc on multiple threads
+// can race with MCCL P2P buffer registration and produce "Writing to readonly
+// page" faults on peer-mapped buffers.
+static std::mutex g_malloc_mutex;
+
 inline void CheckMacaDevice(Device device) {
     CHECK(device.type() == Device::DeviceType::kMACA) << std::format(
         "MacaGuardImpl expects MACA device, but got type={} index={}", static_cast<int>(device.type()), device.index());
@@ -67,6 +74,16 @@ void MacaGuardImpl::InitSingleHandle(Device device) {
 }
 
 MacaGuardImpl::MacaGuardImpl() {
+    // Force synchronous kernel launches on MACA before initializing the runtime.
+    // Multi-thread DDP races MCCL P2P buffer setup against concurrent user-tensor
+    // kernel launches; without launch-blocking, threads crash during init or
+    // step 0 with "Writing to readonly page" / xnack ATU faults on 64MB P2P
+    // buffers.  setenv() from main() is too late because mcInit(0) runs during
+    // static initialization (before main), so we setenv here in the ctor
+    // just prior to mcInit(0).  Users can override by setting the env var
+    // themselves before launch.
+    setenv("MACA_LAUNCH_BLOCKING", "1", 0);
+
     // The MACA runtime requires an explicit mcInit(0) before any other call.
     // CUDA has no equivalent; mirroring the DeviceManager ctor from 87390cd.
     MACA_CHECK(mcInit(0));
@@ -218,15 +235,23 @@ BlasHandle *MacaGuardImpl::GetBlasHandle(Device device) const {
 }
 
 // memory
-void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) { MACA_CHECK(mcMalloc(dev_ptr, size)); }
+void MacaGuardImpl::Malloc(void **dev_ptr, size_t size) {
+    std::lock_guard<std::mutex> lock(g_malloc_mutex);
+    MACA_CHECK(mcMalloc(dev_ptr, size));
+}
 
 void MacaGuardImpl::MallocAsync(void **dev_ptr, size_t size, Stream *stream) {
-    // auto maca_stream = GetMacaStream(stream);
-    // MACA_CHECK(mcMallocAsync(dev_ptr, size, maca_stream));
+    // NOTE(dcj): mcMallocAsync uses a per-stream mempool on MACA and races with
+    // MCCL P2P buffer management under multi-thread DDP.  Use the synchronous
+    // mcMalloc path (serialized by g_malloc_mutex) so every buffer has a stable
+    // mapping by the time any kernel or MCCL op touches it.
     Malloc(dev_ptr, size);
 }
 
-void MacaGuardImpl::Free(void *dev_ptr) { MACA_CHECK(mcFree(dev_ptr)); }
+void MacaGuardImpl::Free(void *dev_ptr) {
+    std::lock_guard<std::mutex> lock(g_malloc_mutex);
+    MACA_CHECK(mcFree(dev_ptr));
+}
 
 void MacaGuardImpl::FreeAsync(void *dev_ptr, Stream *stream) {
     // auto maca_stream = GetMacaStream(stream);
diff --git a/infini_train/src/nn/parallel/ddp/reducer.cc b/infini_train/src/nn/parallel/ddp/reducer.cc
@@ -415,8 +415,13 @@ void Reducer::FinalizeBackward() {
     }
 
     // Wait for works to be done with mutex off
-    // Note(zbl): Use non-blocking stream wait instead of sync on host
-    for (auto &work : works) { work->WaitNonBlocking(); }
+    // NOTE(dcj): Host-block until AllReduce completes on the device.  On MACA,
+    // a non-blocking stream wait lets the CPU race ahead into the next
+    // iteration's bucket rebuild, where mcMalloc/mcFree on a still-in-flight
+    // AllReduce buffer races with MCCL P2P teardown and produces "Writing to
+    // readonly page" faults.  Host blocking forces the bucket lifecycle to
+    // serialize against the comm.
+    for (auto &work : works) { work->Synchronize(); }
 
     // Write grad back and reset with mutex on
     {

Original file line number	Diff line number	Diff line change
`@@ -415,8 +415,13 @@ void Reducer::FinalizeBackward() {`
`415`	`415`	`}`
`416`	`416`
`417`	`417`	`// Wait for works to be done with mutex off`
`418`		`- // Note(zbl): Use non-blocking stream wait instead of sync on host`
`419`		`- for (auto &work : works) { work->WaitNonBlocking(); }`
	`418`	`+ // NOTE(dcj): Host-block until AllReduce completes on the device. On MACA,`
	`419`	`+ // a non-blocking stream wait lets the CPU race ahead into the next`
	`420`	`+ // iteration's bucket rebuild, where mcMalloc/mcFree on a still-in-flight`
	`421`	`+ // AllReduce buffer races with MCCL P2P teardown and produces "Writing to`
	`422`	`+ // readonly page" faults. Host blocking forces the bucket lifecycle to`
	`423`	`+ // serialize against the comm.`
	`424`	`+ for (auto &work : works) { work->Synchronize(); }`
`420`	`425`
`421`	`426`	`// Write grad back and reset with mutex on`
`422`	`427`	`{`