fix(engine): synchronize on CPU group before destroying NCCL PG

HT-Yuan · HT-Yuan · commit e7230e2b492d · 2026-04-23T20:05:07.000+08:00
Before calling ``dist.destroy_process_group()``, FSDPEngine and
MegatronEngine now perform a barrier on the gloo CPU subgroup. This
ensures every rank finishes its final NCCL abort together instead of
having rank-0 tear down the global TCPStore while non-zero ranks'
HeartbeatMonitor threads are still polling it.

Symptom this fixes: a noisy stderr backtrace at the very end of a
successful training run, e.g.

    [W TCPStore.cpp] recvValue failed on SocketImpl ... no error
    [W ProcessGroupNCCL.cpp] ... HeartbeatMonitor::runLoop()

emitted from the NCCL HeartbeatMonitor C++ thread on non-zero ranks
after rank-0's TCPStore server has already been shut down.

Also make ``FSDPEngine.destroy()`` idempotent by flipping
``own_global_group`` to False after tearing the group down, matching
``MegatronEngine.destroy()``. This protects against double-destroy
from future cleanup hooks.

The barrier is wrapped in try/except so a half-dead process group at
teardown never turns a warning into a hard failure.
diff --git a/areal/engine/fsdp_engine.py b/areal/engine/fsdp_engine.py
@@ -419,7 +419,25 @@ def destroy(self):
         # handles still exist and we expect another engine to
         # clean up these groups.
         if dist.is_initialized() and self.own_global_group:
+            # Pre-destroy synchronization on a CPU (gloo) group so that all
+            # ranks leave the NCCL collective phase together. Without this
+            # barrier, rank-0 (which owns the TCPStore server) may exit
+            # before peers finish their final NCCL abort, causing
+            # HeartbeatMonitor background threads on other ranks to observe
+            # "recvValue failed" on the already-closed store. This is
+            # harmless but produces a noisy stderr backtrace at teardown.
+            if getattr(self, "_cpu_group", None) is not None:
+                try:
+                    dist.barrier(group=self._cpu_group)
+                except Exception as e:  # pragma: no cover - best-effort
+                    self.logger.warning(
+                        f"pre-destroy CPU barrier failed (ignored): {e}"
+                    )
             dist.destroy_process_group()
+            # Make destroy() idempotent: if the controller calls destroy
+            # more than once (e.g. via cleanup hooks), the second call
+            # must not try to destroy already-destroyed groups.
+            self.own_global_group = False
 
     @property
     def initialized(self) -> bool:
diff --git a/areal/engine/megatron_engine.py b/areal/engine/megatron_engine.py
@@ -510,6 +510,19 @@ def destroy(self):
         # handles still exist and we expect another engine to
         # clean up these groups.
         if dist.is_initialized() and self.own_global_group:
+            # Pre-destroy synchronization on a CPU (gloo) group so that all
+            # ranks leave the NCCL collective phase together. Without this
+            # barrier, rank-0 (which owns the TCPStore server) may exit
+            # before peers finish their final NCCL abort, causing
+            # HeartbeatMonitor background threads on other ranks to observe
+            # "recvValue failed" on the already-closed store.
+            if getattr(self, "_cpu_group", None) is not None:
+                try:
+                    dist.barrier(group=self._cpu_group)
+                except Exception as e:  # pragma: no cover - best-effort
+                    self.logger.warning(
+                        f"pre-destroy CPU barrier failed (ignored): {e}"
+                    )
             mpu.destroy_model_parallel()
             dist.destroy_process_group()
             self.own_global_group = False