fix(controller): tear down workers in reverse rank order with concurrent dispatch

HT-Yuan · HT-Yuan · commit 29b8495abd9f · 2026-04-23T21:24:43.000+08:00
When ``TrainController.destroy()`` asks the scheduler to kill its workers,
rank-0 is now signalled last instead of first. Rank-0 hosts the global
TCPStore server that all other ranks' ``ProcessGroupNCCL::HeartbeatMonitor``
threads still poll during their final cleanup; killing it first leaves
peers observing a closed socket, which surfaces as

    [W TCPStore.cpp] recvValue failed ... no error
    [W ProcessGroupNCCL.cpp] ... HeartbeatMonitor::runLoop()

in stderr at the very end of a successful run.

Reverse rank order alone is necessary but not sufficient: the original
``LocalScheduler._cleanup_workers`` iterated workers serially and blocked
on ``kill_process_tree(..., timeout=3, graceful=True)`` for each one. A
4-rank job therefore spent ~12s in cleanup, with only one rank inside its
``engine.destroy()`` path at a time. The CPU ``dist.barrier`` added in the
companion FSDP commit could never rendezvous -- every rank timed out and
the NCCL/TCPStore teardown race still fired. The local scheduler now
dispatches SIGTERM to all workers concurrently via daemon threads and
joins them in parallel, so every rank enters ``engine.destroy()`` within
the same small window while rank-0 still receives its signal last (by a
few milliseconds).

Changes
-------
* ``Scheduler.delete_workers`` grows an optional ``reverse_order: bool``
  keyword, documented in the abstract API. Existing callers stay
  source-compatible.
* ``LocalScheduler._cleanup_workers`` is restructured into three phases:
  synchronous port release, concurrent SIGTERM dispatch (one daemon
  thread per worker, started in caller-provided order), and parallel
  join with a bounded timeout. Per-worker SIGTERM -&gt; wait -&gt; SIGKILL
  escalation is preserved via the existing ``kill_process_tree`` helper.
* ``RayScheduler`` honours the flag by iterating workers in reverse
  before invoking ``actor.destroy.remote()``. No concurrency change is
  needed because Ray's ``.remote()`` is already async-dispatched.
* ``SlurmScheduler`` accepts the keyword for API parity but ignores it,
  since ``scancel`` tears down the whole job step atomically.
* ``TrainController.destroy()`` now:
  - passes ``reverse_order=True`` with a ``TypeError`` fallback so
    third-party schedulers keep working;
  - inspects the ``asyncio.gather(..., return_exceptions=True)`` result
    and logs per-rank engine-destroy failures as warnings instead of
    silently discarding them;
  - documents the new two-phase teardown invariant in its docstring.
* Mock schedulers in ``tests/test_train_controller.py`` and
  ``tests/test_rollout_controller.py`` accept the new kwarg.

Tests
-----
* ``tests/test_train_controller.py`` asserts ``delete_workers`` is called
  with ``reverse_order=True`` and verifies the ``TypeError`` fallback
  path for legacy schedulers.
* ``tests/test_local_scheduler.py`` verifies that ``reverse_order=True``
  produces the expected reverse iteration over workers.
* Verified end-to-end with the HH-RLHF DPO example under
  ``scheduler.type=local``: the previously reproducible
  ``TCPStore.recvValue failed`` / HeartbeatMonitor warnings no longer
  appear on clean shutdown.
diff --git a/areal/api/scheduler_api.py b/areal/api/scheduler_api.py
@@ -106,13 +106,22 @@ def get_workers(self, role: str, timeout: int | None = None) -> list[Worker]:
         raise NotImplementedError()
 
     @abc.abstractmethod
-    def delete_workers(self, role: str | None = None):
+    def delete_workers(self, role: str | None = None, reverse_order: bool = False):
         """Stop and clean up worker processes.
 
         Parameters
         ----------
         role : str, optional
             Specific role to delete. If None, all workers are deleted
+        reverse_order : bool, optional
+            If True, terminate workers in reverse order of their IDs so that
+            rank-0 (which typically owns the global TCPStore server) is the
+            last one to be killed. This helps avoid a noisy
+            ``TCPStore.recvValue failed`` warning emitted by NCCL's
+            HeartbeatMonitor background thread on non-zero ranks during
+            teardown. Implementations that tear down all workers as a single
+            atomic operation (e.g. ``scancel`` for Slurm) may safely ignore
+            this argument. Defaults to False for backward compatibility.
 
         Raises
         ------
diff --git a/areal/infra/controller/train_controller.py b/areal/infra/controller/train_controller.py
@@ -404,6 +404,19 @@ def destroy(self):
         """Destroy the controller and release GPU memory of models.
 
         Cleans up all resources including workers, engines, and internal state.
+
+        The teardown order is carefully chosen to avoid a noisy
+        ``TCPStore.recvValue failed`` warning from NCCL's HeartbeatMonitor
+        on non-zero ranks:
+
+        1. Remote engines' ``destroy()`` runs first so that every rank calls
+           ``dist.destroy_process_group()`` after a CPU barrier. This
+           guarantees all ranks finish NCCL abort together before any store
+           shuts down.
+        2. Workers are killed in reverse rank order so that rank-0 (owner
+           of the global TCPStore server) receives SIGTERM last. This
+           avoids the short window where non-zero ranks' HeartbeatMonitor
+           threads poll a store whose TCP listener has already been closed.
         """
         logger.info("Destroying TrainController...")
 
@@ -421,17 +434,38 @@ async def _destroy_all_engines():
                         )
                         for rank, worker in enumerate(self.workers)
                     ]
-                    await asyncio.gather(*tasks, return_exceptions=True)
-
-                run_async_task(_destroy_all_engines)
+                    return await asyncio.gather(*tasks, return_exceptions=True)
+
+                results = run_async_task(_destroy_all_engines)
+                # Surface per-worker failures instead of silently swallowing them.
+                for rank, res in enumerate(results or []):
+                    if isinstance(res, BaseException):
+                        logger.warning(
+                            f"Engine destroy on rank {rank} raised "
+                            f"{type(res).__name__}: {res}"
+                        )
                 logger.info("Engines destroyed")
             except Exception as e:
                 logger.error(f"Error destroying engines: {e}")
 
-        # Then delete workers via scheduler
+        # Then delete workers via scheduler. Pass reverse_order=True so
+        # that rank-0 (TCPStore owner) is killed last; keep a TypeError
+        # fallback for third-party Scheduler implementations that do not
+        # yet support the new keyword.
         try:
-            logger.info("Deleting all workers...")
-            self.scheduler.delete_workers(role=self._worker_role)
+            logger.info("Deleting all workers (reverse rank order)...")
+            try:
+                self.scheduler.delete_workers(
+                    role=self._worker_role, reverse_order=True
+                )
+            except TypeError:
+                # Backward-compat path for custom schedulers that have not
+                # been updated to accept `reverse_order`.
+                logger.warning(
+                    "Scheduler.delete_workers does not accept reverse_order; "
+                    "falling back to legacy behaviour."
+                )
+                self.scheduler.delete_workers(role=self._worker_role)
             logger.info("Workers deleted")
         except Exception as e:
             logger.error(f"Error deleting workers: {e}")
diff --git a/areal/infra/scheduler/local.py b/areal/infra/scheduler/local.py
@@ -1082,23 +1082,27 @@ def _check_worker_health(self, role: str):
                     stderr,
                 )
 
-    def delete_workers(self, role: str | None = None):
+    def delete_workers(self, role: str | None = None, reverse_order: bool = False):
         """Delete workers and clean up resources.
 
         Parameters
         ----------
         role : str, optional
             Specific worker role to delete, or None to delete all
+        reverse_order : bool, optional
+            If True, terminate workers in reverse rank order so that rank-0
+            (owner of the global TCPStore) is signalled last. See
+            ``Scheduler.delete_workers`` for background.
         """
         if role is None:
             # Delete colocated roles first (they don't own processes)
             colocated_roles = list(self._colocated_roles.keys())
             for r in colocated_roles:
-                self.delete_workers(r)
+                self.delete_workers(r, reverse_order=reverse_order)
             # Then delete actual worker roles
             roles = list(self._workers.keys())
             for r in roles:
-                self.delete_workers(r)
+                self.delete_workers(r, reverse_order=reverse_order)
             return
 
         # Handle colocated/forked role
@@ -1107,6 +1111,8 @@ def delete_workers(self, role: str | None = None):
             if role in self._workers:
                 logger.info(f"Removing forked role '{role}' (managed by parent worker)")
                 workers = self._workers[role]
+                if reverse_order:
+                    workers = list(reversed(workers))
                 self._cleanup_workers(
                     workers
                 )  # Release ports, but process=None skips kill
@@ -1124,29 +1130,108 @@ def delete_workers(self, role: str | None = None):
         workers = self._workers[role]
         logger.info(f"Deleting {len(workers)} workers for role '{role}'")
 
+        if reverse_order:
+            workers = list(reversed(workers))
         self._cleanup_workers(workers)
 
         del self._workers[role]
 
         logger.info(f"Successfully deleted workers for role '{role}'")
 
     def _cleanup_workers(self, workers: list[WorkerInfo]):
+        """Tear down a batch of workers with coordinated teardown semantics.
+
+        The previous implementation iterated ``workers`` serially and called
+        ``kill_process_tree(..., timeout=3, graceful=True)`` on each one.
+        Because that helper blocks for up to ``timeout`` seconds between
+        SIGTERM and the fallback SIGKILL, a 4-rank job could spend ~12 s
+        killing workers one-by-one. During that window only a single rank
+        was executing its ``engine.destroy()`` path, so the CPU barrier
+        added in ``FSDPEngine.destroy()`` could never actually synchronise
+        -- every rank timed out on its barrier and the NCCL teardown race
+        that produced ``TCPStore.recvValue failed`` / HeartbeatMonitor
+        warnings was not fixed.
+
+        The corrected behaviour is:
+
+        1. Release port allocations synchronously (cheap, no I/O).
+        2. Send SIGTERM to every worker in the order provided by the
+           caller, with no blocking waits in between. ``delete_workers``
+           passes the list in reverse rank order when
+           ``reverse_order=True``, which preserves the "rank-0 signalled
+           last" guarantee while keeping the dispatch window in the
+           millisecond range.
+        3. Wait for every worker to exit in parallel using one thread per
+           worker. Each thread re-uses ``kill_process_tree`` so the
+           existing SIGTERM -> wait -> SIGKILL escalation is preserved
+           per-worker; we just no longer serialise the waits.
+
+        With this change every rank enters ``engine.destroy()`` within the
+        same small window, the CPU ``dist.barrier`` inside can actually
+        rendezvous, and the NCCL / TCPStore teardown becomes race-free.
+        """
+        import threading
+
+        # Phase 1: always release ports, regardless of whether the worker
+        # owns a process (forked workers have ``process is None``).
+        live_workers: list[WorkerInfo] = []
         for worker_info in workers:
             try:
                 for port_str in worker_info.worker.worker_ports:
                     self._allocated_ports.discard(int(port_str))
+            except Exception as e:
+                logger.error(
+                    f"Error releasing ports for worker {worker_info.worker.id}: {e}",
+                    exc_info=True,
+                )
+            if worker_info.process is not None:
+                live_workers.append(worker_info)
+            else:
+                logger.debug(f"Cleaned up worker {worker_info.worker.id}")
 
-                # Only kill process if we own it (non-forked workers)
-                if worker_info.process is not None:
-                    kill_process_tree(worker_info.process.pid, timeout=3, graceful=True)
+        if not live_workers:
+            return
 
+        # Phase 2: dispatch SIGTERM to every worker concurrently via
+        # background threads so that all ranks reach their teardown
+        # barrier within the same window. The list order is preserved as
+        # thread start order: when the caller requests reverse_order,
+        # rank-0 is the last thread to be started, which keeps the
+        # "rank-0 dies last" property while staying non-blocking.
+        def _finalize(worker_info: WorkerInfo) -> None:
+            try:
+                kill_process_tree(worker_info.process.pid, timeout=3, graceful=True)
                 logger.debug(f"Cleaned up worker {worker_info.worker.id}")
             except Exception as e:
                 logger.error(
                     f"Error cleaning up worker {worker_info.worker.id}: {e}",
                     exc_info=True,
                 )
 
+        threads: list[threading.Thread] = []
+        for worker_info in live_workers:
+            t = threading.Thread(
+                target=_finalize,
+                args=(worker_info,),
+                name=f"cleanup-{worker_info.worker.id}",
+                daemon=True,
+            )
+            t.start()
+            threads.append(t)
+
+        # Phase 3: wait for every cleanup thread. Each ``kill_process_tree``
+        # call internally waits up to ``timeout=3`` seconds for graceful
+        # shutdown and then SIGKILLs stragglers, so a small safety margin
+        # on ``join`` is sufficient.
+        join_timeout = 10.0
+        for t in threads:
+            t.join(timeout=join_timeout)
+            if t.is_alive():
+                logger.warning(
+                    f"Cleanup thread {t.name} did not finish within "
+                    f"{join_timeout}s; leaving it as daemon."
+                )
+
     def _read_log_tail(self, log_file: str, lines: int = 50) -> str:
         try:
             with open(log_file) as f:
diff --git a/areal/infra/scheduler/ray.py b/areal/infra/scheduler/ray.py
@@ -492,24 +492,28 @@ def get_workers(self, role: str, timeout: float | None = None) -> list[Worker]:
 
         return [wi.worker for wi in worker_info_list]
 
-    def delete_workers(self, role: str | None = None):
+    def delete_workers(self, role: str | None = None, reverse_order: bool = False):
         """
         Delete workers and clean up resources
 
         Parameters
         --------
         role: str, optional
             Specific worker role to delete, or None to delete all
+        reverse_order: bool, optional
+            If True, iterate workers in reverse rank order when issuing
+            ``actor.destroy.remote()`` so that rank-0 is signalled last.
+            Note: Ray kills are asynchronous, so ordering here is best-effort.
         """
         if role is None:
             # Delete colocated roles first (they're just mappings)
             colocated_roles = list(self._colocated_roles.keys())
             for r in colocated_roles:
-                self.delete_workers(r)
+                self.delete_workers(r, reverse_order=reverse_order)
             # Then delete actual worker roles
             roles = list(self._workers.keys())
             for r in roles:
-                self.delete_workers(r)
+                self.delete_workers(r, reverse_order=reverse_order)
             return
 
         # Handle colocated role
@@ -521,6 +525,8 @@ def delete_workers(self, role: str | None = None):
                 logger.info(
                     f"Cleaning up {len(workers)} forked actors for role '{role}'"
                 )
+                if reverse_order:
+                    workers = list(reversed(workers))
                 self._cleanup_forked_workers(workers)
                 del self._workers[role]
             else:
@@ -536,6 +542,8 @@ def delete_workers(self, role: str | None = None):
         workers = self._workers[role]
         logger.info(f"Deleting {len(workers)} workers for role '{role}'")
 
+        if reverse_order:
+            workers = list(reversed(workers))
         self._cleanup_workers(workers)
 
         del self._workers[role]
diff --git a/areal/infra/scheduler/slurm.py b/areal/infra/scheduler/slurm.py
@@ -1221,14 +1221,19 @@ def get_workers(self, role: str, timeout: float | None = None) -> list[Worker]:
 
         raise WorkerTimeoutError(role, timeout)
 
-    def delete_workers(self, role: str | None = None):
+    def delete_workers(self, role: str | None = None, reverse_order: bool = False):
         """Delete workers and cancel Slurm jobs.
 
         Parameters
         ----------
         role : str, optional
             Role to delete. If None, deletes all roles.
+        reverse_order : bool, optional
+            Accepted for API compatibility with other schedulers but ignored
+            here: Slurm tears down the entire job step atomically via
+            ``scancel``, so per-rank ordering cannot be enforced.
         """
+        del reverse_order  # unused, see docstring
         if role is None:
             # Delete colocated/forked roles first (they don't own Slurm jobs)
             colocated_roles = list(self._colocated_roles.keys())
diff --git a/tests/test_local_scheduler.py b/tests/test_local_scheduler.py
@@ -1100,6 +1100,39 @@ def test_delete_workers_nonexistent_role(self, scheduler):
         # Should not raise
         scheduler.delete_workers("nonexistent")
 
+    def test_delete_workers_reverse_order(self, scheduler, tmp_path, monkeypatch):
+        """With reverse_order=True, workers are cleaned up in reverse rank order.
+
+        This protects rank-0 (owner of the global TCPStore server) from being
+        torn down before non-zero ranks finish their final NCCL abort.
+        """
+        workers = [
+            create_worker_info(
+                worker_id=f"role1/{i}",
+                role="role1",
+                ports=[str(8000 + i)],
+                log_file=str(tmp_path / f"role1-{i}.log"),
+            )
+            for i in range(4)
+        ]
+        scheduler._workers["role1"] = workers
+        scheduler._allocated_ports = {8000, 8001, 8002, 8003}
+
+        observed_order: list[str] = []
+
+        original_cleanup = scheduler._cleanup_workers
+
+        def spy(workers_arg):
+            observed_order.extend(w.worker.id for w in workers_arg)
+            original_cleanup(workers_arg)
+
+        monkeypatch.setattr(scheduler, "_cleanup_workers", spy)
+
+        scheduler.delete_workers("role1", reverse_order=True)
+
+        assert observed_order == ["role1/3", "role1/2", "role1/1", "role1/0"]
+        assert "role1" not in scheduler._workers
+
     def test_cleanup_workers_releases_ports(self, scheduler, tmp_path):
         """Should release allocated ports when cleaning up workers."""
         worker = create_worker_info(
diff --git a/tests/test_rollout_controller.py b/tests/test_rollout_controller.py
@@ -161,7 +161,7 @@ async def _async_call_engine_internal(self, worker_id, method, *args, **kwargs):
         await asyncio.sleep(0.001)
         return None
 
-    def delete_workers(self, role):
+    def delete_workers(self, role, reverse_order: bool = False):
         self.workers.clear()
         self._pending_results.clear()
         self._task_counter = 0
diff --git a/tests/test_train_controller.py b/tests/test_train_controller.py