fix(kubeflow): resolve rank-0 and last rank before forwarding logs

ko3n1g · ko3n1g · commit c23cecf7d5de · 2026-05-31T22:12:30.000Z
On first attach the GROUP_RANK pod map is empty until the torchrun workers
finish rendezvous, so _forward_to_stdout fell back to rank-0-only and the
last rank's early per-step loss/throughput lines (replayed via --tail=-1)
were written to log-allranks but never forwarded to stdout — the CI log
silently dropped the beginning of the run until a re-attach ~120s later,
by which point --since-time skips the replayed history.

Poll on the first attach until both rank 0 and the last rank resolve before
forwarding, capped at 600s (then fall back). The wait is gated on a
non-empty pod list, so it is a no-op when pods can't be listed (no kubectl
/ unit tests) and engages only for real runs.

Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;
diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
@@ -609,10 +609,40 @@ def _forward_to_stdout(log_line: str, pod_index: dict[str, int]) -> bool:
             # older than REORDER_HOLD_S — long enough to absorb cross-node clock
             # skew + flush jitter, short enough to keep the console near-live.
             reorder_hold_s = 2.0
+            # First attach: resolve BOTH rank 0 and the last rank before forwarding
+            # any line. GROUP_RANK is only readable once the torchrun workers have
+            # rendezvoused, so the map is empty at first and _forward_to_stdout would
+            # fall back to rank-0-only — the last rank's early per-step loss lines
+            # (replayed via --tail=-1) would land in log-allranks but never reach
+            # stdout, silently dropping the beginning of the run from the CI log.
+            # Poll until both are resolved, capped so a run that never exposes
+            # GROUP_RANK still streams (with the completion-index fallback).
+            rank_resolve_timeout_s = 600.0
+            rank_resolve_poll_s = 5.0
             since_time: Optional[str] = None
             while True:
                 pod_index = _pod_index_map()
                 _ensure_group_ranks(set(pod_index))
+                if since_time is None:
+                    # First attach: wait until BOTH rank 0 and the last rank are
+                    # resolved before forwarding, so the last rank's early per-step
+                    # lines (replayed via --tail=-1) reach stdout instead of only
+                    # log-allranks. Only wait while pods are actually listable; an
+                    # empty list (no kubectl / unit tests) skips the wait and streams
+                    # with the existing completion-index fallback.
+                    resolve_deadline = time.time() + rank_resolve_timeout_s
+                    while pod_index and not {0, last_group_rank} <= set(group_rank_map.values()):
+                        if time.time() >= resolve_deadline:
+                            logger.warning(
+                                "rank 0 / last rank (%d) not both resolved within %.0fs; "
+                                "forwarding with completion-index fallback",
+                                last_group_rank,
+                                rank_resolve_timeout_s,
+                            )
+                            break
+                        time.sleep(rank_resolve_poll_s)
+                        pod_index = _pod_index_map()
+                        _ensure_group_ranks(set(pod_index))
                 attempt_cmd = base_cmd + ["--timestamps", "-f"]
                 # First attach replays history (--tail=-1); reconnects resume from
                 # the last seen timestamp so re-attaching never re-emits old lines.