perf(fsdp2): reduce per-step host/comm overhead (#181)

kcz358 · web-flow · commit 2fe89ebde489 · 2026-05-28T09:43:59.000+08:00
* perf(fsdp2): overlap host-to-device copy with non_blocking=True

send_to_device defaulted to non_blocking=False, which made the H2D
transfer a synchronous step even when the dataloader produced pinned
tensors (the repo's default). With non_blocking=True the copy is
submitted to the CUDA stream and overlaps the first kernels of the
following training_step, eliminating the dedicated h2d wait.

Adds a one-time warning at trainer init when dataloader_pin_memory is
False, since the flag becomes a no-op on pageable memory.

* perf(fsdp2): collapse training metrics collectives to a single all_gather

calculate_training_metrics previously issued five independent all_reduce
calls (mfu/sum/avg/min/max) on tiny scalar tensors, each paying full
collective latency and an extra GPU-&gt;CPU sync via .item(). Replace them
with one all_gather_into_tensor over a 2-element per-rank tensor
[mfu_local, seq_len_sum_local], reduce locally (mean/sum/min/max), and
do a single batched .tolist() to pull all scalars at once.

Also drops the redundant torch.tensor(flops, device=...) wrapper since
the callee now accepts a Python float directly, removing one host-&gt;device
roundtrip per step.
diff --git a/src/lmms_engine/train/fsdp2/fsdp2_trainer.py b/src/lmms_engine/train/fsdp2/fsdp2_trainer.py
@@ -95,6 +95,16 @@ def __init__(
         # Optional EMA (fully opt-in)
         self.ema = EMAHelper(self.args)
 
+        # send_to_device uses non_blocking=True to overlap H2D with the next
+        # training step. This requires pinned memory; pageable memory falls
+        # back to a synchronous copy and the flag becomes a no-op.
+        if not self.args.dataloader_pin_memory:
+            logger.warning(
+                "send_to_device uses non_blocking=True but dataloader_pin_memory "
+                "is False; H2D copies will fall back to synchronous. Enable "
+                "dataloader_pin_memory for best throughput."
+            )
+
         # Optional Eval Server Backend (only on rank 0)
         self.eval_backend = None
         if dist.get_rank() == 0 and self.args.eval_config is not None and self.args.eval_strategy != "no":
@@ -380,7 +390,7 @@ def train(self, resume_from_checkpoint: bool = False):
                     break
                 # send batch to device
                 with self.cuda_event_profiler.record("host_to_device", self.global_step):
-                    batch = send_to_device(batch, self.fsdp2_model.device)
+                    batch = send_to_device(batch, self.fsdp2_model.device, non_blocking=True)
                 self.memory_snapshot_profiler.step(self.global_step)
                 start_time = time.perf_counter()
                 try:
@@ -408,14 +418,13 @@ def train(self, resume_from_checkpoint: bool = False):
                     )
                     self.compute_tracker.accumulate_flops(raw_flops)
                     device = self.fsdp2_model.device
-                    flops_tensor = torch.tensor(flops, device=device)
                     sp_size = pgm.process_group_manager.cp_world_size
                     tp_size = pgm.process_group_manager.tp_world_size
                     parallel_size = sp_size * tp_size
 
                     # Calculate training metrics (MFU, token stats, throughput)
                     perf_metrics, self.total_tokens = self.calculate_training_metrics(
-                        flops_tensor=flops_tensor,
+                        flops=flops,
                         parallel_size=parallel_size,
                         promised_flops=promised_flops,
                         device=device,
@@ -654,7 +663,7 @@ def print_batch_input(self, batch):
 
     @staticmethod
     def calculate_training_metrics(
-        flops_tensor: torch.Tensor,
+        flops: float,
         parallel_size: int,
         promised_flops: float,
         device: torch.device,
@@ -666,54 +675,61 @@ def calculate_training_metrics(
         """
         Calculate training performance metrics including MFU, token statistics, and throughput.
 
+        Uses a single ``all_gather`` over a 2-element per-rank tensor
+        ``[mfu_local, seq_len_sum_local]`` and reduces (mean/sum/min/max)
+        locally, replacing five independent ``all_reduce`` calls.
+
         Args:
-            flops_tensor: Tensor containing FLOPs count
-            parallel_size: Product of sequence and tensor parallel sizes
-            promised_flops: Promised FLOPs capacity
-            device: Device to perform computations on
-            seq_len: List of sequence lengths per batch
-            total_tokens: Current total token count
-            delta_time: Time taken for the training step
-            world_size: Distributed training world size
+            flops: Per-rank FLOPs count (Python float from ``estimate_flops``).
+            parallel_size: Product of sequence and tensor parallel sizes.
+            promised_flops: Promised FLOPs capacity.
+            device: Device to perform computations on.
+            seq_len: List of sequence lengths per batch (one entry per local sample).
+            total_tokens: Current total token count.
+            delta_time: Time taken for the training step.
+            world_size: Distributed training world size.
 
         Returns:
             tuple: (metrics_dict, updated_total_tokens)
         """
-        metrics = {}
-
-        # Calculate mfu per rank
-        # Divide by parallel size because seq_len/flops are estimated before SP/TP sharding.
-        mfu = flops_tensor.item() / parallel_size / promised_flops
-        mfu = torch.tensor(mfu, device=device)
-        torch.distributed.all_reduce(mfu, op=torch.distributed.ReduceOp.AVG)
-        mfu = mfu.item()
-
-        # Calculating token stats
-        seq_len = torch.tensor(seq_len, device=device, dtype=torch.float32)
-        # Divide by parallel size to avoid counting replicated SP/TP batches multiple times.
-        total_seq_len = seq_len.sum() / parallel_size
-        torch.distributed.all_reduce(total_seq_len, op=torch.distributed.ReduceOp.SUM)
-        # Avg seq len won't be effected by sp since we perform all reduce
-        # across world size
-        global_seq_len_avg = seq_len.sum()
-        torch.distributed.all_reduce(global_seq_len_avg, op=torch.distributed.ReduceOp.AVG)
-        metrics["perf/global_seq_len_avg"] = global_seq_len_avg.item()
-        global_seq_len_min = seq_len.sum()
-        torch.distributed.all_reduce(global_seq_len_min, op=torch.distributed.ReduceOp.MIN)
-        metrics["perf/global_seq_len_min"] = global_seq_len_min.item()
-        global_seq_len_max = seq_len.sum()
-        torch.distributed.all_reduce(global_seq_len_max, op=torch.distributed.ReduceOp.MAX)
-        metrics["perf/global_seq_len_max"] = global_seq_len_max.item()
-
-        metrics["train/mfu"] = round(mfu, 2)
-        total_tokens += total_seq_len.item()
-
-        tokens_per_second = total_seq_len.item() / delta_time
+        # Divide mfu by parallel size because seq_len/flops are estimated
+        # before SP/TP sharding. seq_len comes in as a plain Python list so we
+        # avoid an extra GPU sync and just sum it on the host.
+        mfu_local = flops / parallel_size / promised_flops
+        seq_len_sum_local = float(sum(seq_len))
+
+        local = torch.tensor([mfu_local, seq_len_sum_local], device=device, dtype=torch.float32)
+        gathered = torch.empty(world_size * 2, device=device, dtype=torch.float32)
+        torch.distributed.all_gather_into_tensor(gathered, local)
+        gathered = gathered.view(world_size, 2)
+
+        mfu_all = gathered[:, 0]
+        sl_all = gathered[:, 1]
+
+        # Reduce on-device, then do a single batched .tolist() sync to pull
+        # all five scalars at once.
+        reduced = torch.stack(
+            [
+                mfu_all.mean(),
+                sl_all.sum() / parallel_size,  # total_seq_len (deduped across SP/TP)
+                sl_all.mean(),  # global_seq_len_avg
+                sl_all.amin(),  # global_seq_len_min
+                sl_all.amax(),  # global_seq_len_max
+            ]
+        ).tolist()
+        mfu, total_seq_len, global_seq_len_avg, global_seq_len_min, global_seq_len_max = reduced
+
+        total_tokens += total_seq_len
+        tokens_per_second = total_seq_len / delta_time
         tokens_per_gpu = tokens_per_second / world_size
 
-        # Log total tokens and total tokens per second
-        metrics["train/total_tokens"] = TrainUtilities.format_tokens(total_tokens)
-        metrics["train/tokens_per_second"] = round(tokens_per_second)
-        metrics["train/tokens_per_gpu"] = round(tokens_per_gpu)
-
+        metrics = {
+            "train/mfu": round(mfu, 2),
+            "perf/global_seq_len_avg": global_seq_len_avg,
+            "perf/global_seq_len_min": global_seq_len_min,
+            "perf/global_seq_len_max": global_seq_len_max,
+            "train/total_tokens": TrainUtilities.format_tokens(total_tokens),
+            "train/tokens_per_second": round(tokens_per_second),
+            "train/tokens_per_gpu": round(tokens_per_gpu),
+        }
         return metrics, total_tokens