add comm example

yushangdi · yushangdi · commit b95d72fb3905 · 2026-06-05T21:13:35.000Z
diff --git a/advanced_source/cuda_graph_annotations_tutorial.py b/advanced_source/cuda_graph_annotations_tutorial.py
@@ -16,6 +16,9 @@
        * How to profile annotated graphs
        * How to post-process traces with semantic kernel lanes
        * How to visualize graph execution with custom stream assignments
+       * How to annotate communication collectives with the metadata
+         (collective type, message size, group, rank) that eager NCCL
+         traces expose but CUDA graphs drop
 
     .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
        :class-card: card-prerequisites
@@ -34,6 +37,14 @@
 labels to kernels within CUDA graphs. These annotations can be merged back into
 profiler traces to create custom visualization lanes, making it easier to
 understand and debug complex graph executions.
+
+Annotations are not limited to compute kernels. One of the most valuable uses
+is annotating **communication collectives**. In eager mode, the profiler
+attaches rich metadata to every NCCL kernel -- the collective type, message
+size, process group, and ranks -- so you can see exactly what each comm is
+doing. Under CUDA graphs that metadata is lost: the collective replays as an
+opaque kernel. This tutorial shows how to re-attach that metadata with
+annotations so graphed comms read just like eager ones.
 """
 
 ###############################################################################
@@ -95,15 +106,18 @@
 
 import copy
 import math
+import os
 import pickle
 import sys
 from collections import Counter
 from pathlib import Path
 
 import torch
+import torch.distributed as dist
 from torch.profiler import profile, ProfilerActivity
 from torch.cuda._graph_annotations import (
     get_kernel_annotations,
+    get_stream_for_pg,
     mark_kernels,
     _is_tools_id_unavailable,
 )
@@ -446,6 +460,180 @@ def main():
 # the semantic kernel lanes.
 # ============================================================
 
+###############################################################################
+# Annotating Communication Collectives
+# -------------------------------------
+#
+# In eager mode the profiler records a NCCL collective with a set of metadata
+# fields -- the collective type, input/output message sizes, the process group,
+# its size, and the participating ranks. When you select an ``all_reduce`` in
+# the trace viewer you see all of it, which is invaluable for spotting an
+# undersized bucket, a collective on the wrong group, or a rank imbalance.
+#
+# Under CUDA graphs that context disappears. The collective is captured once
+# and then replayed as an anonymous kernel node, so the profiler has nothing to
+# attach the NCCL metadata to. The kernels still show up in the trace, but they
+# are opaque: you cannot tell an all-reduce from an all-gather, let alone how
+# many bytes moved.
+#
+# Annotations close this gap. By wrapping the collective in ``mark_kernels``
+# with the same fields eager records, we re-attach that metadata to the graphed
+# kernel. After post-processing, a graphed collective reads just like an eager
+# one. The helper below builds the metadata dict; using the field names the
+# profiler uses in eager (``Collective name``, ``In msg nelems``,
+# ``Group size``, ...) keeps the annotated trace consistent with non-graphed
+# traces, so the same tooling and muscle memory apply.
+
+def annotate_collective(collective_name, tensor, group=None):
+    """Annotate a collective with the metadata eager NCCL traces expose.
+
+    Returns a ``mark_kernels`` context manager. Any kernels launched inside
+    (i.e. the collective) are tagged with the collective type, message size,
+    dtype, group name/size, and rank, and placed on a dedicated lane keyed by
+    the process group so comms are visually separated from compute.
+    """
+    initialized = dist.is_available() and dist.is_initialized()
+    world_size = dist.get_world_size(group) if initialized else 1
+    rank = dist.get_rank(group) if initialized else 0
+    pg_name = getattr(group, "group_name", "default") if group else "default"
+
+    metadata = {
+        "name": collective_name,
+        # Field names mirror what the profiler records for eager collectives.
+        "Collective name": collective_name,
+        "dtype": str(tensor.dtype).replace("torch.", ""),
+        "In msg nelems": tensor.numel(),
+        "Out msg nelems": tensor.numel(),
+        "Group size": world_size,
+        "Process Group Name": pg_name,
+        "rank": rank,
+        # Give every process group its own lane (a stable id >= 60).
+        "stream": get_stream_for_pg(pg_name),
+    }
+    return mark_kernels(metadata)
+
+###############################################################################
+# A Block That Mixes Compute and Communication
+# ----------------------------------------------
+#
+# A tensor- or data-parallel layer interleaves matmuls with collectives. Here
+# the projection output is all-reduced across the group, mirroring the comm in
+# a tensor-parallel linear. The collective is annotated with
+# ``annotate_collective`` and lands on its own lane.
+
+def build_comm_block(group=None):
+    """Create a compute + collective block annotated for profiling."""
+    device = "cuda"
+    torch.manual_seed(0)
+    dim = 1024
+    params = {
+        "x": torch.randn(4, 256, dim, device=device),
+        "W": torch.randn(dim, dim, device=device) / math.sqrt(dim),
+    }
+
+    def forward():
+        with mark_kernels({"name": "proj", "stream": 61}):
+            h = params["x"] @ params["W"]
+
+        # All-reduce the projection output across the group (e.g. tensor
+        # parallel). The annotation re-attaches the NCCL metadata that a
+        # CUDA graph would otherwise drop.
+        if dist.is_available() and dist.is_initialized():
+            with annotate_collective("all_reduce", h, group):
+                dist.all_reduce(h)
+        return h
+
+    return forward
+
+###############################################################################
+# Running the Communication Demo
+# -------------------------------
+#
+# Collectives need a process group. In real training the group already exists
+# with ``world_size > 1``; to keep this tutorial runnable on a single GPU we
+# initialize a trivial one-rank NCCL group. The capture, profiling, and
+# post-processing steps are exactly the same helpers used for the compute demo
+# -- comm annotations need no special handling on the trace side.
+
+def maybe_init_single_rank_pg():
+    """Initialize a 1-rank NCCL group so the demo runs on a single GPU."""
+    if not (dist.is_available() and torch.cuda.is_available()):
+        return False
+    if not dist.is_initialized():
+        os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+        os.environ.setdefault("MASTER_PORT", "29500")
+        dist.init_process_group("nccl", rank=0, world_size=1)
+    return True
+
+def comm_annotation_demo():
+    """Capture a compute+collective block and surface the comm metadata."""
+    if not maybe_init_single_rank_pg():
+        print("Distributed/NCCL unavailable; skipping comm annotation demo.")
+        return
+
+    output_dir = Path("traces_comm")
+
+    print("\nBuilding compute + collective block...")
+    model_fn = build_comm_block()
+
+    print("Capturing CUDA graph with annotations...")
+    graph, _ = capture_graph_with_annotations(model_fn)
+
+    annotations_path = save_annotations(output_dir)
+    raw_trace_path = profile_graph(graph, output_dir)
+    annotated_path, _, annotated_trace = post_process_trace(
+        raw_trace_path, annotations_path, output_dir
+    )
+
+    # Print the args of the annotated collective kernel(s) to show that the
+    # eager-style metadata is now attached to the graphed comm.
+    print("\nAnnotated collective kernels (metadata restored):")
+    for event in annotated_trace["traceEvents"]:
+        args = event.get("args", {})
+        if args.get("Collective name"):
+            print(f"  {event.get('name', '?')[:40]}")
+            for key in (
+                "Collective name",
+                "dtype",
+                "In msg nelems",
+                "Group size",
+                "Process Group Name",
+                "rank",
+                "stream",
+            ):
+                if key in args:
+                    print(f"      {key}: {args[key]}")
+    print(f"\nAnnotated trace: {annotated_path}")
+
+# Example output:
+# if __name__ == "__main__":
+#     comm_annotation_demo()
+#
+# Building compute + collective block...
+# Capturing CUDA graph with annotations...
+# Captured graph with 3 annotated nodes
+# Saved 3 annotations to traces_comm/kernel_annotations_rank0_fwd_bwd.pkl
+# Saved raw trace to traces_comm/trace_raw.json.gz
+# Annotated 3 kernels in the trace
+# Saved annotated trace to traces_comm/trace_annotated.json.gz
+#
+# Annotated collective kernels (metadata restored):
+#   ncclDevKernel_AllReduce_Sum_f32_RING_LL
+#       Collective name: all_reduce
+#       dtype: float32
+#       In msg nelems: 1048576
+#       Group size: 1
+#       Process Group Name: default
+#       rank: 0
+#       stream: 60
+#
+# Annotated trace: traces_comm/trace_annotated.json.gz
+#
+# In the trace viewer the all-reduce now sits on its own ``comm`` lane, and
+# selecting it shows the collective type, message size, group, and rank --
+# the same fields you would see in an eager trace, recovered for a graphed
+# collective.
+
 ###############################################################################
 # Visualizing Results
 # -------------------
@@ -528,6 +716,9 @@ def main():
 #
 # - Use ``mark_kernels()`` to label regions during graph capture
 # - Enable annotations with ``enable_annotations=True``
+# - Annotate communication collectives to recover the NCCL metadata
+#   (collective type, message size, group, rank) that CUDA graphs drop but
+#   eager traces expose
 # - Post-process traces with ``annotate_trace()`` and cleanup passes
 # - View results in chrome://tracing for intuitive visualization
 #