Add allreduce tensor shape logging for profiling

github-actions[bot] · functionstackx · github-actions[bot] · commit 1d3f38dab34d · 2026-04-05T21:12:36.000Z
- Add `benchmarks/patches/inject_ar_shape_logging.py`: patches SGLang's parallel_state.py and custom_all_reduce.py inside the container to log tensor .shape, .dtype, and byte size on rank 0 for every allreduce call - Modify `dsr1_fp4_mi355x.sh` to run the injection when AR_SHAPE_LOGGING=1 - Add `ar-shape-logging` input to profile.yml workflow Closes #1005 Co-authored-by: functionstackx <functionstackx@users.noreply.github.com>
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
@@ -22,6 +22,11 @@ on:
         required: false
         type: boolean
         default: false
+      ar-shape-logging:
+        description: "Enable allreduce tensor shape logging (AR_SHAPE_LOGGING)"
+        required: false
+        type: boolean
+        default: false
       ref:
         description: "Ref (branch/sha) to checkout"
         required: false
@@ -117,6 +122,7 @@ jobs:
       DISAGG: ${{ matrix.config.disagg }}
       MOE_DEBUG: '0'
       MOE_DEBUG_LOG: ${{ (inputs.moe-debug) && '/workspace/moe_debug.tp0.log' || '' }}
+      AR_SHAPE_LOGGING: ${{ (inputs.ar-shape-logging) && '1' || '0' }}
     steps:
       - name: Resource cleanup
         run: |
diff --git a/benchmarks/patches/allreduce_shape_logger.py b/benchmarks/patches/allreduce_shape_logger.py
@@ -0,0 +1,132 @@
+"""
+Monkey-patch SGLang's GroupCoordinator.all_reduce to log tensor shapes
+entering the custom allreduce kernel (cross_device_reduce_2stage).
+
+Usage: Set PYTHONPATH to include the directory containing sitecustomize.py
+which imports this module, OR call patch() directly before launching SGLang.
+
+Logs are written to /workspace/allreduce_shapes.log (one line per call on rank 0).
+After the run, the log can be post-processed to get unique shapes and counts.
+"""
+
+import atexit
+import collections
+import os
+
+_shape_counts = collections.Counter()
+_log_file = None
+_original_all_reduce = None
+_original_all_reduce_out_place = None
+_patched = False
+# Limit per-call logging to avoid flooding stdout; summary is printed at exit.
+_MAX_LOG_LINES = 200
+_log_line_count = 0
+
+
+def _get_rank():
+    try:
+        import torch.distributed as dist
+        if dist.is_initialized():
+            return dist.get_rank()
+    except Exception:
+        pass
+    return 0
+
+
+def _patched_all_reduce_out_place(self, input_, outplace_all_reduce_method):
+    """Wrapper around _all_reduce_out_place that logs shapes for custom AR calls."""
+    global _log_line_count
+    rank = _get_rank()
+    if rank == 0:
+        shape_key = (tuple(input_.shape), str(input_.dtype), outplace_all_reduce_method)
+        _shape_counts[shape_key] += 1
+        if _log_line_count < _MAX_LOG_LINES:
+            print(
+                f"[AR_SHAPE] method={outplace_all_reduce_method} "
+                f"shape={list(input_.shape)} dtype={input_.dtype} "
+                f"numel={input_.numel()} bytes={input_.numel() * input_.element_size()}",
+                flush=True,
+            )
+            _log_line_count += 1
+    return _original_all_reduce_out_place(self, input_, outplace_all_reduce_method)
+
+
+def _patched_all_reduce(self, input_):
+    """Wrapper around all_reduce that logs shapes for ALL allreduce calls (including in-place/deterministic)."""
+    global _log_line_count
+    rank = _get_rank()
+    if rank == 0 and _log_line_count < _MAX_LOG_LINES:
+        shape_key = (tuple(input_.shape), str(input_.dtype), "all")
+        _shape_counts[shape_key] += 1
+        if _log_line_count < _MAX_LOG_LINES:
+            print(
+                f"[AR_SHAPE_ENTRY] shape={list(input_.shape)} dtype={input_.dtype} "
+                f"numel={input_.numel()} bytes={input_.numel() * input_.element_size()}",
+                flush=True,
+            )
+            _log_line_count += 1
+    return _original_all_reduce(self, input_)
+
+
+def _print_summary():
+    """Print aggregated shape summary at process exit."""
+    rank = _get_rank()
+    if rank != 0 or not _shape_counts:
+        return
+
+    log_path = os.environ.get("AR_SHAPE_LOG", "/workspace/allreduce_shapes.log")
+    lines = []
+    lines.append("\n" + "=" * 80)
+    lines.append("[AR_SHAPE_SUMMARY] AllReduce tensor shapes (rank 0):")
+    lines.append(f"{'Count':>8}  {'Method':<12}  {'Shape':<30}  {'Dtype':<16}  {'Bytes':<12}")
+    lines.append("-" * 80)
+
+    for (shape, dtype, method), count in _shape_counts.most_common():
+        import torch
+        # Compute element size from dtype string
+        elem_size = 2  # default bf16
+        if "float32" in dtype:
+            elem_size = 4
+        elif "float16" in dtype or "bfloat16" in dtype:
+            elem_size = 2
+        elif "float8" in dtype:
+            elem_size = 1
+        numel = 1
+        for s in shape:
+            numel *= s
+        nbytes = numel * elem_size
+        lines.append(f"{count:>8}  {method:<12}  {str(list(shape)):<30}  {dtype:<16}  {nbytes:<12}")
+
+    lines.append("=" * 80)
+    summary = "\n".join(lines)
+    print(summary, flush=True)
+
+    try:
+        with open(log_path, "w") as f:
+            f.write(summary + "\n")
+        print(f"[AR_SHAPE] Summary written to {log_path}", flush=True)
+    except Exception as e:
+        print(f"[AR_SHAPE] Failed to write log: {e}", flush=True)
+
+
+def patch():
+    """Apply the monkey-patch to GroupCoordinator."""
+    global _original_all_reduce, _original_all_reduce_out_place, _patched
+    if _patched:
+        return
+
+    try:
+        from sglang.srt.distributed.parallel_state import GroupCoordinator
+    except ImportError:
+        print("[AR_SHAPE] Could not import GroupCoordinator, skipping patch", flush=True)
+        return
+
+    _original_all_reduce = GroupCoordinator.all_reduce
+    _original_all_reduce_out_place = GroupCoordinator._all_reduce_out_place
+
+    GroupCoordinator.all_reduce = _patched_all_reduce
+    GroupCoordinator._all_reduce_out_place = _patched_all_reduce_out_place
+    _patched = True
+
+    atexit.register(_print_summary)
+    print("[AR_SHAPE] Monkey-patch installed: logging allreduce tensor shapes on rank 0", flush=True)
diff --git a/benchmarks/patches/inject_ar_shape_logging.py b/benchmarks/patches/inject_ar_shape_logging.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+"""
+Inject allreduce shape logging into SGLang's parallel_state.py at runtime.
+
+Patches GroupCoordinator._all_reduce_out_place to print tensor shapes on rank 0.
+This patches the actual source file inside the container so that all worker
+processes (forked by SGLang) pick up the change.
+
+Usage: python3 inject_ar_shape_logging.py
+"""
+import importlib
+import os
+import re
+import sys
+import textwrap
+
+
+def find_and_patch(module_path: str, target_method: str, log_tag: str) -> bool:
+    """Find a Python module file and inject shape logging into a method."""
+    try:
+        mod = importlib.import_module(module_path)
+        filepath = mod.__file__
+    except (ImportError, AttributeError) as e:
+        print(f"[AR_SHAPE] Could not import {module_path}: {e}")
+        return False
+
+    if not filepath or not os.path.exists(filepath):
+        print(f"[AR_SHAPE] File not found for {module_path}")
+        return False
+
+    with open(filepath, "r") as f:
+        src = f.read()
+
+    # Look for the method definition
+    # Match: "def <method_name>(self, <args>):"
+    pattern = rf"(    def {re.escape(target_method)}\(self[^)]*\)[^:]*:.*\n)"
+    match = re.search(pattern, src)
+    if not match:
+        print(f"[AR_SHAPE] Could not find {target_method} in {filepath}")
+        return False
+
+    # Check if already patched
+    if "[AR_SHAPE_LOG]" in src:
+        print(f"[AR_SHAPE] Already patched: {filepath}")
+        return True
+
+    # Find the first argument name after self (the tensor)
+    sig_match = re.search(
+        rf"def {re.escape(target_method)}\(self,\s*(\w+)", src
+    )
+    tensor_name = sig_match.group(1) if sig_match else "input_"
+
+    # Build the logging code to insert after the method def line
+    log_code = textwrap.dedent(f"""\
+        # [AR_SHAPE_LOG] Injected shape logging
+        try:
+            import torch.distributed as _dist
+            if not _dist.is_initialized() or _dist.get_rank() == 0:
+                _s = list({tensor_name}.shape)
+                _b = {tensor_name}.numel() * {tensor_name}.element_size()
+                print(f"[AR_SHAPE] {log_tag} shape={{_s}} dtype={{{tensor_name}.dtype}} bytes={{_b}}", flush=True)
+        except Exception:
+            pass
+    """)
+
+    # Indent to match method body (8 spaces)
+    indented_log = textwrap.indent(log_code, "        ")
+
+    # Insert after the method definition line
+    end_of_def = match.end()
+    new_src = src[:end_of_def] + indented_log + src[end_of_def:]
+
+    with open(filepath, "w") as f:
+        f.write(new_src)
+    print(f"[AR_SHAPE] Patched {target_method} in {filepath}")
+    return True
+
+
+def patch_parallel_state():
+    """Patch GroupCoordinator._all_reduce_out_place in parallel_state.py."""
+    return find_and_patch(
+        "sglang.srt.distributed.parallel_state",
+        "_all_reduce_out_place",
+        "out_place",
+    )
+
+
+def patch_sglang_custom_ar():
+    """Patch CustomAllreduce.all_reduce_unreg in sglang's custom_all_reduce.py."""
+    return find_and_patch(
+        "sglang.srt.distributed.device_communicators.custom_all_reduce",
+        "all_reduce_unreg",
+        "sglang_unreg",
+    )
+
+
+def patch_aiter_custom_ar():
+    """Patch CustomAllreduce.all_reduce_unreg in aiter's custom_all_reduce.py."""
+    return find_and_patch(
+        "aiter.dist.device_communicators.custom_all_reduce",
+        "all_reduce_unreg",
+        "aiter_unreg",
+    )
+
+
+def patch_top_level_all_reduce():
+    """Patch GroupCoordinator.all_reduce — the single entry point for all allreduce calls."""
+    return find_and_patch(
+        "sglang.srt.distributed.parallel_state",
+        "all_reduce",
+        "entry",
+    )
+
+
+if __name__ == "__main__":
+    print("[AR_SHAPE] Starting allreduce shape logging injection...")
+
+    # Patch the top-level entry point (catches ALL allreduce calls)
+    patch_top_level_all_reduce()
+
+    # Patch the out-of-place path (catches custom AR method selection)
+    patch_parallel_state()
+
+    # Patch the low-level unreg call in both sglang and aiter
+    patch_sglang_custom_ar()
+    patch_aiter_custom_ar()
+
+    print("[AR_SHAPE] Done. Shape logs will appear as [AR_SHAPE] lines in server output.")
diff --git a/benchmarks/patches/sitecustomize.py b/benchmarks/patches/sitecustomize.py
@@ -0,0 +1,49 @@
+"""Auto-patch SGLang allreduce shape logging via import hook.
+
+When AR_SHAPE_LOGGING=1, installs a meta-path finder that waits for
+sglang.srt.distributed.parallel_state to be imported, then applies the
+monkey-patch to log tensor shapes entering the custom allreduce kernel.
+"""
+import importlib
+import os
+import sys
+
+
+if os.environ.get("AR_SHAPE_LOGGING") == "1":
+
+    class _AllReducePatchFinder:
+        """Meta-path finder that triggers patching after parallel_state is imported."""
+        _target = "sglang.srt.distributed.parallel_state"
+        _done = False
+
+        def find_module(self, fullname, path=None):
+            if not self._done and fullname == self._target:
+                return self
+            return None
+
+        def load_module(self, fullname):
+            # Remove ourselves so we don't recurse
+            self._done = True
+            if self in sys.meta_path:
+                sys.meta_path.remove(self)
+
+            # Let the real import happen
+            if fullname in sys.modules:
+                mod = sys.modules[fullname]
+            else:
+                mod = importlib.import_module(fullname)
+
+            # Now apply the patch
+            try:
+                _patch_dir = os.path.dirname(os.path.abspath(__file__))
+                if _patch_dir not in sys.path:
+                    sys.path.insert(0, _patch_dir)
+                import allreduce_shape_logger
+                allreduce_shape_logger.patch()
+            except Exception as e:
+                print(f"[AR_SHAPE] Deferred patch failed: {e}", flush=True)
+
+            return mod
+
+    sys.meta_path.insert(0, _AllReducePatchFinder())
+    print("[AR_SHAPE] Import hook installed, will patch after parallel_state loads", flush=True)
diff --git a/benchmarks/single_node/dsr1_fp4_mi355x.sh b/benchmarks/single_node/dsr1_fp4_mi355x.sh
@@ -20,6 +20,13 @@ hf download "$MODEL"
 export SGLANG_USE_AITER=1
 export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
+# Optionally inject allreduce shape logging (set AR_SHAPE_LOGGING=1 to enable)
+if [[ "${AR_SHAPE_LOGGING:-}" == "1" ]]; then
+    echo "[AR_SHAPE] Injecting allreduce shape logger..."
+    python3 /workspace/benchmarks/patches/inject_ar_shape_logging.py
+    echo "[AR_SHAPE] Injection complete"
+fi
+
 PREFILL_SIZE=196608
 if [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
 	if [[ "$CONC" -gt "32" ]]; then