Add PCIe transfer benchmark (H2D/D2H) to Ironwood

linamy85 · linamy85 · commit 252715f9d9b2 · 2026-01-16T14:51:47.000Z
- Implement benchmark_pcie_transfer.py to measure H2D and D2H transfer performance using JAX, supporting various transfer modes (Standard, Parallel, Threaded, Chunked).
- Integrate the new benchmark into run_benchmark.py.
- Add configuration files for single device, single chip, and single VM topologies in configs/pcie_transfer/.
- Add scripts/run_pcie_transfer_benchmark.sh for bulk execution with numactl interleaving option.
diff --git a/Ironwood/configs/host_device/host_device_single_chip.yaml b/Ironwood/configs/host_device/host_device_single_chip.yaml
@@ -0,0 +1,9 @@
+benchmarks:
+- benchmark_name: host_device
+  num_runs: 20
+  benchmark_sweep_params:
+  # Single Chip (1 Chip, 2 Devices)
+  - {mesh_shape: "1x2", data_size_mb_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]}
+
+  csv_path: "../microbenchmarks/host_device/single_chip"
+  trace_dir: "../microbenchmarks/host_device/single_chip/trace"
diff --git a/Ironwood/guides/host_device/host_device.md b/Ironwood/guides/host_device/host_device.md
@@ -0,0 +1,30 @@
+# Host Device Microbenchmarks on tpu7x-2x2x1
+
+This guide provides instructions for running Host Device (Host-to-Device and Device-to-Host) microbenchmarks on tpu7x-2x2x1 Google Kubernetes Engine (GKE) clusters. It covers creating a node pool, running the benchmarks, and viewing the output.
+
+> [!NOTE]
+> This benchmark is currently a Work In Progress (WIP). Expected bandwidth numbers are not yet finalized.
+
+## Create Node Pools
+
+Follow [Setup section](../../Ironwood_Microbenchmarks_readme.md#setup) to create a GKE cluster with one 2x2x1 nodepool.
+
+## Run Host Device Microbenchmarks
+
+To run the microbenchmarks, apply the following Kubernetes configuration:
+```bash
+kubectl apply -f tpu7x-host-device-benchmark.yaml
+```
+
+To extract the log of the microbenchmark, use `kubectl logs`:
+```bash
+kubectl logs tpu7x-host-device-benchmark
+```
+
+Once the benchmark completes, you should see logs reporting bandwidth statistics.
+
+To retrieve the complete results, including the trace and CSV output files, you must keep the pod running after the benchmark completes. To do this, add a `sleep` command to the `tpu7x-host-device-benchmark.yaml` file. You can then use `kubectl cp` to copy the output from the pod.
+
+```bash
+kubectl cp tpu7x-host-device-benchmark:/microbenchmarks/host_device host_device
+```
diff --git a/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml b/Ironwood/guides/host_device/tpu7x-host-device-benchmark.yaml
@@ -0,0 +1,33 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: tpu7x-pcie-transfer-benchmark
+spec:
+  restartPolicy: Never
+  nodeSelector:
+    cloud.google.com/gke-tpu-accelerator: tpu7x
+    cloud.google.com/gke-tpu-topology: 2x2x1
+  containers:
+  - name: tpu-job
+    image: python:3.12
+    ports:
+    - containerPort: 8431
+    securityContext:
+      privileged: false
+    command:
+    - bash
+    - -c
+    - |
+      set -ex
+
+      git clone https://github.com/AI-Hypercomputer/accelerator-microbenchmarks.git
+      cd accelerator-microbenchmarks
+      pip install -r requirements.txt
+
+      bash ./Ironwood/scripts/run_host_device_benchmark.sh
+
+    resources:
+      requests:
+        google.com/tpu: 4
+      limits:
+        google.com/tpu: 4
diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Default values
+CONFIG_DIR="Ironwood/configs/host_device"
+SPECIFIC_CONFIG=""
+INTERLEAVED=false
+
+# Helper function for usage
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo "Options:"
+    echo "  --config <path>       Path to specific config file (optional)"
+    echo "  --interleaved         Run with numactl --interleave=all"
+    echo "  --help                Show this help message"
+    exit 1
+}
+
+# Parse arguments
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --config) SPECIFIC_CONFIG="$2"; shift ;;
+        --interleaved) INTERLEAVED=true ;;
+        --help) usage ;;
+        *) echo "Unknown parameter passed: $1"; usage ;;
+    esac
+    shift
+done
+
+echo "--- Starting Host-Device Transfer Benchmark (H2D/D2H) ---"
+echo "Note: This benchmark is work in progress"
+echo "Interleaved: $INTERLEAVED"
+
+if [ -n "$SPECIFIC_CONFIG" ]; then
+    CONFIGS=("$SPECIFIC_CONFIG")
+else
+    # Use nullglob to handle case where no files match (though unlikely here)
+    shopt -s nullglob
+    CONFIGS=("$CONFIG_DIR"/*.yaml)
+    shopt -u nullglob
+fi
+
+if [ ${#CONFIGS[@]} -eq 0 ]; then
+    echo "No configuration files found!"
+    exit 1
+fi
+
+for CONFIG_FILE in "${CONFIGS[@]}"; do
+    echo "--- Running Config: $CONFIG_FILE ---"
+    CMD="python Ironwood/src/run_benchmark.py --config=${CONFIG_FILE}"
+
+    if [ "$INTERLEAVED" = true ]; then
+        if command -v numactl &> /dev/null; then
+            echo "Running with numactl --interleave=all"
+            numactl --interleave=all $CMD
+        else
+            echo "Warning: numactl not found. Running without interleaving."
+            $CMD
+        fi
+    else
+        $CMD
+    fi
+    echo "--- Finished Config: $CONFIG_FILE ---"
+    echo ""
+done
+
+echo "--- All Benchmarks Finished ---"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
@@ -0,0 +1,222 @@
+"""Benchmarks Host-to-Device and Device-to-Host transfer performance."""
+
+import concurrent.futures
+import gc
+import time
+import os
+from typing import Any, Dict, Tuple, List
+
+import jax
+from jax import sharding
+import numpy as np
+from benchmark_utils import MetricsStatistics
+
+os.environ["TPU_PREMAPPED_BUFFER_SIZE"] = "68719476736" # 64 GiB
+os.environ["TPU_PREMAPPED_BUFFER_TRANSFER_THRESHOLD_BYTES"] = "68719476736"
+
+def get_tpu_devices(num_devices: int):
+    devices = jax.devices()
+    if len(devices) < num_devices:
+        raise RuntimeError(f"Require {num_devices} devices, found {len(devices)}")
+    return devices[:num_devices]
+
+def _run_chunked(host_data, data_sharding, host_shards, target_devices, num_devices, chunks_per_device):
+    # Smart Chunked H2D
+    chk_h2d_start = time.perf_counter()
+    total_workers = num_devices * chunks_per_device
+    with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
+        chunked_futures = []
+        for shard, dev in zip(host_shards, target_devices):
+            sub_chunks = np.array_split(shard, chunks_per_device, axis=0)
+            for chunk in sub_chunks:
+                chunked_futures.append(
+                    executor.submit(jax.device_put, chunk, dev)
+                )
+        chunked_buffers = [f.result() for f in chunked_futures]
+        for db in chunked_buffers:
+            db.block_until_ready()
+    chk_h2d_end = time.perf_counter()
+    h2d_ms = (chk_h2d_end - chk_h2d_start) * 1000
+    for db in chunked_buffers:
+        db.delete()
+
+    # Smart Chunked D2H
+    data_on_device = jax.device_put(host_data, data_sharding)
+    data_on_device.block_until_ready()
+    
+    chk_d2h_start = time.perf_counter()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
+        d2h_futures = []
+        for shard in data_on_device.addressable_shards:
+            # Direct slicing on device array to avoid copy
+            shard_len = shard.data.shape[0]
+            chunk_size = (shard_len + chunks_per_device - 1) // chunks_per_device
+            for i in range(chunks_per_device):
+                start = i * chunk_size
+                end = min((i + 1) * chunk_size, shard_len)
+                if start < end:
+                    d2h_futures.append(
+                        executor.submit(jax.device_get, shard.data[start:end])
+                    )
+        _ = [f.result() for f in d2h_futures]
+    chk_d2h_end = time.perf_counter()
+    d2h_ms = (chk_d2h_end - chk_d2h_start) * 1000
+    data_on_device.delete()
+        
+    return h2d_ms, d2h_ms
+
+
+def _run_warmup(host_data, data_sharding, data_size_mb):
+    # --- ADAPTIVE WARM UP ---
+    if data_size_mb <= 128:
+        warmup_iters = 50
+    elif data_size_mb >= 8192:
+        warmup_iters = 3
+    else:
+        warmup_iters = 10
+
+    for _ in range(warmup_iters):
+        data_on_device = jax.device_put(host_data, data_sharding)
+        data_on_device.block_until_ready()
+        _ = jax.device_get(data_on_device)
+        data_on_device.delete()
+
+    gc.collect()
+
+def _get_chunks_per_device(data_size_mb, num_devices):
+    # --- SMART CHUNKING CONFIG ---
+    target_chunk_size_mb = 16
+    max_global_threads = 256
+    
+    data_per_device_mb = data_size_mb / num_devices
+
+    if data_per_device_mb < target_chunk_size_mb:
+        chunks_per_device = 1
+    else:
+        chunks_per_device = int(data_per_device_mb / target_chunk_size_mb)
+
+    total_threads = num_devices * chunks_per_device
+    if total_threads > max_global_threads:
+        chunks_per_device = max(1, int(max_global_threads / num_devices))
+    
+    return chunks_per_device
+
+
+def benchmark_host_device(
+    mesh_shape: str,
+    data_size_mb: int,
+    num_runs: int = 100,
+    trace_dir: str = None,
+) -> Dict[str, Any]:
+    """Benchmarks H2D/D2H transfer using smart chunking."""
+    dims = [int(d) for d in mesh_shape.split("x")]
+    mesh_shape = tuple(dims)
+         
+    num_devices = int(np.prod(mesh_shape))
+    tpu_devices = get_tpu_devices(num_devices)
+    
+    rows = 1024 * data_size_mb // np.dtype(np.float32).itemsize
+    
+    host_data = np.ones((rows, 8, 128), dtype=np.float32)
+    
+    print(
+        f"Benchmarking Transfer with Data Size: {data_size_mb} MB on"
+        f" {num_devices} devices for {num_runs} iterations"
+    )
+
+    # Setup Mesh Sharding
+    if len(mesh_shape) == 1:
+        mesh = sharding.Mesh(
+            np.array(tpu_devices).reshape(mesh_shape), axis_names=("x",)
+        )
+        data_sharding = sharding.NamedSharding(mesh, sharding.PartitionSpec("x"))
+    else:
+        mesh = sharding.Mesh(
+            np.array(tpu_devices).reshape(mesh_shape), axis_names=("x", "y")
+        )
+        data_sharding = sharding.NamedSharding(
+            mesh, sharding.PartitionSpec(("x", "y"))
+        )
+    
+    # --- ADAPTIVE WARM UP ---
+    _run_warmup(host_data, data_sharding, data_size_mb)
+
+    # Pre-calculate sharding info
+    dummy_put = jax.device_put(host_data[:num_devices], data_sharding)
+    target_devices = [s.device for s in dummy_put.addressable_shards]
+    dummy_put.delete()
+
+    host_shards = np.split(host_data, num_devices, axis=0)
+
+    # Performance Lists
+    h2d_perf, d2h_perf = [], []
+
+    # --- SMART CHUNKING CONFIG ---
+    chunks_per_device = _get_chunks_per_device(data_size_mb, num_devices)
+        
+    # Profiling Context
+    if trace_dir:
+        profiler_context = jax.profiler.trace(trace_dir)
+    else:
+        # No-op context manager
+        import contextlib
+        profiler_context = contextlib.nullcontext()
+
+    with profiler_context:
+        for i in range(num_runs):
+            # Step Context
+            if trace_dir:
+                step_context = jax.profiler.StepTraceAnnotation("host_device", step_num=i)
+            else:
+                step_context = contextlib.nullcontext()
+            
+            with step_context:
+                # Optimized Chunked Transfer (Sole Strategy)
+                h2d_ms, d2h_ms = _run_chunked(
+                    host_data, data_sharding, host_shards, target_devices, 
+                    num_devices, chunks_per_device
+                )
+                h2d_perf.append(h2d_ms)
+                d2h_perf.append(d2h_ms)
+
+    del host_data, host_shards
+    gc.collect()
+
+    return {
+        "H2D_Bandwidth": h2d_perf,
+        "D2H_Bandwidth": d2h_perf,
+        "Chunk_Count": chunks_per_device,
+        "Thread_Count": num_devices * chunks_per_device,
+    }
+
+def benchmark_host_device_calculate_metrics(
+    mesh_shape: str,
+    data_size_mb: int,
+    H2D_Bandwidth: List[float],
+    D2H_Bandwidth: List[float],
+    Chunk_Count: int,
+    Thread_Count: int,
+) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """Calculates metrics for Host-Device transfer."""
+    params = locals().items()
+    
+    # Filter out list params from metadata to avoid explosion
+    metadata_keys = {"mesh_shape", "data_size_mb", "Chunk_Count", "Thread_Count"}
+    metadata = {k: v for k, v in params if k in metadata_keys}
+    
+    metrics = {}
+    
+    def add_metric(name, ms_list):
+        # Report Bandwidth (GiB/s)
+        # Handle division by zero if ms is 0
+        bw_list = [
+            ((data_size_mb / 1024) / (ms / 1000)) if ms > 0 else 0.0 
+            for ms in ms_list
+        ]
+        stats_bw = MetricsStatistics(bw_list, f"{name}_bw (GiB/s)")
+        metrics.update(stats_bw.serialize_statistics())
+
+    add_metric("H2D", H2D_Bandwidth)
+    add_metric("D2H", D2H_Bandwidth)
+
+    return metadata, metrics
diff --git a/Ironwood/src/run_benchmark.py b/Ironwood/src/run_benchmark.py