Add PCIe transfer benchmark (H2D/D2H) to Ironwood

linamy85 · linamy85 · commit 1fa3c99d1960 · 2026-01-20T06:55:13.000Z
- Implement benchmark_pcie_transfer.py to measure H2D and D2H transfer performance using JAX, supporting various transfer modes (Standard, Parallel, Threaded, Chunked).
- Integrate the new benchmark into run_benchmark.py.
- Add configuration files for single device, single chip, and single VM topologies in configs/pcie_transfer/.
- Add scripts/run_pcie_transfer_benchmark.sh for bulk execution with numactl interleaving option.
diff --git a/Ironwood/guides/host_device/host_device.md b/Ironwood/guides/host_device/host_device.md
@@ -2,7 +2,7 @@
 
 This guide provides instructions for running Host Device (Host-to-Device and Device-to-Host) microbenchmarks on tpu7x-2x2x1 Google Kubernetes Engine (GKE) clusters. It covers creating a node pool, running the benchmarks, and viewing the output.
 
-> [!WARNING]
+> [!NOTE]
 > This benchmark is currently a Work In Progress (WIP). Expected bandwidth numbers are not yet finalized.
 
 ## Create Node Pools
diff --git a/Ironwood/scripts/run_host_device_benchmark.sh b/Ironwood/scripts/run_host_device_benchmark.sh
@@ -27,13 +27,8 @@ while [[ "$#" -gt 0 ]]; do
 done
 
 echo "--- Starting Host-Device Transfer Benchmark (H2D/D2H) ---"
-echo "********************************************************"
-echo "WARNING: This benchmark is currently a WORK IN PROGRESS"
-echo "********************************************************"
-echo ""
-echo "Configuration:"
-echo "    Interleaved: $INTERLEAVED"
-echo ""
+echo "Note: This benchmark is work in progress"
+echo "Interleaved: $INTERLEAVED"
 
 if [ -n "$SPECIFIC_CONFIG" ]; then
     CONFIGS=("$SPECIFIC_CONFIG")
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
@@ -1,5 +1,7 @@
-"""Benchmarks Host-to-Device and Device-to-Host transfer performance (Simple Baseline)."""
+"""Benchmarks Host-to-Device and Device-to-Host transfer performance."""
 
+import concurrent.futures
+import gc
 import time
 import os
 from typing import Any, Dict, Tuple, List
@@ -9,8 +11,7 @@
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
-# 64 GiB
-os.environ["TPU_PREMAPPED_BUFFER_SIZE"] = "68719476736"
+os.environ["TPU_PREMAPPED_BUFFER_SIZE"] = "68719476736" # 64 GiB
 os.environ["TPU_PREMAPPED_BUFFER_TRANSFER_THRESHOLD_BYTES"] = "68719476736"
 
 def get_tpu_devices(num_devices: int):
@@ -19,26 +20,107 @@ def get_tpu_devices(num_devices: int):
         raise RuntimeError(f"Require {num_devices} devices, found {len(devices)}")
     return devices[:num_devices]
 
+def _run_chunked(host_data, data_sharding, host_shards, target_devices, num_devices, chunks_per_device):
+    # Smart Chunked H2D
+    chk_h2d_start = time.perf_counter()
+    total_workers = num_devices * chunks_per_device
+    with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
+        chunked_futures = []
+        for shard, dev in zip(host_shards, target_devices):
+            sub_chunks = np.array_split(shard, chunks_per_device, axis=0)
+            for chunk in sub_chunks:
+                chunked_futures.append(
+                    executor.submit(jax.device_put, chunk, dev)
+                )
+        chunked_buffers = [f.result() for f in chunked_futures]
+        for db in chunked_buffers:
+            db.block_until_ready()
+    chk_h2d_end = time.perf_counter()
+    h2d_ms = (chk_h2d_end - chk_h2d_start) * 1000
+    for db in chunked_buffers:
+        db.delete()
+
+    # Smart Chunked D2H
+    data_on_device = jax.device_put(host_data, data_sharding)
+    data_on_device.block_until_ready()
+    
+    chk_d2h_start = time.perf_counter()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
+        d2h_futures = []
+        for shard in data_on_device.addressable_shards:
+            # Direct slicing on device array to avoid copy
+            shard_len = shard.data.shape[0]
+            chunk_size = (shard_len + chunks_per_device - 1) // chunks_per_device
+            for i in range(chunks_per_device):
+                start = i * chunk_size
+                end = min((i + 1) * chunk_size, shard_len)
+                if start < end:
+                    d2h_futures.append(
+                        executor.submit(jax.device_get, shard.data[start:end])
+                    )
+        _ = [f.result() for f in d2h_futures]
+    chk_d2h_end = time.perf_counter()
+    d2h_ms = (chk_d2h_end - chk_d2h_start) * 1000
+    data_on_device.delete()
+        
+    return h2d_ms, d2h_ms
+
+
+def _run_warmup(host_data, data_sharding, data_size_mb):
+    # --- ADAPTIVE WARM UP ---
+    if data_size_mb <= 128:
+        warmup_iters = 50
+    elif data_size_mb >= 8192:
+        warmup_iters = 3
+    else:
+        warmup_iters = 10
+
+    for _ in range(warmup_iters):
+        data_on_device = jax.device_put(host_data, data_sharding)
+        data_on_device.block_until_ready()
+        _ = jax.device_get(data_on_device)
+        data_on_device.delete()
+
+    gc.collect()
+
+def _get_chunks_per_device(data_size_mb, num_devices):
+    # --- SMART CHUNKING CONFIG ---
+    target_chunk_size_mb = 16
+    max_global_threads = 256
+    
+    data_per_device_mb = data_size_mb / num_devices
+
+    if data_per_device_mb < target_chunk_size_mb:
+        chunks_per_device = 1
+    else:
+        chunks_per_device = int(data_per_device_mb / target_chunk_size_mb)
+
+    total_threads = num_devices * chunks_per_device
+    if total_threads > max_global_threads:
+        chunks_per_device = max(1, int(max_global_threads / num_devices))
+    
+    return chunks_per_device
+
+
 def benchmark_host_device(
     mesh_shape: str,
     data_size_mb: int,
     num_runs: int = 100,
     trace_dir: str = None,
 ) -> Dict[str, Any]:
-    """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
+    """Benchmarks H2D/D2H transfer using smart chunking."""
     dims = [int(d) for d in mesh_shape.split("x")]
     mesh_shape = tuple(dims)
          
     num_devices = int(np.prod(mesh_shape))
     tpu_devices = get_tpu_devices(num_devices)
     
-    num_elements = 1024 * 1024 * data_size_mb // np.dtype(np.float32).itemsize
+    rows = 1024 * data_size_mb // np.dtype(np.float32).itemsize
     
-    # Allocate Host Source Buffer
-    host_data = np.ones((num_elements,), dtype=np.float32)
+    host_data = np.ones((rows, 8, 128), dtype=np.float32)
     
     print(
-        f"Benchmarking (Simple) Transfer with Data Size: {data_size_mb} MB on"
+        f"Benchmarking Transfer with Data Size: {data_size_mb} MB on"
         f" {num_devices} devices for {num_runs} iterations"
     )
 
@@ -47,25 +129,37 @@ def benchmark_host_device(
         mesh = sharding.Mesh(
             np.array(tpu_devices).reshape(mesh_shape), axis_names=("x",)
         )
-        # Shard the 1D array across "x"
-        partition_spec = sharding.PartitionSpec("x")
+        data_sharding = sharding.NamedSharding(mesh, sharding.PartitionSpec("x"))
     else:
         mesh = sharding.Mesh(
             np.array(tpu_devices).reshape(mesh_shape), axis_names=("x", "y")
         )
-        # Shard the 1D array across BOTH "x" and "y" (product sharding)
-        partition_spec = sharding.PartitionSpec(("x", "y"))
-    
-    data_sharding = sharding.NamedSharding(mesh, partition_spec)
+        data_sharding = sharding.NamedSharding(
+            mesh, sharding.PartitionSpec(("x", "y"))
+        )
     
+    # --- ADAPTIVE WARM UP ---
+    _run_warmup(host_data, data_sharding, data_size_mb)
+
+    # Pre-calculate sharding info
+    dummy_put = jax.device_put(host_data[:num_devices], data_sharding)
+    target_devices = [s.device for s in dummy_put.addressable_shards]
+    dummy_put.delete()
+
+    host_shards = np.split(host_data, num_devices, axis=0)
+
     # Performance Lists
     h2d_perf, d2h_perf = [], []
+
+    # --- SMART CHUNKING CONFIG ---
+    chunks_per_device = _get_chunks_per_device(data_size_mb, num_devices)
         
     # Profiling Context
-    import contextlib
     if trace_dir:
         profiler_context = jax.profiler.trace(trace_dir)
     else:
+        # No-op context manager
+        import contextlib
         profiler_context = contextlib.nullcontext()
 
     with profiler_context:
@@ -77,53 +171,37 @@ def benchmark_host_device(
                 step_context = contextlib.nullcontext()
             
             with step_context:
-                 # H2D
-                t0 = time.perf_counter()
-                
-                # Simple device_put
-                device_array = jax.device_put(host_data, data_sharding)
-                device_array.block_until_ready()
-                
-                t1 = time.perf_counter()
-                h2d_perf.append((t1 - t0) * 1000)
-                
-                # Verify H2D shape/sharding
-                assert device_array.shape == host_data.shape
-                assert device_array.sharding == data_sharding
-                
-                # D2H
-                t2 = time.perf_counter()
-                
-                # Simple device_get
-                # Note: device_get returns a numpy array (copy)
-                _ = jax.device_get(device_array)
-                
-                t3 = time.perf_counter()
-                d2h_perf.append((t3 - t2) * 1000)
-                
-                device_array.delete()
+                # Optimized Chunked Transfer (Sole Strategy)
+                h2d_ms, d2h_ms = _run_chunked(
+                    host_data, data_sharding, host_shards, target_devices, 
+                    num_devices, chunks_per_device
+                )
+                h2d_perf.append(h2d_ms)
+                d2h_perf.append(d2h_ms)
+
+    del host_data, host_shards
+    gc.collect()
 
     return {
         "H2D_Bandwidth": h2d_perf,
         "D2H_Bandwidth": d2h_perf,
+        "Chunk_Count": chunks_per_device,
+        "Thread_Count": num_devices * chunks_per_device,
     }
 
 def benchmark_host_device_calculate_metrics(
     mesh_shape: str,
     data_size_mb: int,
     H2D_Bandwidth: List[float],
     D2H_Bandwidth: List[float],
+    Chunk_Count: int,
+    Thread_Count: int,
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()
     
-    data_size_mib = data_size_mb
-    
     # Filter out list params from metadata to avoid explosion
-    metadata_keys = {
-        "mesh_shape", 
-        "data_size_mib", 
-    }
+    metadata_keys = {"mesh_shape", "data_size_mb", "Chunk_Count", "Thread_Count"}
     metadata = {k: v for k, v in params if k in metadata_keys}
     
     metrics = {}