Add baseline pipelined flow to H2D benchmark

leonardchan · leonardchan · commit a86475d6a50b · 2026-02-05T08:41:15.000+08:00
diff --git a/Ironwood/configs/host_device/host_device.yaml b/Ironwood/configs/host_device/host_device.yaml
@@ -3,7 +3,8 @@ benchmarks:
   num_runs: 20
   benchmark_sweep_params:
   - {
-      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
+      h2d_type: ["simple", "pipelined"],
+      data_size_mib_list: [1, 16, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768],
     }
   csv_path: "../microbenchmarks/host_device"
   trace_dir: "../microbenchmarks/host_device/trace"
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, Tuple, List
 
 import jax
-from jax import sharding
+from jax import numpy as jnp
 import numpy as np
 from benchmark_utils import MetricsStatistics
 
@@ -23,17 +23,23 @@ def benchmark_host_device(
     data_size_mib: int,
     num_runs: int = 100,
     trace_dir: str = None,
+    h2d_type: str = "simple",
 ) -> Dict[str, Any]:
-    """Benchmarks H2D/D2H transfer using simple device_put/device_get."""
+    """Benchmarks H2D/D2H transfer using device_put/device_get."""
     
     num_elements = 1024 * 1024 * data_size_mib // np.dtype(np.float32).itemsize
     
     # Allocate Host Source Buffer
     column = 128
     host_data = np.random.normal(size=(num_elements // column, column)).astype(np.float32)
     
+    # Used in pipelined flow
+    # TODO: turn into a param
+    num_devices_to_perform_h2d = 1
+    target_devices = jax.devices()[:num_devices_to_perform_h2d]
+
     print(
-        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations",
+        f"Benchmarking Transfer with Data Size: {data_size_mib} MB for {num_runs} iterations with {h2d_type=}",
         flush=True
     )
 
@@ -65,29 +71,98 @@ def benchmark_host_device(
             
             with step_context:
                  # H2D
-                t0 = time.perf_counter()
-                
-                # Simple device_put
-                device_array = jax.device_put(host_data)
-                device_array.block_until_ready()
-                
-                t1 = time.perf_counter()
-                h2d_perf.append((t1 - t0) * 1000)
-                
-                # Verify H2D shape
-                assert device_array.shape == host_data.shape
-                
-                # D2H
-                t2 = time.perf_counter()
-                
-                # Simple device_get
-                # Note: device_get returns a numpy array (copy)
-                _ = jax.device_get(device_array)
-                
-                t3 = time.perf_counter()
-                d2h_perf.append((t3 - t2) * 1000)
+                if h2d_type == "simple":
+                    t0 = time.perf_counter()
+                    # Simple device_put
+                    device_array = jax.device_put(host_data)
+                    device_array.block_until_ready()
+                    t1 = time.perf_counter()
+                    
+                    # Verify H2D shape
+                    assert device_array.shape == host_data.shape
+
+                    h2d_perf.append((t1 - t0) * 1000)
                 
-                device_array.delete()
+                    # D2H
+                    t2 = time.perf_counter()
+                    
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(device_array)
+                    
+                    t3 = time.perf_counter()
+                    d2h_perf.append((t3 - t2) * 1000)
+                    
+                    device_array.delete()
+                elif h2d_type == "pipelined":
+                    target_chunk_size_mib = 16  # Sweet spot from profiling
+                    num_devices = len(target_devices)
+
+                    tensors_on_device = []
+                    
+                    # Calculate chunks per device
+                    data_per_dev = data_size_mib / num_devices
+                    chunks_per_dev = int(data_per_dev / target_chunk_size_mib)
+                    chunks_per_dev = max(1, chunks_per_dev)
+
+                    chunks = np.array_split(host_data, chunks_per_dev * num_devices, axis=0)
+
+                    t0 = time.perf_counter()
+                    if chunks_per_dev > 1:    
+                        # We need to map chunks to the correct device
+                        # This simple example assumes chunks are perfectly divisible and ordered
+                        # In production, use `jax.sharding` mesh logic for complex layouts
+
+                        # approach 1: simple for loop
+                        for idx, chunk in enumerate(chunks):
+                            if num_devices > 1:
+                                dev = target_devices[idx % num_devices]
+                            else:
+                                dev = target_devices[0]
+                            tensors_on_device.append(jax.device_put(chunk, dev))
+                        # Re-assemble array
+                        result = jnp.vstack(tensors_on_device)
+                        # Wait for all chunks to be transferred
+                        result.block_until_ready()
+
+                        # approach 2: generator (slightly less overhead)
+                        # def chunk_generator(num_devices, chunks_per_dev):
+                        #     for n in range(chunks_per_dev):
+                        #         for d in range(num_devices):
+                        #             # 1. Get the specific small chunk
+                        #             chunk = chunks[d*chunks_per_dev+n]
+
+                        #             # 2. Trigger an individual DMA transfer for this specific chunk
+                        #             # This is where NUMA-local memory access matters
+                        #             yield jax.device_put(chunk, target_devices[d])
+
+                        # # Re-assemble array
+                        # result = jnp.vstack(list(chunk_generator(num_devices, chunks_per_dev)))
+                        # # Wait for all chunks to be transferred
+                        # result.block_until_ready()
+                    else:
+                        print(f"Warning: {data_size_mib=} is not larger than {target_chunk_size_mib=}, falling back to standard JAX put.")
+                        # Fallback to standard JAX put for small data
+                        result = jax.device_put(host_data, target_devices[0])
+                        result.block_until_ready()
+
+                    t1 = time.perf_counter()
+                    h2d_perf.append((t1 - t0) * 1000)
+
+                    # D2H
+                    t2 = time.perf_counter()
+                    # Simple device_get
+                    # Note: device_get returns a numpy array (copy)
+                    _ = jax.device_get(result)
+
+                    t3 = time.perf_counter()
+                    if not np.allclose(result, host_data):
+                        print("pipelined result not equal to host_data")
+                    d2h_perf.append((t3 - t2) * 1000)
+
+                    for r in tensors_on_device:
+                        r.delete()
+                    del tensors_on_device
 
     return {
         "H2D_Bandwidth_ms": h2d_perf,
@@ -98,6 +173,7 @@ def benchmark_host_device_calculate_metrics(
     data_size_mib: int,
     H2D_Bandwidth_ms: List[float],
     D2H_Bandwidth_ms: List[float],
+    h2d_type: str = "simple",
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()