Auto searching chunk size for benchmarking

linamy85 · linamy85 · commit 3a8d714c81ee · 2026-01-20T06:55:13.000Z
diff --git a/Ironwood/src/benchmark_host_device.py b/Ironwood/src/benchmark_host_device.py
@@ -20,9 +20,7 @@ def get_tpu_devices(num_devices: int):
         raise RuntimeError(f"Require {num_devices} devices, found {len(devices)}")
     return devices[:num_devices]
 
-def _run_chunked(host_data, data_sharding, host_shards, target_devices, num_devices, chunks_per_device):
-    # Smart Chunked H2D
-    chk_h2d_start = time.perf_counter()
+def _run_h2d_chunked(host_shards, target_devices, num_devices, chunks_per_device):
     total_workers = num_devices * chunks_per_device
     with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
         chunked_futures = []
@@ -35,16 +33,10 @@ def _run_chunked(host_data, data_sharding, host_shards, target_devices, num_devi
         chunked_buffers = [f.result() for f in chunked_futures]
         for db in chunked_buffers:
             db.block_until_ready()
-    chk_h2d_end = time.perf_counter()
-    h2d_ms = (chk_h2d_end - chk_h2d_start) * 1000
-    for db in chunked_buffers:
-        db.delete()
+    return chunked_buffers
 
-    # Smart Chunked D2H
-    data_on_device = jax.device_put(host_data, data_sharding)
-    data_on_device.block_until_ready()
-    
-    chk_d2h_start = time.perf_counter()
+def _run_d2h_chunked(data_on_device, num_devices, chunks_per_device):
+    total_workers = num_devices * chunks_per_device
     with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
         d2h_futures = []
         for shard in data_on_device.addressable_shards:
@@ -58,49 +50,94 @@ def _run_chunked(host_data, data_sharding, host_shards, target_devices, num_devi
                     d2h_futures.append(
                         executor.submit(jax.device_get, shard.data[start:end])
                     )
-        _ = [f.result() for f in d2h_futures]
-    chk_d2h_end = time.perf_counter()
-    d2h_ms = (chk_d2h_end - chk_d2h_start) * 1000
-    data_on_device.delete()
-        
-    return h2d_ms, d2h_ms
-
+        for f in d2h_futures:
+            f.result()
 
-def _run_warmup(host_data, data_sharding, data_size_mb):
-    # --- ADAPTIVE WARM UP ---
-    if data_size_mb <= 128:
-        warmup_iters = 50
-    elif data_size_mb >= 8192:
-        warmup_iters = 3
-    else:
-        warmup_iters = 10
-
-    for _ in range(warmup_iters):
-        data_on_device = jax.device_put(host_data, data_sharding)
-        data_on_device.block_until_ready()
-        _ = jax.device_get(data_on_device)
-        data_on_device.delete()
-
-    gc.collect()
 
-def _get_chunks_per_device(data_size_mb, num_devices):
-    # --- SMART CHUNKING CONFIG ---
-    target_chunk_size_mb = 16
-    max_global_threads = 256
+def _find_optimal_chunk_size(
+    run_fn,
+    num_devices,
+    data_size_mb,
+    search_min_size_mb=1,
+    max_global_threads=256
+):
+    """Finds optimal chunk size by iterating over candidates."""
+    print("  Searching for optimal chunk size...")
     
+    # Generate size candidates
+    candidates_mb = []
+    curr = search_min_size_mb
     data_per_device_mb = data_size_mb / num_devices
+    
+    # Iterate until we cover the full data size per device
+    while curr <= data_per_device_mb:
+        candidates_mb.append(curr)
+        curr *= 2
+    # Ensure we test at least one candidate (e.g. if data < min_size)
+    if not candidates_mb:
+         candidates_mb.append(data_per_device_mb)
 
-    if data_per_device_mb < target_chunk_size_mb:
-        chunks_per_device = 1
-    else:
-        chunks_per_device = int(data_per_device_mb / target_chunk_size_mb)
-
-    total_threads = num_devices * chunks_per_device
-    if total_threads > max_global_threads:
-        chunks_per_device = max(1, int(max_global_threads / num_devices))
+    # Map sizes to counts, keeping track of unique counts to test
+    candidates_counts = []
+    seen_counts = set()
     
-    return chunks_per_device
+    for size_mb in candidates_mb:
+        if size_mb > data_per_device_mb:
+             count = 1
+        else:
+             count = int(data_per_device_mb / size_mb)
+             if count < 1: count = 1
+             
+        # Filter by max global threads
+        if (count * num_devices) > max_global_threads:
+            continue
+        
+        if count not in seen_counts:
+            candidates_counts.append(count)
+            seen_counts.add(count)
+            
+    # Sort candidates (counts) ascending for clean output
+    candidates_counts.sort()
+    
+    if not candidates_counts:
+        candidates_counts = [1]
 
+    best_chunk_count = 1
+    best_median_bw = -1.0
+    
+    # 5 search iterations + 3 warmup (before search)
+    warmup_iters = 3
+    search_iters = 5
+    
+    try:
+        for _ in range(warmup_iters):
+             run_fn(1) # Warmup with 1 chunk
+    except Exception:
+        pass 
+        
+    for chunk_count in candidates_counts:
+        times_ms = []
+        try:
+            for _ in range(search_iters):
+                t_start = time.perf_counter()
+                res = run_fn(chunk_count)
+                t_end = time.perf_counter()
+                
+                if isinstance(res, (int, float)):
+                    times_ms.append(res)
+                else:
+                    times_ms.append((t_end - t_start) * 1000)
+            
+            median_ms = np.median(times_ms)
+            if median_ms > 0:
+                 if best_median_bw < 0 or median_ms < best_median_bw:
+                     best_median_bw = median_ms
+                     best_chunk_count = chunk_count
+        except Exception as e:
+            continue
+            
+    print(f"  Found optimal chunk count: {best_chunk_count} (approx size: {data_per_device_mb/best_chunk_count:.2f} MB)")
+    return best_chunk_count
 
 def benchmark_host_device(
     mesh_shape: str,
@@ -138,21 +175,47 @@ def benchmark_host_device(
             mesh, sharding.PartitionSpec(("x", "y"))
         )
     
-    # --- ADAPTIVE WARM UP ---
-    _run_warmup(host_data, data_sharding, data_size_mb)
-
     # Pre-calculate sharding info
     dummy_put = jax.device_put(host_data[:num_devices], data_sharding)
     target_devices = [s.device for s in dummy_put.addressable_shards]
     dummy_put.delete()
 
     host_shards = np.split(host_data, num_devices, axis=0)
 
+    # --- SEARCH OPTIMAL CHUNKS ---
+    # Define wrappers for search
+    
+    def h2d_run_fn(c):
+        bufs = _run_h2d_chunked(host_shards, target_devices, num_devices, c)
+        for b in bufs: b.delete()
+
+    # H2D Search
+    h2d_chunks = _find_optimal_chunk_size(h2d_run_fn, num_devices, data_size_mb)
+    
+    # D2H Search
+    # We need persistent data on device for D2H search to avoid H2D overhead in D2H measurement
+    data_on_device_for_search = jax.device_put(host_data, data_sharding)
+    data_on_device_for_search.block_until_ready()
+    
+    def d2h_run_fn(c):
+        # Force a new buffer to avoid host-side caching of device_get
+        # Adding 0.0 creates a new DeviceArray with same sharding
+        fresh_data = jax.lax.add(data_on_device_for_search, 0.0)
+        fresh_data.block_until_ready()
+        
+        t0 = time.perf_counter()
+        _run_d2h_chunked(fresh_data, num_devices, c)
+        t1 = time.perf_counter()
+        
+        fresh_data.delete()
+        return (t1 - t0) * 1000
+        
+    d2h_chunks = _find_optimal_chunk_size(d2h_run_fn, num_devices, data_size_mb)
+    
+    data_on_device_for_search.delete()
+
     # Performance Lists
     h2d_perf, d2h_perf = [], []
-
-    # --- SMART CHUNKING CONFIG ---
-    chunks_per_device = _get_chunks_per_device(data_size_mb, num_devices)
         
     # Profiling Context
     if trace_dir:
@@ -171,37 +234,54 @@ def benchmark_host_device(
                 step_context = contextlib.nullcontext()
             
             with step_context:
-                # Optimized Chunked Transfer (Sole Strategy)
-                h2d_ms, d2h_ms = _run_chunked(
-                    host_data, data_sharding, host_shards, target_devices, 
-                    num_devices, chunks_per_device
+                 # H2D
+                t0 = time.perf_counter()
+                chunked_buffers = _run_h2d_chunked(
+                    host_shards, target_devices, num_devices, h2d_chunks
                 )
-                h2d_perf.append(h2d_ms)
-                d2h_perf.append(d2h_ms)
+                t1 = time.perf_counter()
+                h2d_perf.append((t1 - t0) * 1000)
+                
+                for db in chunked_buffers:
+                    db.delete()
+                    
+                # D2H
+                # We need data on device again
+                data_on_device = jax.device_put(host_data, data_sharding)
+                data_on_device.block_until_ready()
+                
+                t2 = time.perf_counter()
+                _run_d2h_chunked(data_on_device, num_devices, d2h_chunks)
+                t3 = time.perf_counter()
+                d2h_perf.append((t3 - t2) * 1000)
+                
+                data_on_device.delete()
 
     del host_data, host_shards
     gc.collect()
 
     return {
         "H2D_Bandwidth": h2d_perf,
         "D2H_Bandwidth": d2h_perf,
-        "Chunk_Count": chunks_per_device,
-        "Thread_Count": num_devices * chunks_per_device,
+        "H2D_Chunk_Size_MB": (data_size_mb / num_devices) / h2d_chunks if h2d_chunks > 0 else 0,
+        "D2H_Chunk_Size_MB": (data_size_mb / num_devices) / d2h_chunks if d2h_chunks > 0 else 0,
+        "Thread_Count": num_devices * max(h2d_chunks, d2h_chunks), # Approx
     }
 
 def benchmark_host_device_calculate_metrics(
     mesh_shape: str,
     data_size_mb: int,
     H2D_Bandwidth: List[float],
     D2H_Bandwidth: List[float],
-    Chunk_Count: int,
+    H2D_Chunk_Size_MB: float,
+    D2H_Chunk_Size_MB: float,
     Thread_Count: int,
 ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
     """Calculates metrics for Host-Device transfer."""
     params = locals().items()
     
     # Filter out list params from metadata to avoid explosion
-    metadata_keys = {"mesh_shape", "data_size_mb", "Chunk_Count", "Thread_Count"}
+    metadata_keys = {"mesh_shape", "data_size_mb", "H2D_Chunk_Size_MB", "D2H_Chunk_Size_MB", "Thread_Count"}
     metadata = {k: v for k, v in params if k in metadata_keys}
     
     metrics = {}