feat(agentic): use Mooncake store for B300 offload

cquil11 · cquil11 · commit 2f27beacda4e · 2026-06-04T16:51:34.000-05:00
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -9414,8 +9414,8 @@ dsv4-fp4-b300-vllm-agentic:
     agentic-coding:
     - duration: 1800
       search-space:
-      # TEMPORARY: run only native CPU-offload scenarios while diagnosing
-      # asynchronous CUDA failures.
+      # TEMPORARY: run only MooncakeStore CPU-offload scenarios while
+      # diagnosing the native/SimpleCPU offload failures.
       # - { tp: 4, offloading: none,  conc-list: [1, 4, 8, 16, 32] }
       # - { tp: 8, offloading: none,  conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] }
       # - { tp: 4, ep: 4, dp-attn: true, offloading: none,  conc-list: [8, 16, 32, 64, 128] }
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -241,6 +241,8 @@ jobs:
             results/server.log
             results/router.log
             results/lmcache_server.log
+            results/mooncake_master.log
+            results/mooncake_config.json
             results/benchmark.log
             results/config.yaml
             results/lmcache_command.txt
@@ -282,6 +284,8 @@ jobs:
             ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
             ${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }}
             ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }}
           if-no-files-found: ignore
 
       - name: Upload GPU metrics
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -85,6 +85,7 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 ROUTER_LOG="$RESULT_DIR/router.log"
+MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log"
 mkdir -p "$RESULT_DIR"
 
 OFFLOAD_ARGS=()
@@ -93,24 +94,52 @@ case "$OFFLOADING" in
     cpu)
         # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
         # individual jobs to a fraction of that. Aim for ~2.5 TB total host
-        # CPU pool across the engine(s).
+        # CPU pool across all GPU ranks.
         #
-        # --kv_offloading_size configures one native OffloadingConnector pool
-        # per vLLM engine. DP-attn starts one engine per DP rank, so pre-divide
-        # the aggregate host budget across those engines.
+        # Mooncake embedded mode contributes one global segment per GPU rank to
+        # a shared distributed store. Pre-divide the aggregate host budget
+        # across those rank-contributed segments.
         TOTAL_CPU_DRAM_GB=2500
-        if [ "$DP_ATTENTION" = "true" ]; then
-            PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP))
-        else
-            PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
+        PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP))
+
+        MOONCAKE_VERSION=0.3.11.post1
+        agentic_pip_install --quiet --no-cache-dir --no-deps \
+            --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION"
+        python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null
+
+        MOONCAKE_MASTER_PORT=$((PORT + 12000))
+        MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json"
+        cat > "$MOONCAKE_CONFIG_PATH" <<EOF
+{
+  "mode": "embedded",
+  "metadata_server": "P2PHANDSHAKE",
+  "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT",
+  "global_segment_size": "${PER_RANK_GB}GB",
+  "local_buffer_size": "4GB",
+  "protocol": "rdma",
+  "device_name": "",
+  "enable_offload": false
+}
+EOF
+        export MOONCAKE_CONFIG_PATH
+        # Identical prefixes must hash to identical store keys across DP ranks.
+        export PYTHONHASHSEED=0
+
+        echo "Starting Mooncake master on port $MOONCAKE_MASTER_PORT..."
+        mooncake_master --port "$MOONCAKE_MASTER_PORT" \
+            > "$MOONCAKE_MASTER_LOG" 2>&1 &
+        MOONCAKE_MASTER_PID=$!
+        sleep 2
+        if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then
+            echo "Mooncake master died during startup." >&2
+            cat "$MOONCAKE_MASTER_LOG" >&2
+            exit 1
         fi
 
-        # The native backend resolves to OffloadingConnector while this env var
-        # is unset.
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
         OFFLOAD_ARGS=(
-            --kv_offloading_backend native
-            --kv_offloading_size "$PER_ENGINE_GB"
+            --kv-transfer-config
+            '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}'
         )
         ;;
     *)
@@ -144,9 +173,6 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
-# Temporary diagnostic: surface asynchronous CUDA failures at the operation
-# that caused them instead of at a later synchronization point.
-export CUDA_LAUNCH_BLOCKING=1
 
 vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \