Skip to content

Commit 2f27bea

Browse files
committed
feat(agentic): use Mooncake store for B300 offload
1 parent fb362a6 commit 2f27bea

3 files changed

Lines changed: 47 additions & 17 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9414,8 +9414,8 @@ dsv4-fp4-b300-vllm-agentic:
94149414
agentic-coding:
94159415
- duration: 1800
94169416
search-space:
9417-
# TEMPORARY: run only native CPU-offload scenarios while diagnosing
9418-
# asynchronous CUDA failures.
9417+
# TEMPORARY: run only MooncakeStore CPU-offload scenarios while
9418+
# diagnosing the native/SimpleCPU offload failures.
94199419
# - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] }
94209420
# - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] }
94219421
# - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] }

.github/workflows/benchmark-tmpl.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ jobs:
241241
results/server.log
242242
results/router.log
243243
results/lmcache_server.log
244+
results/mooncake_master.log
245+
results/mooncake_config.json
244246
results/benchmark.log
245247
results/config.yaml
246248
results/lmcache_command.txt
@@ -282,6 +284,8 @@ jobs:
282284
${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
283285
${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }}
284286
${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }}
287+
${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }}
288+
${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }}
285289
if-no-files-found: ignore
286290

287291
- name: Upload GPU metrics

benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768
8585
# ---- Server config ----------------------------------------------------------
8686
SERVER_LOG="$RESULT_DIR/server.log"
8787
ROUTER_LOG="$RESULT_DIR/router.log"
88+
MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log"
8889
mkdir -p "$RESULT_DIR"
8990

9091
OFFLOAD_ARGS=()
@@ -93,24 +94,52 @@ case "$OFFLOADING" in
9394
cpu)
9495
# B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
9596
# individual jobs to a fraction of that. Aim for ~2.5 TB total host
96-
# CPU pool across the engine(s).
97+
# CPU pool across all GPU ranks.
9798
#
98-
# --kv_offloading_size configures one native OffloadingConnector pool
99-
# per vLLM engine. DP-attn starts one engine per DP rank, so pre-divide
100-
# the aggregate host budget across those engines.
99+
# Mooncake embedded mode contributes one global segment per GPU rank to
100+
# a shared distributed store. Pre-divide the aggregate host budget
101+
# across those rank-contributed segments.
101102
TOTAL_CPU_DRAM_GB=2500
102-
if [ "$DP_ATTENTION" = "true" ]; then
103-
PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP))
104-
else
105-
PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
103+
PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP))
104+
105+
MOONCAKE_VERSION=0.3.11.post1
106+
agentic_pip_install --quiet --no-cache-dir --no-deps \
107+
--force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION"
108+
python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null
109+
110+
MOONCAKE_MASTER_PORT=$((PORT + 12000))
111+
MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json"
112+
cat > "$MOONCAKE_CONFIG_PATH" <<EOF
113+
{
114+
"mode": "embedded",
115+
"metadata_server": "P2PHANDSHAKE",
116+
"master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT",
117+
"global_segment_size": "${PER_RANK_GB}GB",
118+
"local_buffer_size": "4GB",
119+
"protocol": "rdma",
120+
"device_name": "",
121+
"enable_offload": false
122+
}
123+
EOF
124+
export MOONCAKE_CONFIG_PATH
125+
# Identical prefixes must hash to identical store keys across DP ranks.
126+
export PYTHONHASHSEED=0
127+
128+
echo "Starting Mooncake master on port $MOONCAKE_MASTER_PORT..."
129+
mooncake_master --port "$MOONCAKE_MASTER_PORT" \
130+
> "$MOONCAKE_MASTER_LOG" 2>&1 &
131+
MOONCAKE_MASTER_PID=$!
132+
sleep 2
133+
if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then
134+
echo "Mooncake master died during startup." >&2
135+
cat "$MOONCAKE_MASTER_LOG" >&2
136+
exit 1
106137
fi
107138

108-
# The native backend resolves to OffloadingConnector while this env var
109-
# is unset.
110139
unset VLLM_USE_SIMPLE_KV_OFFLOAD
111140
OFFLOAD_ARGS=(
112-
--kv_offloading_backend native
113-
--kv_offloading_size "$PER_ENGINE_GB"
141+
--kv-transfer-config
142+
'{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}'
114143
)
115144
;;
116145
*)
@@ -144,9 +173,6 @@ echo "Starting vllm server..."
144173
export TORCH_CUDA_ARCH_LIST="10.0"
145174
export PYTHONNOUSERSITE=1
146175
export VLLM_FLOAT32_MATMUL_PRECISION=high
147-
# Temporary diagnostic: surface asynchronous CUDA failures at the operation
148-
# that caused them instead of at a later synchronization point.
149-
export CUDA_LAUNCH_BLOCKING=1
150176

151177
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
152178
--host 0.0.0.0 \

0 commit comments

Comments
 (0)