@@ -85,6 +85,7 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768
8585# ---- Server config ----------------------------------------------------------
8686SERVER_LOG=" $RESULT_DIR /server.log"
8787ROUTER_LOG=" $RESULT_DIR /router.log"
88+ MOONCAKE_MASTER_LOG=" $RESULT_DIR /mooncake_master.log"
8889mkdir -p " $RESULT_DIR "
8990
9091OFFLOAD_ARGS=()
@@ -93,24 +94,52 @@ case "$OFFLOADING" in
9394 cpu)
9495 # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
9596 # individual jobs to a fraction of that. Aim for ~2.5 TB total host
96- # CPU pool across the engine(s) .
97+ # CPU pool across all GPU ranks .
9798 #
98- # --kv_offloading_size configures one native OffloadingConnector pool
99- # per vLLM engine. DP-attn starts one engine per DP rank, so pre-divide
100- # the aggregate host budget across those engines .
99+ # Mooncake embedded mode contributes one global segment per GPU rank to
100+ # a shared distributed store. Pre-divide the aggregate host budget
101+ # across those rank-contributed segments .
101102 TOTAL_CPU_DRAM_GB=2500
102- if [ " $DP_ATTENTION " = " true" ]; then
103- PER_ENGINE_GB=$(( TOTAL_CPU_DRAM_GB / TP))
104- else
105- PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
103+ PER_RANK_GB=$(( TOTAL_CPU_DRAM_GB / TP))
104+
105+ MOONCAKE_VERSION=0.3.11.post1
106+ agentic_pip_install --quiet --no-cache-dir --no-deps \
107+ --force-reinstall " mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION "
108+ python3 -c " from mooncake.store import MooncakeDistributedStore" > /dev/null
109+
110+ MOONCAKE_MASTER_PORT=$(( PORT + 12000 ))
111+ MOONCAKE_CONFIG_PATH=" $RESULT_DIR /mooncake_config.json"
112+ cat > " $MOONCAKE_CONFIG_PATH " << EOF
113+ {
114+ "mode": "embedded",
115+ "metadata_server": "P2PHANDSHAKE",
116+ "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT ",
117+ "global_segment_size": "${PER_RANK_GB} GB",
118+ "local_buffer_size": "4GB",
119+ "protocol": "rdma",
120+ "device_name": "",
121+ "enable_offload": false
122+ }
123+ EOF
124+ export MOONCAKE_CONFIG_PATH
125+ # Identical prefixes must hash to identical store keys across DP ranks.
126+ export PYTHONHASHSEED=0
127+
128+ echo " Starting Mooncake master on port $MOONCAKE_MASTER_PORT ..."
129+ mooncake_master --port " $MOONCAKE_MASTER_PORT " \
130+ > " $MOONCAKE_MASTER_LOG " 2>&1 &
131+ MOONCAKE_MASTER_PID=$!
132+ sleep 2
133+ if ! kill -0 " $MOONCAKE_MASTER_PID " 2> /dev/null; then
134+ echo " Mooncake master died during startup." >&2
135+ cat " $MOONCAKE_MASTER_LOG " >&2
136+ exit 1
106137 fi
107138
108- # The native backend resolves to OffloadingConnector while this env var
109- # is unset.
110139 unset VLLM_USE_SIMPLE_KV_OFFLOAD
111140 OFFLOAD_ARGS=(
112- --kv_offloading_backend native
113- --kv_offloading_size " $PER_ENGINE_GB "
141+ --kv-transfer-config
142+ ' {"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}} '
114143 )
115144 ;;
116145 * )
@@ -144,9 +173,6 @@ echo "Starting vllm server..."
144173export TORCH_CUDA_ARCH_LIST=" 10.0"
145174export PYTHONNOUSERSITE=1
146175export VLLM_FLOAT32_MATMUL_PRECISION=high
147- # Temporary diagnostic: surface asynchronous CUDA failures at the operation
148- # that caused them instead of at a later synchronization point.
149- export CUDA_LAUNCH_BLOCKING=1
150176
151177vllm serve " $MODEL_PATH " --served-model-name " $MODEL " \
152178--host 0.0.0.0 \
0 commit comments