@@ -87,38 +87,31 @@ SERVER_LOG="$RESULT_DIR/server.log"
8787ROUTER_LOG=" $RESULT_DIR /router.log"
8888mkdir -p " $RESULT_DIR "
8989
90- OFFLOAD_ARGS=" "
90+ OFFLOAD_ARGS=()
9191case " $OFFLOADING " in
9292 none) ;;
9393 cpu)
9494 # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
95- # individual jobs to a fraction of that. Aim for ~2.2 TB total host
95+ # individual jobs to a fraction of that. Aim for ~2.5 TB total host
9696 # CPU pool across the engine(s).
9797 #
98- # SimpleCPUOffloadConnector divides cpu_bytes_to_use by
99- # parallel_config.world_size (= TP*PP, NOT including DP — see
100- # vllm/config/parallel.py docstring). So:
101- # - DP-attn=true → each of $TP DP engines has world_size=1 in
102- # its parallel_config; the connector does no internal divide,
103- # and each engine torch.zeros + pin_tensor allocates the full
104- # --kv_offloading_size value. Pre-divide by $TP here so the
105- # aggregate host commit ≈ TOTAL_CPU_DRAM_GB.
106- # - DP-attn=false → single engine with world_size=TP. Pass the
107- # full TOTAL_CPU_DRAM_GB; the connector's internal divide
108- # yields TOTAL/TP per rank, and TP-shared mmap (PR #37206)
109- # keeps the aggregate at TOTAL.
110- TOTAL_CPU_DRAM_GB=2200
98+ # --kv_offloading_size configures one native OffloadingConnector pool
99+ # per vLLM engine. DP-attn starts one engine per DP rank, so pre-divide
100+ # the aggregate host budget across those engines.
101+ TOTAL_CPU_DRAM_GB=2500
111102 if [ " $DP_ATTENTION " = " true" ]; then
112103 PER_ENGINE_GB=$(( TOTAL_CPU_DRAM_GB / TP))
113104 else
114105 PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
115106 fi
116- PER_ENGINE_BYTES=$(( PER_ENGINE_GB * 1024 * 1024 * 1024 ))
117- # Temporarily run eager mode to isolate whether lazy offloading is
118- # required to reproduce the SimpleCPUOffloadConnector CUDA failures.
119- # See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob.
120- export VLLM_USE_SIMPLE_KV_OFFLOAD=1
121- OFFLOAD_ARGS=" --kv-transfer-config {\" kv_connector\" :\" SimpleCPUOffloadConnector\" ,\" kv_role\" :\" kv_both\" ,\" kv_connector_extra_config\" :{\" cpu_bytes_to_use\" :$PER_ENGINE_BYTES ,\" lazy_offload\" :false}}"
107+
108+ # The native backend resolves to OffloadingConnector while this env var
109+ # is unset.
110+ unset VLLM_USE_SIMPLE_KV_OFFLOAD
111+ OFFLOAD_ARGS=(
112+ --kv_offloading_backend native
113+ --kv_offloading_size " $PER_ENGINE_GB "
114+ )
122115 ;;
123116 * )
124117 echo " Error: unsupported OFFLOADING value '$OFFLOADING ' (expected one of: none, cpu)" >&2
@@ -170,7 +163,7 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
170163--no-disable-hybrid-kv-cache-manager \
171164--max-model-len " $MAX_MODEL_LEN " \
172165--max-num-seqs " $PER_ENGINE_MAX_NUM_SEQS " \
173- $ OFFLOAD_ARGS > " $SERVER_LOG " 2>&1 &
166+ " ${ OFFLOAD_ARGS[@]} " > " $SERVER_LOG " 2>&1 &
174167SERVER_PID=$!
175168echo " Server PID: $SERVER_PID "
176169
0 commit comments