Skip to content

Commit 3747263

Browse files
committed
fix(agentic): use native B300 KV offloading
1 parent 60f3be0 commit 3747263

1 file changed

Lines changed: 15 additions & 22 deletions

File tree

benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -87,38 +87,31 @@ SERVER_LOG="$RESULT_DIR/server.log"
8787
ROUTER_LOG="$RESULT_DIR/router.log"
8888
mkdir -p "$RESULT_DIR"
8989

90-
OFFLOAD_ARGS=""
90+
OFFLOAD_ARGS=()
9191
case "$OFFLOADING" in
9292
none) ;;
9393
cpu)
9494
# B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
95-
# individual jobs to a fraction of that. Aim for ~2.2 TB total host
95+
# individual jobs to a fraction of that. Aim for ~2.5 TB total host
9696
# CPU pool across the engine(s).
9797
#
98-
# SimpleCPUOffloadConnector divides cpu_bytes_to_use by
99-
# parallel_config.world_size (= TP*PP, NOT including DP — see
100-
# vllm/config/parallel.py docstring). So:
101-
# - DP-attn=true → each of $TP DP engines has world_size=1 in
102-
# its parallel_config; the connector does no internal divide,
103-
# and each engine torch.zeros + pin_tensor allocates the full
104-
# --kv_offloading_size value. Pre-divide by $TP here so the
105-
# aggregate host commit ≈ TOTAL_CPU_DRAM_GB.
106-
# - DP-attn=false → single engine with world_size=TP. Pass the
107-
# full TOTAL_CPU_DRAM_GB; the connector's internal divide
108-
# yields TOTAL/TP per rank, and TP-shared mmap (PR #37206)
109-
# keeps the aggregate at TOTAL.
110-
TOTAL_CPU_DRAM_GB=2200
98+
# --kv_offloading_size configures one native OffloadingConnector pool
99+
# per vLLM engine. DP-attn starts one engine per DP rank, so pre-divide
100+
# the aggregate host budget across those engines.
101+
TOTAL_CPU_DRAM_GB=2500
111102
if [ "$DP_ATTENTION" = "true" ]; then
112103
PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP))
113104
else
114105
PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
115106
fi
116-
PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024))
117-
# Temporarily run eager mode to isolate whether lazy offloading is
118-
# required to reproduce the SimpleCPUOffloadConnector CUDA failures.
119-
# See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob.
120-
export VLLM_USE_SIMPLE_KV_OFFLOAD=1
121-
OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":false}}"
107+
108+
# The native backend resolves to OffloadingConnector while this env var
109+
# is unset.
110+
unset VLLM_USE_SIMPLE_KV_OFFLOAD
111+
OFFLOAD_ARGS=(
112+
--kv_offloading_backend native
113+
--kv_offloading_size "$PER_ENGINE_GB"
114+
)
122115
;;
123116
*)
124117
echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
@@ -170,7 +163,7 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
170163
--no-disable-hybrid-kv-cache-manager \
171164
--max-model-len "$MAX_MODEL_LEN" \
172165
--max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" \
173-
$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
166+
"${OFFLOAD_ARGS[@]}" > "$SERVER_LOG" 2>&1 &
174167
SERVER_PID=$!
175168
echo "Server PID: $SERVER_PID"
176169

0 commit comments

Comments
 (0)