Skip to content

Commit c050d08

Browse files
committed
manual
Signed-off-by: seungrokj <seungrok.jung@amd.com>
1 parent 18eb2d5 commit c050d08

2 files changed

Lines changed: 383 additions & 375 deletions

File tree

benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh

Lines changed: 35 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -90,43 +90,6 @@ wait_for_lmcache_ready() {
9090
exit 1
9191
}
9292

93-
echo "Starting vllm server..."
94-
export TORCH_CUDA_ARCH_LIST="10.0"
95-
export PYTHONNOUSERSITE=1
96-
# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
97-
# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
98-
# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
99-
# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
100-
# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
101-
# net the estimator provides, so disabling it is redundant rather than
102-
# unsafe.
103-
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
104-
105-
{ set +x; } 2>/dev/null
106-
VLLM_CMD=(
107-
vllm serve "$MODEL"
108-
--host 0.0.0.0
109-
--port "$PORT"
110-
--tensor-parallel-size="$TP"
111-
--gpu-memory-utilization 0.90
112-
--max-num-seqs "$CONC"
113-
--reasoning-parser kimi_k2
114-
--tool-call-parser kimi_k2
115-
--compilation_config.pass_config.fuse_allreduce_rms true
116-
--kv-cache-dtype fp8
117-
--max-cudagraph-capture-size 2048
118-
--stream-interval 20
119-
--trust-remote-code
120-
"${PREFIX_CACHE_ARGS[@]}"
121-
"${OFFLOAD_ARGS[@]}"
122-
)
123-
printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
124-
printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
125-
"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
126-
SERVER_PID=$!
127-
echo "Server PID: $SERVER_PID"
128-
129-
13093
case "$OFFLOADING" in
13194
none)
13295
;;
@@ -210,6 +173,41 @@ case "$OFFLOADING" in
210173
;;
211174
esac
212175

176+
echo "Starting vllm server..."
177+
export TORCH_CUDA_ARCH_LIST="10.0"
178+
export PYTHONNOUSERSITE=1
179+
# Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
180+
# eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
181+
# (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
182+
# trips before the engine starts. Our --gpu-memory-utilization=0.90 already
183+
# leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
184+
# net the estimator provides, so disabling it is redundant rather than
185+
# unsafe.
186+
export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
187+
188+
{ set +x; } 2>/dev/null
189+
VLLM_CMD=(
190+
vllm serve "$MODEL"
191+
--host 0.0.0.0
192+
--port "$PORT"
193+
--tensor-parallel-size="$TP"
194+
--gpu-memory-utilization 0.90
195+
--max-num-seqs "$CONC"
196+
--reasoning-parser kimi_k2
197+
--tool-call-parser kimi_k2
198+
--compilation_config.pass_config.fuse_allreduce_rms true
199+
--kv-cache-dtype fp8
200+
--max-cudagraph-capture-size 2048
201+
--stream-interval 20
202+
--trust-remote-code
203+
"${PREFIX_CACHE_ARGS[@]}"
204+
"${OFFLOAD_ARGS[@]}"
205+
)
206+
printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
207+
printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
208+
"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
209+
SERVER_PID=$!
210+
echo "Server PID: $SERVER_PID"
213211

214212
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
215213

0 commit comments

Comments
 (0)