@@ -90,43 +90,6 @@ wait_for_lmcache_ready() {
9090 exit 1
9191}
9292
93- echo " Starting vllm server..."
94- export TORCH_CUDA_ARCH_LIST=" 10.0"
95- export PYTHONNOUSERSITE=1
96- # Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
97- # eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
98- # (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
99- # trips before the engine starts. Our --gpu-memory-utilization=0.90 already
100- # leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
101- # net the estimator provides, so disabling it is redundant rather than
102- # unsafe.
103- export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
104-
105- { set +x; } 2> /dev/null
106- VLLM_CMD=(
107- vllm serve " $MODEL "
108- --host 0.0.0.0
109- --port " $PORT "
110- --tensor-parallel-size=" $TP "
111- --gpu-memory-utilization 0.90
112- --max-num-seqs " $CONC "
113- --reasoning-parser kimi_k2
114- --tool-call-parser kimi_k2
115- --compilation_config.pass_config.fuse_allreduce_rms true
116- --kv-cache-dtype fp8
117- --max-cudagraph-capture-size 2048
118- --stream-interval 20
119- --trust-remote-code
120- " ${PREFIX_CACHE_ARGS[@]} "
121- " ${OFFLOAD_ARGS[@]} "
122- )
123- printf ' %q ' " ${VLLM_CMD[@]} " | tee " $RESULT_DIR /vllm_command.txt"
124- printf ' \n' | tee -a " $RESULT_DIR /vllm_command.txt"
125- " ${VLLM_CMD[@]} " > " $SERVER_LOG " 2>&1 &
126- SERVER_PID=$!
127- echo " Server PID: $SERVER_PID "
128-
129-
13093case " $OFFLOADING " in
13194 none)
13295 ;;
@@ -210,6 +173,41 @@ case "$OFFLOADING" in
210173 ;;
211174esac
212175
176+ echo " Starting vllm server..."
177+ export TORCH_CUDA_ARCH_LIST=" 10.0"
178+ export PYTHONNOUSERSITE=1
179+ # Disable vLLM v0.21+ CUDA-graph memory estimator. Its pre-reservation
180+ # eats ~32% of HBM upfront which, combined with FP4 weights at TP=4
181+ # (~62 GB/GPU), leaves no room for KV blocks -- _check_enough_kv_cache_memory
182+ # trips before the engine starts. Our --gpu-memory-utilization=0.90 already
183+ # leaves ~18 GB/GPU slack outside vLLM's budget, which is the same safety
184+ # net the estimator provides, so disabling it is redundant rather than
185+ # unsafe.
186+ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
187+
188+ { set +x; } 2> /dev/null
189+ VLLM_CMD=(
190+ vllm serve " $MODEL "
191+ --host 0.0.0.0
192+ --port " $PORT "
193+ --tensor-parallel-size=" $TP "
194+ --gpu-memory-utilization 0.90
195+ --max-num-seqs " $CONC "
196+ --reasoning-parser kimi_k2
197+ --tool-call-parser kimi_k2
198+ --compilation_config.pass_config.fuse_allreduce_rms true
199+ --kv-cache-dtype fp8
200+ --max-cudagraph-capture-size 2048
201+ --stream-interval 20
202+ --trust-remote-code
203+ " ${PREFIX_CACHE_ARGS[@]} "
204+ " ${OFFLOAD_ARGS[@]} "
205+ )
206+ printf ' %q ' " ${VLLM_CMD[@]} " | tee " $RESULT_DIR /vllm_command.txt"
207+ printf ' \n' | tee -a " $RESULT_DIR /vllm_command.txt"
208+ " ${VLLM_CMD[@]} " > " $SERVER_LOG " 2>&1 &
209+ SERVER_PID=$!
210+ echo " Server PID: $SERVER_PID "
213211
214212wait_for_server_ready --port " $PORT " --server-log " $SERVER_LOG " --server-pid " $SERVER_PID "
215213
0 commit comments