@@ -75,20 +75,40 @@ cleanup_lmcache_server() {
7575trap cleanup_lmcache_server EXIT
7676
7777wait_for_lmcache_ready () {
78+ { set +x; } 2> /dev/null
7879 local attempts=" ${LMCACHE_READY_ATTEMPTS:- 120} "
80+ local tail_pid=" "
81+
82+ while [ ! -f " $LMCACHE_LOG " ]; do
83+ if [[ -n " $LMCACHE_PID " ]] && ! kill -0 " $LMCACHE_PID " 2> /dev/null; then
84+ echo " LMCache server died before creating log file. Exiting." >&2
85+ exit 1
86+ fi
87+ sleep 1
88+ done
89+
90+ tail -f -n +1 " $LMCACHE_LOG " &
91+ tail_pid=$!
92+
7993 for (( i = 1 ; i <= attempts; i++ )) ; do
8094 if curl --output /dev/null --silent --fail " http://127.0.0.1:${LMCACHE_HTTP_PORT} /healthcheck" ; then
95+ kill " $tail_pid " 2> /dev/null || true
96+ wait " $tail_pid " 2> /dev/null || true
8197 return 0
8298 fi
8399 if [[ -n " $LMCACHE_PID " ]] && ! kill -0 " $LMCACHE_PID " 2> /dev/null; then
84100 echo " LMCache server died before becoming healthy. Log follows:" >&2
101+ kill " $tail_pid " 2> /dev/null || true
102+ wait " $tail_pid " 2> /dev/null || true
85103 cat " $LMCACHE_LOG " >&2 || true
86104 exit 1
87105 fi
88106 sleep 1
89107 done
90108
91109 echo " Timed out waiting for LMCache server healthcheck. Log follows:" >&2
110+ kill " $tail_pid " 2> /dev/null || true
111+ wait " $tail_pid " 2> /dev/null || true
92112 cat " $LMCACHE_LOG " >&2 || true
93113 exit 1
94114}
@@ -131,6 +151,7 @@ case "$OFFLOADING" in
131151 )
132152 ;;
133153 lmcache-mp)
154+ { set +x; } 2> /dev/null
134155 # LMCache docs recommend MP mode for production: start an external
135156 # `lmcache server`, then point vLLM's LMCacheMPConnector at it. For
136157 # vLLM >= 0.20, prefer the LMCache-shipped connector module because it
@@ -154,16 +175,21 @@ case "$OFFLOADING" in
154175 LMCACHE_MAX_WORKERS=" ${LMCACHE_MAX_WORKERS:- $TP } "
155176
156177 echo " Starting LMCache MP server..."
157- lmcache server \
158- --host " $LMCACHE_HOST " \
159- --port " $LMCACHE_PORT " \
160- --http-host " $LMCACHE_HOST " \
161- --http-port " $LMCACHE_HTTP_PORT " \
162- --l1-size-gb " $LMCACHE_L1_SIZE_GB " \
163- --l1-init-size-gb " $LMCACHE_L1_INIT_SIZE_GB " \
164- --chunk-size " $LMCACHE_CHUNK_SIZE " \
165- --max-workers " $LMCACHE_MAX_WORKERS " \
166- --eviction-policy LRU > " $LMCACHE_LOG " 2>&1 &
178+ LMCACHE_CMD=(
179+ lmcache server
180+ --host " $LMCACHE_HOST "
181+ --port " $LMCACHE_PORT "
182+ --http-host " $LMCACHE_HOST "
183+ --http-port " $LMCACHE_HTTP_PORT "
184+ --l1-size-gb " $LMCACHE_L1_SIZE_GB "
185+ --l1-init-size-gb " $LMCACHE_L1_INIT_SIZE_GB "
186+ --chunk-size " $LMCACHE_CHUNK_SIZE "
187+ --max-workers " $LMCACHE_MAX_WORKERS "
188+ --eviction-policy LRU
189+ )
190+ printf ' %q ' " ${LMCACHE_CMD[@]} " > " $RESULT_DIR /lmcache_command.txt"
191+ printf ' \n' >> " $RESULT_DIR /lmcache_command.txt"
192+ " ${LMCACHE_CMD[@]} " > " $LMCACHE_LOG " 2>&1 &
167193 LMCACHE_PID=$!
168194 echo " LMCache server PID: $LMCACHE_PID "
169195 wait_for_lmcache_ready
@@ -206,25 +232,31 @@ export TORCH_CUDA_ARCH_LIST="10.0"
206232export PYTHONNOUSERSITE=1
207233export VLLM_FLOAT32_MATMUL_PRECISION=high
208234
209- vllm serve " $MODEL " \
210- --host 0.0.0.0 \
211- --port " $PORT " \
212- --trust-remote-code \
213- --kv-cache-dtype fp8 \
214- --block-size 256 \
215- " ${PARALLEL_ARGS[@]} " \
216- " ${EP_ARGS[@]} " \
217- --compilation-config ' {"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
218- --attention_config.use_fp4_indexer_cache=True \
219- --tokenizer-mode deepseek_v4 \
220- --tool-call-parser deepseek_v4 \
221- --enable-auto-tool-choice \
222- --reasoning-parser deepseek_v4 \
223- --enable-prefix-caching \
224- " ${HYBRID_KV_ARGS[@]} " \
225- --max-model-len " $MAX_MODEL_LEN " \
226- --max-num-seqs " $PER_ENGINE_MAX_NUM_SEQS " \
227- " ${OFFLOAD_ARGS[@]} " > " $SERVER_LOG " 2>&1 &
235+ { set +x; } 2> /dev/null
236+ VLLM_CMD=(
237+ vllm serve " $MODEL "
238+ --host 0.0.0.0
239+ --port " $PORT "
240+ --trust-remote-code
241+ --kv-cache-dtype fp8
242+ --block-size 256
243+ " ${PARALLEL_ARGS[@]} "
244+ " ${EP_ARGS[@]} "
245+ --compilation-config ' {"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
246+ --attention_config.use_fp4_indexer_cache=True
247+ --tokenizer-mode deepseek_v4
248+ --tool-call-parser deepseek_v4
249+ --enable-auto-tool-choice
250+ --reasoning-parser deepseek_v4
251+ --enable-prefix-caching
252+ " ${HYBRID_KV_ARGS[@]} "
253+ --max-model-len " $MAX_MODEL_LEN "
254+ --max-num-seqs " $PER_ENGINE_MAX_NUM_SEQS "
255+ " ${OFFLOAD_ARGS[@]} "
256+ )
257+ printf ' %q ' " ${VLLM_CMD[@]} " | tee " $RESULT_DIR /vllm_command.txt"
258+ printf ' \n' | tee -a " $RESULT_DIR /vllm_command.txt"
259+ " ${VLLM_CMD[@]} " > " $SERVER_LOG " 2>&1 &
228260SERVER_PID=$!
229261echo " Server PID: $SERVER_PID "
230262
0 commit comments