1+ #! /bin/bash
2+
3+ echo " VIRTUAL_ENV is: $VIRTUAL_ENV "
4+
5+ # should be launched from 2-serving-engines/flat/choose-and-deploy.sh
6+
7+ # Check if vllm command is available
8+ if ! command -v vllm & > /dev/null; then
9+ echo " ERROR: vllm command not found in PATH" >&2
10+ echo " Please ensure vLLM is installed and accessible:" >&2
11+ echo " pip install vllm" >&2
12+ echo " Or activate the appropriate virtual environment" >&2
13+ echo " Current PATH: $PATH " >&2
14+ echo " Python location: $( which python3 2> /dev/null || echo ' not found' ) " >&2
15+ exit 1
16+ fi
17+
18+ NUM_INSTANCES=4
19+
20+
21+ # Find N free ports starting from START_PORT
22+ find_free_ports () {
23+ local start=$1
24+ local count=$2
25+ local port=$start
26+ local free_ports=()
27+
28+ while [ " ${# free_ports[@]} " -lt " $count " ]; do
29+ if ! lsof -iTCP:$port -sTCP:LISTEN & > /dev/null; then
30+ free_ports+=($port )
31+ fi
32+ (( port++ ))
33+ done
34+
35+ echo " ${free_ports[@]} "
36+ }
37+
38+ find_free_gpus () {
39+ local count=$1
40+ local free_gpus=()
41+
42+ local total_gpus
43+ total_gpus=$( nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
44+
45+ for (( i= 0 ; i< total_gpus; i++ )) ; do
46+ has_process=$( nvidia-smi --query-compute-apps=gpu_uuid --format=csv,noheader | grep -c " $( nvidia-smi --query-gpu=uuid --format=csv,noheader -i $i ) " || true)
47+ if [[ " $has_process " -eq 0 ]]; then
48+ free_gpus+=(" $i " )
49+ fi
50+ if [[ " ${# free_gpus[@]} " -ge " $count " ]]; then
51+ break
52+ fi
53+ done
54+
55+ if [[ " ${# free_gpus[@]} " -lt " $count " ]]; then
56+ echo " ERROR: Only found ${# free_gpus[@]} free GPUs, need $count " >&2
57+ exit 1
58+ fi
59+
60+ echo " ${free_gpus[@]} "
61+ }
62+
63+ # Get 4 free ports starting from 8000
64+ free_ports=($( find_free_ports 8000 " $NUM_INSTANCES " ) )
65+ free_gpus=($( find_free_gpus " $NUM_INSTANCES " ) )
66+
67+ echo " Using ports: ${free_ports[*]} "
68+ echo " Using GPUs: ${free_gpus[*]} "
69+
70+ if [ " ${# free_ports[@]} " -ne " $NUM_INSTANCES " ]; then
71+ echo " ERROR: Only found ${# free_ports[@]} free ports, need $NUM_INSTANCES " >&2
72+ exit 1
73+ fi
74+
75+ if [ " ${# free_gpus[@]} " -ne " $NUM_INSTANCES " ]; then
76+ echo " ERROR: Only found ${# free_gpus[@]} free GPUs, need $NUM_INSTANCES " >&2
77+ exit 1
78+ fi
79+
80+
81+ for i in $( seq 0 $(( NUM_INSTANCES - 1 )) ) ; do
82+ port=" ${free_ports[$i]} "
83+ gpu=" ${free_gpus[$i]} "
84+ log_file=" vllm_${port} .log"
85+
86+ echo " Launching vLLM on port $port with GPU $gpu ..."
87+ CUDA_VISIBLE_DEVICES=" $gpu " \
88+ LMCACHE_CONFIG_FILE=" configs/cpu-offload.yaml" \
89+ nohup vllm serve \
90+ meta-llama/Llama-3.1-8B-Instruct \
91+ --max-model-len 32000 \
92+ --port " $port " \
93+ > " $log_file " 2>&1 &
94+ done
95+
96+ # do a trick here where we alternate querying each port with v1/models
97+ # and we return once we get NUM_INSTANCES good responses in a row
98+ # every query, we also print out the tail of the logs
99+ echo " Waiting for all $NUM_INSTANCES engines to be ready in a row..."
100+
101+
102+ ready_in_a_row=0
103+ i=0
104+ while true ; do
105+ port=" ${free_ports[$((i % NUM_INSTANCES))]} "
106+ log_file=" vllm_${port} .log"
107+
108+ echo " ⏳ Checking port $port ..."
109+ if curl -s http://localhost:$port /v1/models > /dev/null 2>&1 ; then
110+ echo " ✅ Port $port responded OK"
111+ (( ready_in_a_row++ ))
112+ else
113+ echo " ❌ Port $port not ready. Resetting counter."
114+ ready_in_a_row=0
115+ fi
116+
117+ echo " ↪ Log tail for port $port :"
118+ tail -n 5 " $log_file " || echo " (no log yet)"
119+ echo " "
120+
121+ if [[ " $ready_in_a_row " -ge " $NUM_INSTANCES " ]]; then
122+ echo " 🎉 All $NUM_INSTANCES engines responded successfully in a row"
123+ break
124+ fi
125+
126+ sleep 2
127+ (( i++ ))
128+ done
129+
130+ port_arg=$( IFS=, ; echo " ${free_ports[*]} " )
131+
132+ nohup python routers/round-robin-router.py --ports " $port_arg " &
0 commit comments