-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstartup.sh
More file actions
153 lines (135 loc) · 4.47 KB
/
Copy pathstartup.sh
File metadata and controls
153 lines (135 loc) · 4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/bin/bash
# vLLM Models Template - Startup Script
# Starts vLLM server with configured model and launches Web UI
set -e
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# Store HuggingFace models on the workspace volume (has more space than container disk)
export HF_HOME="$SCRIPT_DIR/.hf_cache"
mkdir -p "$HF_HOME"
# Process IDs for cleanup
VLLM_PID=""
WEB_PID=""
# Graceful shutdown handler
cleanup() {
echo ""
echo "Shutting down..."
# Checkpoint database for clean sync (consolidates WAL into main db)
curl -sf http://localhost:8080/api/checkpoint > /dev/null 2>&1 || true
[ -n "$WEB_PID" ] && kill "$WEB_PID" 2>/dev/null
[ -n "$VLLM_PID" ] && kill "$VLLM_PID" 2>/dev/null
wait
echo "Shutdown complete."
exit 0
}
trap cleanup SIGTERM SIGINT
echo "=== vLLM Models Template ==="
echo "Working directory: $SCRIPT_DIR"
echo ""
# Read configuration from models.json
if [ ! -f "./models.json" ]; then
echo "Error: models.json not found"
exit 1
fi
MODEL=$(jq -r '.model' ./models.json)
GPU_MEM=$(jq -r '.vllm_args.gpu_memory_utilization // 0.9' ./models.json)
MAX_LEN=$(jq -r '.vllm_args.max_model_len // 32768' ./models.json)
TP_SIZE=$(jq -r '.vllm_args.tensor_parallel_size // 1' ./models.json)
if [ -z "$MODEL" ] || [ "$MODEL" = "null" ]; then
echo "Error: No model specified in models.json"
exit 1
fi
echo "Configuration:"
echo " Model: $MODEL"
echo " GPU Memory Utilization: $GPU_MEM"
echo " Max Model Length: $MAX_LEN"
echo " Tensor Parallel Size: $TP_SIZE"
echo ""
# Build vLLM command arguments
VLLM_ARGS=(
"--model" "$MODEL"
"--gpu-memory-utilization" "$GPU_MEM"
"--max-model-len" "$MAX_LEN"
"--tensor-parallel-size" "$TP_SIZE"
"--host" "0.0.0.0"
"--port" "8000"
)
# Add HuggingFace token if available (for gated models)
if [ -n "$HF_TOKEN" ]; then
echo "HuggingFace token detected, enabling gated model access"
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
fi
# Check if vLLM is already running (from previous job)
if curl -sf http://localhost:8000/v1/models > /dev/null 2>&1; then
echo "vLLM is already running (reusing existing server)"
VLLM_PID=""
else
# Start vLLM server in background
# vLLM downloads models automatically with progress output (keeps SSH alive)
echo "Starting vLLM server..."
echo "Command: python -m vllm.entrypoints.openai.api_server ${VLLM_ARGS[*]}"
echo ""
echo "vLLM will download the model if not cached. This may take several minutes."
echo ""
python -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" &
VLLM_PID=$!
# Wait for vLLM to be ready
echo "Waiting for vLLM API (downloading model and loading into GPU memory)..."
TIMEOUT=900 # 15 minutes - for large model downloads
ELAPSED=0
while [ $ELAPSED -lt $TIMEOUT ]; do
if curl -sf http://localhost:8000/v1/models > /dev/null 2>&1; then
echo "vLLM is ready!"
break
fi
if ! kill -0 $VLLM_PID 2>/dev/null; then
echo "Error: vLLM process died unexpectedly"
exit 1
fi
sleep 3
ELAPSED=$((ELAPSED + 3))
echo " Still loading... ($ELAPSED/${TIMEOUT}s)"
done
if [ $ELAPSED -ge $TIMEOUT ]; then
echo "Error: vLLM failed to start within ${TIMEOUT} seconds"
exit 1
fi
fi
# Copy models.json to ui/ so the web server can serve it
echo ""
cp -f ./models.json ./ui/models.json 2>/dev/null || true
# Check if Web UI is already running (from previous job)
if curl -sf http://localhost:8080/ > /dev/null 2>&1; then
echo "Web UI is already running on port 8080 (reusing existing server)"
WEB_PID=""
else
echo "Starting Web UI on port 8080 (with conversation persistence)..."
python server.py &
WEB_PID=$!
fi
echo ""
echo "========================================"
echo " vLLM MODELS READY"
echo "========================================"
echo ""
echo " Model: $MODEL"
echo " vLLM API: http://localhost:8000"
echo " Web UI: http://localhost:8080"
echo ""
echo " API Examples:"
echo " # List models"
echo " curl http://localhost:8000/v1/models"
echo ""
echo " # Chat (OpenAI-compatible)"
echo " curl http://localhost:8000/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello!\"}]}'"
echo ""
echo " To change models:"
echo " 1. Edit models.json"
echo " 2. Run: gpu restart"
echo ""
echo "========================================"
# Wait for vLLM (main process) - keeps container running
wait $VLLM_PID