gpu/templates/vllm-models/startup.sh at main · gpu-cli/gpu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/bin/bash
# vLLM Models Template - Startup Script
# Starts vLLM server with configured model and launches Web UI
set -e

# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

# Store HuggingFace models on the workspace volume (has more space than container disk)
export HF_HOME="$SCRIPT_DIR/.hf_cache"
mkdir -p "$HF_HOME"

# Process IDs for cleanup
VLLM_PID=""
WEB_PID=""

# Graceful shutdown handler
cleanup() {
  echo ""
  echo "Shutting down..."
  # Checkpoint database for clean sync (consolidates WAL into main db)
  curl -sf http://localhost:8080/api/checkpoint > /dev/null 2>&1 || true
  [ -n "$WEB_PID" ] && kill "$WEB_PID" 2>/dev/null
  [ -n "$VLLM_PID" ] && kill "$VLLM_PID" 2>/dev/null
  wait
  echo "Shutdown complete."
  exit 0
}
trap cleanup SIGTERM SIGINT

echo "=== vLLM Models Template ==="
echo "Working directory: $SCRIPT_DIR"
echo ""

# Read configuration from models.json
if [ ! -f "./models.json" ]; then
  echo "Error: models.json not found"
  exit 1
fi

MODEL=$(jq -r '.model' ./models.json)
GPU_MEM=$(jq -r '.vllm_args.gpu_memory_utilization // 0.9' ./models.json)
MAX_LEN=$(jq -r '.vllm_args.max_model_len // 32768' ./models.json)
TP_SIZE=$(jq -r '.vllm_args.tensor_parallel_size // 1' ./models.json)

if [ -z "$MODEL" ] || [ "$MODEL" = "null" ]; then
  echo "Error: No model specified in models.json"
  exit 1
fi

echo "Configuration:"
echo "  Model: $MODEL"
echo "  GPU Memory Utilization: $GPU_MEM"
echo "  Max Model Length: $MAX_LEN"
echo "  Tensor Parallel Size: $TP_SIZE"
echo ""

# Build vLLM command arguments
VLLM_ARGS=(
  "--model" "$MODEL"
  "--gpu-memory-utilization" "$GPU_MEM"
  "--max-model-len" "$MAX_LEN"
  "--tensor-parallel-size" "$TP_SIZE"
  "--host" "0.0.0.0"
  "--port" "8000"
)

# Add HuggingFace token if available (for gated models)
if [ -n "$HF_TOKEN" ]; then
  echo "HuggingFace token detected, enabling gated model access"
  export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
fi

# Check if vLLM is already running (from previous job)
if curl -sf http://localhost:8000/v1/models > /dev/null 2>&1; then
  echo "vLLM is already running (reusing existing server)"
  VLLM_PID=""
else
  # Start vLLM server in background
  # vLLM downloads models automatically with progress output (keeps SSH alive)
  echo "Starting vLLM server..."
  echo "Command: python -m vllm.entrypoints.openai.api_server ${VLLM_ARGS[*]}"
  echo ""
  echo "vLLM will download the model if not cached. This may take several minutes."
  echo ""
  python -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" &
  VLLM_PID=$!

  # Wait for vLLM to be ready
  echo "Waiting for vLLM API (downloading model and loading into GPU memory)..."
  TIMEOUT=900  # 15 minutes - for large model downloads
  ELAPSED=0
  while [ $ELAPSED -lt $TIMEOUT ]; do
    if curl -sf http://localhost:8000/v1/models > /dev/null 2>&1; then
      echo "vLLM is ready!"
      break
    fi
    if ! kill -0 $VLLM_PID 2>/dev/null; then
      echo "Error: vLLM process died unexpectedly"
      exit 1
    fi
    sleep 3
    ELAPSED=$((ELAPSED + 3))
    echo "  Still loading... ($ELAPSED/${TIMEOUT}s)"
  done

  if [ $ELAPSED -ge $TIMEOUT ]; then
    echo "Error: vLLM failed to start within ${TIMEOUT} seconds"
    exit 1
  fi
fi

# Copy models.json to ui/ so the web server can serve it
echo ""
cp -f ./models.json ./ui/models.json 2>/dev/null || true

# Check if Web UI is already running (from previous job)
if curl -sf http://localhost:8080/ > /dev/null 2>&1; then
  echo "Web UI is already running on port 8080 (reusing existing server)"
  WEB_PID=""
else
  echo "Starting Web UI on port 8080 (with conversation persistence)..."
  python server.py &
  WEB_PID=$!
fi

echo ""
echo "========================================"
echo "         vLLM MODELS READY"
echo "========================================"
echo ""
echo "  Model:     $MODEL"
echo "  vLLM API:  http://localhost:8000"
echo "  Web UI:    http://localhost:8080"
echo ""
echo "  API Examples:"
echo "    # List models"
echo "    curl http://localhost:8000/v1/models"
echo ""
echo "    # Chat (OpenAI-compatible)"
echo "    curl http://localhost:8000/v1/chat/completions \\"
echo "      -H 'Content-Type: application/json' \\"
echo "      -d '{\"model\":\"$MODEL\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello!\"}]}'"
echo ""
echo "  To change models:"
echo "    1. Edit models.json"
echo "    2. Run: gpu restart"
echo ""
echo "========================================"

# Wait for vLLM (main process) - keeps container running
wait $VLLM_PID