|
| 1 | +#!/bin/bash |
| 2 | +set -e |
| 3 | + |
| 4 | +# Detect Brev user (handles ubuntu, nvidia, shadeform, etc.) |
| 5 | +detect_brev_user() { |
| 6 | + if [ -n "${SUDO_USER:-}" ] && [ "$SUDO_USER" != "root" ]; then |
| 7 | + echo "$SUDO_USER" |
| 8 | + return |
| 9 | + fi |
| 10 | + # Check for Brev-specific markers |
| 11 | + for user_home in /home/*; do |
| 12 | + username=$(basename "$user_home") |
| 13 | + [ "$username" = "launchpad" ] && continue |
| 14 | + if ls "$user_home"/.lifecycle-script-ls-*.log 2>/dev/null | grep -q . || \ |
| 15 | + [ -f "$user_home/.verb-setup.log" ] || \ |
| 16 | + { [ -L "$user_home/.cache" ] && [ "$(readlink "$user_home/.cache")" = "/ephemeral/cache" ]; }; then |
| 17 | + echo "$username" |
| 18 | + return |
| 19 | + fi |
| 20 | + done |
| 21 | + # Fallback to common users |
| 22 | + [ -d "/home/nvidia" ] && echo "nvidia" && return |
| 23 | + [ -d "/home/ubuntu" ] && echo "ubuntu" && return |
| 24 | + echo "ubuntu" |
| 25 | +} |
| 26 | + |
| 27 | +# Set USER and HOME if running as root |
| 28 | +if [ "$(id -u)" -eq 0 ] || [ "${USER:-}" = "root" ]; then |
| 29 | + DETECTED_USER=$(detect_brev_user) |
| 30 | + export USER="$DETECTED_USER" |
| 31 | + export HOME="/home/$DETECTED_USER" |
| 32 | +fi |
| 33 | + |
| 34 | +# Configuration (override with environment variables) |
| 35 | +MODEL="${TRTLLM_MODEL:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}" |
| 36 | +PORT="${TRTLLM_PORT:-8000}" |
| 37 | +IMAGE="${TRTLLM_IMAGE:-nvcr.io/nvidia/tensorrt-llm/release:latest}" |
| 38 | + |
| 39 | +echo "⚡ Setting up TensorRT-LLM inference server..." |
| 40 | +echo "User: $USER | Home: $HOME" |
| 41 | +echo "Model: $MODEL | Port: $PORT" |
| 42 | +echo "Image: $IMAGE" |
| 43 | + |
| 44 | +# Note: Brev already has Docker and NVIDIA Container Toolkit installed |
| 45 | +echo "Using existing Docker installation..." |
| 46 | + |
| 47 | +# Verify GPU is available |
| 48 | +if command -v nvidia-smi &> /dev/null; then |
| 49 | + echo "GPU detected: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" |
| 50 | +else |
| 51 | + echo "❌ No GPU detected - This is a script meant to be run on a NVIDIA Brev GPU instance!" |
| 52 | + exit 1 |
| 53 | +fi |
| 54 | + |
| 55 | +# Create cache directory for HuggingFace models |
| 56 | +mkdir -p "$HOME/.cache/huggingface" |
| 57 | + |
| 58 | +# Stop existing container if running |
| 59 | +if docker ps -a --format '{{.Names}}' | grep -q '^trtllm$'; then |
| 60 | + echo "Removing existing TensorRT-LLM container..." |
| 61 | + docker stop trtllm 2>/dev/null || true |
| 62 | + docker rm trtllm 2>/dev/null || true |
| 63 | +fi |
| 64 | + |
| 65 | +# Run TensorRT-LLM container |
| 66 | +echo "Starting TensorRT-LLM server with $MODEL..." |
| 67 | +echo "This may take 10-20+ minutes on first run (engine building + model download)..." |
| 68 | +docker run -d \ |
| 69 | + --name trtllm \ |
| 70 | + --restart unless-stopped \ |
| 71 | + --gpus all \ |
| 72 | + --ipc host \ |
| 73 | + --ulimit memlock=-1 \ |
| 74 | + --ulimit stack=67108864 \ |
| 75 | + -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \ |
| 76 | + -p "$PORT:8000" \ |
| 77 | + -e "HF_TOKEN=${HF_TOKEN:-}" \ |
| 78 | + -e "HUGGING_FACE_HUB_TOKEN=${HF_TOKEN:-}" \ |
| 79 | + "$IMAGE" \ |
| 80 | + trtllm-serve serve "$MODEL" --host 0.0.0.0 --port 8000 |
| 81 | + |
| 82 | +# Create examples directory |
| 83 | +mkdir -p "$HOME/trtllm-examples" |
| 84 | + |
| 85 | +# Create example Python script |
| 86 | +cat > "$HOME/trtllm-examples/chat.py" << EOF |
| 87 | +#!/usr/bin/env python3 |
| 88 | +"""Example: Chat with TensorRT-LLM using OpenAI SDK""" |
| 89 | +from openai import OpenAI |
| 90 | +
|
| 91 | +client = OpenAI(base_url="http://localhost:${PORT}/v1", api_key="tensorrt_llm") |
| 92 | +
|
| 93 | +response = client.chat.completions.create( |
| 94 | + model="${MODEL}", |
| 95 | + messages=[{"role": "user", "content": "Explain what TensorRT-LLM is in two sentences."}] |
| 96 | +) |
| 97 | +
|
| 98 | +print(response.choices[0].message.content) |
| 99 | +EOF |
| 100 | +chmod +x "$HOME/trtllm-examples/chat.py" |
| 101 | + |
| 102 | +# Create curl example script |
| 103 | +cat > "$HOME/trtllm-examples/test_api.sh" << EOF |
| 104 | +#!/bin/bash |
| 105 | +# Test TensorRT-LLM API with curl |
| 106 | +curl -s http://localhost:${PORT}/v1/chat/completions \\ |
| 107 | + -H "Content-Type: application/json" \\ |
| 108 | + -d '{ |
| 109 | + "model": "${MODEL}", |
| 110 | + "messages": [{"role": "user", "content": "Hello!"}], |
| 111 | + "max_tokens": 100 |
| 112 | + }' | python3 -m json.tool |
| 113 | +EOF |
| 114 | +chmod +x "$HOME/trtllm-examples/test_api.sh" |
| 115 | + |
| 116 | +# Fix permissions if running as root |
| 117 | +if [ "$(id -u)" -eq 0 ]; then |
| 118 | + chown -R $USER:$USER "$HOME/.cache/huggingface" |
| 119 | + chown -R $USER:$USER "$HOME/trtllm-examples" |
| 120 | +fi |
| 121 | + |
| 122 | +# Wait for container to start |
| 123 | +echo "Waiting for TensorRT-LLM to initialize..." |
| 124 | +echo "(Engine building and model loading may take 10-20+ minutes on first run)" |
| 125 | +sleep 5 |
| 126 | + |
| 127 | +# Verify |
| 128 | +echo "" |
| 129 | +echo "Verifying installation..." |
| 130 | +docker ps --filter "name=trtllm" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" |
| 131 | + |
| 132 | +echo "" |
| 133 | +echo "✅ TensorRT-LLM container running!" |
| 134 | +echo "" |
| 135 | +echo "⏳ The engine is still building — first run takes 10-20+ minutes." |
| 136 | +echo " Subsequent starts are much faster (engine is cached)." |
| 137 | +echo " Run this to watch progress:" |
| 138 | +echo " docker logs -f trtllm" |
| 139 | +echo "" |
| 140 | +echo " The API is ready when you see: \"Started server process\"" |
| 141 | +echo "" |
| 142 | +echo "Model: $MODEL" |
| 143 | +echo "API Endpoint: http://localhost:$PORT" |
| 144 | +echo "OpenAI-compatible: http://localhost:$PORT/v1" |
| 145 | +echo "" |
| 146 | +echo "⚠️ To access from outside Brev, open port: ${PORT}/tcp" |
| 147 | +echo "" |
| 148 | +echo "Quick start (after engine finishes building):" |
| 149 | +echo " pip install openai" |
| 150 | +echo " python3 $HOME/trtllm-examples/chat.py" |
| 151 | +echo " bash $HOME/trtllm-examples/test_api.sh" |
| 152 | +echo "" |
| 153 | +echo "Manage:" |
| 154 | +echo " docker logs -f trtllm # Watch startup progress" |
| 155 | +echo " docker restart trtllm # Restart server" |
| 156 | +echo " docker stop trtllm # Stop server" |
| 157 | +echo "" |
| 158 | +echo "Run with a different model:" |
| 159 | +echo " export HF_TOKEN={YOUR_HF_TOKEN}" |
| 160 | +echo " TRTLLM_MODEL=nvidia/Llama-3.1-8B-Instruct-FP8 bash setup.sh" |
0 commit comments