feat: configure Fly.io Volumes for NVMe SSD storage

gHashTag · ona-agent · gHashTag · commit 99639ff778a0 · 2026-02-02T07:34:27.000Z
CRITICAL FIX: Use NVMe volume instead of ephemeral disk

Volume performance (performance-16x):
- NVMe: 32,000 IOPs, 128 MiB/s
- Ephemeral: 2,000 IOPs, 8 MiB/s
- Improvement: 16x faster I/O

Changes:
- fly.toml: Add [[mounts]] for trinity_models volume
- Dockerfile: Use entrypoint.sh for model initialization
- entrypoint.sh: Download model to volume on first run

Expected load time: 208s → ~13s

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,7 @@
 # TRINITY LLM - Zig-based LLM Inference Engine
 # phi^2 + 1/phi^2 = 3 = TRINITY
+# 
+# Uses Fly.io Volumes for NVMe SSD storage (16x faster than ephemeral)
 
 FROM debian:bookworm-slim AS builder
 
@@ -36,23 +38,17 @@ WORKDIR /app
 # Copy binary from builder
 COPY --from=builder /build/vibee /app/vibee
 
-# Create models directory
-RUN mkdir -p /app/models
-
-# Download SmolLM2-1.7B Q8_0 (better quality, larger model)
-# Size: ~1.8GB, loads in ~10-15 seconds
-# For smaller/faster option, use SmolLM2-360M or SmolLM-135M
-RUN echo "Downloading SmolLM2-1.7B-Instruct Q8_0..." && \
-    curl -L -o /app/models/smollm2-1.7b-instruct-q8_0.gguf \
-    "https://huggingface.co/bartowski/SmolLM2-1.7B-Instruct-GGUF/resolve/main/SmolLM2-1.7B-Instruct-Q8_0.gguf" && \
-    ls -la /app/models/
+# Copy entrypoint script
+COPY entrypoint.sh /app/entrypoint.sh
+RUN chmod +x /app/entrypoint.sh
 
 # Set environment
-ENV MODEL_PATH=/app/models/smollm2-1.7b-instruct-q8_0.gguf
+# MODEL_PATH points to volume mount (NVMe SSD)
+ENV MODEL_PATH=/data/models/smollm2-1.7b-instruct-q8_0.gguf
 ENV TEMPERATURE=0.7
 ENV TOP_P=0.9
 ENV NUM_THREADS=16
 
 # Run HTTP API server
 EXPOSE 8080
-CMD ["/app/vibee", "serve", "--model", "/app/models/smollm2-1.7b-instruct-q8_0.gguf", "--port", "8080"]
+ENTRYPOINT ["/app/entrypoint.sh"]
diff --git a/docs/DISCOVERIES.md b/docs/DISCOVERIES.md
@@ -234,17 +234,55 @@ Dequantization and SIMD are fast - the bottleneck is FILE READ.
 
 ### Recommended Solutions
 
-1. **Fly.io Volumes** - Use local SSD storage (HIGH IMPACT)
+1. **Fly.io Volumes** - Use local SSD storage (HIGH IMPACT) ✅ IMPLEMENTED
 2. **Memory-map model** - mmap() for lazy loading (MEDIUM)
 3. **Smaller model** - Use 360M instead of 1.7B (WORKAROUND)
 4. **Pre-warm on deploy** - Keep model in memory (WORKAROUND)
 
 ---
 
+## Fly.io Volumes Configuration
+
+**Status**: ✅ Implemented
+
+### Volume Performance (performance-16x)
+
+| Storage Type | IOPs | Bandwidth |
+|--------------|------|-----------|
+| Ephemeral disk | 2,000 | 8 MiB/s |
+| **NVMe Volume** | **32,000** | **128 MiB/s** |
+| **Improvement** | **16x** | **16x** |
+
+### Configuration Changes
+
+**fly.toml:**
+```toml
+[[mounts]]
+  source = "trinity_models"
+  destination = "/data/models"
+  initial_size = "3gb"
+```
+
+**entrypoint.sh:**
+- Downloads model to volume on first run
+- Subsequent starts use cached model (instant)
+- Model persists across deploys
+
+### Expected Impact
+
+| Metric | Before (Ephemeral) | After (Volume) |
+|--------|-------------------|----------------|
+| Load time | 208s | ~13s (estimated) |
+| First deploy | 208s | ~60s (download) |
+| Subsequent | 208s | ~13s |
+
+---
+
 ## Version History
 
 | Version | Date | Changes |
 |---------|------|---------|
+| v1.4.0 | 2026-02-02 | Fly.io Volumes for NVMe SSD storage |
 | v1.3.0 | 2026-02-02 | Load profiling - found I/O bottleneck |
 | v1.2.0 | 2026-02-02 | Parallel dequantization (OPT-003) |
 | v1.1.0 | 2026-02-02 | SIMD optimization (OPT-001) |
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# ═══════════════════════════════════════════════════════════════════════════════
+# TRINITY LLM - Entrypoint Script
+# Downloads model to NVMe volume on first run
+# φ² + 1/φ² = 3 = TRINITY
+# ═══════════════════════════════════════════════════════════════════════════════
+
+set -e
+
+MODEL_DIR="/data/models"
+MODEL_FILE="smollm2-1.7b-instruct-q8_0.gguf"
+MODEL_PATH="${MODEL_DIR}/${MODEL_FILE}"
+MODEL_URL="https://huggingface.co/bartowski/SmolLM2-1.7B-Instruct-GGUF/resolve/main/SmolLM2-1.7B-Instruct-Q8_0.gguf"
+
+echo "╔══════════════════════════════════════════════════════════════╗"
+echo "║           TRINITY LLM - Volume Initialization                ║"
+echo "╚══════════════════════════════════════════════════════════════╝"
+
+# Create models directory on volume
+mkdir -p "${MODEL_DIR}"
+
+# Check if model exists on volume
+if [ -f "${MODEL_PATH}" ]; then
+    echo "✓ Model found on NVMe volume: ${MODEL_PATH}"
+    ls -lh "${MODEL_PATH}"
+else
+    echo "⚡ Model not found on volume. Downloading to NVMe SSD..."
+    echo "   This is a one-time operation. Future starts will be instant."
+    echo ""
+    echo "   Downloading: ${MODEL_FILE}"
+    echo "   From: ${MODEL_URL}"
+    echo ""
+    
+    # Download with progress
+    curl -L --progress-bar -o "${MODEL_PATH}" "${MODEL_URL}"
+    
+    echo ""
+    echo "✓ Download complete!"
+    ls -lh "${MODEL_PATH}"
+fi
+
+echo ""
+echo "Starting TRINITY LLM server..."
+echo "Model: ${MODEL_PATH}"
+echo ""
+
+# Start the server
+exec /app/vibee serve --model "${MODEL_PATH}" --port 8080
diff --git a/fly.toml b/fly.toml
@@ -11,27 +11,25 @@ primary_region = "iad"
   dockerfile = "Dockerfile"
 
 [env]
-  MODEL_PATH = "/app/models/smollm2-1.7b-instruct-q8_0.gguf"
+  # Model path on persistent volume (NVMe SSD - 16x faster than ephemeral)
+  MODEL_PATH = "/data/models/smollm2-1.7b-instruct-q8_0.gguf"
   TEMPERATURE = "0.7"
   TOP_P = "0.9"
   NUM_THREADS = "16"
 
 # SmolLM2-1.7B requires more RAM (~4GB for model + buffers)
-# performance-4x: 4 dedicated CPU cores, 8GB RAM
 [[vm]]
   size = "performance-16x"
   memory = "32gb"
   cpus = 16
 
-# Alternative sizes:
-# performance-8x: 8 CPU, 16GB RAM (faster, more expensive)
-# performance-16x: 16 CPU, 32GB RAM (maximum speed)
-# shared-cpu-4x: 4 shared CPU, 8GB RAM (cheaper)
-
-# Persistent volume for models
-# [[mounts]]
-#   source = "trinity_models"
-#   destination = "/app/models"
+# Persistent volume for models (NVMe SSD)
+# Volume limits for performance-16x: 32,000 IOPs, 128 MiB/s
+# vs ephemeral disk: 2,000 IOPs, 8 MiB/s (16x slower!)
+[[mounts]]
+  source = "trinity_models"
+  destination = "/data/models"
+  initial_size = "3gb"
 
 # HTTP API service
 [http_service]