Skip to content

Commit d0b0752

Browse files
gHashTagona-agent
andcommitted
feat: add multi-model support (360M fast, 1.7B quality)
MODEL_SIZE env var selects model: - 360m: 0.39GB, 2.17s load, ~7 tok/s (fast) - 1.7b: 1.7GB, 4.82s load, ~1.4 tok/s (quality) Default changed to 360M for faster responses. Set MODEL_SIZE=1.7b for better quality. Co-authored-by: Ona <no-reply@ona.com>
1 parent 3b2b425 commit d0b0752

3 files changed

Lines changed: 60 additions & 4 deletions

File tree

docs/DISCOVERIES.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ Dequantization and SIMD are fast - the bottleneck is FILE READ.
294294

295295
| Version | Date | Changes |
296296
|---------|------|---------|
297+
| v1.6.0 | 2026-02-02 | Multi-model support (360M fast, 1.7B quality) |
297298
| v1.5.0 | 2026-02-02 | Batch metrics & throughput tracking (INF-004) |
298299
| v1.4.0 | 2026-02-02 | Fly.io Volumes - **43x faster load (208s→4.8s)** |
299300
| v1.3.0 | 2026-02-02 | Load profiling - found I/O bottleneck |
@@ -305,6 +306,36 @@ Dequantization and SIMD are fast - the bottleneck is FILE READ.
305306

306307
---
307308

309+
## Multi-Model Support
310+
311+
**Status**: ✅ Implemented
312+
313+
### Available Models
314+
315+
| Model | Size | Load Time | Inference | Use Case |
316+
|-------|------|-----------|-----------|----------|
317+
| SmolLM2-360M | 0.39GB | **2.17s** | ~7 tok/s | Fast responses |
318+
| SmolLM2-1.7B | 1.7GB | 4.82s | ~1.4 tok/s | Quality responses |
319+
320+
### Configuration
321+
322+
Set `MODEL_SIZE` environment variable in `fly.toml`:
323+
324+
```toml
325+
[env]
326+
MODEL_SIZE = "360m" # Options: "360m" (fast) or "1.7b" (quality)
327+
```
328+
329+
### Performance Comparison
330+
331+
| Metric | 360M | 1.7B | Improvement |
332+
|--------|------|------|-------------|
333+
| Model size | 0.39GB | 1.7GB | 4.4x smaller |
334+
| Load time | 2.17s | 4.82s | 2.2x faster |
335+
| Inference | ~7 tok/s | ~1.4 tok/s | ~5x faster |
336+
337+
---
338+
308339
## Batch Processing Metrics (INF-004)
309340

310341
**Status**: ✅ Phase 1 Implemented (Metrics)

entrypoint.sh

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,37 @@
22
# ═══════════════════════════════════════════════════════════════════════════════
33
# TRINITY LLM - Entrypoint Script
44
# Downloads model to NVMe volume on first run
5+
# Supports multiple model sizes via MODEL_SIZE env var
56
# φ² + 1/φ² = 3 = TRINITY
67
# ═══════════════════════════════════════════════════════════════════════════════
78

89
set -e
910

1011
MODEL_DIR="/data/models"
11-
MODEL_FILE="smollm2-1.7b-instruct-q8_0.gguf"
12+
13+
# Model selection based on MODEL_SIZE env var
14+
# Options: 360m (fast), 1.7b (default/quality)
15+
MODEL_SIZE="${MODEL_SIZE:-1.7b}"
16+
17+
case "${MODEL_SIZE}" in
18+
"360m"|"360M"|"fast")
19+
MODEL_FILE="smollm2-360m-instruct-q8_0.gguf"
20+
MODEL_URL="https://huggingface.co/bartowski/SmolLM2-360M-Instruct-GGUF/resolve/main/SmolLM2-360M-Instruct-Q8_0.gguf"
21+
MODEL_DESC="SmolLM2-360M (fast, 0.39GB)"
22+
;;
23+
"1.7b"|"1.7B"|"quality"|*)
24+
MODEL_FILE="smollm2-1.7b-instruct-q8_0.gguf"
25+
MODEL_URL="https://huggingface.co/bartowski/SmolLM2-1.7B-Instruct-GGUF/resolve/main/SmolLM2-1.7B-Instruct-Q8_0.gguf"
26+
MODEL_DESC="SmolLM2-1.7B (quality, 1.7GB)"
27+
;;
28+
esac
29+
1230
MODEL_PATH="${MODEL_DIR}/${MODEL_FILE}"
13-
MODEL_URL="https://huggingface.co/bartowski/SmolLM2-1.7B-Instruct-GGUF/resolve/main/SmolLM2-1.7B-Instruct-Q8_0.gguf"
1431

1532
echo "╔══════════════════════════════════════════════════════════════╗"
1633
echo "║ TRINITY LLM - Volume Initialization ║"
34+
echo "╠══════════════════════════════════════════════════════════════╣"
35+
echo "║ Model: ${MODEL_DESC}"
1736
echo "╚══════════════════════════════════════════════════════════════╝"
1837

1938
# Create models directory on volume
@@ -42,6 +61,7 @@ fi
4261
echo ""
4362
echo "Starting TRINITY LLM server..."
4463
echo "Model: ${MODEL_PATH}"
64+
echo "Size: ${MODEL_SIZE}"
4565
echo ""
4666

4767
# Start the server

fly.toml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,13 @@ primary_region = "iad"
1111
dockerfile = "Dockerfile"
1212

1313
[env]
14-
# Model path on persistent volume (NVMe SSD - 16x faster than ephemeral)
15-
MODEL_PATH = "/data/models/smollm2-1.7b-instruct-q8_0.gguf"
14+
# Model selection: "360m" (fast) or "1.7b" (quality)
15+
# 360M: 0.39GB, ~5x faster inference
16+
# 1.7B: 1.7GB, better quality responses
17+
MODEL_SIZE = "360m"
18+
19+
# Model path on persistent volume (NVMe SSD - 43x faster than ephemeral)
20+
MODEL_PATH = "/data/models/smollm2-360m-instruct-q8_0.gguf"
1621
TEMPERATURE = "0.7"
1722
TOP_P = "0.9"
1823
NUM_THREADS = "16"

0 commit comments

Comments
 (0)