Skip to content

Commit d8f169e

Browse files
gHashTagona-agent
andcommitted
Add ternary quantization, Flash Attention, and parallel inference
- specs/tri/ternary_smollm2.vibee: Ternary {-1,0,+1} quantization spec - specs/tri/flash_attention.vibee: IO-aware tiled attention spec - specs/tri/parallel_inference.vibee: Multi-threaded inference spec - src/vibeec/gguf_to_tri.zig: GGUF to .tri format converter - src/vibeec/tri_inference.zig: Ternary model inference engine - src/vibeec/flash_attention.zig: Online softmax, SIMD dot product - src/vibeec/parallel_inference.zig: Thread-parallel matmul - src/vibeec/flash_benchmark.zig: Flash vs standard attention benchmark - fly.toml: Updated for performance-16x (16 CPU cores) - Dockerfile.flyio: Fly.io deployment container - benchmark_flyio.sh: Performance estimation script Performance on 2-core: - GGUF Q8_0: 8.73 tok/s (baseline) - TRI + SIMD: 7.97 tok/s (-9%) Expected on Fly.io performance-16x: - ~50 tok/s (6x speedup) Co-authored-by: Ona <no-reply@ona.com>
1 parent 7945962 commit d8f169e

11 files changed

Lines changed: 2920 additions & 7 deletions

Dockerfile.flyio

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# ═══════════════════════════════════════════════════════════════════════════════
2+
# TRINITY INFERENCE - Fly.io Deployment
3+
# Multi-threaded LLM inference with 16 CPU cores
4+
# φ² + 1/φ² = 3 = TRINITY
5+
# ═══════════════════════════════════════════════════════════════════════════════
6+
7+
FROM debian:bookworm-slim AS builder
8+
9+
# Install Zig
10+
RUN apt-get update && apt-get install -y \
11+
curl \
12+
xz-utils \
13+
&& rm -rf /var/lib/apt/lists/*
14+
15+
# Download and install Zig 0.13.0
16+
RUN curl -L https://ziglang.org/download/0.13.0/zig-linux-x86_64-0.13.0.tar.xz | tar -xJ -C /opt
17+
ENV PATH="/opt/zig-linux-x86_64-0.13.0:${PATH}"
18+
19+
# Copy source code
20+
WORKDIR /app
21+
COPY src/vibeec/*.zig ./src/vibeec/
22+
23+
# Build with ReleaseFast optimization
24+
WORKDIR /app/src/vibeec
25+
RUN zig build-exe tri_inference.zig -O ReleaseFast -o /app/tri_inference
26+
27+
# ═══════════════════════════════════════════════════════════════════════════════
28+
# Runtime image
29+
# ═══════════════════════════════════════════════════════════════════════════════
30+
31+
FROM debian:bookworm-slim
32+
33+
# Install runtime dependencies
34+
RUN apt-get update && apt-get install -y \
35+
curl \
36+
&& rm -rf /var/lib/apt/lists/*
37+
38+
WORKDIR /app
39+
40+
# Copy binary
41+
COPY --from=builder /app/tri_inference /app/tri_inference
42+
43+
# Copy models (will be downloaded at runtime if not present)
44+
# Models should be mounted as volumes or downloaded
45+
46+
# Create models directory
47+
RUN mkdir -p /app/models
48+
49+
# Download SmolLM2 360M model
50+
RUN curl -L -o /app/models/smollm2-360m.tri \
51+
"https://huggingface.co/bartowski/SmolLM2-360M-Instruct-GGUF/resolve/main/SmolLM2-360M-Instruct-Q8_0.gguf" || true
52+
53+
# Set environment
54+
ENV NUM_THREADS=16
55+
56+
# Run benchmark
57+
CMD ["/app/tri_inference", "/app/models/smollm2-360m.tri"]

benchmark_flyio.sh

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/bin/bash
2+
# ═══════════════════════════════════════════════════════════════════════════════
3+
# TRINITY BENCHMARK - Fly.io Performance Estimation
4+
# φ² + 1/φ² = 3 = TRINITY
5+
# ═══════════════════════════════════════════════════════════════════════════════
6+
7+
echo "╔══════════════════════════════════════════════════════════════╗"
8+
echo "║ TRINITY BENCHMARK - FLY.IO ESTIMATION ║"
9+
echo "║ φ² + 1/φ² = 3 = TRINITY ║"
10+
echo "╚══════════════════════════════════════════════════════════════╝"
11+
echo ""
12+
13+
# Current environment
14+
CURRENT_CORES=$(nproc)
15+
echo "Current environment: ${CURRENT_CORES} cores"
16+
echo ""
17+
18+
# Run benchmark
19+
echo "Running benchmark on ${CURRENT_CORES} cores..."
20+
cd /workspaces/trinity/src/vibeec
21+
RESULT=$(./tri_inference ../../models/smollm2-360m.tri 2>&1 | grep "Speed")
22+
CURRENT_SPEED=$(echo "$RESULT" | grep -oP '[\d.]+(?= tokens/sec)')
23+
24+
echo "Current speed: ${CURRENT_SPEED} tok/s"
25+
echo ""
26+
27+
# Estimate Fly.io performance
28+
echo "═══════════════════════════════════════════════════════════════"
29+
echo "ESTIMATED FLY.IO PERFORMANCE (based on linear scaling):"
30+
echo "═══════════════════════════════════════════════════════════════"
31+
echo ""
32+
33+
# Calculate estimates (assuming ~80% parallel efficiency)
34+
EFFICIENCY=0.8
35+
36+
for CORES in 4 8 16; do
37+
SPEEDUP=$(echo "scale=2; 1 + ($CORES - $CURRENT_CORES) * $EFFICIENCY / $CURRENT_CORES" | bc)
38+
ESTIMATED=$(echo "scale=2; $CURRENT_SPEED * $SPEEDUP" | bc)
39+
40+
case $CORES in
41+
4) SIZE="performance-4x" ;;
42+
8) SIZE="performance-8x" ;;
43+
16) SIZE="performance-16x" ;;
44+
esac
45+
46+
echo " $SIZE ($CORES cores): ~${ESTIMATED} tok/s (${SPEEDUP}x speedup)"
47+
done
48+
49+
echo ""
50+
echo "═══════════════════════════════════════════════════════════════"
51+
echo "TO DEPLOY ON FLY.IO:"
52+
echo "═══════════════════════════════════════════════════════════════"
53+
echo ""
54+
echo "1. Login: flyctl auth login"
55+
echo "2. Create app: flyctl apps create trinity-inference"
56+
echo "3. Deploy: flyctl deploy --config fly.toml"
57+
echo ""
58+
echo "KOSCHEI IS IMMORTAL | GOLDEN CHAIN IS CLOSED"

fly.toml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
# ═══════════════════════════════════════════════════════════════════════════════
12
# TRINITY LLM - Fly.io Configuration
2-
# phi^2 + 1/phi^2 = 3 = TRINITY
3+
# Multi-threaded LLM inference with MAXIMUM CPU cores
4+
# φ² + 1/φ² = 3 = TRINITY
5+
# ═══════════════════════════════════════════════════════════════════════════════
36

47
app = "trinity-llm"
58
primary_region = "iad"
@@ -11,15 +14,21 @@ primary_region = "iad"
1114
MODEL_PATH = "/app/models/smollm-135m-instruct-q8_0.gguf"
1215
TEMPERATURE = "0.7"
1316
TOP_P = "0.9"
17+
NUM_THREADS = "16"
1418

15-
# Use shared-cpu-2x for SmolLM-135M
16-
# 2GB RAM needed: model (139MB) + dequantized weights (~600MB) + buffers
19+
# MAXIMUM CPU: performance-16x = 16 dedicated CPU cores, 32GB RAM
20+
# For benchmark testing multi-threaded inference
1721
[[vm]]
18-
size = "shared-cpu-2x"
19-
memory = "2gb"
20-
cpus = 2
22+
size = "performance-16x"
23+
memory = "32gb"
24+
cpus = 16
2125

22-
# Persistent volume for models (optional - model is baked into image)
26+
# Alternative sizes:
27+
# performance-8x: 8 CPU, 16GB RAM
28+
# performance-4x: 4 CPU, 8GB RAM
29+
# shared-cpu-8x: 8 shared CPU, 16GB RAM
30+
31+
# Persistent volume for models
2332
# [[mounts]]
2433
# source = "trinity_models"
2534
# destination = "/app/models"

0 commit comments

Comments
 (0)