Fix: Use TinyLlama Q8_0 (supported quantization)

gHashTag · ona-agent · gHashTag · commit a05b614307da · 2026-02-01T17:21:28.000Z
Q4_K_M not yet supported by TRINITY weight loader.
TinyLlama-1.1B is smaller and uses Q8_0 format.

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -39,19 +39,19 @@ COPY --from=builder /build/vibee /app/vibee
 # Create models directory
 RUN mkdir -p /app/models
 
-# Download Mistral-7B-Instruct Q4_K_M (best open source model for quality)
-# Size: ~4.4GB, excellent instruction following
-RUN echo "Downloading Mistral-7B-Instruct-v0.2 Q4_K_M..." && \
-    curl -L -o /app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf \
-    "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
+# Download TinyLlama-1.1B Q8_0 (supported quantization format)
+# Size: ~1.1GB, fast inference, good for testing
+RUN echo "Downloading TinyLlama-1.1B-Chat Q8_0..." && \
+    curl -L -o /app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf \
+    "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
 
 # Set environment
-ENV MODEL_PATH=/app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
+ENV MODEL_PATH=/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf
 ENV TEMPERATURE=0.7
 ENV TOP_P=0.9
 
 # Expose port (for future HTTP API)
 EXPOSE 8080
 
 # Run chat
-CMD ["/app/vibee", "chat", "--model", "/app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf", "--temperature", "0.7", "--top-p", "0.9"]
+CMD ["/app/vibee", "chat", "--model", "/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf", "--temperature", "0.7", "--top-p", "0.9"]
diff --git a/fly.toml b/fly.toml
@@ -8,16 +8,15 @@ primary_region = "iad"
   dockerfile = "Dockerfile"
 
 [env]
-  MODEL_PATH = "/app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
+  MODEL_PATH = "/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
   TEMPERATURE = "0.7"
   TOP_P = "0.9"
 
-# Use performance-8x for LLM inference (8 CPU, 16GB RAM)
-# This is needed for Mistral-7B model
+# Use performance-2x for TinyLlama (2 CPU, 4GB RAM)
 [[vm]]
-  size = "performance-8x"
-  memory = "16gb"
-  cpus = 8
+  size = "performance-2x"
+  memory = "4gb"
+  cpus = 2
 
 # Persistent volume for models (optional - model is baked into image)
 # [[mounts]]