Skip to content

Commit a05b614

Browse files
gHashTagona-agent
andcommitted
Fix: Use TinyLlama Q8_0 (supported quantization)
Q4_K_M not yet supported by TRINITY weight loader. TinyLlama-1.1B is smaller and uses Q8_0 format. Co-authored-by: Ona <no-reply@ona.com>
1 parent 16cf95f commit a05b614

2 files changed

Lines changed: 12 additions & 13 deletions

File tree

Dockerfile

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,19 @@ COPY --from=builder /build/vibee /app/vibee
3939
# Create models directory
4040
RUN mkdir -p /app/models
4141

42-
# Download Mistral-7B-Instruct Q4_K_M (best open source model for quality)
43-
# Size: ~4.4GB, excellent instruction following
44-
RUN echo "Downloading Mistral-7B-Instruct-v0.2 Q4_K_M..." && \
45-
curl -L -o /app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf \
46-
"https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
42+
# Download TinyLlama-1.1B Q8_0 (supported quantization format)
43+
# Size: ~1.1GB, fast inference, good for testing
44+
RUN echo "Downloading TinyLlama-1.1B-Chat Q8_0..." && \
45+
curl -L -o /app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf \
46+
"https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
4747

4848
# Set environment
49-
ENV MODEL_PATH=/app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
49+
ENV MODEL_PATH=/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf
5050
ENV TEMPERATURE=0.7
5151
ENV TOP_P=0.9
5252

5353
# Expose port (for future HTTP API)
5454
EXPOSE 8080
5555

5656
# Run chat
57-
CMD ["/app/vibee", "chat", "--model", "/app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf", "--temperature", "0.7", "--top-p", "0.9"]
57+
CMD ["/app/vibee", "chat", "--model", "/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf", "--temperature", "0.7", "--top-p", "0.9"]

fly.toml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,15 @@ primary_region = "iad"
88
dockerfile = "Dockerfile"
99

1010
[env]
11-
MODEL_PATH = "/app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
11+
MODEL_PATH = "/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
1212
TEMPERATURE = "0.7"
1313
TOP_P = "0.9"
1414

15-
# Use performance-8x for LLM inference (8 CPU, 16GB RAM)
16-
# This is needed for Mistral-7B model
15+
# Use performance-2x for TinyLlama (2 CPU, 4GB RAM)
1716
[[vm]]
18-
size = "performance-8x"
19-
memory = "16gb"
20-
cpus = 8
17+
size = "performance-2x"
18+
memory = "4gb"
19+
cpus = 2
2120

2221
# Persistent volume for models (optional - model is baked into image)
2322
# [[mounts]]

0 commit comments

Comments
 (0)