Skip to content

Commit eae8299

Browse files
gHashTagona-agent
andcommitted
Add HTTP API server with /v1/chat/completions endpoint
- OpenAI-compatible API - SmolLM-135M model (fast loading) - /health endpoint for Fly.io checks Co-authored-by: Ona <no-reply@ona.com>
1 parent 2504feb commit eae8299

5 files changed

Lines changed: 349 additions & 16 deletions

File tree

Dockerfile

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,19 @@ COPY --from=builder /build/vibee /app/vibee
3939
# Create models directory
4040
RUN mkdir -p /app/models
4141

42-
# Download TinyLlama-1.1B Q8_0 (supported quantization format)
43-
# Size: ~1.1GB, fast inference, good for testing
44-
RUN echo "Downloading TinyLlama-1.1B-Chat Q8_0..." && \
45-
curl -L -o /app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf \
46-
"https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
42+
# Download SmolLM-135M Q8_0 (smallest model, fast loading)
43+
# Size: ~135MB, loads in <1 second, good for demos
44+
RUN echo "Downloading SmolLM-135M-Instruct Q8_0..." && \
45+
curl -L -o /app/models/smollm-135m-instruct-q8_0.gguf \
46+
"https://huggingface.co/TheBloke/SmolLM-135M-Instruct-GGUF/resolve/main/smollm-135m-instruct.Q8_0.gguf" || \
47+
curl -L -o /app/models/smollm-135m-instruct-q8_0.gguf \
48+
"https://huggingface.co/Felladrin/gguf-smollm-135M-instruct-v0.2/resolve/main/smollm-135M-instruct-v0.2-Q8_0.gguf"
4749

4850
# Set environment
49-
ENV MODEL_PATH=/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf
51+
ENV MODEL_PATH=/app/models/smollm-135m-instruct-q8_0.gguf
5052
ENV TEMPERATURE=0.7
5153
ENV TOP_P=0.9
5254

53-
# Keep container running for SSH access
54-
CMD ["/bin/sleep", "infinity"]
55+
# Run HTTP API server
56+
EXPOSE 8080
57+
CMD ["/app/vibee", "serve", "--model", "/app/models/smollm-135m-instruct-q8_0.gguf", "--port", "8080"]

bin/vibee

164 KB
Binary file not shown.

fly.toml

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,32 @@ primary_region = "iad"
88
dockerfile = "Dockerfile"
99

1010
[env]
11-
MODEL_PATH = "/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
11+
MODEL_PATH = "/app/models/smollm-135m-instruct-q8_0.gguf"
1212
TEMPERATURE = "0.7"
1313
TOP_P = "0.9"
1414

15-
# Use performance-2x for TinyLlama (2 CPU, 4GB RAM)
15+
# Use shared-cpu-1x for SmolLM-135M (small model)
1616
[[vm]]
17-
size = "performance-2x"
18-
memory = "4gb"
19-
cpus = 2
17+
size = "shared-cpu-1x"
18+
memory = "512mb"
19+
cpus = 1
2020

2121
# Persistent volume for models (optional - model is baked into image)
2222
# [[mounts]]
2323
# source = "trinity_models"
2424
# destination = "/app/models"
2525

26-
# CLI application - no HTTP service
27-
# Access via: fly ssh console -a trinity-llm
28-
# Then run: /app/vibee chat --model /app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf
26+
# HTTP API service
27+
[http_service]
28+
internal_port = 8080
29+
force_https = true
30+
auto_stop_machines = true
31+
auto_start_machines = true
32+
min_machines_running = 0
33+
34+
[[http_service.checks]]
35+
grace_period = "60s"
36+
interval = "30s"
37+
method = "GET"
38+
path = "/health"
39+
timeout = "10s"

src/vibeec/gen_cmd.zig

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ const vibee_parser = @import("vibee_parser.zig");
33
const zig_codegen = @import("zig_codegen.zig");
44
const verilog_codegen = @import("verilog_codegen.zig");
55
const gguf_chat = @import("gguf_chat.zig");
6+
const http_server = @import("http_server.zig");
67

78
pub fn main() !void {
89
const allocator = std.heap.page_allocator;
@@ -73,6 +74,28 @@ pub fn main() !void {
7374
}
7475

7576
try gguf_chat.runChat(allocator, model_path.?, prompt, max_tokens, temperature, top_p);
77+
} else if (std.mem.eql(u8, command, "serve")) {
78+
// HTTP API server
79+
var model_path: ?[]const u8 = null;
80+
var port: u16 = 8080;
81+
82+
var i: usize = 2;
83+
while (i < args.len) : (i += 1) {
84+
if (std.mem.eql(u8, args[i], "--model") and i + 1 < args.len) {
85+
model_path = args[i + 1];
86+
i += 1;
87+
} else if (std.mem.eql(u8, args[i], "--port") and i + 1 < args.len) {
88+
port = std.fmt.parseInt(u16, args[i + 1], 10) catch 8080;
89+
i += 1;
90+
}
91+
}
92+
93+
if (model_path == null) {
94+
std.debug.print("Error: --model required\n", .{});
95+
return;
96+
}
97+
98+
try http_server.runServer(allocator, model_path.?, port);
7699
} else if (std.mem.eql(u8, command, "help") or std.mem.eql(u8, command, "--help")) {
77100
printUsage();
78101
} else {
@@ -96,6 +119,8 @@ fn printUsage() void {
96119
\\ --max-tokens N Max tokens to generate (default: 100)
97120
\\ --temperature F Sampling temperature (default: 0.7)
98121
\\ --top-p F Top-p nucleus sampling (default: 0.9)
122+
\\ vibeec serve --model <path.gguf> [options] HTTP API server (OpenAI compatible)
123+
\\ --port N Port to listen on (default: 8080)
99124
\\ vibeec help Show this help
100125
\\
101126
, .{});

0 commit comments

Comments
 (0)