gHashTag
diff --git a/‎Dockerfile‎
Lines changed: 11 additions & 8 deletions b/‎Dockerfile‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎bin/vibee‎
164 KB b/‎bin/vibee‎
164 KB
diff --git a/‎fly.toml‎
Lines changed: 19 additions & 8 deletions b/‎fly.toml‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎src/vibeec/gen_cmd.zig‎
Lines changed: 25 additions & 0 deletions b/‎src/vibeec/gen_cmd.zig‎
Lines changed: 25 additions & 0 deletions
@@ -39,16 +39,19 @@ COPY --from=builder /build/vibee /app/vibee
 # Create models directory
 RUN mkdir -p /app/models
 
-# Download TinyLlama-1.1B Q8_0 (supported quantization format)
-# Size: ~1.1GB, fast inference, good for testing
-RUN echo "Downloading TinyLlama-1.1B-Chat Q8_0..." && \
-    curl -L -o /app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf \
-    "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
+# Download SmolLM-135M Q8_0 (smallest model, fast loading)
+# Size: ~135MB, loads in <1 second, good for demos
+RUN echo "Downloading SmolLM-135M-Instruct Q8_0..." && \
+    curl -L -o /app/models/smollm-135m-instruct-q8_0.gguf \
+    "https://huggingface.co/TheBloke/SmolLM-135M-Instruct-GGUF/resolve/main/smollm-135m-instruct.Q8_0.gguf" || \
+    curl -L -o /app/models/smollm-135m-instruct-q8_0.gguf \
+    "https://huggingface.co/Felladrin/gguf-smollm-135M-instruct-v0.2/resolve/main/smollm-135M-instruct-v0.2-Q8_0.gguf"
 
 # Set environment
-ENV MODEL_PATH=/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf
+ENV MODEL_PATH=/app/models/smollm-135m-instruct-q8_0.gguf
 ENV TEMPERATURE=0.7
 ENV TOP_P=0.9
 
-# Keep container running for SSH access
-CMD ["/bin/sleep", "infinity"]
+# Run HTTP API server
+EXPOSE 8080
+CMD ["/app/vibee", "serve", "--model", "/app/models/smollm-135m-instruct-q8_0.gguf", "--port", "8080"]
@@ -8,21 +8,32 @@ primary_region = "iad"
   dockerfile = "Dockerfile"
 
 [env]
-  MODEL_PATH = "/app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
+  MODEL_PATH = "/app/models/smollm-135m-instruct-q8_0.gguf"
   TEMPERATURE = "0.7"
   TOP_P = "0.9"
 
-# Use performance-2x for TinyLlama (2 CPU, 4GB RAM)
+# Use shared-cpu-1x for SmolLM-135M (small model)
 [[vm]]
-  size = "performance-2x"
-  memory = "4gb"
-  cpus = 2
+  size = "shared-cpu-1x"
+  memory = "512mb"
+  cpus = 1
 
 # Persistent volume for models (optional - model is baked into image)
 # [[mounts]]
 #   source = "trinity_models"
 #   destination = "/app/models"
 
-# CLI application - no HTTP service
-# Access via: fly ssh console -a trinity-llm
-# Then run: /app/vibee chat --model /app/models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf
+# HTTP API service
+[http_service]
+  internal_port = 8080
+  force_https = true
+  auto_stop_machines = true
+  auto_start_machines = true
+  min_machines_running = 0
+
+[[http_service.checks]]
+  grace_period = "60s"
+  interval = "30s"
+  method = "GET"
+  path = "/health"
+  timeout = "10s"
@@ -3,6 +3,7 @@ const vibee_parser = @import("vibee_parser.zig");
 const zig_codegen = @import("zig_codegen.zig");
 const verilog_codegen = @import("verilog_codegen.zig");
 const gguf_chat = @import("gguf_chat.zig");
+const http_server = @import("http_server.zig");
 
 pub fn main() !void {
     const allocator = std.heap.page_allocator;
@@ -73,6 +74,28 @@ pub fn main() !void {
         }
 
         try gguf_chat.runChat(allocator, model_path.?, prompt, max_tokens, temperature, top_p);
+    } else if (std.mem.eql(u8, command, "serve")) {
+        // HTTP API server
+        var model_path: ?[]const u8 = null;
+        var port: u16 = 8080;
+
+        var i: usize = 2;
+        while (i < args.len) : (i += 1) {
+            if (std.mem.eql(u8, args[i], "--model") and i + 1 < args.len) {
+                model_path = args[i + 1];
+                i += 1;
+            } else if (std.mem.eql(u8, args[i], "--port") and i + 1 < args.len) {
+                port = std.fmt.parseInt(u16, args[i + 1], 10) catch 8080;
+                i += 1;
+            }
+        }
+
+        if (model_path == null) {
+            std.debug.print("Error: --model required\n", .{});
+            return;
+        }
+
+        try http_server.runServer(allocator, model_path.?, port);
     } else if (std.mem.eql(u8, command, "help") or std.mem.eql(u8, command, "--help")) {
         printUsage();
     } else {
@@ -96,6 +119,8 @@ fn printUsage() void {
         \\    --max-tokens N                            Max tokens to generate (default: 100)
         \\    --temperature F                           Sampling temperature (default: 0.7)
         \\    --top-p F                                 Top-p nucleus sampling (default: 0.9)
+        \\  vibeec serve --model <path.gguf> [options]  HTTP API server (OpenAI compatible)
+        \\    --port N                                  Port to listen on (default: 8080)
         \\  vibeec help                                 Show this help
         \\
     , .{});