Upgrade to SmolLM2-1.7B and add tok/s logging

gHashTag · ona-agent · gHashTag · commit 18ebbfb5cff0 · 2026-02-02T06:10:08.000Z
- Dockerfile: Switch from SmolLM-135M to SmolLM2-1.7B for better quality
- fly.toml: Update to performance-4x (4 CPU, 8GB RAM) for larger model
- fly.toml: Increase grace_period to 180s for model loading
- http_server.zig: Add detailed tok/s logging for performance monitoring

Expected performance on Fly.io performance-4x:
- SmolLM2-1.7B: ~8-12 tok/s with 4 cores

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -39,18 +39,20 @@ COPY --from=builder /build/vibee /app/vibee
 # Create models directory
 RUN mkdir -p /app/models
 
-# Download SmolLM-135M Q8_0 (official HuggingFace model)
-# Size: ~145MB, loads in <1 second, good for demos
-RUN echo "Downloading SmolLM-135M-Instruct Q8_0..." && \
-    curl -L -o /app/models/smollm-135m-instruct-q8_0.gguf \
-    "https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/resolve/main/smollm-135m-instruct-add-basics-q8_0.gguf" && \
+# Download SmolLM2-1.7B Q8_0 (better quality, larger model)
+# Size: ~1.8GB, loads in ~10-15 seconds
+# For smaller/faster option, use SmolLM2-360M or SmolLM-135M
+RUN echo "Downloading SmolLM2-1.7B-Instruct Q8_0..." && \
+    curl -L -o /app/models/smollm2-1.7b-instruct-q8_0.gguf \
+    "https://huggingface.co/bartowski/SmolLM2-1.7B-Instruct-GGUF/resolve/main/SmolLM2-1.7B-Instruct-Q8_0.gguf" && \
     ls -la /app/models/
 
 # Set environment
-ENV MODEL_PATH=/app/models/smollm-135m-instruct-q8_0.gguf
+ENV MODEL_PATH=/app/models/smollm2-1.7b-instruct-q8_0.gguf
 ENV TEMPERATURE=0.7
 ENV TOP_P=0.9
+ENV NUM_THREADS=16
 
 # Run HTTP API server
 EXPOSE 8080
-CMD ["/app/vibee", "serve", "--model", "/app/models/smollm-135m-instruct-q8_0.gguf", "--port", "8080"]
+CMD ["/app/vibee", "serve", "--model", "/app/models/smollm2-1.7b-instruct-q8_0.gguf", "--port", "8080"]
diff --git a/fly.toml b/fly.toml
@@ -11,22 +11,22 @@ primary_region = "iad"
   dockerfile = "Dockerfile"
 
 [env]
-  MODEL_PATH = "/app/models/smollm-135m-instruct-q8_0.gguf"
+  MODEL_PATH = "/app/models/smollm2-1.7b-instruct-q8_0.gguf"
   TEMPERATURE = "0.7"
   TOP_P = "0.9"
-  NUM_THREADS = "16"
+  NUM_THREADS = "4"
 
-# MAXIMUM CPU: performance-16x = 16 dedicated CPU cores, 32GB RAM
-# For benchmark testing multi-threaded inference
+# SmolLM2-1.7B requires more RAM (~4GB for model + buffers)
+# performance-4x: 4 dedicated CPU cores, 8GB RAM
 [[vm]]
-  size = "performance-16x"
-  memory = "32gb"
-  cpus = 16
+  size = "performance-4x"
+  memory = "8gb"
+  cpus = 4
 
 # Alternative sizes:
-# performance-8x: 8 CPU, 16GB RAM
-# performance-4x: 4 CPU, 8GB RAM
-# shared-cpu-8x: 8 shared CPU, 16GB RAM
+# performance-8x: 8 CPU, 16GB RAM (faster, more expensive)
+# performance-16x: 16 CPU, 32GB RAM (maximum speed)
+# shared-cpu-4x: 4 shared CPU, 8GB RAM (cheaper)
 
 # Persistent volume for models
 # [[mounts]]
@@ -42,8 +42,8 @@ primary_region = "iad"
   min_machines_running = 0
 
 [[http_service.checks]]
-  grace_period = "120s"
+  grace_period = "180s"  # SmolLM2-1.7B needs ~30-60s to load
   interval = "30s"
   method = "GET"
   path = "/health"
-  timeout = "15s"
+  timeout = "30s"
diff --git a/src/vibeec/http_server.zig b/src/vibeec/http_server.zig
@@ -210,6 +210,9 @@ pub const HttpServer = struct {
 
         std.debug.print("  Prompt: {s}\n", .{prompt});
 
+        // Start timing for tok/s measurement
+        var gen_timer = std.time.Timer.start() catch null;
+
         // Generate response with system prompt for better quality
         const sampling = SamplingParams{
             .temperature = 0.7,
@@ -234,11 +237,13 @@ pub const HttpServer = struct {
         const tokens = tokenizer.encode(self.allocator, full_prompt) catch null;
         defer if (tokens) |t| self.allocator.free(t);
 
+        var generated_token_count: usize = 0;
+
         if (tokens) |toks| {
             var output_tokens = std.ArrayList(u32).init(self.allocator);
             defer output_tokens.deinit();
 
-            // Process input tokens
+            // Process input tokens (prefill)
             var pos: usize = 0;
             for (toks) |tok| {
                 _ = model.forward(tok, pos) catch null;
@@ -258,6 +263,8 @@ pub const HttpServer = struct {
                 pos += 1;
             }
 
+            generated_token_count = output_tokens.items.len;
+
             // Decode tokens
             if (output_tokens.items.len > 0) {
                 generated = tokenizer.decode(self.allocator, output_tokens.items) catch null;
@@ -267,7 +274,15 @@ pub const HttpServer = struct {
             }
         }
 
+        // Calculate and log generation speed
+        const gen_time_ns = if (gen_timer) |*timer| timer.read() else 0;
+        const gen_time_s = @as(f64, @floatFromInt(gen_time_ns)) / 1e9;
+        const input_token_count = if (tokens) |toks| toks.len else 0;
+        const tok_per_sec = if (gen_time_s > 0) @as(f64, @floatFromInt(generated_token_count)) / gen_time_s else 0;
+
         std.debug.print("  Response: {s}\n", .{response_text});
+        std.debug.print("  Tokens: {d} input + {d} output = {d} total\n", .{ input_token_count, generated_token_count, input_token_count + generated_token_count });
+        std.debug.print("  Time: {d:.2}s | Speed: {d:.2} tok/s (generation only)\n", .{ gen_time_s, tok_per_sec });
 
         // Escape JSON string
         var escaped = std.ArrayList(u8).init(self.allocator);