Skip to content

Commit 18ebbfb

Browse files
gHashTagona-agent
andcommitted
Upgrade to SmolLM2-1.7B and add tok/s logging
- Dockerfile: Switch from SmolLM-135M to SmolLM2-1.7B for better quality - fly.toml: Update to performance-4x (4 CPU, 8GB RAM) for larger model - fly.toml: Increase grace_period to 180s for model loading - http_server.zig: Add detailed tok/s logging for performance monitoring Expected performance on Fly.io performance-4x: - SmolLM2-1.7B: ~8-12 tok/s with 4 cores Co-authored-by: Ona <no-reply@ona.com>
1 parent 1de2e4f commit 18ebbfb

3 files changed

Lines changed: 37 additions & 20 deletions

File tree

Dockerfile

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,20 @@ COPY --from=builder /build/vibee /app/vibee
3939
# Create models directory
4040
RUN mkdir -p /app/models
4141

42-
# Download SmolLM-135M Q8_0 (official HuggingFace model)
43-
# Size: ~145MB, loads in <1 second, good for demos
44-
RUN echo "Downloading SmolLM-135M-Instruct Q8_0..." && \
45-
curl -L -o /app/models/smollm-135m-instruct-q8_0.gguf \
46-
"https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/resolve/main/smollm-135m-instruct-add-basics-q8_0.gguf" && \
42+
# Download SmolLM2-1.7B Q8_0 (better quality, larger model)
43+
# Size: ~1.8GB, loads in ~10-15 seconds
44+
# For smaller/faster option, use SmolLM2-360M or SmolLM-135M
45+
RUN echo "Downloading SmolLM2-1.7B-Instruct Q8_0..." && \
46+
curl -L -o /app/models/smollm2-1.7b-instruct-q8_0.gguf \
47+
"https://huggingface.co/bartowski/SmolLM2-1.7B-Instruct-GGUF/resolve/main/SmolLM2-1.7B-Instruct-Q8_0.gguf" && \
4748
ls -la /app/models/
4849

4950
# Set environment
50-
ENV MODEL_PATH=/app/models/smollm-135m-instruct-q8_0.gguf
51+
ENV MODEL_PATH=/app/models/smollm2-1.7b-instruct-q8_0.gguf
5152
ENV TEMPERATURE=0.7
5253
ENV TOP_P=0.9
54+
ENV NUM_THREADS=16
5355

5456
# Run HTTP API server
5557
EXPOSE 8080
56-
CMD ["/app/vibee", "serve", "--model", "/app/models/smollm-135m-instruct-q8_0.gguf", "--port", "8080"]
58+
CMD ["/app/vibee", "serve", "--model", "/app/models/smollm2-1.7b-instruct-q8_0.gguf", "--port", "8080"]

fly.toml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,22 @@ primary_region = "iad"
1111
dockerfile = "Dockerfile"
1212

1313
[env]
14-
MODEL_PATH = "/app/models/smollm-135m-instruct-q8_0.gguf"
14+
MODEL_PATH = "/app/models/smollm2-1.7b-instruct-q8_0.gguf"
1515
TEMPERATURE = "0.7"
1616
TOP_P = "0.9"
17-
NUM_THREADS = "16"
17+
NUM_THREADS = "4"
1818

19-
# MAXIMUM CPU: performance-16x = 16 dedicated CPU cores, 32GB RAM
20-
# For benchmark testing multi-threaded inference
19+
# SmolLM2-1.7B requires more RAM (~4GB for model + buffers)
20+
# performance-4x: 4 dedicated CPU cores, 8GB RAM
2121
[[vm]]
22-
size = "performance-16x"
23-
memory = "32gb"
24-
cpus = 16
22+
size = "performance-4x"
23+
memory = "8gb"
24+
cpus = 4
2525

2626
# Alternative sizes:
27-
# performance-8x: 8 CPU, 16GB RAM
28-
# performance-4x: 4 CPU, 8GB RAM
29-
# shared-cpu-8x: 8 shared CPU, 16GB RAM
27+
# performance-8x: 8 CPU, 16GB RAM (faster, more expensive)
28+
# performance-16x: 16 CPU, 32GB RAM (maximum speed)
29+
# shared-cpu-4x: 4 shared CPU, 8GB RAM (cheaper)
3030

3131
# Persistent volume for models
3232
# [[mounts]]
@@ -42,8 +42,8 @@ primary_region = "iad"
4242
min_machines_running = 0
4343

4444
[[http_service.checks]]
45-
grace_period = "120s"
45+
grace_period = "180s" # SmolLM2-1.7B needs ~30-60s to load
4646
interval = "30s"
4747
method = "GET"
4848
path = "/health"
49-
timeout = "15s"
49+
timeout = "30s"

src/vibeec/http_server.zig

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ pub const HttpServer = struct {
210210

211211
std.debug.print(" Prompt: {s}\n", .{prompt});
212212

213+
// Start timing for tok/s measurement
214+
var gen_timer = std.time.Timer.start() catch null;
215+
213216
// Generate response with system prompt for better quality
214217
const sampling = SamplingParams{
215218
.temperature = 0.7,
@@ -234,11 +237,13 @@ pub const HttpServer = struct {
234237
const tokens = tokenizer.encode(self.allocator, full_prompt) catch null;
235238
defer if (tokens) |t| self.allocator.free(t);
236239

240+
var generated_token_count: usize = 0;
241+
237242
if (tokens) |toks| {
238243
var output_tokens = std.ArrayList(u32).init(self.allocator);
239244
defer output_tokens.deinit();
240245

241-
// Process input tokens
246+
// Process input tokens (prefill)
242247
var pos: usize = 0;
243248
for (toks) |tok| {
244249
_ = model.forward(tok, pos) catch null;
@@ -258,6 +263,8 @@ pub const HttpServer = struct {
258263
pos += 1;
259264
}
260265

266+
generated_token_count = output_tokens.items.len;
267+
261268
// Decode tokens
262269
if (output_tokens.items.len > 0) {
263270
generated = tokenizer.decode(self.allocator, output_tokens.items) catch null;
@@ -267,7 +274,15 @@ pub const HttpServer = struct {
267274
}
268275
}
269276

277+
// Calculate and log generation speed
278+
const gen_time_ns = if (gen_timer) |*timer| timer.read() else 0;
279+
const gen_time_s = @as(f64, @floatFromInt(gen_time_ns)) / 1e9;
280+
const input_token_count = if (tokens) |toks| toks.len else 0;
281+
const tok_per_sec = if (gen_time_s > 0) @as(f64, @floatFromInt(generated_token_count)) / gen_time_s else 0;
282+
270283
std.debug.print(" Response: {s}\n", .{response_text});
284+
std.debug.print(" Tokens: {d} input + {d} output = {d} total\n", .{ input_token_count, generated_token_count, input_token_count + generated_token_count });
285+
std.debug.print(" Time: {d:.2}s | Speed: {d:.2} tok/s (generation only)\n", .{ gen_time_s, tok_per_sec });
271286

272287
// Escape JSON string
273288
var escaped = std.ArrayList(u8).init(self.allocator);

0 commit comments

Comments
 (0)