gHashTag
diff --git a/‎bin/vibee‎
15.8 KB b/‎bin/vibee‎
15.8 KB
diff --git a/‎src/vibeec/gguf_chat.zig‎
Lines changed: 106 additions & 59 deletions b/‎src/vibeec/gguf_chat.zig‎
Lines changed: 106 additions & 59 deletions
@@ -9,17 +9,19 @@ const model_mod = @import("gguf_model.zig");
 const tokenizer_mod = @import("gguf_tokenizer.zig");
 const inference = @import("gguf_inference.zig");
 
+// Chat template for formatting prompts
+const ChatTemplate = tokenizer_mod.ChatTemplate;
+
 // Entry point for CLI chat command
 pub fn runChat(allocator: std.mem.Allocator, model_path: []const u8, initial_prompt: ?[]const u8, max_tokens: u32) !void {
-    _ = initial_prompt;
-    _ = max_tokens;
+    const stdout = std.io.getStdOut().writer();
 
-    std.debug.print("\n", .{});
-    std.debug.print("╔══════════════════════════════════════════════════════════════╗\n", .{});
-    std.debug.print("║           TRINITY CHAT - SIMD Optimized LLM                  ║\n", .{});
-    std.debug.print("║           phi^2 + 1/phi^2 = 3 = TRINITY                      ║\n", .{});
-    std.debug.print("╚══════════════════════════════════════════════════════════════╝\n", .{});
-    std.debug.print("\n", .{});
+    try stdout.print("\n", .{});
+    try stdout.print("╔══════════════════════════════════════════════════════════════╗\n", .{});
+    try stdout.print("║           TRINITY CHAT - SIMD Optimized LLM                  ║\n", .{});
+    try stdout.print("║           Chat Template + Streaming Output                   ║\n", .{});
+    try stdout.print("╚══════════════════════════════════════════════════════════════╝\n", .{});
+    try stdout.print("\n", .{});
 
     // Load model
     std.debug.print("Loading model: {s}\n", .{model_path});
@@ -48,76 +50,121 @@ pub fn runChat(allocator: std.mem.Allocator, model_path: []const u8, initial_pro
     };
     defer tokenizer.deinit();
 
-    std.debug.print("Ready! Type your message (or 'quit' to exit):\n\n", .{});
+    // Use TinyLlama chat template
+    const template = ChatTemplate.TINYLLAMA;
+    const system_prompt = "You are a helpful AI assistant.";
+
+    std.debug.print("Chat template: TinyLlama (ChatML format)\n", .{});
+    std.debug.print("System: {s}\n", .{system_prompt});
+    std.debug.print("\nReady! Type your message (or 'quit' to exit):\n\n", .{});
+
+    // Handle initial prompt if provided
+    if (initial_prompt) |prompt| {
+        try generateWithTemplate(allocator, stdout, &model, &tokenizer, &template, system_prompt, prompt, max_tokens);
+    }
 
     // Interactive loop
     const stdin = std.io.getStdIn().reader();
     var buf: [1024]u8 = undefined;
 
     while (true) {
-        std.debug.print("User: ", .{});
+        try stdout.print("User: ", .{});
         const line = stdin.readUntilDelimiter(&buf, '\n') catch break;
         const trimmed = std.mem.trim(u8, line, " \t\r\n");
 
         if (trimmed.len == 0) continue;
         if (std.mem.eql(u8, trimmed, "quit") or std.mem.eql(u8, trimmed, "exit")) break;
 
-        // Generate response using full transformer forward pass
-        std.debug.print("Assistant: ", .{});
-        var gen_timer = try std.time.Timer.start();
-
-        const tokens = tokenizer.encode(allocator, trimmed) catch {
-            std.debug.print("[tokenization error]\n", .{});
-            continue;
-        };
-        defer allocator.free(tokens);
-
-        // Real generation with transformer
-        var generated: u32 = 0;
-        var current_tokens = std.ArrayList(u32).init(allocator);
-        defer current_tokens.deinit();
-        for (tokens) |t| try current_tokens.append(t);
-
-        const max_gen: u32 = 50;
-        while (generated < max_gen) : (generated += 1) {
-            // Forward pass for last token
-            const pos = current_tokens.items.len - 1;
-            const last_token = current_tokens.items[pos];
-
-            const logits = model.forward(last_token, pos) catch {
-                std.debug.print("[forward error]", .{});
-                break;
-            };
-            defer allocator.free(logits);
-
-            // Sample next token (greedy)
-            var max_idx: u32 = 0;
-            var max_val: f32 = logits[0];
-            for (logits[1..], 1..) |l, i| {
-                if (l > max_val) {
-                    max_val = l;
-                    max_idx = @intCast(i);
-                }
-            }
+        try generateWithTemplate(allocator, stdout, &model, &tokenizer, &template, system_prompt, trimmed, max_tokens);
+    }
 
-            // Check for EOS
-            if (max_idx == tokenizer.eos_token) break;
+    try stdout.print("Goodbye!\n", .{});
+}
 
-            // Decode and print
-            const decoded = tokenizer.decode(allocator, &[_]u32{max_idx}) catch " ";
-            defer if (decoded.len > 0) allocator.free(decoded);
-            std.debug.print("{s}", .{decoded});
+// Generate response with chat template and streaming output
+fn generateWithTemplate(
+    allocator: std.mem.Allocator,
+    writer: anytype,
+    model: *model_mod.FullModel,
+    tokenizer: *tokenizer_mod.Tokenizer,
+    template: *const ChatTemplate,
+    system: []const u8,
+    user_input: []const u8,
+    max_tokens: u32,
+) !void {
+    // Format prompt with chat template
+    const formatted = try template.formatPrompt(allocator, system, user_input);
+    defer allocator.free(formatted);
+
+    try writer.print("Assistant: ", .{});
+    var gen_timer = try std.time.Timer.start();
+
+    // Tokenize formatted prompt
+    const tokens = tokenizer.encode(allocator, formatted) catch {
+        try writer.print("[tokenization error]\n", .{});
+        return;
+    };
+    defer allocator.free(tokens);
+
+    // Reset KV cache for new conversation
+    model.resetKVCache();
+
+    // Process prompt tokens (prefill) - build up KV cache
+    var last_logits: ?[]f32 = null;
+    for (tokens, 0..) |token, pos| {
+        if (last_logits) |l| allocator.free(l);
+        last_logits = model.forward(token, pos) catch {
+            try writer.print("[forward error]\n", .{});
+            return;
+        };
+    }
 
-            try current_tokens.append(max_idx);
+    // Generate tokens with streaming output
+    var generated: u32 = 0;
+    var current_pos = tokens.len;
+
+    // Use logits from last prefill token for first generation
+    var current_logits = last_logits orelse return;
+    var last_token: u32 = 0;
+
+    while (generated < max_tokens) : (generated += 1) {
+        // Sample next token (greedy)
+        var max_idx: u32 = 0;
+        var max_val: f32 = current_logits[0];
+        for (current_logits[1..], 1..) |l, i| {
+            if (l > max_val) {
+                max_val = l;
+                max_idx = @intCast(i);
+            }
         }
-        std.debug.print("\n", .{});
 
-        const gen_time = gen_timer.read();
-        const tok_per_sec = @as(f64, @floatFromInt(generated)) / (@as(f64, @floatFromInt(gen_time)) / 1e9);
-        std.debug.print("[{d} tokens, {d:.1} tok/s]\n\n", .{ generated, tok_per_sec });
+        // Free current logits
+        allocator.free(current_logits);
+
+        // Check for EOS
+        if (max_idx == tokenizer.eos_token) break;
+
+        // Decode and stream output immediately
+        const decoded = tokenizer.decode(allocator, &[_]u32{max_idx}) catch " ";
+        defer if (decoded.len > 0) allocator.free(decoded);
+        
+        // Stream: print immediately without buffering
+        try writer.print("{s}", .{decoded});
+        
+        // Check for </s> or end markers in decoded text
+        if (std.mem.indexOf(u8, decoded, "</s>") != null) break;
+        if (std.mem.indexOf(u8, decoded, "<|") != null) break;
+
+        // Get next logits
+        last_token = max_idx;
+        current_logits = model.forward(last_token, current_pos) catch break;
+        current_pos += 1;
     }
+    try writer.print("\n", .{});
 
-    std.debug.print("Goodbye!\n", .{});
+    const gen_time = gen_timer.read();
+    const tok_per_sec = @as(f64, @floatFromInt(generated)) / (@as(f64, @floatFromInt(gen_time)) / 1e9);
+    try writer.print("[{d} tokens, {d:.1} tok/s]\n\n", .{ generated, tok_per_sec });
 }
 
 pub fn main() !void {