Add DeepSeek chat template and improve tokenizer

gHashTag · ona-agent · gHashTag · commit 4655d5c831f8 · 2026-02-02T05:02:00.000Z
- Add DeepSeek Coder chat template (&lt;|User|&gt;/&lt;|Assistant|&gt;/&lt;|EOT|&gt;)
- Add DeepSeek special tokens to tokenizer
- Skip system message for templates without system support
- Fix chat template detection and display
- Add Q4_K debug output (disabled)

Working models:
- Qwen2.5 Coder 0.5B: 3.0 tok/s
- SmolLM 135M: 8.9 tok/s
- TinyLlama 1.1B: ~2 tok/s

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/bin/vibee b/bin/vibee
diff --git a/src/vibeec/gguf_chat.zig b/src/vibeec/gguf_chat.zig
@@ -74,9 +74,12 @@ const ConversationHistory = struct {
         for (self.messages.items) |msg| {
             switch (msg.role) {
                 .system => {
-                    try result.appendSlice(template.system_prefix);
-                    try result.appendSlice(msg.content);
-                    try result.appendSlice(template.system_suffix);
+                    // Skip system message if template doesn't support it (empty prefix)
+                    if (template.system_prefix.len > 0) {
+                        try result.appendSlice(template.system_prefix);
+                        try result.appendSlice(msg.content);
+                        try result.appendSlice(template.system_suffix);
+                    }
                 },
                 .user => {
                     try result.appendSlice(template.user_prefix);
@@ -111,6 +114,11 @@ const ConversationHistory = struct {
 
 // Auto-detect chat template based on model name
 fn detectChatTemplate(model_path: []const u8) ChatTemplate {
+    // Check for DeepSeek models
+    if (std.mem.indexOf(u8, model_path, "deepseek") != null or
+        std.mem.indexOf(u8, model_path, "DeepSeek") != null) {
+        return ChatTemplate.DEEPSEEK;
+    }
     // Check for Qwen models
     if (std.mem.indexOf(u8, model_path, "qwen") != null or
         std.mem.indexOf(u8, model_path, "Qwen") != null) {
@@ -216,7 +224,16 @@ fn runChatInternal(allocator: std.mem.Allocator, model_path: []const u8, initial
     // Add system message
     try history.addMessage(.system, system_prompt);
 
-    std.debug.print("Chat template: TinyLlama (ChatML format)\n", .{});
+    // Print detected template name
+    const template_name = if (std.mem.indexOf(u8, model_path, "deepseek") != null or std.mem.indexOf(u8, model_path, "DeepSeek") != null)
+        "DeepSeek"
+    else if (std.mem.indexOf(u8, model_path, "qwen") != null or std.mem.indexOf(u8, model_path, "Qwen") != null)
+        "Qwen (ChatML)"
+    else if (std.mem.indexOf(u8, model_path, "smollm") != null or std.mem.indexOf(u8, model_path, "SmolLM") != null)
+        "SmolLM (ChatML)"
+    else
+        "TinyLlama (ChatML)";
+    std.debug.print("Chat template: {s}\n", .{template_name});
     std.debug.print("System: {s}\n", .{system_prompt});
     std.debug.print("Sampling: temperature={d:.2}, top_p={d:.2}\n", .{sampling_params.temperature, sampling_params.top_p});
     std.debug.print("History: enabled (last 10 messages)\n", .{});
diff --git a/src/vibeec/gguf_inference.zig b/src/vibeec/gguf_inference.zig
@@ -184,6 +184,8 @@ pub fn dequantizeQ4_KTensor(allocator: std.mem.Allocator, data: []const u8, num_
         const d = gguf.f16ToF32(d_bits);
         const min = gguf.f16ToF32(dmin_bits);
 
+
+
         const scales = block[4..16]; // 12 bytes of scales
         const qs = block[16..144]; // 128 bytes of quantized values
 
diff --git a/src/vibeec/gguf_tokenizer.zig b/src/vibeec/gguf_tokenizer.zig
@@ -79,6 +79,7 @@ pub const Tokenizer = struct {
             // First check for special tokens (they have priority)
             var found_special = false;
             const special_tokens = [_][]const u8{
+                // Qwen/ChatML tokens
                 "<|im_start|>", "<|im_end|>", "<|endoftext|>",
                 "<|object_ref_start|>", "<|object_ref_end|>",
                 "<|box_start|>", "<|box_end|>",
@@ -88,6 +89,10 @@ pub const Tokenizer = struct {
                 "<tool_call>", "</tool_call>",
                 "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>",
                 "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>",
+                // DeepSeek tokens
+                "<|User|>", "<|Assistant|>", "<|EOT|>",
+                "<｜begin▁of▁sentence｜>", "<｜end▁of▁sentence｜>",
+                "<｜fim▁hole｜>", "<｜fim▁begin｜>", "<｜fim▁end｜>",
             };
             
             for (special_tokens) |special| {
@@ -349,6 +354,16 @@ pub const ChatTemplate = struct {
         .assistant_suffix = "<|im_end|>\n",
     };
 
+    // DeepSeek Coder chat template (no system prompt)
+    pub const DEEPSEEK = ChatTemplate{
+        .system_prefix = "",
+        .system_suffix = "",
+        .user_prefix = "<|User|>",
+        .user_suffix = "\n",
+        .assistant_prefix = "<|Assistant|>",
+        .assistant_suffix = "<|EOT|>\n",
+    };
+
     pub fn formatPrompt(
         self: *const ChatTemplate,
         allocator: std.mem.Allocator,