Add multi-model support with auto-detection

gHashTag · ona-agent · gHashTag · commit 3602d37c9a82 · 2026-02-02T03:29:31.000Z
- ChatTemplate: QWEN, SMOLLM, LLAMA2, TINYLLAMA
- Auto-detect model type from filename
- Auto-select system prompt (coder vs general)
- ChatML format for HTTP API (im_start/im_end)

Supported models:
- SmolLM-135M (general, fast)
- Qwen2.5-Coder-0.5B/1.5B (coding)
- TinyLlama-1.1B (general)

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/bin/vibee b/bin/vibee
diff --git a/src/vibeec/gguf_chat.zig b/src/vibeec/gguf_chat.zig
@@ -109,6 +109,39 @@ const ConversationHistory = struct {
     }
 };
 
+// Auto-detect chat template based on model name
+fn detectChatTemplate(model_path: []const u8) ChatTemplate {
+    // Check for Qwen models
+    if (std.mem.indexOf(u8, model_path, "qwen") != null or
+        std.mem.indexOf(u8, model_path, "Qwen") != null) {
+        return ChatTemplate.QWEN;
+    }
+    // Check for SmolLM models
+    if (std.mem.indexOf(u8, model_path, "smollm") != null or
+        std.mem.indexOf(u8, model_path, "SmolLM") != null) {
+        return ChatTemplate.SMOLLM;
+    }
+    // Check for Llama2 models
+    if (std.mem.indexOf(u8, model_path, "llama-2") != null or
+        std.mem.indexOf(u8, model_path, "Llama-2") != null) {
+        return ChatTemplate.LLAMA2;
+    }
+    // Default to TinyLlama/ChatML format
+    return ChatTemplate.TINYLLAMA;
+}
+
+// Auto-detect system prompt based on model type
+fn detectSystemPrompt(model_path: []const u8) []const u8 {
+    // Coder models
+    if (std.mem.indexOf(u8, model_path, "coder") != null or
+        std.mem.indexOf(u8, model_path, "Coder") != null or
+        std.mem.indexOf(u8, model_path, "code") != null) {
+        return "You are Qwen, a helpful coding assistant. Write clean, efficient code with clear explanations.";
+    }
+    // Default assistant
+    return "You are a helpful AI assistant. Be concise and direct.";
+}
+
 // Entry point for CLI chat command (with ternary support)
 pub fn runChatWithTernary(allocator: std.mem.Allocator, model_path: []const u8, initial_prompt: ?[]const u8, max_tokens: u32, temperature: f32, top_p: f32, use_ternary: bool) !void {
     return runChatInternal(allocator, model_path, initial_prompt, max_tokens, temperature, top_p, use_ternary);
@@ -172,9 +205,9 @@ fn runChatInternal(allocator: std.mem.Allocator, model_path: []const u8, initial
     };
     defer tokenizer.deinit();
 
-    // Use TinyLlama chat template
-    const template = ChatTemplate.TINYLLAMA;
-    const system_prompt = "You are a helpful AI assistant.";
+    // Auto-detect model and select appropriate chat template
+    const template = detectChatTemplate(model_path);
+    const system_prompt = detectSystemPrompt(model_path);
 
     // Initialize conversation history (keep last 10 messages + system)
     var history = ConversationHistory.init(allocator, 12);
diff --git a/src/vibeec/gguf_tokenizer.zig b/src/vibeec/gguf_tokenizer.zig
@@ -212,6 +212,26 @@ pub const ChatTemplate = struct {
         .assistant_suffix = " </s><s>[INST] ",
     };
 
+    // Qwen2.5 chat template
+    pub const QWEN = ChatTemplate{
+        .system_prefix = "<|im_start|>system\n",
+        .system_suffix = "<|im_end|>\n",
+        .user_prefix = "<|im_start|>user\n",
+        .user_suffix = "<|im_end|>\n",
+        .assistant_prefix = "<|im_start|>assistant\n",
+        .assistant_suffix = "<|im_end|>\n",
+    };
+
+    // SmolLM chat template
+    pub const SMOLLM = ChatTemplate{
+        .system_prefix = "<|im_start|>system\n",
+        .system_suffix = "<|im_end|>\n",
+        .user_prefix = "<|im_start|>user\n",
+        .user_suffix = "<|im_end|>\n",
+        .assistant_prefix = "<|im_start|>assistant\n",
+        .assistant_suffix = "<|im_end|>\n",
+    };
+
     pub fn formatPrompt(
         self: *const ChatTemplate,
         allocator: std.mem.Allocator,
diff --git a/src/vibeec/http_server.zig b/src/vibeec/http_server.zig
@@ -222,10 +222,10 @@ pub const HttpServer = struct {
         var generated: ?[]u8 = null;
         defer if (generated) |g| self.allocator.free(g);
 
-        // Build full prompt with system instruction
+        // Build full prompt with ChatML format (works with most models)
         const system_prompt = "You are TRINITY, a helpful AI assistant. Be concise and direct.";
         const full_prompt = std.fmt.allocPrint(self.allocator, 
-            "<|system|>\n{s}<|end|>\n<|user|>\n{s}<|end|>\n<|assistant|>\n", 
+            "<|im_start|>system\n{s}<|im_end|>\n<|im_start|>user\n{s}<|im_end|>\n<|im_start|>assistant\n", 
             .{system_prompt, prompt}
         ) catch prompt;
         defer if (full_prompt.ptr != prompt.ptr) self.allocator.free(full_prompt);
@@ -325,10 +325,10 @@ pub const HttpServer = struct {
             "Connection: keep-alive\r\n\r\n";
         try connection.stream.writeAll(sse_header);
 
-        // Build prompt with system instruction
+        // Build prompt with ChatML format
         const system_prompt = "You are TRINITY, a helpful AI assistant. Be concise and direct.";
         const full_prompt = std.fmt.allocPrint(self.allocator, 
-            "<|system|>\n{s}<|end|>\n<|user|>\n{s}<|end|>\n<|assistant|>\n", 
+            "<|im_start|>system\n{s}<|im_end|>\n<|im_start|>user\n{s}<|im_end|>\n<|im_start|>assistant\n", 
             .{system_prompt, prompt}
         ) catch prompt;
         defer if (full_prompt.ptr != prompt.ptr) self.allocator.free(full_prompt);