Add SmolLM 135M support (12-14 tok/s)

gHashTag · ona-agent · gHashTag · commit 97ee5e2b0a74 · 2026-02-01T16:00:21.000Z
- Tied embeddings fallback (output = token_embd)
- GPT-2 tokenizer support (Ġ → space, Ċ → newline)
- SmolLM 135M: 12-14 tok/s, 0.27s load, 139MB
- TinyLlama 1.1B: 1.5 tok/s, 2.76s load, 1.1GB
- 9x speedup with smaller model

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/bin/vibee b/bin/vibee
diff --git a/src/vibeec/gguf_model.zig b/src/vibeec/gguf_model.zig
@@ -118,7 +118,17 @@ pub const FullModel = struct {
 
         // Load embeddings
         self.token_embedding = try self.loadTensor("token_embd.weight");
-        self.output_weight = try self.loadTensor("output.weight");
+        
+        // Try to load output.weight, fallback to tied embeddings (token_embd)
+        self.output_weight = self.loadTensor("output.weight") catch |err| blk: {
+            if (err == error.TensorNotFound) {
+                // Tied embeddings: output = token_embd (common in smaller models)
+                std.debug.print("  Using tied embeddings (output = token_embd)\n", .{});
+                break :blk self.token_embedding;
+            }
+            return err;
+        };
+        
         self.output_norm = try self.loadTensor("output_norm.weight");
 
         // Initialize RoPE
diff --git a/src/vibeec/gguf_tokenizer.zig b/src/vibeec/gguf_tokenizer.zig
@@ -139,13 +139,25 @@ pub const Tokenizer = struct {
         for (tokens) |token| {
             if (token < self.vocab_size) {
                 const text = self.vocab[token];
-                // Replace special space character with regular space
+                // Replace special space characters with regular space
                 var i: usize = 0;
                 while (i < text.len) {
+                    // Llama-style space: ▁ (U+2581) = 0xE2 0x96 0x81
                     if (i + 2 < text.len and text[i] == 0xE2 and text[i + 1] == 0x96 and text[i + 2] == 0x81) {
                         try result.append(' ');
                         i += 3;
-                    } else {
+                    }
+                    // GPT-2 style space: Ġ (U+0120) = 0xC4 0xA0
+                    else if (i + 1 < text.len and text[i] == 0xC4 and text[i + 1] == 0xA0) {
+                        try result.append(' ');
+                        i += 2;
+                    }
+                    // Newline token: Ċ (U+010A) = 0xC4 0x8A
+                    else if (i + 1 < text.len and text[i] == 0xC4 and text[i + 1] == 0x8A) {
+                        try result.append('\n');
+                        i += 2;
+                    }
+                    else {
                         try result.append(text[i]);
                         i += 1;
                     }