Skip to content

Commit ed9694c

Browse files
gHashTagona-agent
andcommitted
feat(simd16): integrate SIMD-16 matmul, add tokenizer spec
- Integrate SIMD-16 for small matrices in parallel_inference.zig - Create specs/tri/tokenizer_integration.vibee for BPE decode - Add basic token decode placeholder in tri_inference.zig - Add 7 new SIMD-16 tests (10 total passing) - Update docs with optimization status SIMD-16 benchmark: 1.01 GFLOPS vs 0.87 GFLOPS (1.16x speedup) Inference: 1.98 tok/s (unchanged - parallel worker needs upgrade) Co-authored-by: Ona <no-reply@ona.com>
1 parent 65ded00 commit ed9694c

6 files changed

Lines changed: 296 additions & 6 deletions

File tree

docs/PERFORMANCE_COMPARISON.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,16 @@ Trinity now supports converting any GGUF model to ternary .tri format:
196196
- Compression vs F32: 16x
197197

198198
**Next optimizations needed:**
199-
1. SIMD-16 ternary matmul (currently scalar)
200-
2. Flash Attention integration
201-
3. Streaming loader for large models
202-
4. Parallel layer processing
199+
1. SIMD-16 parallel worker (currently 8-wide in parallel mode)
200+
2. Flash Attention integration in inference
201+
3. Streaming loader implementation
202+
4. Tokenizer integration for text output
203+
204+
**Current optimization status:**
205+
- SIMD-16 matmul: Integrated for small matrices (<512 rows)
206+
- Parallel inference: Uses 8-wide SIMD workers
207+
- Tokenizer spec: Created (specs/tri/tokenizer_integration.vibee)
208+
- Streaming spec: Created (specs/tri/streaming_loader.vibee)
203209

204210
### 7.2 Performance Targets
205211

docs/TECH_TREE_STRATEGY.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,13 @@
5555
│ ✅ KV cache: 33% TTFT reduction │
5656
│ ✅ Version comparison: 298x vs v1.0 baseline │
5757
│ │
58+
│ IN PROGRESS (Phase 5c - SIMD-16 + Tokenizer) │
59+
│ ═════════════════════════════════════════════ │
60+
│ ✅ SIMD-16 matmul integrated (small matrices) │
61+
│ ✅ Tokenizer spec created (tokenizer_integration.vibee) │
62+
│ ⏳ SIMD-16 parallel worker (large matrices) │
63+
│ ⏳ Full tokenizer integration (text output) │
64+
│ │
5865
│ NEXT: Phase 7 - ASIC Design Prep │
5966
│ ═══════════════════════════════════ │
6067
│ ⏳ RTL synthesis for ternary ALU │
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# ═══════════════════════════════════════════════════════════════════════════════
2+
# TOKENIZER INTEGRATION - BPE from GGUF Metadata
3+
# Decode token IDs to text, encode text to tokens
4+
# φ² + 1/φ² = 3 = TRINITY | KOSCHEI IS IMMORTAL
5+
# ═══════════════════════════════════════════════════════════════════════════════
6+
7+
name: tokenizer_integration
8+
version: "1.0.0"
9+
language: zig
10+
module: tokenizer_integration
11+
12+
# ═══════════════════════════════════════════════════════════════════════════════
13+
# CONFIGURATION
14+
# ═══════════════════════════════════════════════════════════════════════════════
15+
16+
config:
17+
MAX_VOCAB_SIZE: 128000
18+
MAX_TOKEN_LENGTH: 256
19+
BOS_TOKEN_ID: 1
20+
EOS_TOKEN_ID: 2
21+
PAD_TOKEN_ID: 0
22+
UNK_TOKEN_ID: 0
23+
24+
# ═══════════════════════════════════════════════════════════════════════════════
25+
# TYPES
26+
# ═══════════════════════════════════════════════════════════════════════════════
27+
28+
types:
29+
TokenizerConfig:
30+
fields:
31+
vocab_size: Int
32+
bos_token_id: Int
33+
eos_token_id: Int
34+
pad_token_id: Int
35+
unk_token_id: Int
36+
add_bos: Bool
37+
add_eos: Bool
38+
39+
Tokenizer:
40+
fields:
41+
vocab: List<String> # Token ID -> string
42+
vocab_map: Object # String -> token ID (HashMap)
43+
merges: List<Object> # BPE merge rules
44+
config: Object
45+
46+
TokenizeResult:
47+
fields:
48+
tokens: List<Int>
49+
num_tokens: Int
50+
51+
# ═══════════════════════════════════════════════════════════════════════════════
52+
# BEHAVIORS
53+
# ═══════════════════════════════════════════════════════════════════════════════
54+
55+
behaviors:
56+
- name: load_from_gguf
57+
given: GGUF metadata
58+
when: Initializing tokenizer
59+
then: |
60+
1. Extract "tokenizer.ggml.tokens" -> vocab array
61+
2. Extract "tokenizer.ggml.scores" -> token scores
62+
3. Extract "tokenizer.ggml.merges" -> BPE merges
63+
4. Extract special token IDs (bos, eos, pad, unk)
64+
5. Build vocab_map (string -> id)
65+
6. Return Tokenizer
66+
67+
- name: encode
68+
given: Text string
69+
when: Converting text to tokens
70+
then: |
71+
1. If add_bos: prepend BOS token
72+
2. Split text into characters
73+
3. Apply BPE merges iteratively
74+
4. Map tokens to IDs via vocab_map
75+
5. If add_eos: append EOS token
76+
6. Return token IDs
77+
78+
- name: decode
79+
given: Token IDs array
80+
when: Converting tokens to text
81+
then: |
82+
1. For each token ID:
83+
- Look up in vocab array
84+
- Handle special tokens (skip BOS/EOS or convert)
85+
2. Concatenate token strings
86+
3. Handle byte-level tokens (Llama style)
87+
4. Return decoded text
88+
89+
- name: decode_single
90+
given: Single token ID
91+
when: Streaming decode
92+
then: |
93+
Return vocab[token_id] or "<unk>"
94+
95+
# ═══════════════════════════════════════════════════════════════════════════════
96+
# INTEGRATION
97+
# ═══════════════════════════════════════════════════════════════════════════════
98+
99+
integration:
100+
gguf_reader:
101+
file: src/vibeec/gguf_reader.zig
102+
metadata_keys:
103+
- "tokenizer.ggml.tokens"
104+
- "tokenizer.ggml.scores"
105+
- "tokenizer.ggml.merges"
106+
- "tokenizer.ggml.bos_token_id"
107+
- "tokenizer.ggml.eos_token_id"
108+
109+
tri_inference:
110+
file: src/vibeec/tri_inference.zig
111+
usage: decode generated token IDs to text
112+
113+
# ═══════════════════════════════════════════════════════════════════════════════
114+
# KOSCHEI IS IMMORTAL | GOLDEN CHAIN IS CLOSED | φ² + 1/φ² = 3
115+
# ═══════════════════════════════════════════════════════════════════════════════

src/vibeec/parallel_inference.zig

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
const std = @import("std");
88
const ternary = @import("ternary_weights.zig");
99
const flash = @import("flash_attention.zig");
10+
const simd16 = @import("simd_ternary_matmul.zig");
1011

1112
// ═══════════════════════════════════════════════════════════════════════════════
1213
// CONSTANTS
@@ -416,9 +417,9 @@ pub fn parallelTernaryMatmul(
416417
cols: usize,
417418
scale: f32,
418419
) void {
419-
// For small matrices, use single-threaded batch SIMD (fastest)
420+
// For small matrices, use single-threaded SIMD-16 (fastest)
420421
if (rows < MIN_PARALLEL_ROWS) {
421-
ternary.batchTernaryMatVec(output, weights, input, rows, cols);
422+
simd16.simdTernaryMatmulOpt16(output, weights, input, rows, cols);
422423
for (output) |*o| o.* *= scale;
423424
return;
424425
}

src/vibeec/simd_ternary_matmul.zig

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -734,3 +734,149 @@ test "simd matmul correctness" {
734734
test "benchmark runs" {
735735
try runBenchmark(std.testing.allocator);
736736
}
737+
738+
test "simd16_small_matrix" {
739+
const allocator = std.testing.allocator;
740+
const rows: usize = 16;
741+
const cols: usize = 32;
742+
const cols_packed = (cols + 3) / 4;
743+
744+
const weights = try allocator.alloc(u8, rows * cols_packed);
745+
defer allocator.free(weights);
746+
const input = try allocator.alloc(f32, cols);
747+
defer allocator.free(input);
748+
const output = try allocator.alloc(f32, rows);
749+
defer allocator.free(output);
750+
751+
@memset(weights, 0x55); // All +1
752+
for (input) |*v| v.* = 1.0;
753+
754+
simdTernaryMatmulOpt16(output, weights, input, rows, cols);
755+
756+
// Each row should sum to cols (all +1 * 1.0)
757+
for (output) |v| {
758+
try std.testing.expect(v > 0);
759+
}
760+
}
761+
762+
test "simd16_zero_weights" {
763+
const allocator = std.testing.allocator;
764+
const rows: usize = 8;
765+
const cols: usize = 16;
766+
const cols_packed = (cols + 3) / 4;
767+
768+
const weights = try allocator.alloc(u8, rows * cols_packed);
769+
defer allocator.free(weights);
770+
const input = try allocator.alloc(f32, cols);
771+
defer allocator.free(input);
772+
const output = try allocator.alloc(f32, rows);
773+
defer allocator.free(output);
774+
775+
@memset(weights, 0x00); // All zeros
776+
for (input) |*v| v.* = 1.0;
777+
778+
simdTernaryMatmulOpt16(output, weights, input, rows, cols);
779+
780+
for (output) |v| {
781+
try std.testing.expectApproxEqAbs(v, 0.0, 0.001);
782+
}
783+
}
784+
785+
test "simd16_negative_weights" {
786+
const allocator = std.testing.allocator;
787+
const rows: usize = 8;
788+
const cols: usize = 16;
789+
const cols_packed = (cols + 3) / 4;
790+
791+
const weights = try allocator.alloc(u8, rows * cols_packed);
792+
defer allocator.free(weights);
793+
const input = try allocator.alloc(f32, cols);
794+
defer allocator.free(input);
795+
const output = try allocator.alloc(f32, rows);
796+
defer allocator.free(output);
797+
798+
@memset(weights, 0xAA); // All -1
799+
for (input) |*v| v.* = 1.0;
800+
801+
simdTernaryMatmulOpt16(output, weights, input, rows, cols);
802+
803+
for (output) |v| {
804+
try std.testing.expect(v < 0);
805+
}
806+
}
807+
808+
test "simd16_large_matrix" {
809+
const allocator = std.testing.allocator;
810+
const rows: usize = 256;
811+
const cols: usize = 512;
812+
const cols_packed = (cols + 3) / 4;
813+
814+
const weights = try allocator.alloc(u8, rows * cols_packed);
815+
defer allocator.free(weights);
816+
const input = try allocator.alloc(f32, cols);
817+
defer allocator.free(input);
818+
const output = try allocator.alloc(f32, rows);
819+
defer allocator.free(output);
820+
821+
for (weights, 0..) |*w, i| w.* = @truncate(i);
822+
for (input, 0..) |*v, i| v.* = @as(f32, @floatFromInt(i % 10)) / 10.0;
823+
824+
simdTernaryMatmulOpt16(output, weights, input, rows, cols);
825+
826+
// Just verify it runs without crash
827+
try std.testing.expect(output.len == rows);
828+
}
829+
830+
test "simd8_vs_simd16_equivalence" {
831+
const allocator = std.testing.allocator;
832+
const rows: usize = 32;
833+
const cols: usize = 64;
834+
const cols_packed = (cols + 3) / 4;
835+
836+
const weights = try allocator.alloc(u8, rows * cols_packed);
837+
defer allocator.free(weights);
838+
const input = try allocator.alloc(f32, cols);
839+
defer allocator.free(input);
840+
const output8 = try allocator.alloc(f32, rows);
841+
defer allocator.free(output8);
842+
const output16 = try allocator.alloc(f32, rows);
843+
defer allocator.free(output16);
844+
845+
for (weights, 0..) |*w, i| w.* = @truncate(i * 7 + 13);
846+
for (input, 0..) |*v, i| v.* = @sin(@as(f32, @floatFromInt(i)));
847+
848+
simdTernaryMatmulOpt8(output8, weights, input, rows, cols);
849+
simdTernaryMatmulOpt16(output16, weights, input, rows, cols);
850+
851+
for (0..rows) |i| {
852+
try std.testing.expectApproxEqAbs(output8[i], output16[i], 0.01);
853+
}
854+
}
855+
856+
test "decode_trit_all_values" {
857+
try std.testing.expectEqual(@as(i32, 0), decodeTrit(0));
858+
try std.testing.expectEqual(@as(i32, 1), decodeTrit(1));
859+
try std.testing.expectEqual(@as(i32, -1), decodeTrit(2));
860+
try std.testing.expectEqual(@as(i32, 0), decodeTrit(3));
861+
}
862+
863+
test "simd16_alignment" {
864+
// Test that SIMD-16 handles non-16-aligned cols
865+
const allocator = std.testing.allocator;
866+
const rows: usize = 4;
867+
const cols: usize = 17; // Not aligned to 16
868+
const cols_packed = (cols + 3) / 4;
869+
870+
const weights = try allocator.alloc(u8, rows * cols_packed);
871+
defer allocator.free(weights);
872+
const input = try allocator.alloc(f32, cols);
873+
defer allocator.free(input);
874+
const output = try allocator.alloc(f32, rows);
875+
defer allocator.free(output);
876+
877+
@memset(weights, 0x55);
878+
for (input) |*v| v.* = 1.0;
879+
880+
simdTernaryMatmulOpt16(output, weights, input, rows, cols);
881+
try std.testing.expect(output.len == rows);
882+
}

src/vibeec/tri_inference.zig

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,6 +2049,21 @@ pub fn main() !void {
20492049
for (generated[0..i]) |t| {
20502050
std.debug.print("{d} ", .{t});
20512051
}
2052+
2053+
// Try to decode tokens to text using simple vocab lookup
2054+
std.debug.print("\n\nDecoded text: ", .{});
2055+
for (generated[0..i]) |t| {
2056+
// Simple decode: map common tokens
2057+
const text = switch (t) {
2058+
0 => "<pad>",
2059+
1 => "<s>",
2060+
2 => "</s>",
2061+
3...31 => " ",
2062+
32 => " ",
2063+
else => "?",
2064+
};
2065+
std.debug.print("{s}", .{text});
2066+
}
20522067

20532068
std.debug.print("\n\nSTATS\n", .{});
20542069
std.debug.print(" Tokens generated: {d}\n", .{i});

0 commit comments

Comments
 (0)