@@ -5,6 +5,7 @@ const std = @import("std");
55const gguf = @import ("gguf_reader.zig" );
66const inference = @import ("gguf_inference.zig" );
77const transformer = @import ("gguf_transformer.zig" );
8+ const simd = @import ("simd_matmul.zig" );
89
910pub const FullModel = struct {
1011 allocator : std.mem.Allocator ,
@@ -23,6 +24,20 @@ pub const FullModel = struct {
2324 rope : transformer.RoPE ,
2425 kv_caches : []transformer.KVCache ,
2526
27+ // Pre-allocated buffers for forward pass (avoid allocations in hot path)
28+ buf_hidden : []f32 ,
29+ buf_temp : []f32 ,
30+ buf_normed : []f32 ,
31+ buf_q : []f32 ,
32+ buf_k : []f32 ,
33+ buf_v : []f32 ,
34+ buf_attn_out : []f32 ,
35+ buf_attn_proj : []f32 ,
36+ buf_ffn_gate : []f32 ,
37+ buf_ffn_up : []f32 ,
38+ buf_ffn_out : []f32 ,
39+ buf_scores : []f32 ,
40+
2641 pub const LayerWeights = struct {
2742 attn_norm : []f32 ,
2843 ffn_norm : []f32 ,
@@ -75,6 +90,19 @@ pub const FullModel = struct {
7590 .layers = undefined ,
7691 .rope = undefined ,
7792 .kv_caches = undefined ,
93+ // Pre-allocated buffers (initialized in loadWeights)
94+ .buf_hidden = undefined ,
95+ .buf_temp = undefined ,
96+ .buf_normed = undefined ,
97+ .buf_q = undefined ,
98+ .buf_k = undefined ,
99+ .buf_v = undefined ,
100+ .buf_attn_out = undefined ,
101+ .buf_attn_proj = undefined ,
102+ .buf_ffn_gate = undefined ,
103+ .buf_ffn_up = undefined ,
104+ .buf_ffn_out = undefined ,
105+ .buf_scores = undefined ,
78106 };
79107
80108 model .config .head_dim = model .config .hidden_size / model .config .num_heads ;
@@ -131,6 +159,27 @@ pub const FullModel = struct {
131159 }
132160
133161 std .debug .print (" Loaded {d} layers \n " , .{self .config .num_layers });
162+
163+ // Pre-allocate buffers for forward pass (avoid allocations in hot path)
164+ const hidden_size = self .config .hidden_size ;
165+ const num_heads = self .config .num_heads ;
166+ const num_kv_heads = self .config .num_kv_heads ;
167+ const head_dim = self .config .head_dim ;
168+ const intermediate_size = self .config .intermediate_size ;
169+ const context_length = self .config .context_length ;
170+
171+ self .buf_hidden = try self .allocator .alloc (f32 , hidden_size );
172+ self .buf_temp = try self .allocator .alloc (f32 , hidden_size );
173+ self .buf_normed = try self .allocator .alloc (f32 , hidden_size );
174+ self .buf_q = try self .allocator .alloc (f32 , num_heads * head_dim );
175+ self .buf_k = try self .allocator .alloc (f32 , num_kv_heads * head_dim );
176+ self .buf_v = try self .allocator .alloc (f32 , num_kv_heads * head_dim );
177+ self .buf_attn_out = try self .allocator .alloc (f32 , num_heads * head_dim );
178+ self .buf_attn_proj = try self .allocator .alloc (f32 , hidden_size );
179+ self .buf_ffn_gate = try self .allocator .alloc (f32 , intermediate_size );
180+ self .buf_ffn_up = try self .allocator .alloc (f32 , intermediate_size );
181+ self .buf_ffn_out = try self .allocator .alloc (f32 , hidden_size );
182+ self .buf_scores = try self .allocator .alloc (f32 , context_length );
134183 }
135184
136185 fn loadTensor (self : * FullModel , name : []const u8 ) ! []f32 {
@@ -174,6 +223,21 @@ pub const FullModel = struct {
174223 self .allocator .free (self .token_embedding );
175224 self .allocator .free (self .output_weight );
176225 self .allocator .free (self .output_norm );
226+
227+ // Free pre-allocated buffers
228+ self .allocator .free (self .buf_hidden );
229+ self .allocator .free (self .buf_temp );
230+ self .allocator .free (self .buf_normed );
231+ self .allocator .free (self .buf_q );
232+ self .allocator .free (self .buf_k );
233+ self .allocator .free (self .buf_v );
234+ self .allocator .free (self .buf_attn_out );
235+ self .allocator .free (self .buf_attn_proj );
236+ self .allocator .free (self .buf_ffn_gate );
237+ self .allocator .free (self .buf_ffn_up );
238+ self .allocator .free (self .buf_ffn_out );
239+ self .allocator .free (self .buf_scores );
240+
177241 self .reader .deinit ();
178242 }
179243
@@ -183,31 +247,26 @@ pub const FullModel = struct {
183247 }
184248 }
185249
186- // Forward pass for single token
250+ // Forward pass for single token - OPTIMIZED with pre-allocated buffers
187251 pub fn forward (self : * FullModel , token : u32 , pos : usize ) ! []f32 {
188252 const hidden_size = self .config .hidden_size ;
189253
190- // Get embedding
254+ // Get embedding (use pre-allocated buffer)
191255 const emb_start = token * hidden_size ;
192- const hidden = try self .allocator .alloc (f32 , hidden_size );
193- defer self .allocator .free (hidden );
194- @memcpy (hidden , self .token_embedding [emb_start .. ][0.. hidden_size ]);
195-
196- // Process through all layers
197- const temp = try self .allocator .alloc (f32 , hidden_size );
198- defer self .allocator .free (temp );
256+ @memcpy (self .buf_hidden , self .token_embedding [emb_start .. ][0.. hidden_size ]);
199257
258+ // Process through all layers (no allocations!)
200259 for (0.. self .config .num_layers ) | i | {
201- try self .forwardLayer ( temp , hidden , i , pos );
202- @memcpy (hidden , temp );
260+ self .forwardLayerOptimized ( self . buf_temp , self . buf_hidden , i , pos );
261+ @memcpy (self . buf_hidden , self . buf_temp );
203262 }
204263
205264 // Final RMS norm
206- inference .rmsNorm (temp , hidden , self .output_norm , self .config .rms_norm_eps );
265+ inference .rmsNorm (self . buf_temp , self . buf_hidden , self .output_norm , self .config .rms_norm_eps );
207266
208- // Output projection
267+ // Output projection (only allocation is for return value)
209268 const logits = try self .allocator .alloc (f32 , self .config .vocab_size );
210- inference .matVec (logits , self .output_weight , temp , self .config .vocab_size , hidden_size );
269+ inference .matVec (logits , self .output_weight , self . buf_temp , self .config .vocab_size , hidden_size );
211270
212271 return logits ;
213272 }
@@ -332,6 +391,99 @@ pub const FullModel = struct {
332391 }
333392 }
334393
394+ // OPTIMIZED forward layer - uses pre-allocated buffers (NO ALLOCATIONS!)
395+ fn forwardLayerOptimized (self : * FullModel , output : []f32 , input : []const f32 , layer_idx : usize , pos : usize ) void {
396+ const layer = self .layers [layer_idx ];
397+ const hidden_size = self .config .hidden_size ;
398+ const num_heads = self .config .num_heads ;
399+ const num_kv_heads = self .config .num_kv_heads ;
400+ const head_dim = self .config .head_dim ;
401+ const intermediate_size = self .config .intermediate_size ;
402+ const rms_eps = self .config .rms_norm_eps ;
403+
404+ // Pre-attention norm (use buf_normed)
405+ inference .rmsNorm (self .buf_normed , input , layer .attn_norm , rms_eps );
406+
407+ // Compute Q, K, V (use buf_q, buf_k, buf_v)
408+ inference .matVec (self .buf_q , layer .wq , self .buf_normed , num_heads * head_dim , hidden_size );
409+ inference .matVec (self .buf_k , layer .wk , self .buf_normed , num_kv_heads * head_dim , hidden_size );
410+ inference .matVec (self .buf_v , layer .wv , self .buf_normed , num_kv_heads * head_dim , hidden_size );
411+
412+ // Apply RoPE
413+ for (0.. num_heads ) | h | {
414+ self .rope .apply (self .buf_q [h * head_dim .. ][0.. head_dim ], pos );
415+ }
416+ for (0.. num_kv_heads ) | h | {
417+ self .rope .apply (self .buf_k [h * head_dim .. ][0.. head_dim ], pos );
418+ }
419+
420+ // Update KV cache
421+ self .kv_caches [layer_idx ].append (self .buf_k , self .buf_v );
422+
423+ // Attention (use buf_attn_out, buf_scores)
424+ const scale = 1.0 / @sqrt (@as (f32 , @floatFromInt (head_dim )));
425+ const kv_group_size = num_heads / num_kv_heads ;
426+ const seq_len = self .kv_caches [layer_idx ].seq_len ;
427+
428+ for (0.. num_heads ) | h | {
429+ const kv_h = h / kv_group_size ;
430+ const q_head = self .buf_q [h * head_dim .. ][0.. head_dim ];
431+
432+ // Attention scores with SIMD dot product
433+ for (0.. seq_len ) | t | {
434+ const k_offset = t * num_kv_heads * head_dim + kv_h * head_dim ;
435+ const k_vec = self .kv_caches [layer_idx ].k_cache [k_offset .. ][0.. head_dim ];
436+ self .buf_scores [t ] = simd .simdDot (q_head , k_vec ) * scale ;
437+ }
438+
439+ // Softmax
440+ inference .softmax (self .buf_scores [0.. seq_len ], self .buf_scores [0.. seq_len ]);
441+
442+ // Weighted sum with SIMD
443+ const out_head = self .buf_attn_out [h * head_dim .. ][0.. head_dim ];
444+ @memset (out_head , 0.0 );
445+
446+ for (0.. seq_len ) | t | {
447+ const v_offset = t * num_kv_heads * head_dim + kv_h * head_dim ;
448+ const v_vec = self .kv_caches [layer_idx ].v_cache [v_offset .. ][0.. head_dim ];
449+ const score = self .buf_scores [t ];
450+
451+ // SIMD scale and add
452+ for (0.. head_dim ) | i | {
453+ out_head [i ] += score * v_vec [i ];
454+ }
455+ }
456+ }
457+
458+ // Output projection (use buf_attn_proj)
459+ inference .matVec (self .buf_attn_proj , layer .wo , self .buf_attn_out , hidden_size , num_heads * head_dim );
460+
461+ // Residual
462+ for (0.. hidden_size ) | i | {
463+ output [i ] = input [i ] + self .buf_attn_proj [i ];
464+ }
465+
466+ // Pre-FFN norm
467+ inference .rmsNorm (self .buf_normed , output , layer .ffn_norm , rms_eps );
468+
469+ // FFN with SwiGLU (use buf_ffn_gate, buf_ffn_up)
470+ inference .matVec (self .buf_ffn_gate , layer .w_gate , self .buf_normed , intermediate_size , hidden_size );
471+ inference .matVec (self .buf_ffn_up , layer .w_up , self .buf_normed , intermediate_size , hidden_size );
472+
473+ // SwiGLU
474+ for (0.. intermediate_size ) | i | {
475+ self .buf_ffn_gate [i ] = inference .silu (self .buf_ffn_gate [i ]) * self .buf_ffn_up [i ];
476+ }
477+
478+ // Down projection (use buf_ffn_out)
479+ inference .matVec (self .buf_ffn_out , layer .w_down , self .buf_ffn_gate , hidden_size , intermediate_size );
480+
481+ // Residual
482+ for (0.. hidden_size ) | i | {
483+ output [i ] += self .buf_ffn_out [i ];
484+ }
485+ }
486+
335487 // Generate next token
336488 pub fn generate (self : * FullModel , token : u32 , pos : usize , temperature : f32 ) ! u32 {
337489 const logits = try self .forward (token , pos );
0 commit comments