@@ -133,13 +133,27 @@ pub const FullModel = struct {
133133
134134 pub fn loadWeights (self : * FullModel ) ! void {
135135 std .debug .print ("Loading weights...\n " , .{});
136+
137+ // ═══════════════════════════════════════════════════════════════════
138+ // PROFILING: Track time for each phase
139+ // ═══════════════════════════════════════════════════════════════════
140+ var total_timer = std .time .Timer .start () catch unreachable ;
141+ var phase_timer = std .time .Timer .start () catch unreachable ;
142+
143+ var time_thread_pool : u64 = 0 ;
144+ var time_embeddings : u64 = 0 ;
145+ var time_rope : u64 = 0 ;
146+ var time_kv_cache : u64 = 0 ;
147+ var time_layers : u64 = 0 ;
148+ var time_buffers : u64 = 0 ;
136149
137150 // Initialize thread pool for parallel matVec
151+ phase_timer .reset ();
138152 try simd .initThreadPool (self .allocator );
139-
140-
153+ time_thread_pool = phase_timer .read ();
141154
142155 // Load embeddings
156+ phase_timer .reset ();
143157 self .token_embedding = try self .loadTensor ("token_embd.weight" );
144158
145159 // Try to load output.weight, fallback to tied embeddings (token_embd)
@@ -152,19 +166,21 @@ pub const FullModel = struct {
152166 return err ;
153167 };
154168
155-
156-
157169 self .output_norm = try self .loadTensor ("output_norm.weight" );
170+ time_embeddings = phase_timer .read ();
158171
159172 // Initialize RoPE
173+ phase_timer .reset ();
160174 self .rope = try transformer .RoPE .init (
161175 self .allocator ,
162176 self .config .head_dim ,
163177 self .config .context_length ,
164178 self .config .rope_theta ,
165179 );
180+ time_rope = phase_timer .read ();
166181
167182 // Initialize KV caches for each layer
183+ phase_timer .reset ();
168184 self .kv_caches = try self .allocator .alloc (transformer .KVCache , self .config .num_layers );
169185 for (0.. self .config .num_layers ) | i | {
170186 self .kv_caches [i ] = try transformer .KVCache .init (
@@ -174,8 +190,10 @@ pub const FullModel = struct {
174190 self .config .context_length ,
175191 );
176192 }
193+ time_kv_cache = phase_timer .read ();
177194
178195 // Load layer weights
196+ phase_timer .reset ();
179197 self .layers = try self .allocator .alloc (LayerWeights , self .config .num_layers );
180198
181199 for (0.. self .config .num_layers ) | i | {
@@ -200,9 +218,11 @@ pub const FullModel = struct {
200218 };
201219 }
202220
221+ time_layers = phase_timer .read ();
203222 std .debug .print (" Loaded {d} layers \n " , .{self .config .num_layers });
204223
205224 // Pre-allocate buffers for forward pass (avoid allocations in hot path)
225+ phase_timer .reset ();
206226 const hidden_size = self .config .hidden_size ;
207227 const num_heads = self .config .num_heads ;
208228 const num_kv_heads = self .config .num_kv_heads ;
@@ -222,6 +242,44 @@ pub const FullModel = struct {
222242 self .buf_ffn_up = try self .allocator .alloc (f32 , intermediate_size );
223243 self .buf_ffn_out = try self .allocator .alloc (f32 , hidden_size );
224244 self .buf_scores = try self .allocator .alloc (f32 , context_length );
245+ time_buffers = phase_timer .read ();
246+
247+ // ═══════════════════════════════════════════════════════════════════
248+ // PROFILING RESULTS
249+ // ═══════════════════════════════════════════════════════════════════
250+ const total_time = total_timer .read ();
251+ std .debug .print ("\n ╔══════════════════════════════════════════════════════════════╗\n " , .{});
252+ std .debug .print ("║ LOAD WEIGHTS PROFILING ║\n " , .{});
253+ std .debug .print ("╠══════════════════════════════════════════════════════════════╣\n " , .{});
254+ std .debug .print ("║ Thread pool init: {d:>10.2} ms ({d:>5.1}%) ║\n " , .{
255+ @as (f64 , @floatFromInt (time_thread_pool )) / 1_000_000.0 ,
256+ @as (f64 , @floatFromInt (time_thread_pool )) / @as (f64 , @floatFromInt (total_time )) * 100.0
257+ });
258+ std .debug .print ("║ Embeddings: {d:>10.2} ms ({d:>5.1}%) ║\n " , .{
259+ @as (f64 , @floatFromInt (time_embeddings )) / 1_000_000.0 ,
260+ @as (f64 , @floatFromInt (time_embeddings )) / @as (f64 , @floatFromInt (total_time )) * 100.0
261+ });
262+ std .debug .print ("║ RoPE init: {d:>10.2} ms ({d:>5.1}%) ║\n " , .{
263+ @as (f64 , @floatFromInt (time_rope )) / 1_000_000.0 ,
264+ @as (f64 , @floatFromInt (time_rope )) / @as (f64 , @floatFromInt (total_time )) * 100.0
265+ });
266+ std .debug .print ("║ KV cache init: {d:>10.2} ms ({d:>5.1}%) ║\n " , .{
267+ @as (f64 , @floatFromInt (time_kv_cache )) / 1_000_000.0 ,
268+ @as (f64 , @floatFromInt (time_kv_cache )) / @as (f64 , @floatFromInt (total_time )) * 100.0
269+ });
270+ std .debug .print ("║ Layer weights: {d:>10.2} ms ({d:>5.1}%) ◄── BOTTLENECK ║\n " , .{
271+ @as (f64 , @floatFromInt (time_layers )) / 1_000_000.0 ,
272+ @as (f64 , @floatFromInt (time_layers )) / @as (f64 , @floatFromInt (total_time )) * 100.0
273+ });
274+ std .debug .print ("║ Buffer alloc: {d:>10.2} ms ({d:>5.1}%) ║\n " , .{
275+ @as (f64 , @floatFromInt (time_buffers )) / 1_000_000.0 ,
276+ @as (f64 , @floatFromInt (time_buffers )) / @as (f64 , @floatFromInt (total_time )) * 100.0
277+ });
278+ std .debug .print ("╠══════════════════════════════════════════════════════════════╣\n " , .{});
279+ std .debug .print ("║ TOTAL: {d:>10.2} ms ║\n " , .{
280+ @as (f64 , @floatFromInt (total_time )) / 1_000_000.0
281+ });
282+ std .debug .print ("╚══════════════════════════════════════════════════════════════╝\n " , .{});
225283 }
226284
227285 fn loadTensor (self : * FullModel , name : []const u8 ) ! []f32 {
0 commit comments