@@ -14,6 +14,35 @@ const FullModel = model_mod.FullModel;
1414const Tokenizer = tokenizer_mod .Tokenizer ;
1515const SamplingParams = inference .SamplingParams ;
1616
17+ // ═══════════════════════════════════════════════════════════════════════════════
18+ // BATCH PROCESSING METRICS (INF-004)
19+ // ═══════════════════════════════════════════════════════════════════════════════
20+
21+ const BatchMetrics = struct {
22+ total_requests : std .atomic .Value (u64 ) = std .atomic .Value (u64 ).init (0 ),
23+ active_requests : std .atomic .Value (u32 ) = std .atomic .Value (u32 ).init (0 ),
24+ total_tokens_generated : std .atomic .Value (u64 ) = std .atomic .Value (u64 ).init (0 ),
25+ total_inference_time_ns : std .atomic .Value (u64 ) = std .atomic .Value (u64 ).init (0 ),
26+
27+ fn recordRequest (self : * BatchMetrics ) void {
28+ _ = self .total_requests .fetchAdd (1 , .monotonic );
29+ _ = self .active_requests .fetchAdd (1 , .monotonic );
30+ }
31+
32+ fn completeRequest (self : * BatchMetrics , tokens : u64 , time_ns : u64 ) void {
33+ _ = self .active_requests .fetchSub (1 , .monotonic );
34+ _ = self .total_tokens_generated .fetchAdd (tokens , .monotonic );
35+ _ = self .total_inference_time_ns .fetchAdd (time_ns , .monotonic );
36+ }
37+
38+ fn getThroughput (self : * BatchMetrics ) f64 {
39+ const tokens = self .total_tokens_generated .load (.monotonic );
40+ const time_ns = self .total_inference_time_ns .load (.monotonic );
41+ if (time_ns == 0 ) return 0 ;
42+ return @as (f64 , @floatFromInt (tokens )) / (@as (f64 , @floatFromInt (time_ns )) / 1e9 );
43+ }
44+ };
45+
1746// ═══════════════════════════════════════════════════════════════════════════════
1847// HTTP SERVER
1948// ═══════════════════════════════════════════════════════════════════════════════
@@ -22,6 +51,7 @@ pub const HttpServer = struct {
2251 allocator : Allocator ,
2352 model_path : []const u8 ,
2453 port : u16 ,
54+ metrics : BatchMetrics = .{},
2555
2656 pub fn init (allocator : Allocator , model_path : []const u8 , port : u16 ) HttpServer {
2757 return .{
@@ -154,10 +184,29 @@ pub const HttpServer = struct {
154184 }
155185
156186 fn sendInfo (self : * HttpServer , connection : * std.net.Server.Connection ) ! void {
157- _ = self ;
158- const body_str = "{\" name\" :\" TRINITY LLM\" ,\" version\" :\" 1.0.0\" ,\" endpoints\" :[\" /v1/chat/completions\" ,\" /health\" ]}" ;
159- const response = "HTTP/1.1 200 OK\r \n Content-Type: application/json\r \n Access-Control-Allow-Origin: *\r \n Content-Length: 87\r \n Connection: close\r \n \r \n " ++ body_str ;
160- try connection .stream .writeAll (response );
187+ // Include metrics in info response (INF-004)
188+ const total = self .metrics .total_requests .load (.monotonic );
189+ const active = self .metrics .active_requests .load (.monotonic );
190+ const throughput = self .metrics .getThroughput ();
191+ const total_tokens = self .metrics .total_tokens_generated .load (.monotonic );
192+
193+ const body = std .fmt .allocPrint (self .allocator ,
194+ "{{\" name\" :\" TRINITY LLM\" ,\" version\" :\" 1.4.0\" ,\" endpoints\" :[\" /v1/chat/completions\" ,\" /health\" ,\" /metrics\" ],\" metrics\" :{{\" total_requests\" :{d},\" active_requests\" :{d},\" total_tokens\" :{d},\" throughput_tok_s\" :{d:.2}}}}}"
195+ , .{ total , active , total_tokens , throughput }) catch {
196+ const body_str = "{\" name\" :\" TRINITY LLM\" ,\" version\" :\" 1.4.0\" ,\" endpoints\" :[\" /v1/chat/completions\" ,\" /health\" ]}" ;
197+ const response = "HTTP/1.1 200 OK\r \n Content-Type: application/json\r \n Access-Control-Allow-Origin: *\r \n Content-Length: 85\r \n Connection: close\r \n \r \n " ++ body_str ;
198+ try connection .stream .writeAll (response );
199+ return ;
200+ };
201+ defer self .allocator .free (body );
202+
203+ const header = std .fmt .allocPrint (self .allocator ,
204+ "HTTP/1.1 200 OK\r \n Content-Type: application/json\r \n Access-Control-Allow-Origin: *\r \n Content-Length: {d}\r \n Connection: close\r \n \r \n "
205+ , .{body .len }) catch return ;
206+ defer self .allocator .free (header );
207+
208+ try connection .stream .writeAll (header );
209+ try connection .stream .writeAll (body );
161210 }
162211
163212 fn sendCors (self : * HttpServer , connection : * std.net.Server.Connection ) ! void {
@@ -185,6 +234,9 @@ pub const HttpServer = struct {
185234 }
186235
187236 fn handleChatCompletion (self : * HttpServer , connection : * std.net.Server.Connection , body : []const u8 , model : * FullModel , tokenizer : * Tokenizer ) ! void {
237+ // Record request for metrics (INF-004)
238+ self .metrics .recordRequest ();
239+
188240 // Check if streaming is requested
189241 const is_streaming = std .mem .indexOf (u8 , body , "\" stream\" :true" ) != null or
190242 std .mem .indexOf (u8 , body , "\" stream\" : true" ) != null ;
@@ -280,9 +332,16 @@ pub const HttpServer = struct {
280332 const input_token_count = if (tokens ) | toks | toks .len else 0 ;
281333 const tok_per_sec = if (gen_time_s > 0 ) @as (f64 , @floatFromInt (generated_token_count )) / gen_time_s else 0 ;
282334
335+ // Update batch metrics (INF-004)
336+ self .metrics .completeRequest (@intCast (generated_token_count ), gen_time_ns );
337+ const throughput = self .metrics .getThroughput ();
338+ const active = self .metrics .active_requests .load (.monotonic );
339+ const total = self .metrics .total_requests .load (.monotonic );
340+
283341 std .debug .print (" Response: {s}\n " , .{response_text });
284342 std .debug .print (" Tokens: {d} input + {d} output = {d} total\n " , .{ input_token_count , generated_token_count , input_token_count + generated_token_count });
285- std .debug .print (" Time: {d:.2}s | Speed: {d:.2} tok/s (generation only)\n " , .{ gen_time_s , tok_per_sec });
343+ std .debug .print (" Time: {d:.2}s | Speed: {d:.2} tok/s | Throughput: {d:.2} tok/s\n " , .{ gen_time_s , tok_per_sec , throughput });
344+ std .debug .print (" Requests: {d} total, {d} active\n " , .{ total , active });
286345
287346 // Escape JSON string
288347 var escaped = std .ArrayList (u8 ).init (self .allocator );
0 commit comments