1- # ═══════════════════════════════════════════════════════════════════════════════
2- # TRINITY BATCH PROCESSING (INF-004)
3- # Request batching for improved throughput under load
4- # φ² + 1/φ² = 3 = TRINITY
5- # ═══════════════════════════════════════════════════════════════════════════════
6-
71name: batch_processing
82version: "1.0.0"
93language: zig
10- module: batch_processing
11-
12- # ═══════════════════════════════════════════════════════════════════════════════
13- # PROBLEM ANALYSIS
14- # ═══════════════════════════════════════════════════════════════════════════════
15-
16- # Current state:
17- # - Sequential request processing (one at a time)
18- # - ~1.4 tok/s inference speed
19- # - Requests queue up during generation
20- # - No parallelism in request handling
21-
22- # Target:
23- # - Batch multiple requests together
24- # - Process batch in parallel where possible
25- # - Reduce per-request overhead
26- # - Target: 3-4x throughput improvement
27-
28- # ═══════════════════════════════════════════════════════════════════════════════
29- # TYPES
30- # ═══════════════════════════════════════════════════════════════════════════════
314
325types:
336 BatchRequest:
@@ -36,22 +9,19 @@ types:
369 messages: List<String>
3710 max_tokens: Int
3811 temperature: Float
39- connection: Object # HTTP connection to respond to
12+ connection: Object
4013 received_at: Timestamp
41-
4214 BatchResponse:
4315 fields:
4416 request_id: String
4517 content: String
4618 tokens_generated: Int
4719 latency_ms: Float
48-
4920 BatchConfig:
5021 fields:
51- max_batch_size: Int # Max requests per batch (default: 4)
52- batch_timeout_ms: Int # Max wait time for batch (default: 100ms)
53- max_queue_size: Int # Max pending requests (default: 32)
54-
22+ max_batch_size: Int
23+ batch_timeout_ms: Int
24+ max_queue_size: Int
5525 BatchMetrics:
5626 fields:
5727 total_requests: Int
@@ -60,78 +30,35 @@ types:
6030 avg_latency_ms: Float
6131 throughput_tok_per_sec: Float
6232
63- # ═══════════════════════════════════════════════════════════════════════════════
64- # BATCHING STRATEGY
65- # ═══════════════════════════════════════════════════════════════════════════════
66-
67- # Strategy: Continuous Batching
68- #
69- # 1. Accept thread: receives requests, adds to queue
70- # 2. Batch thread: collects requests, forms batches
71- # 3. Inference thread: processes batches
72- #
73- # Benefits:
74- # - Amortize model overhead across multiple requests
75- # - Better GPU/CPU utilization (when we add GPU)
76- # - Reduced latency variance
77-
78- batching_config:
79- max_batch_size: 4
80- batch_timeout_ms: 100
81- max_queue_size: 32
82-
83- # For CPU inference, batching helps less than GPU
84- # But still reduces per-request overhead:
85- # - HTTP parsing
86- # - Tokenization
87- # - Response formatting
88-
89- # ═══════════════════════════════════════════════════════════════════════════════
90- # IMPLEMENTATION APPROACH
91- # ═══════════════════════════════════════════════════════════════════════════════
92-
93- # Phase 1: Request Queue (simpler)
94- # - Add thread-safe queue for incoming requests
95- # - Process requests in FIFO order
96- # - Still sequential inference, but async HTTP handling
97-
98- # Phase 2: True Batching (complex)
99- # - Batch multiple prompts together
100- # - Requires padding/masking for different lengths
101- # - Shared KV cache management
102- # - Significant code changes
103-
104- # For now: Implement Phase 1 (async request handling)
105-
106- # ═══════════════════════════════════════════════════════════════════════════════
107- # BEHAVIORS
108- # ═══════════════════════════════════════════════════════════════════════════════
109-
11033behaviors:
11134 - name: enqueue_request
11235 given: HTTP connection and parsed request body
11336 when: New chat completion request received
11437 then: Add to request queue, return immediately
11538 implementation: |
116- pub fn enqueue_request(self: *@This(), req: BatchRequest) !void {
117- // Add to request queue with timestamp
118- _ = self;
119- _ = req;
120- // In real implementation: queue.append(req) with mutex lock
121- // For now: simple stub that compiles
122- }
39+ pub fn enqueue_request(self: *@This(), req: BatchRequest) !void {
40+ // Add to request queue with timestamp
41+ _ = self;
42+ _ = req;
43+ // In real implementation: queue.append(req) with mutex lock
44+ // For now: simple stub that compiles
45+ }
46+
47+
12348
12449 - name: dequeue_batch
12550 given: Request queue and batch config
12651 when: Batch timeout or max_batch_size reached
12752 then: Return array of BatchRequest up to max_batch_size
12853 implementation: |
129- pub fn dequeue_batch(self: *@This()) []BatchRequest {
130- // Return batch of up to max_batch_size requests
131- _ = self;
132- // In real implementation: return queue items[0..max_batch_size]
133- return &[_]BatchRequest{};
134- }
54+ pub fn dequeue_batch(self: *@This()) []BatchRequest {
55+ // Return batch of up to max_batch_size requests
56+ _ = self;
57+ // In real implementation: return queue items[0..max_batch_size]
58+ return &[_]BatchRequest{};
59+ }
60+
61+
13562
13663 - name: process_batch
13764 given: Array of BatchRequest and model
@@ -143,11 +70,13 @@ behaviors:
14370 when: Generation complete
14471 then: Send HTTP response to client
14572 implementation: |
146- pub fn send_response(resp: BatchResponse) !void {
147- // Send HTTP response with JSON body
148- _ = resp;
149- // In real implementation: write to connection stream
150- }
73+ pub fn send_response(resp: BatchResponse) !void {
74+ // Send HTTP response with JSON body
75+ _ = resp;
76+ // In real implementation: write to connection stream
77+ }
78+
79+
15180
15281 - name: get_metrics
15382 given: No input required
@@ -159,9 +88,11 @@ behaviors:
15988 when: Configuration update requested
16089 then: Update batching parameters
16190 implementation: |
162- pub fn configure_batching(self: *@This(), config: BatchConfig) void {
163- // Update batching parameters
164- _ = config;
165- self.max_batch_size = 4; // Default
166- // In real implementation: self.config = config
167- }
91+ pub fn configure_batching(self: *@This(), config: BatchConfig) void {
92+ // Update batching parameters
93+ _ = config;
94+ self.max_batch_size = 4; // Default
95+ // In real implementation: self.config = config
96+ }
97+
98+
0 commit comments