Skip to content

Commit 1872e3f

Browse files
danielbodartclaude
andcommitted
Support multiple concurrent TCP transcriptions sharing a single loaded model
TCP server now spawns a thread per connection, each getting an independent pipeline (~7.3MB) while sharing the model weights in VRAM/RAM. Local audio capture (--trigger) runs alongside TCP in a background thread. Output simplified to raw text stream (no timestamp framing). CoreML encoder caches moved from shared model struct to per-pipeline for thread safety. --input flag deprecated in favour of --trigger for enabling local mode. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d754621 commit 1872e3f

8 files changed

Lines changed: 146 additions & 88 deletions

File tree

CLAUDE.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@ Zig and Bun are installed automatically via `bootstrap.sh` + mise.
1616
# Clean build artifacts
1717
./run.ts clean
1818

19-
# Run directly (loads model, grabs keyboard, CapsLock = push-to-talk)
19+
# Run directly (TCP server + local push-to-talk via CapsLock)
2020
# Linux:
2121
./dist/linux/bin/capsper --trigger capslock --audio-channel FL --drop-terms drop-terms.txt
2222
# macOS:
2323
./dist/macos/bin/capsper --trigger capslock --drop-terms drop-terms.txt
2424

25+
# TCP-only mode (no local capture, for testing or remote transcription)
26+
./dist/linux/bin/capsper --drop-terms drop-terms.txt
27+
2528
# First-time setup (builds, configures permissions, installs service)
2629
./run.ts setup
2730
```
@@ -55,7 +58,7 @@ On macOS, `./run.ts build` produces one binary. On Linux, it builds both `capspe
5558

5659
## Architecture
5760

58-
Push-to-talk voice dictation for Linux and macOS. Self-contained binary per platform. On Linux: grabs keyboards via evdev, intercepts CapsLock, captures audio via PipeWire, transcribes with Nemotron RNNT (via onnxruntime), injects text via uinput. On macOS: CGEventTap input, CoreAudio capture, CoreML inference (93% ANE), CGEventPost injection.
61+
Push-to-talk voice dictation for Linux and macOS. Self-contained binary per platform. Supports multiple concurrent transcriptions — TCP server accepts multiple clients simultaneously, each getting an independent pipeline while sharing the single loaded model. Use `--trigger` to also enable local audio capture with push-to-talk alongside TCP. On Linux: grabs keyboards via evdev, intercepts CapsLock, captures audio via PipeWire, transcribes with Nemotron RNNT (via onnxruntime), injects text via uinput. On macOS: CGEventTap input, CoreAudio capture, CoreML inference (93% ANE), CGEventPost injection.
5962

6063
> **History:** Capsper originally used whisper.cpp for ASR with Silero/TEN-VAD for voice activity detection. It now uses NVIDIA's Nemotron Speech 600M model (FastConformer RNNT) which is incremental and doesn't need a separate VAD — PTT (push-to-talk) is the sole gate. The name "Capsper" is a nod to Casper the friendly ghost — ghostwriting via CapsLock.
6164
@@ -172,7 +175,7 @@ Do NOT manually download CI artifacts or stage releases by hand — the update s
172175
- Conversion scripts: [nemotron-speech-600m-coreml](https://github.com/danielbodart/nemotron-speech-600m-coreml) (CoreML), [nemotron-speech-600m-onnx](https://github.com/danielbodart/nemotron-speech-600m-onnx) (ONNX)
173176
- Audio format: 16kHz mono S16_LE PCM (32000 bytes/sec)
174177
- Default server port: 43007
175-
- CLI flags: `--audio-channel`, `--audio-target`, `--audio-gain`, `--audio-detect` (cross-platform names; `--pw-*` aliases kept for backwards compatibility)
178+
- CLI flags: `--audio-channel`, `--audio-target`, `--audio-gain`, `--audio-detect` (cross-platform names; `--pw-*` aliases kept for backwards compatibility). `--trigger` enables local audio capture + PTT alongside TCP. `--input` is deprecated.
176179
- Service management: `systemctl --user` on Linux, `launchctl bootstrap/bootout gui/$(id -u)` on macOS
177180
- Service files: `~/.config/systemd/user/capsper.service` (Linux), `~/Library/LaunchAgents/io.github.danielbodart.capsper.plist` (macOS)
178181
- Permissions: `input` group + udev rule on Linux; Accessibility + Microphone TCC on macOS

src/backend/coreml/helpers.m

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,15 @@
1818
typedef struct {
1919
void *encoder; // MLModel* (retained)
2020
void *decoder; // MLModel* (retained)
21+
} CapsperCoreMLModels;
22+
23+
// Per-pipeline encoder cache state. Each pipeline gets its own caches
24+
// so multiple transcriptions can run concurrently on the shared model.
25+
typedef struct {
2126
void *cache_channel; // MLMultiArray* (retained)
2227
void *cache_time; // MLMultiArray* (retained)
2328
void *cache_len; // MLMultiArray* (retained)
24-
} CapsperCoreMLModels;
29+
} CapsperCoreMLCaches;
2530

2631
// ---------------------------------------------------------------------------
2732
// Helpers
@@ -230,39 +235,53 @@ static void copy_to_f32(MLMultiArray *src, float *dst, NSInteger count) {
230235
models->encoder = (void *)CFBridgingRetain(encoder);
231236
models->decoder = (void *)CFBridgingRetain(decoder);
232237

233-
// Initialize cache state (batch-first: [1, 24, 70, 1024], [1, 24, 1024, 8], [1])
234-
models->cache_channel = (void *)CFBridgingRetain(
235-
make_zeros(@[@1, @24, @70, @1024], MLMultiArrayDataTypeFloat32));
236-
models->cache_time = (void *)CFBridgingRetain(
237-
make_zeros(@[@1, @24, @1024, @8], MLMultiArrayDataTypeFloat32));
238-
models->cache_len = (void *)CFBridgingRetain(
239-
make_zeros(@[@1], MLMultiArrayDataTypeInt32));
240-
241238
NSLog(@"capsper_coreml: models loaded from %@", dir);
242239
return models;
243240
}
244241
}
245242

246-
/// Release all CoreML models and cache state.
243+
/// Release all CoreML models.
247244
void capsper_coreml_release(CapsperCoreMLModels *models) {
248245
if (!models) return;
249246
@autoreleasepool {
250247
if (models->encoder) CFBridgingRelease(models->encoder);
251248
if (models->decoder) CFBridgingRelease(models->decoder);
252-
if (models->cache_channel) CFBridgingRelease(models->cache_channel);
253-
if (models->cache_time) CFBridgingRelease(models->cache_time);
254-
if (models->cache_len) CFBridgingRelease(models->cache_len);
255249
free(models);
256250
}
257251
}
258252

253+
/// Create per-pipeline encoder cache state.
254+
CapsperCoreMLCaches *capsper_coreml_create_caches(void) {
255+
@autoreleasepool {
256+
CapsperCoreMLCaches *caches = (CapsperCoreMLCaches *)calloc(1, sizeof(CapsperCoreMLCaches));
257+
caches->cache_channel = (void *)CFBridgingRetain(
258+
make_zeros(@[@1, @24, @70, @1024], MLMultiArrayDataTypeFloat32));
259+
caches->cache_time = (void *)CFBridgingRetain(
260+
make_zeros(@[@1, @24, @1024, @8], MLMultiArrayDataTypeFloat32));
261+
caches->cache_len = (void *)CFBridgingRetain(
262+
make_zeros(@[@1], MLMultiArrayDataTypeInt32));
263+
return caches;
264+
}
265+
}
266+
267+
/// Release per-pipeline encoder cache state.
268+
void capsper_coreml_release_caches(CapsperCoreMLCaches *caches) {
269+
if (!caches) return;
270+
@autoreleasepool {
271+
if (caches->cache_channel) CFBridgingRelease(caches->cache_channel);
272+
if (caches->cache_time) CFBridgingRelease(caches->cache_time);
273+
if (caches->cache_len) CFBridgingRelease(caches->cache_len);
274+
free(caches);
275+
}
276+
}
277+
259278
/// Reset encoder cache state to zeros (call between utterances).
260-
void capsper_coreml_reset_state(CapsperCoreMLModels *models) {
261-
if (!models) return;
279+
void capsper_coreml_reset_state(CapsperCoreMLCaches *caches) {
280+
if (!caches) return;
262281
@autoreleasepool {
263-
MLMultiArray *ch = (__bridge MLMultiArray *)(models->cache_channel);
264-
MLMultiArray *t = (__bridge MLMultiArray *)(models->cache_time);
265-
MLMultiArray *l = (__bridge MLMultiArray *)(models->cache_len);
282+
MLMultiArray *ch = (__bridge MLMultiArray *)(caches->cache_channel);
283+
MLMultiArray *t = (__bridge MLMultiArray *)(caches->cache_time);
284+
MLMultiArray *l = (__bridge MLMultiArray *)(caches->cache_len);
266285
memset(ch.dataPointer, 0, ch.count * sizeof(float));
267286
memset(t.dataPointer, 0, t.count * sizeof(float));
268287
memset(l.dataPointer, 0, l.count * sizeof(int32_t));
@@ -282,15 +301,16 @@ void capsper_coreml_reset_state(CapsperCoreMLModels *models) {
282301
/// Returns 0 on success, -1 on error.
283302
int capsper_coreml_run_encoder(
284303
CapsperCoreMLModels *models,
304+
CapsperCoreMLCaches *caches,
285305
const float *mel_data,
286306
float *out_encoded,
287307
int32_t *out_encoded_len
288308
) {
289309
@autoreleasepool {
290310
MLModel *encoder = (__bridge MLModel *)(models->encoder);
291-
MLMultiArray *cache_ch = (__bridge MLMultiArray *)(models->cache_channel);
292-
MLMultiArray *cache_time = (__bridge MLMultiArray *)(models->cache_time);
293-
MLMultiArray *cache_len = (__bridge MLMultiArray *)(models->cache_len);
311+
MLMultiArray *cache_ch = (__bridge MLMultiArray *)(caches->cache_channel);
312+
MLMultiArray *cache_time = (__bridge MLMultiArray *)(caches->cache_time);
313+
MLMultiArray *cache_len = (__bridge MLMultiArray *)(caches->cache_len);
294314

295315
// Wrap mel input (zero-copy)
296316
MLMultiArray *mel = wrap_f32((float *)mel_data, @[@1, @128, @65], 1 * 128 * 65);

src/backend/coreml/pipeline.zig

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ const N_MELS = mel_state_mod.N_MELS;
3131

3232
// CoreML C API (from coreml_helpers.m)
3333
pub const CapsperCoreMLModels = opaque {};
34+
pub const CapsperCoreMLCaches = opaque {};
3435
extern fn capsper_coreml_run_encoder(
3536
models: *CapsperCoreMLModels,
37+
caches: *CapsperCoreMLCaches,
3638
mel_data: [*]const f32,
3739
out_encoded: [*]f32,
3840
out_encoded_len: *i32,
@@ -47,7 +49,9 @@ extern fn capsper_coreml_run_decoder(
4749
out_state_h: [*]f32,
4850
out_state_c: [*]f32,
4951
) c_int;
50-
extern fn capsper_coreml_reset_state(models: *CapsperCoreMLModels) void;
52+
extern fn capsper_coreml_create_caches() ?*CapsperCoreMLCaches;
53+
extern fn capsper_coreml_release_caches(caches: *CapsperCoreMLCaches) void;
54+
extern fn capsper_coreml_reset_state(caches: *CapsperCoreMLCaches) void;
5155

5256
/// Process-lifetime config. CoreML models are shared across connections.
5357
pub const CoreMLConfig = struct {
@@ -69,6 +73,9 @@ pub const CoreMLPipeline = struct {
6973
// Pre-encode cache: last PRE_ENCODE_CACHE mel frames from previous chunk
7074
pre_cache: [N_MELS * PRE_ENCODE_CACHE]f32 = [_]f32{0} ** (N_MELS * PRE_ENCODE_CACHE),
7175

76+
// Per-pipeline encoder cache state (CoreML MLMultiArrays)
77+
caches: *CapsperCoreMLCaches,
78+
7279
// RNNT decoder state
7380
dec_state1: []f32,
7481
dec_state2: []f32,
@@ -84,6 +91,8 @@ pub const CoreMLPipeline = struct {
8491
pub fn init(allocator: std.mem.Allocator, config: CoreMLConfig, verbose: bool) !CoreMLPipeline {
8592
const dec_state_size = PRED_LAYERS * 1 * PRED_HIDDEN;
8693

94+
const caches = capsper_coreml_create_caches() orelse return error.CoreMLCacheInitFailed;
95+
8796
const dec_state1 = try allocator.alloc(f32, dec_state_size);
8897
errdefer allocator.free(dec_state1);
8998
@memset(dec_state1, 0);
@@ -97,6 +106,7 @@ pub const CoreMLPipeline = struct {
97106
.config = config,
98107
.verbose = verbose,
99108
.mel = NemoMelState.init(allocator, config.filterbank),
109+
.caches = caches,
100110
.dec_state1 = dec_state1,
101111
.dec_state2 = dec_state2,
102112
};
@@ -107,6 +117,7 @@ pub const CoreMLPipeline = struct {
107117
}
108118

109119
pub fn deinit(self: *CoreMLPipeline) void {
120+
capsper_coreml_release_caches(self.caches);
110121
self.allocator.free(self.dec_state1);
111122
self.allocator.free(self.dec_state2);
112123
self.emitted_text.deinit(self.allocator);
@@ -154,7 +165,7 @@ pub const CoreMLPipeline = struct {
154165
self.mel.reset();
155166
self.mel_frame_cursor = 0;
156167
@memset(&self.pre_cache, 0);
157-
capsper_coreml_reset_state(self.config.models);
168+
capsper_coreml_reset_state(self.caches);
158169
@memset(self.dec_state1, 0);
159170
@memset(self.dec_state2, 0);
160171
self.last_token = tokenizer.BLANK_ID;
@@ -241,6 +252,7 @@ pub const CoreMLPipeline = struct {
241252

242253
const status = capsper_coreml_run_encoder(
243254
self.config.models,
255+
self.caches,
244256
&mel_input,
245257
&enc_output,
246258
&enc_len,

src/main.zig

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ const Recorder = @import("shared/recorder.zig").Recorder;
2222
pub fn main() !void {
2323
var gpa = std.heap.GeneralPurposeAllocator(.{ .enable_memory_limit = true }){};
2424
defer _ = gpa.deinit();
25-
const allocator = gpa.allocator();
25+
var ts_allocator = std.heap.ThreadSafeAllocator{ .child_allocator = gpa.allocator() };
26+
const allocator = ts_allocator.allocator();
2627

2728
const args = try std.process.argsAlloc(allocator);
2829
defer std.process.argsFree(allocator, args);
@@ -72,17 +73,9 @@ pub fn main() !void {
7273
break :blk 43007;
7374
};
7475
} else if (std.mem.eql(u8, arg, "--input")) {
76+
// Deprecated: local mode is now enabled by --trigger. Accept and skip for backwards compatibility.
7577
i += 1;
76-
if (i < args.len) {
77-
if (std.mem.eql(u8, args[i], "tcp")) {
78-
input_mode = .tcp;
79-
} else if (std.mem.eql(u8, args[i], "local")) {
80-
input_mode = .local;
81-
} else {
82-
std.debug.print("Invalid --input value '{s}', expected 'tcp' or 'local'\n", .{args[i]});
83-
return;
84-
}
85-
}
78+
std.debug.print("Warning: --input is deprecated. Use --trigger to enable local mode alongside TCP.\n", .{});
8679
} else if (std.mem.eql(u8, arg, "--audio-target") or std.mem.eql(u8, arg, "--pw-target")) {
8780
i += 1;
8881
if (i < args.len) audio_target = args[i];
@@ -178,7 +171,7 @@ pub fn main() !void {
178171
return;
179172
}
180173

181-
// --trigger implies --input local (audio capture) and starts not-live (trigger key controls recording)
174+
// --trigger enables local audio capture alongside TCP (trigger key controls PTT recording)
182175
if (trigger_key != null) {
183176
input_mode = .local;
184177
}
@@ -356,8 +349,8 @@ pub fn main() !void {
356349
};
357350

358351
var server2 = Server.init(allocator, pipeline_factory, 0, .tcp, null, 0, verbose, false, null, drop_terms, null, 1.0, true);
359-
server_mod.is_live.store(true, .monotonic);
360-
server2.handleConnection(file.handle, 1, null) catch |err| {
352+
var always_live = std.atomic.Value(bool).init(true);
353+
server2.handleConnection(file.handle, 1, &always_live, null, null) catch |err| {
361354
std.debug.print("Stream error: {}\n", .{err});
362355
};
363356
file.close();
@@ -478,7 +471,7 @@ pub fn main() !void {
478471
fn printUsage() void {
479472
std.debug.print("Usage: capsper [--model PATH] [--port PORT]\n", .{});
480473
std.debug.print(" [--verbose|-v]\n", .{});
481-
std.debug.print(" [--input tcp|local] [--audio-target NODE] [--audio-channel CHANNEL]\n", .{});
474+
std.debug.print(" [--audio-target NODE] [--audio-channel CHANNEL]\n", .{});
482475
std.debug.print(" [--trigger KEY] [--trigger-passthrough] [--type-delay MICROSECONDS]\n", .{});
483476
std.debug.print(" [--drop-terms FILE]\n", .{});
484477
std.debug.print(" [--record-dir DIR [--record-keep N]]\n", .{});
@@ -487,4 +480,8 @@ fn printUsage() void {
487480
std.debug.print(" [--audio-detect [--detect-duration SECS]]\n", .{});
488481
std.debug.print(" [--warmup-file FILE] [--no-warmup]\n", .{});
489482
std.debug.print(" [--dry-run] [--version]\n", .{});
483+
std.debug.print("\n", .{});
484+
std.debug.print("TCP server is always active (default port 43007). Multiple clients can connect\n", .{});
485+
std.debug.print("simultaneously, each getting an independent transcription pipeline.\n", .{});
486+
std.debug.print("Use --trigger to also enable local audio capture with push-to-talk.\n", .{});
490487
}

0 commit comments

Comments
 (0)