Skip to content

Commit fc0daef

Browse files
authored
Make local STT a streaming runtime service (#11)
* Separate STT runtime service from stdio transport * Transcribe PCM chunks without temp WAV files * Emit local STT runtime event frames * Add streaming audio chunk command * Keep STT session configuration in runtime state * Buffer streaming STT audio in runtime sessions * Stream local STT audio from Electron * Emit final STT segment events on session flush * Forward STT runtime events to capture listeners * Persist live STT transcript segments * Wrap whisper STT behind backend interface * Queue streaming STT transcription jobs * Report streaming STT progress by audio time * Prepare STT response sink for async events * Make streaming STT jobs self-contained * Isolate streaming STT job execution * Separate STT backend from runtime state * Isolate STT request source * Introduce STT worker boundary * Route STT worker through owned commands * Run STT backend on a worker thread * Submit streaming STT jobs asynchronously * Drain streaming STT events through runtime service * Pump STT runtime events from stdio loop * Drop stale STT job results after cancel * Verify async STT job submission * Keep Metal STT backend on runtime thread * Add opt-in STT worker process boundary * Enable STT worker process by default * Warm local STT runtime on startup * Emit final events from STT worker results * Refresh STT warmup after model changes * Ignore STT events outside active session * Clamp PCM16 normalization range * Scan WAV chunks without header cap
1 parent 414ab8b commit fc0daef

2 files changed

Lines changed: 2585 additions & 98 deletions

File tree

apps/desktop/main.cjs

Lines changed: 292 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ const MAX_SECTION_INSTRUCTION_LENGTH = 1200;
5252
const AUTO_UPDATE_REPO = "qyinm/MirrorNote";
5353
const AUTO_UPDATE_INTERVAL = "30 minutes";
5454
const DEFAULT_AUDIO_DEVICE_ID = "default";
55+
const LOCAL_STT_STREAM_CHUNK_SECONDS = 5;
5556

5657
const LOCAL_STT_MODELS = [
5758
{
@@ -1210,6 +1211,9 @@ async function downloadLocalSTTModel(modelID, targetWebContents = null) {
12101211
await fs.rm(destinationPath, { force: true });
12111212
await fs.rename(temporaryPath, destinationPath);
12121213
await writeElectronSettings({ selectedLocalSTTModelID: model.id });
1214+
warmLocalSTTServer().catch((error) => {
1215+
console.error("Local STT warmup failed after model download:", error);
1216+
});
12131217
updateTrayMenu();
12141218
sendSTTModelDownloadProgress(targetWebContents, {
12151219
modelID: model.id,
@@ -2387,10 +2391,23 @@ async function retryTranscript(id) {
23872391
throw new Error("This note does not have a playable recording to transcribe.");
23882392
}
23892393

2390-
const segments = await transcribeRecordingFile(paths.recordingPath, transcriptionConfiguration);
2394+
const segments = await transcribeRecordingFile(paths.recordingPath, transcriptionConfiguration, {
2395+
onSegments: async (liveSegments) => {
2396+
await writeLiveTranscript(paths, liveSegments);
2397+
},
2398+
});
23912399
return saveTranscript(id, segments);
23922400
}
23932401

2402+
async function writeLiveTranscript(paths, segments) {
2403+
const normalizedSegments = normalizeTranscriptSegments(segments);
2404+
await writeFileAtomic(paths.transcriptPath, serializeTranscriptJSONL(normalizedSegments), "utf8");
2405+
2406+
const metadata = await readJSONIfExists(paths.metadataPath) || {};
2407+
metadata.transcriptSegmentCount = normalizedSegments.length;
2408+
await writeJSONFileAtomic(paths.metadataPath, metadata);
2409+
}
2410+
23942411
async function promptForFileUpload(targetWindow, title, filters) {
23952412
const result = await dialog.showOpenDialog(targetWindow || undefined, {
23962413
title,
@@ -3154,10 +3171,10 @@ async function fetchAudioDeviceInventory() {
31543171
}
31553172
}
31563173

3157-
async function transcribeRecordingFile(recordingPath, transcriptionConfiguration) {
3174+
async function transcribeRecordingFile(recordingPath, transcriptionConfiguration, options = {}) {
31583175
const sttInputPath = await prepareLocalSTTAudioInput(recordingPath);
31593176
try {
3160-
return await transcribeLocalSTTAudioFile(sttInputPath, transcriptionConfiguration);
3177+
return await transcribeLocalSTTAudioFile(sttInputPath, transcriptionConfiguration, options);
31613178
} finally {
31623179
if (sttInputPath !== recordingPath) {
31633180
await fs.rm(path.dirname(sttInputPath), { recursive: true, force: true }).catch(() => {});
@@ -3205,7 +3222,231 @@ function runProcess(command, args) {
32053222
});
32063223
}
32073224

3208-
async function transcribeLocalSTTAudioFile(recordingPath, transcriptionConfiguration) {
3225+
async function transcribeLocalSTTAudioFile(recordingPath, transcriptionConfiguration, options = {}) {
3226+
const sessionID = randomUUID();
3227+
const streamedSegments = [];
3228+
const fallbackSegments = [];
3229+
let lastLiveSegmentCount = 0;
3230+
const unsubscribe = localSTTServer.onEvent((event) => {
3231+
if (event?.sessionID !== sessionID) {
3232+
return;
3233+
}
3234+
const captureEvent = captureEventFromLocalSTTEvent(event);
3235+
if (captureEvent) {
3236+
broadcastCaptureEvent(captureEvent);
3237+
}
3238+
if ((event?.eventType === "partial_segment" || event?.eventType === "final_segment") && event.segment) {
3239+
streamedSegments.push(event.segment);
3240+
}
3241+
});
3242+
const flushLiveSegments = async () => {
3243+
if (typeof options.onSegments !== "function" || streamedSegments.length === lastLiveSegmentCount) {
3244+
return;
3245+
}
3246+
lastLiveSegmentCount = streamedSegments.length;
3247+
await options.onSegments(normalizeTranscriptSegments(streamedSegments));
3248+
};
3249+
3250+
try {
3251+
await sendLocalSTTServerRequest({
3252+
version: 4,
3253+
id: randomUUID(),
3254+
command: "start_session",
3255+
sessionID,
3256+
configuration: transcriptionConfiguration,
3257+
});
3258+
3259+
for await (const chunk of readLocalSTTPcmChunks(recordingPath, LOCAL_STT_STREAM_CHUNK_SECONDS)) {
3260+
const response = await sendLocalSTTServerRequest({
3261+
version: 4,
3262+
id: randomUUID(),
3263+
command: "push_audio_chunk",
3264+
sessionID,
3265+
pcmSamples: chunk.samples,
3266+
sampleRate: chunk.sampleRate,
3267+
});
3268+
if (Array.isArray(response.segments)) {
3269+
fallbackSegments.push(...response.segments);
3270+
}
3271+
await flushLiveSegments();
3272+
}
3273+
3274+
const finalResponse = await sendLocalSTTServerRequest({
3275+
version: 4,
3276+
id: randomUUID(),
3277+
command: "finalize_session",
3278+
sessionID,
3279+
});
3280+
if (Array.isArray(finalResponse.segments)) {
3281+
fallbackSegments.push(...finalResponse.segments);
3282+
}
3283+
await flushLiveSegments();
3284+
3285+
return normalizeTranscriptSegments(streamedSegments.length > 0 ? streamedSegments : fallbackSegments);
3286+
} catch (error) {
3287+
await sendLocalSTTServerRequest({
3288+
version: 4,
3289+
id: randomUUID(),
3290+
command: "cancel_session",
3291+
sessionID,
3292+
}).catch(() => {});
3293+
throw error;
3294+
} finally {
3295+
unsubscribe();
3296+
}
3297+
}
3298+
3299+
function captureEventFromLocalSTTEvent(event) {
3300+
if (!event || typeof event !== "object") {
3301+
return null;
3302+
}
3303+
if ((event.eventType === "partial_segment" || event.eventType === "final_segment") && event.segment) {
3304+
return {
3305+
kind: "transcriptionSegment",
3306+
transcriptionSegment: event.segment,
3307+
};
3308+
}
3309+
if (event.eventType === "progress") {
3310+
return {
3311+
kind: "transcriptionProgress",
3312+
transcriptionProgress: {
3313+
processedSeconds: Number(event.progress?.processedSeconds || 0),
3314+
totalSeconds: Number(event.progress?.receivedSeconds || 0),
3315+
},
3316+
};
3317+
}
3318+
if (event.eventType === "error") {
3319+
return {
3320+
kind: "failed",
3321+
errorMessage: event.error?.message || "Local STT failed.",
3322+
};
3323+
}
3324+
if (event.eventType === "session_finished") {
3325+
return {
3326+
kind: "stateChanged",
3327+
state: "finalizing",
3328+
detail: "Local transcription finished.",
3329+
};
3330+
}
3331+
return null;
3332+
}
3333+
3334+
async function* readLocalSTTPcmChunks(recordingPath, chunkDurationSeconds) {
3335+
const file = await fs.open(recordingPath, "r");
3336+
try {
3337+
const header = await readPcm16WavHeader(file);
3338+
const framesPerChunk = Math.max(1, Math.floor(header.sampleRate * chunkDurationSeconds));
3339+
const bytesPerChunk = framesPerChunk * header.blockAlign;
3340+
const buffer = Buffer.alloc(bytesPerChunk);
3341+
let position = header.dataOffset;
3342+
let remaining = header.dataSize;
3343+
3344+
while (remaining > 0) {
3345+
const bytesToRead = Math.min(buffer.length, remaining);
3346+
const alignedBytesToRead = bytesToRead - (bytesToRead % header.blockAlign);
3347+
if (alignedBytesToRead <= 0) {
3348+
break;
3349+
}
3350+
const { bytesRead } = await file.read(buffer, 0, alignedBytesToRead, position);
3351+
if (bytesRead <= 0) {
3352+
break;
3353+
}
3354+
const alignedBytesRead = bytesRead - (bytesRead % header.blockAlign);
3355+
if (alignedBytesRead <= 0) {
3356+
break;
3357+
}
3358+
yield {
3359+
sampleRate: header.sampleRate,
3360+
samples: pcm16BufferToMonoFloat32(buffer.subarray(0, alignedBytesRead), header.channels),
3361+
};
3362+
position += alignedBytesRead;
3363+
remaining -= alignedBytesRead;
3364+
}
3365+
} finally {
3366+
await file.close();
3367+
}
3368+
}
3369+
3370+
async function readPcm16WavHeader(file) {
3371+
const stat = await file.stat();
3372+
const riffHeader = Buffer.alloc(12);
3373+
const riffRead = await file.read(riffHeader, 0, riffHeader.length, 0);
3374+
if (riffRead.bytesRead !== riffHeader.length || riffHeader.subarray(0, 4).toString("ascii") !== "RIFF" || riffHeader.subarray(8, 12).toString("ascii") !== "WAVE") {
3375+
throw new Error("unsupported WAV format for local STT: expected RIFF/WAVE data");
3376+
}
3377+
3378+
let cursor = 12;
3379+
let sampleRate = 0;
3380+
let channels = 0;
3381+
let bitsPerSample = 0;
3382+
let audioFormat = 0;
3383+
let dataOffset = 0;
3384+
let dataSize = 0;
3385+
const chunkHeader = Buffer.alloc(8);
3386+
3387+
while (cursor + chunkHeader.length <= stat.size) {
3388+
const { bytesRead } = await file.read(chunkHeader, 0, chunkHeader.length, cursor);
3389+
if (bytesRead !== chunkHeader.length) {
3390+
break;
3391+
}
3392+
const chunkID = chunkHeader.subarray(0, 4).toString("ascii");
3393+
const chunkSize = chunkHeader.readUInt32LE(4);
3394+
cursor += 8;
3395+
if (cursor + chunkSize > stat.size) {
3396+
break;
3397+
}
3398+
3399+
if (chunkID === "fmt ") {
3400+
if (chunkSize < 16) {
3401+
throw new Error("unsupported WAV format for local STT: invalid fmt chunk");
3402+
}
3403+
const fmtBuffer = Buffer.alloc(16);
3404+
const fmtRead = await file.read(fmtBuffer, 0, fmtBuffer.length, cursor);
3405+
if (fmtRead.bytesRead !== fmtBuffer.length) {
3406+
throw new Error("unsupported WAV format for local STT: invalid fmt chunk");
3407+
}
3408+
audioFormat = fmtBuffer.readUInt16LE(0);
3409+
channels = fmtBuffer.readUInt16LE(2);
3410+
sampleRate = fmtBuffer.readUInt32LE(4);
3411+
bitsPerSample = fmtBuffer.readUInt16LE(14);
3412+
} else if (chunkID === "data") {
3413+
dataOffset = cursor;
3414+
dataSize = chunkSize;
3415+
break;
3416+
}
3417+
3418+
cursor += chunkSize + (chunkSize % 2);
3419+
}
3420+
3421+
if (audioFormat !== 1 || bitsPerSample !== 16 || channels <= 0 || sampleRate <= 0 || dataOffset <= 0 || dataSize <= 0) {
3422+
throw new Error("unsupported WAV format for local STT: only PCM16 WAV input is supported");
3423+
}
3424+
3425+
return {
3426+
sampleRate,
3427+
channels,
3428+
blockAlign: channels * 2,
3429+
dataOffset,
3430+
dataSize,
3431+
};
3432+
}
3433+
3434+
function pcm16BufferToMonoFloat32(buffer, channels) {
3435+
const blockAlign = channels * 2;
3436+
const frameCount = Math.floor(buffer.length / blockAlign);
3437+
const samples = new Array(frameCount);
3438+
for (let frame = 0; frame < frameCount; frame += 1) {
3439+
let sum = 0;
3440+
const frameOffset = frame * blockAlign;
3441+
for (let channel = 0; channel < channels; channel += 1) {
3442+
sum += buffer.readInt16LE(frameOffset + channel * 2) / 32768.0;
3443+
}
3444+
samples[frame] = sum / channels;
3445+
}
3446+
return samples;
3447+
}
3448+
3449+
async function transcribeLocalSTTAudioFileLegacy(recordingPath, transcriptionConfiguration) {
32093450
const request = {
32103451
version: 4,
32113452
id: randomUUID(),
@@ -3228,6 +3469,7 @@ class LocalSTTServerBridge {
32283469
this.pending = new Map();
32293470
this.stdoutBuffer = Buffer.alloc(0);
32303471
this.stderrTail = [];
3472+
this.eventHandlers = new Set();
32313473
}
32323474

32333475
ensureStarted() {
@@ -3291,6 +3533,11 @@ class LocalSTTServerBridge {
32913533
return;
32923534
}
32933535

3536+
if (response.event) {
3537+
this.emitEvent(response);
3538+
continue;
3539+
}
3540+
32943541
const pending = this.pending.get(response.id);
32953542
if (!pending) {
32963543
continue;
@@ -3306,6 +3553,23 @@ class LocalSTTServerBridge {
33063553
}
33073554
}
33083555

3556+
emitEvent(response) {
3557+
for (const handler of this.eventHandlers) {
3558+
try {
3559+
handler(response.event, response);
3560+
} catch (error) {
3561+
console.error("Local STT event handler failed:", error);
3562+
}
3563+
}
3564+
}
3565+
3566+
onEvent(handler) {
3567+
this.eventHandlers.add(handler);
3568+
return () => {
3569+
this.eventHandlers.delete(handler);
3570+
};
3571+
}
3572+
33093573
request(request) {
33103574
this.ensureStarted();
33113575
return new Promise((resolve, reject) => {
@@ -3363,6 +3627,22 @@ function sendLocalSTTServerRequest(request) {
33633627
return localSTTServer.request(request);
33643628
}
33653629

3630+
async function warmLocalSTTServer() {
3631+
if (process.env.MIRROR_NOTE_TEST_EXPORTS === "1") {
3632+
return;
3633+
}
3634+
const { selectedModel, transcriptionConfiguration } = await selectedTranscriptionConfiguration();
3635+
if (!selectedModel || !transcriptionConfiguration) {
3636+
return;
3637+
}
3638+
await sendLocalSTTServerRequest({
3639+
version: 4,
3640+
id: `warm-stt-${Date.now()}`,
3641+
command: "prepare_model",
3642+
configuration: transcriptionConfiguration,
3643+
});
3644+
}
3645+
33663646
async function fetchCaptureCapabilities() {
33673647
const { transcriptionConfiguration } = await selectedTranscriptionConfiguration();
33683648
if (!transcriptionConfiguration) {
@@ -3690,7 +3970,11 @@ function registerIPCHandlers() {
36903970
ipcMain.handle("settings:get", () => getSettings());
36913971
ipcMain.handle("settings:update", async (_event, patch) => {
36923972
await writeElectronSettings(patch || {});
3693-
return getSettings();
3973+
const settings = await getSettings();
3974+
warmLocalSTTServer().catch((error) => {
3975+
console.error("Local STT warmup failed after settings update:", error);
3976+
});
3977+
return settings;
36943978
});
36953979
ipcMain.handle("settings:download-stt-model", (event, modelID) => downloadLocalSTTModel(modelID, event.sender));
36963980
ipcMain.handle("settings:cancel-stt-model-download", (_event, modelID) => cancelLocalSTTModelDownload(modelID));
@@ -3729,6 +4013,9 @@ function startElectronApp() {
37294013
createApplicationMenu();
37304014
createWindow();
37314015
createTray();
4016+
warmLocalSTTServer().catch((error) => {
4017+
console.error("Local STT warmup failed:", error);
4018+
});
37324019
});
37334020

37344021
app.on("activate", () => {

0 commit comments

Comments
 (0)