Skip to content

Commit a9c41c0

Browse files
fix(stt): track transcript timing provenance (#5285)
Preserve provider word timing metadata through batch persistence and transcript rendering, request OpenAI Whisper word timestamps, and keep transcript-only synthetic timings from driving seek clicks.
1 parent a6d224a commit a9c41c0

13 files changed

Lines changed: 488 additions & 143 deletions

File tree

apps/desktop/src/session/components/note-input/transcript/renderer/segment-hooks.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { useMemo, useRef } from "react";
22

33
import type { Segment } from "~/stt/live-segment";
4+
import { getTranscriptTimingSource } from "~/stt/timing";
45

56
export function useStableSegments(segments: Segment[]): Segment[] {
67
const cacheRef = useRef<Map<string, Segment>>(new Map());
@@ -54,7 +55,8 @@ function segmentsEqual(a: Segment, b: Segment) {
5455
aw.start_ms !== bw.start_ms ||
5556
aw.end_ms !== bw.end_ms ||
5657
aw.channel !== bw.channel ||
57-
aw.is_final !== bw.is_final
58+
aw.is_final !== bw.is_final ||
59+
getTranscriptTimingSource(aw) !== getTranscriptTimingSource(bw)
5860
) {
5961
return false;
6062
}

apps/desktop/src/session/components/note-input/transcript/renderer/transcript.tsx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import {
2323
defaultRenderLabelContext,
2424
SpeakerLabelManager,
2525
} from "~/stt/segment/shared";
26+
import { isTranscriptWordSeekable } from "~/stt/timing";
2627

2728
export function RenderTranscript({
2829
scrollElement,
@@ -105,7 +106,7 @@ const SegmentsList = memo(
105106

106107
const seekAndPlay = useCallback(
107108
(word: SegmentWord) => {
108-
if (audioExists) {
109+
if (audioExists && isTranscriptWordSeekable(word)) {
109110
seek((offsetMs + word.start_ms) / 1000);
110111
startPlayback();
111112
}

apps/desktop/src/session/components/note-input/transcript/renderer/word-span.tsx

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import type { HighlightSegment } from "./utils";
77
import { useSearch } from "~/session/components/note-input/search/context";
88
import { createHighlightSegments } from "~/session/components/note-input/search/matching";
99
import type { SegmentWord } from "~/stt/live-segment";
10+
import { isTranscriptWordSeekable } from "~/stt/timing";
1011

1112
interface WordSpanProps {
1213
word: SegmentWord;
@@ -29,18 +30,19 @@ export function WordSpan(props: WordSpanProps) {
2930
highlights.segments,
3031
highlights.isActive,
3132
);
33+
const canSeek = props.audioExists && isTranscriptWordSeekable(props.word);
3234
const className = useMemo(
3335
() =>
3436
cn([
35-
props.audioExists && "cursor-pointer hover:bg-neutral-200/60",
37+
canSeek && "cursor-pointer hover:bg-neutral-200/60",
3638
!props.word.is_final && ["opacity-60", "italic"],
3739
]),
38-
[props.audioExists, props.word.is_final],
40+
[canSeek, props.word.is_final],
3941
);
4042

4143
return (
4244
<span
43-
onClick={() => props.onClickWord(props.word)}
45+
onClick={() => canSeek && props.onClickWord(props.word)}
4446
className={className}
4547
data-word-id={props.word.id}
4648
>

apps/desktop/src/store/zustand/listener/batch.ts

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ import type { BatchPersistCallback } from "./transcript";
1010
import { transformWordEntries, type WordEntry } from "./utils";
1111

1212
import { type RuntimeSpeakerHint, type WordLike } from "~/stt/segment";
13+
import {
14+
createTranscriptTimingMetadata,
15+
getValidTimingSource,
16+
type TranscriptTimingSource,
17+
} from "~/stt/timing";
1318

1419
export type BatchPhase = "importing" | "transcribing";
1520
export type BatchTerminalReason = "failed" | "timed_out" | "stopped";
@@ -293,19 +298,26 @@ function transformBatch(
293298
return;
294299
}
295300

301+
const timingSource = getWordTimingSourceForBatchResponse(
302+
response,
303+
Boolean(alternative.words?.length),
304+
"synthetic_text",
305+
);
296306
const wordEntries = wordEntriesFromTranscript(
297307
alternative.words,
298308
alternative.transcript,
299309
{
300310
channel: channelIndex,
301311
durationSeconds: getBatchDurationSeconds(response),
312+
timingSource,
302313
},
303314
);
304315

305316
const [words, hints] = transformWordEntries(
306317
wordEntries,
307318
alternative.transcript,
308319
channelIndex,
320+
{ timingSource },
309321
);
310322

311323
hints.forEach((hint) => {
@@ -371,20 +383,27 @@ function mergeBatchPreview(
371383
return preview;
372384
}
373385

386+
const timingSource = getWordTimingSourceForBatchResponse(
387+
response,
388+
Boolean(alternative.words?.length),
389+
"provider_segment_interpolated",
390+
);
374391
const wordEntries = wordEntriesFromTranscript(
375392
alternative.words,
376393
alternative.transcript,
377394
{
378395
channel: channelIndex,
379396
startSeconds: response.start,
380397
durationSeconds: response.duration,
398+
timingSource,
381399
},
382400
);
383401

384402
const [incomingWords, incomingHints] = transformWordEntries(
385403
wordEntries,
386404
alternative.transcript,
387405
channelIndex,
406+
{ timingSource },
388407
);
389408
if (incomingWords.length === 0) {
390409
return preview;
@@ -472,14 +491,23 @@ function wordEntriesFromTranscript(
472491
channel,
473492
startSeconds = 0,
474493
durationSeconds,
494+
timingSource,
475495
}: {
476496
channel: number;
477497
startSeconds?: number;
478498
durationSeconds?: number;
499+
timingSource: TranscriptTimingSource;
479500
},
480501
): WordEntry[] {
481-
if (entries?.length || !transcript.trim()) {
482-
return entries ?? [];
502+
if (entries?.length) {
503+
return entries.map((entry) => ({
504+
...entry,
505+
metadata: createTranscriptTimingMetadata(timingSource, entry.metadata),
506+
}));
507+
}
508+
509+
if (!transcript.trim()) {
510+
return [];
483511
}
484512

485513
const tokens = transcript.trim().split(/\s+/).filter(Boolean);
@@ -501,9 +529,44 @@ function wordEntriesFromTranscript(
501529
end: startSeconds + ((index + 1) / tokens.length) * duration,
502530
channel,
503531
speaker: null,
532+
metadata: createTranscriptTimingMetadata(timingSource),
504533
}));
505534
}
506535

536+
function getWordTimingSourceForBatchResponse(
537+
response: { metadata?: unknown },
538+
hasProviderWords: boolean,
539+
fallbackWithoutWords: TranscriptTimingSource,
540+
): TranscriptTimingSource {
541+
if (!hasProviderWords) {
542+
return fallbackWithoutWords;
543+
}
544+
545+
const explicitSource = getBatchResponseTimingSource(response);
546+
if (explicitSource) {
547+
return explicitSource;
548+
}
549+
550+
return "provider_word";
551+
}
552+
553+
function getBatchResponseTimingSource(response: {
554+
metadata?: unknown;
555+
}): TranscriptTimingSource | undefined {
556+
const metadata = response.metadata;
557+
if (!metadata || typeof metadata !== "object" || Array.isArray(metadata)) {
558+
return undefined;
559+
}
560+
561+
const record = metadata as Record<string, unknown>;
562+
const timing = record.timing;
563+
if (timing && typeof timing === "object" && !Array.isArray(timing)) {
564+
return getValidTimingSource((timing as Record<string, unknown>).source);
565+
}
566+
567+
return getValidTimingSource(record.timing_source);
568+
}
569+
507570
function getBatchDurationSeconds(response: BatchResponse): number | undefined {
508571
const metadata = response.metadata;
509572
if (!metadata || typeof metadata !== "object" || Array.isArray(metadata)) {

apps/desktop/src/store/zustand/listener/general.test.ts

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,11 @@ describe("General Listener Slice", () => {
136136
start_ms: 0,
137137
end_ms: 500,
138138
channel: 0,
139+
metadata: {
140+
timing: {
141+
source: "provider_word",
142+
},
143+
},
139144
},
140145
]);
141146

@@ -201,6 +206,11 @@ describe("General Listener Slice", () => {
201206
start_ms: 0,
202207
end_ms: 500,
203208
channel: 0,
209+
metadata: {
210+
timing: {
211+
source: "provider_word",
212+
},
213+
},
204214
},
205215
],
206216
[
@@ -227,7 +237,7 @@ describe("General Listener Slice", () => {
227237

228238
expect(
229239
handleBatchResponse(sessionId, {
230-
metadata: { duration: 2 },
240+
metadata: { duration: 2, timing_source: "provider_word" },
231241
results: {
232242
channels: [
233243
{
@@ -251,12 +261,22 @@ describe("General Listener Slice", () => {
251261
start_ms: 0,
252262
end_ms: 1000,
253263
channel: 0,
264+
metadata: {
265+
timing: {
266+
source: "synthetic_text",
267+
},
268+
},
254269
},
255270
{
256271
text: " world",
257272
start_ms: 1000,
258273
end_ms: 2000,
259274
channel: 0,
275+
metadata: {
276+
timing: {
277+
source: "synthetic_text",
278+
},
279+
},
260280
},
261281
],
262282
[],
@@ -312,12 +332,22 @@ describe("General Listener Slice", () => {
312332
start_ms: 4000,
313333
end_ms: 5000,
314334
channel: 1,
335+
metadata: {
336+
timing: {
337+
source: "provider_segment_interpolated",
338+
},
339+
},
315340
},
316341
{
317342
text: " world",
318343
start_ms: 5000,
319344
end_ms: 6000,
320345
channel: 1,
346+
metadata: {
347+
timing: {
348+
source: "provider_segment_interpolated",
349+
},
350+
},
321351
},
322352
],
323353
[],

apps/desktop/src/store/zustand/listener/utils.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
import type { RuntimeSpeakerHint, WordLike } from "~/stt/segment";
2+
import {
3+
createTranscriptTimingMetadata,
4+
type TranscriptTimingSource,
5+
type TranscriptWordMetadata,
6+
} from "~/stt/timing";
27

38
export function fixSpacingForWords(
49
words: string[],
@@ -36,12 +41,16 @@ export type WordEntry = {
3641
end: number;
3742
channel?: number;
3843
speaker?: number | null;
44+
metadata?: TranscriptWordMetadata | null;
3945
};
4046

4147
export function transformWordEntries(
4248
wordEntries: WordEntry[] | null | undefined,
4349
transcript: string,
4450
channel: number,
51+
options: {
52+
timingSource?: TranscriptTimingSource;
53+
} = {},
4554
): [WordLike[], RuntimeSpeakerHint[]] {
4655
const words: WordLike[] = [];
4756
const hints: RuntimeSpeakerHint[] = [];
@@ -61,6 +70,10 @@ export function transformWordEntries(
6170
start_ms: Math.round(word.start * 1000),
6271
end_ms: Math.round(word.end * 1000),
6372
channel: typeof word.channel === "number" ? word.channel : channel,
73+
metadata: createTranscriptTimingMetadata(
74+
options.timingSource ?? "provider_word",
75+
word.metadata,
76+
),
6477
});
6578

6679
if (typeof word.speaker === "number") {

apps/desktop/src/stt/live-segment.ts

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import type {
66
SegmentWord as BoundSegmentWord,
77
} from "@hypr/plugin-transcription";
88

9+
import type { TranscriptWordMetadata } from "~/stt/timing";
10+
911
export enum ChannelProfile {
1012
DirectMic = 0,
1113
RemoteParty = 1,
@@ -17,6 +19,7 @@ export type WordLike = {
1719
start_ms: number;
1820
end_ms: number;
1921
channel: ChannelProfile;
22+
metadata?: TranscriptWordMetadata | null;
2023
};
2124

2225
export type PartialWord = WordLike;
@@ -41,8 +44,18 @@ export type RenderLabelContext = {
4144
};
4245

4346
export type SegmentKey = BoundSegmentKey;
44-
export type SegmentWord = BoundSegmentWord;
45-
export type Segment = LiveTranscriptSegment | RenderedTranscriptSegment;
47+
export type SegmentWord = BoundSegmentWord & {
48+
metadata?: TranscriptWordMetadata | null;
49+
};
50+
type SegmentWithWordMetadata<T extends { words: BoundSegmentWord[] }> = Omit<
51+
T,
52+
"words"
53+
> & {
54+
words: SegmentWord[];
55+
};
56+
export type Segment =
57+
| SegmentWithWordMetadata<LiveTranscriptSegment>
58+
| SegmentWithWordMetadata<RenderedTranscriptSegment>;
4659
export type SegmentChannelProfile = BoundChannelProfile;
4760

4861
export class SpeakerLabelManager {

0 commit comments

Comments
 (0)