Skip to content

Commit 41f64f3

Browse files
committed
feat: deleting data from waveform after processing
1 parent fafc5b9 commit 41f64f3

File tree

1 file changed

+56
-22
lines changed

1 file changed

+56
-22
lines changed

src/controllers/SpeechToTextController.ts

Lines changed: 56 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ export class SpeechToTextController {
3232
private streamWaveform: number[] = [];
3333
private isDecodingChunk = false;
3434
private numberOfDecodedChunks = 0;
35+
private numberOfDeletedChunks = 0;
36+
private numOfChunks = 0;
3537

3638
// User callbacks
3739
private decodedTranscribeCallback: (sequence: number[]) => void;
@@ -162,15 +164,26 @@ export class SpeechToTextController {
162164

163165
private chunkWaveform(waveform: number[]) {
164166
this.chunks = [];
165-
const numOfChunks = Math.ceil(waveform.length / this.windowSize);
166-
for (let i = 0; i < numOfChunks; i++) {
167-
let chunk = waveform.slice(
168-
Math.max(this.windowSize * i - this.overlapSeconds, 0),
169-
Math.min(
170-
this.windowSize * (i + 1) + this.overlapSeconds,
171-
waveform.length
172-
)
173-
);
167+
this.numOfChunks = Math.ceil(waveform.length / this.windowSize);
168+
for (let i = 0; i < this.numOfChunks; i++) {
169+
let chunk;
170+
if (i == 0 && this.numberOfDeletedChunks > 0) {
171+
chunk = waveform.slice(
172+
0,
173+
Math.min(
174+
this.windowSize * (i + 1) + 2 * this.overlapSeconds,
175+
waveform.length
176+
)
177+
);
178+
} else {
179+
chunk = waveform.slice(
180+
Math.max(this.windowSize * i - this.overlapSeconds, 0),
181+
Math.min(
182+
this.windowSize * (i + 1) + this.overlapSeconds,
183+
waveform.length
184+
)
185+
);
186+
}
174187
this.chunks.push(chunk);
175188
}
176189
}
@@ -376,21 +389,29 @@ export class SpeechToTextController {
376389
if (!this.isDecodingChunk && streamAction != 2) {
377390
this.isDecodingChunk = true;
378391
while (
379-
this.chunks.at(this.numberOfDecodedChunks)?.length ==
392+
this.chunks.at(-this.numOfChunks)?.length ==
380393
2 * this.overlapSeconds + this.windowSize ||
381394
(this.numberOfDecodedChunks == 0 &&
382-
this.chunks.at(this.numberOfDecodedChunks)?.length ==
395+
this.chunks.at(-this.numOfChunks)?.length ==
383396
this.windowSize + this.overlapSeconds)
384397
) {
385398
let seq = await this.decodeChunk(
386-
this.chunks.at(this.numberOfDecodedChunks)!,
399+
this.chunks.at(-this.numOfChunks)!,
387400
audioLanguage
388401
);
402+
const numSpecialTokens = (await this.getStartingTokenIds(audioLanguage))
403+
.length;
389404
// remove sos/eos token and 3 additional ones
390405
if (this.numberOfDecodedChunks == 0) {
391-
this.seqs = [seq.slice(0, -4)];
406+
this.seqs = [seq.slice(0, -(numSpecialTokens + NUM_TOKENS_TO_SLICE))];
392407
} else {
393-
this.seqs = [...this.seqs, seq.slice(4, -4)];
408+
this.seqs = [
409+
...this.seqs,
410+
seq.slice(
411+
numSpecialTokens + NUM_TOKENS_TO_SLICE,
412+
-(numSpecialTokens + NUM_TOKENS_TO_SLICE)
413+
),
414+
];
394415
this.prevSeq = this.handleOverlaps(this.seqs);
395416
}
396417
this.numberOfDecodedChunks++;
@@ -400,21 +421,33 @@ export class SpeechToTextController {
400421
}
401422
this.isDecodingChunk = false;
402423
}
403-
while (
404-
this.numberOfDecodedChunks < this.chunks.length &&
405-
streamAction == STREAMING_ACTION.STOP
406-
) {
407-
let seq = await this.decodeChunk(
408-
this.chunks.at(this.numberOfDecodedChunks)!
409-
);
424+
// remove data from waveform, which was processed and saved to this.seqs
425+
while (this.numOfChunks > 2) {
426+
if (this.numberOfDeletedChunks == 0) {
427+
this.streamWaveform = this.streamWaveform.slice(
428+
-(
429+
this.streamWaveform.length -
430+
(this.windowSize + this.overlapSeconds)
431+
)
432+
);
433+
} else {
434+
this.streamWaveform = this.streamWaveform.slice(
435+
-(this.streamWaveform.length - this.windowSize)
436+
);
437+
}
438+
this.numberOfDeletedChunks++;
439+
this.numOfChunks--;
440+
}
441+
while (this.numOfChunks > 0 && streamAction == STREAMING_ACTION.STOP) {
442+
let seq = await this.decodeChunk(this.chunks.at(-this.numOfChunks)!);
410443
if (this.numberOfDecodedChunks == 0) {
411444
this.sequence = seq;
412445
this.decodedTranscribeCallback(seq);
413446
this.isGeneratingCallback(false);
414447
break;
415448
}
416449
//last sequence processed
417-
if (this.numberOfDecodedChunks == this.chunks.length - 1) {
450+
if (this.numOfChunks == 1) {
418451
let finalSeq = [...this.sequence, ...seq];
419452
this.sequence = finalSeq;
420453
this.decodedTranscribeCallback(finalSeq);
@@ -424,6 +457,7 @@ export class SpeechToTextController {
424457
this.handleOverlaps(this.seqs);
425458
}
426459
this.numberOfDecodedChunks++;
460+
this.numOfChunks--;
427461
}
428462
const decodedText = await this.tokenIdsToText(this.sequence);
429463
return decodedText;

0 commit comments

Comments
 (0)