@@ -32,6 +32,8 @@ export class SpeechToTextController {
3232 private streamWaveform : number [ ] = [ ] ;
3333 private isDecodingChunk = false ;
3434 private numberOfDecodedChunks = 0 ;
35+ private numberOfDeletedChunks = 0 ;
36+ private numOfChunks = 0 ;
3537
3638 // User callbacks
3739 private decodedTranscribeCallback : ( sequence : number [ ] ) => void ;
@@ -162,15 +164,26 @@ export class SpeechToTextController {
162164
163165 private chunkWaveform ( waveform : number [ ] ) {
164166 this . chunks = [ ] ;
165- const numOfChunks = Math . ceil ( waveform . length / this . windowSize ) ;
166- for ( let i = 0 ; i < numOfChunks ; i ++ ) {
167- let chunk = waveform . slice (
168- Math . max ( this . windowSize * i - this . overlapSeconds , 0 ) ,
169- Math . min (
170- this . windowSize * ( i + 1 ) + this . overlapSeconds ,
171- waveform . length
172- )
173- ) ;
167+ this . numOfChunks = Math . ceil ( waveform . length / this . windowSize ) ;
168+ for ( let i = 0 ; i < this . numOfChunks ; i ++ ) {
169+ let chunk ;
170+ if ( i == 0 && this . numberOfDeletedChunks > 0 ) {
171+ chunk = waveform . slice (
172+ 0 ,
173+ Math . min (
174+ this . windowSize * ( i + 1 ) + 2 * this . overlapSeconds ,
175+ waveform . length
176+ )
177+ ) ;
178+ } else {
179+ chunk = waveform . slice (
180+ Math . max ( this . windowSize * i - this . overlapSeconds , 0 ) ,
181+ Math . min (
182+ this . windowSize * ( i + 1 ) + this . overlapSeconds ,
183+ waveform . length
184+ )
185+ ) ;
186+ }
174187 this . chunks . push ( chunk ) ;
175188 }
176189 }
@@ -376,21 +389,29 @@ export class SpeechToTextController {
376389 if ( ! this . isDecodingChunk && streamAction != 2 ) {
377390 this . isDecodingChunk = true ;
378391 while (
379- this . chunks . at ( this . numberOfDecodedChunks ) ?. length ==
392+ this . chunks . at ( - this . numOfChunks ) ?. length ==
380393 2 * this . overlapSeconds + this . windowSize ||
381394 ( this . numberOfDecodedChunks == 0 &&
382- this . chunks . at ( this . numberOfDecodedChunks ) ?. length ==
395+ this . chunks . at ( - this . numOfChunks ) ?. length ==
383396 this . windowSize + this . overlapSeconds )
384397 ) {
385398 let seq = await this . decodeChunk (
386- this . chunks . at ( this . numberOfDecodedChunks ) ! ,
399+ this . chunks . at ( - this . numOfChunks ) ! ,
387400 audioLanguage
388401 ) ;
402+ const numSpecialTokens = ( await this . getStartingTokenIds ( audioLanguage ) )
403+ . length ;
389404 // remove sos/eos token and 3 additional ones
390405 if ( this . numberOfDecodedChunks == 0 ) {
391- this . seqs = [ seq . slice ( 0 , - 4 ) ] ;
406+ this . seqs = [ seq . slice ( 0 , - ( numSpecialTokens + NUM_TOKENS_TO_SLICE ) ) ] ;
392407 } else {
393- this . seqs = [ ...this . seqs , seq . slice ( 4 , - 4 ) ] ;
408+ this . seqs = [
409+ ...this . seqs ,
410+ seq . slice (
411+ numSpecialTokens + NUM_TOKENS_TO_SLICE ,
412+ - ( numSpecialTokens + NUM_TOKENS_TO_SLICE )
413+ ) ,
414+ ] ;
394415 this . prevSeq = this . handleOverlaps ( this . seqs ) ;
395416 }
396417 this . numberOfDecodedChunks ++ ;
@@ -400,21 +421,33 @@ export class SpeechToTextController {
400421 }
401422 this . isDecodingChunk = false ;
402423 }
403- while (
404- this . numberOfDecodedChunks < this . chunks . length &&
405- streamAction == STREAMING_ACTION . STOP
406- ) {
407- let seq = await this . decodeChunk (
408- this . chunks . at ( this . numberOfDecodedChunks ) !
409- ) ;
424+ // remove data from waveform, which was processed and saved to this.seqs
425+ while ( this . numOfChunks > 2 ) {
426+ if ( this . numberOfDeletedChunks == 0 ) {
427+ this . streamWaveform = this . streamWaveform . slice (
428+ - (
429+ this . streamWaveform . length -
430+ ( this . windowSize + this . overlapSeconds )
431+ )
432+ ) ;
433+ } else {
434+ this . streamWaveform = this . streamWaveform . slice (
435+ - ( this . streamWaveform . length - this . windowSize )
436+ ) ;
437+ }
438+ this . numberOfDeletedChunks ++ ;
439+ this . numOfChunks -- ;
440+ }
441+ while ( this . numOfChunks > 0 && streamAction == STREAMING_ACTION . STOP ) {
442+ let seq = await this . decodeChunk ( this . chunks . at ( - this . numOfChunks ) ! ) ;
410443 if ( this . numberOfDecodedChunks == 0 ) {
411444 this . sequence = seq ;
412445 this . decodedTranscribeCallback ( seq ) ;
413446 this . isGeneratingCallback ( false ) ;
414447 break ;
415448 }
416449 //last sequence processed
417- if ( this . numberOfDecodedChunks == this . chunks . length - 1 ) {
450+ if ( this . numOfChunks == 1 ) {
418451 let finalSeq = [ ...this . sequence , ...seq ] ;
419452 this . sequence = finalSeq ;
420453 this . decodedTranscribeCallback ( finalSeq ) ;
@@ -424,6 +457,7 @@ export class SpeechToTextController {
424457 this . handleOverlaps ( this . seqs ) ;
425458 }
426459 this . numberOfDecodedChunks ++ ;
460+ this . numOfChunks -- ;
427461 }
428462 const decodedText = await this . tokenIdsToText ( this . sequence ) ;
429463 return decodedText ;
0 commit comments