@@ -524,16 +524,10 @@ async function initBaseTrack(
524524 */
525525function parseVTTToSyntheticTranscript ( vttText : string ) : Array < {
526526 transcript : string ;
527- originalCueText : string ;
528- startTime : number ;
529- endTime : number ;
530527 words : Array < { word : string ; start : number ; end : number } > ;
531528} > {
532529 const segments : Array < {
533530 transcript : string ;
534- originalCueText : string ;
535- startTime : number ;
536- endTime : number ;
537531 words : Array < { word : string ; start : number ; end : number } > ;
538532 } > = [ ] ;
539533
@@ -579,9 +573,6 @@ function parseVTTToSyntheticTranscript(vttText: string): Array<{
579573
580574 segments . push ( {
581575 transcript : text ,
582- originalCueText : text ,
583- startTime : startTime ,
584- endTime : endTime ,
585576 words : syntheticWords
586577 } ) ;
587578 }
@@ -612,6 +603,19 @@ function parseVTTTimestamp(timestamp: string): number {
612603 return 0 ;
613604}
614605
606+ /**
607+ * Format seconds to VTT timestamp (HH:MM:SS.mmm)
608+ */
609+ function formatVTTTimestamp ( seconds : number ) : string {
610+ const hours = Math . floor ( seconds / 3600 ) ;
611+ const minutes = Math . floor ( ( seconds % 3600 ) / 60 ) ;
612+ const secs = seconds % 60 ;
613+ const hoursStr = hours . toString ( ) . padStart ( 2 , '0' ) ;
614+ const minutesStr = minutes . toString ( ) . padStart ( 2 , '0' ) ;
615+ const secsStr = secs . toFixed ( 3 ) . padStart ( 6 , '0' ) ;
616+ return `${ hoursStr } :${ minutesStr } :${ secsStr } ` ;
617+ }
618+
615619/**
616620 * Generate captions from VTT (for translations with maxChars support)
617621 */
@@ -640,37 +644,67 @@ async function generateCaptionsFromVTT(params: {
640644 // Use the same caption generation logic as base subtitles
641645 const captions : CaptionData [ ] = [ ] ;
642646 const maxTimeGap = MAX_TIME_GAP_SECONDS ;
643- const addCaption = ( { startTime, endTime, text } : CaptionData ) => {
644- captions . push ( { startTime, endTime, text } ) ;
645- } ;
646647
647- // Process each segment with context
648- syntheticSegments . forEach ( ( segment , segmentIndex ) => {
648+ // First, generate all word groups from all segments
649+ const allWordGroups : Array < Array < TimedWord > > = [ ] ;
650+ syntheticSegments . forEach ( ( segment ) => {
649651 const words = segment . words ;
650652 const sentenceLength = segment . transcript . length ;
651653 const wordGroups = groupWordsByChars ( words , sentenceLength , maxChars , maxTimeGap ) ;
654+
655+ wordGroups . forEach ( group => {
656+ allWordGroups . push ( group ) ;
657+ } ) ;
658+ } ) ;
652659
653- // Get previous and next segment text for context only if time gap is less than maxTimeGap
660+ // Now process each word group with context from previous/next word groups
661+ allWordGroups . forEach ( ( currentGroup , groupIndex ) => {
662+ const currentText = currentGroup . map ( w => w . word ) . join ( ' ' ) ;
663+
664+ // Get previous word group text if available and within time gap
654665 let previousText : string | null = null ;
655- if ( segmentIndex > 0 ) {
656- const previousSegment = syntheticSegments [ segmentIndex - 1 ] ;
657- const timeGap = segment . startTime - previousSegment . endTime ;
666+ if ( groupIndex > 0 ) {
667+ const previousGroup = allWordGroups [ groupIndex - 1 ] ;
668+ const timeGap = currentGroup [ 0 ] . start - previousGroup [ previousGroup . length - 1 ] . end ;
658669 if ( timeGap < maxTimeGap ) {
659- previousText = previousSegment . originalCueText ;
670+ previousText = previousGroup . map ( w => w . word ) . join ( ' ' ) ;
660671 }
661672 }
662-
673+
674+ // Get next word group text if available and within time gap
663675 let nextText : string | null = null ;
664- if ( segmentIndex < syntheticSegments . length - 1 ) {
665- const nextSegment = syntheticSegments [ segmentIndex + 1 ] ;
666- const timeGap = nextSegment . startTime - segment . endTime ;
676+ if ( groupIndex < allWordGroups . length - 1 ) {
677+ const nextGroup = allWordGroups [ groupIndex + 1 ] ;
678+ const timeGap = nextGroup [ 0 ] . start - currentGroup [ currentGroup . length - 1 ] . end ;
667679 if ( timeGap < maxTimeGap ) {
668- nextText = nextSegment . originalCueText ;
680+ nextText = nextGroup . map ( w => w . word ) . join ( ' ' ) ;
669681 }
670682 }
671-
672- createRegularCaptions ( wordGroups , addCaption , previousText , nextText ) ;
683+
684+ // Build caption text with context
685+ let text = currentText ;
686+ if ( previousText || nextText ) {
687+ const contextLines : string [ ] = [ ] ;
688+ if ( previousText ) contextLines . push ( previousText ) ;
689+ contextLines . push ( currentText ) ;
690+ if ( nextText ) contextLines . push ( nextText ) ;
691+ text = contextLines . join ( '\n' ) ;
692+ }
693+
694+ captions . push ( {
695+ startTime : currentGroup [ 0 ] . start ,
696+ endTime : currentGroup [ currentGroup . length - 1 ] . end ,
697+ text
698+ } ) ;
673699 } ) ;
700+
701+ // Also log as VTT format for easier reading
702+ const vttOutput = captions . map ( ( caption , index ) => {
703+ const startTime = formatVTTTimestamp ( caption . startTime ) ;
704+ const endTime = formatVTTTimestamp ( caption . endTime ) ;
705+ return `${ index + 1 } \n${ startTime } --> ${ endTime } \n${ caption . text } ` ;
706+ } ) . join ( '\n\n' ) ;
707+ // console.log('Final captions as VTT:\nWEBVTT\n\n' + vttOutput);
674708
675709 return {
676710 captions,
0 commit comments