Skip to content

Commit b0a45b4

Browse files
committed
feat: enhance subtitle processing by removing unused properties and adding VTT timestamp formatting for captions
1 parent ae53b59 commit b0a45b4

1 file changed

Lines changed: 60 additions & 26 deletions

File tree

  • packages/video-player/javascript/modules/subtitles

packages/video-player/javascript/modules/subtitles/subtitles.ts

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -524,16 +524,10 @@ async function initBaseTrack(
524524
*/
525525
function parseVTTToSyntheticTranscript(vttText: string): Array<{
526526
transcript: string;
527-
originalCueText: string;
528-
startTime: number;
529-
endTime: number;
530527
words: Array<{ word: string; start: number; end: number }>;
531528
}> {
532529
const segments: Array<{
533530
transcript: string;
534-
originalCueText: string;
535-
startTime: number;
536-
endTime: number;
537531
words: Array<{ word: string; start: number; end: number }>;
538532
}> = [];
539533

@@ -579,9 +573,6 @@ function parseVTTToSyntheticTranscript(vttText: string): Array<{
579573

580574
segments.push({
581575
transcript: text,
582-
originalCueText: text,
583-
startTime: startTime,
584-
endTime: endTime,
585576
words: syntheticWords
586577
});
587578
}
@@ -612,6 +603,19 @@ function parseVTTTimestamp(timestamp: string): number {
612603
return 0;
613604
}
614605

606+
/**
607+
* Format seconds to VTT timestamp (HH:MM:SS.mmm)
608+
*/
609+
function formatVTTTimestamp(seconds: number): string {
610+
const hours = Math.floor(seconds / 3600);
611+
const minutes = Math.floor((seconds % 3600) / 60);
612+
const secs = seconds % 60;
613+
const hoursStr = hours.toString().padStart(2, '0');
614+
const minutesStr = minutes.toString().padStart(2, '0');
615+
const secsStr = secs.toFixed(3).padStart(6, '0');
616+
return `${hoursStr}:${minutesStr}:${secsStr}`;
617+
}
618+
615619
/**
616620
* Generate captions from VTT (for translations with maxChars support)
617621
*/
@@ -640,37 +644,67 @@ async function generateCaptionsFromVTT(params: {
640644
// Use the same caption generation logic as base subtitles
641645
const captions: CaptionData[] = [];
642646
const maxTimeGap = MAX_TIME_GAP_SECONDS;
643-
const addCaption = ({ startTime, endTime, text }: CaptionData) => {
644-
captions.push({ startTime, endTime, text });
645-
};
646647

647-
// Process each segment with context
648-
syntheticSegments.forEach((segment, segmentIndex) => {
648+
// First, generate all word groups from all segments
649+
const allWordGroups: Array<Array<TimedWord>> = [];
650+
syntheticSegments.forEach((segment) => {
649651
const words = segment.words;
650652
const sentenceLength = segment.transcript.length;
651653
const wordGroups = groupWordsByChars(words, sentenceLength, maxChars, maxTimeGap);
654+
655+
wordGroups.forEach(group => {
656+
allWordGroups.push(group);
657+
});
658+
});
652659

653-
// Get previous and next segment text for context only if time gap is less than maxTimeGap
660+
// Now process each word group with context from previous/next word groups
661+
allWordGroups.forEach((currentGroup, groupIndex) => {
662+
const currentText = currentGroup.map(w => w.word).join(' ');
663+
664+
// Get previous word group text if available and within time gap
654665
let previousText: string | null = null;
655-
if (segmentIndex > 0) {
656-
const previousSegment = syntheticSegments[segmentIndex - 1];
657-
const timeGap = segment.startTime - previousSegment.endTime;
666+
if (groupIndex > 0) {
667+
const previousGroup = allWordGroups[groupIndex - 1];
668+
const timeGap = currentGroup[0].start - previousGroup[previousGroup.length - 1].end;
658669
if (timeGap < maxTimeGap) {
659-
previousText = previousSegment.originalCueText;
670+
previousText = previousGroup.map(w => w.word).join(' ');
660671
}
661672
}
662-
673+
674+
// Get next word group text if available and within time gap
663675
let nextText: string | null = null;
664-
if (segmentIndex < syntheticSegments.length - 1) {
665-
const nextSegment = syntheticSegments[segmentIndex + 1];
666-
const timeGap = nextSegment.startTime - segment.endTime;
676+
if (groupIndex < allWordGroups.length - 1) {
677+
const nextGroup = allWordGroups[groupIndex + 1];
678+
const timeGap = nextGroup[0].start - currentGroup[currentGroup.length - 1].end;
667679
if (timeGap < maxTimeGap) {
668-
nextText = nextSegment.originalCueText;
680+
nextText = nextGroup.map(w => w.word).join(' ');
669681
}
670682
}
671-
672-
createRegularCaptions(wordGroups, addCaption, previousText, nextText);
683+
684+
// Build caption text with context
685+
let text = currentText;
686+
if (previousText || nextText) {
687+
const contextLines: string[] = [];
688+
if (previousText) contextLines.push(previousText);
689+
contextLines.push(currentText);
690+
if (nextText) contextLines.push(nextText);
691+
text = contextLines.join('\n');
692+
}
693+
694+
captions.push({
695+
startTime: currentGroup[0].start,
696+
endTime: currentGroup[currentGroup.length - 1].end,
697+
text
698+
});
673699
});
700+
701+
// Also log as VTT format for easier reading
702+
const vttOutput = captions.map((caption, index) => {
703+
const startTime = formatVTTTimestamp(caption.startTime);
704+
const endTime = formatVTTTimestamp(caption.endTime);
705+
return `${index + 1}\n${startTime} --> ${endTime}\n${caption.text}`;
706+
}).join('\n\n');
707+
// console.log('Final captions as VTT:\nWEBVTT\n\n' + vttOutput);
674708

675709
return {
676710
captions,

0 commit comments

Comments
 (0)