@@ -15,6 +15,33 @@ export const MenuItemButton = primitiveWithClassname('button', [
1515 'block' ,
1616] ) ;
1717
18+ function mergeSameSpeakerParagraphs ( doc : Document ) {
19+ const mergePoints : number [ ] = [ ] ;
20+ for ( let i = 0 ; i < doc . children . length - 1 ; i ++ ) {
21+ const paragraph = doc . children [ i ] ;
22+ const nextParagraph = doc . children [ i + 1 ] ;
23+ if ( paragraph . speaker == nextParagraph . speaker ) {
24+ mergePoints . push ( i ) ;
25+ }
26+ }
27+ let removed = 0 ;
28+ mergePoints . forEach ( ( index ) => {
29+ const i = index - removed ;
30+ doc . children [ i ] . children . push ( ...JSON . parse ( JSON . stringify ( doc . children [ i + 1 ] . children ) ) ) ;
31+ doc . children . splice ( i + 1 , 1 ) ;
32+ removed ++ ;
33+ } ) ;
34+ }
35+
36+ const punctuations = [ '.' , '?' , '!' ] ;
37+ const non_punctuations = [ '...' ] ;
38+ function containsSentenceEnd ( text : string ) {
39+ return (
40+ punctuations . some ( ( punct ) => text . includes ( punct ) ) &&
41+ ! non_punctuations . some ( ( np ) => text . includes ( np ) )
42+ ) ;
43+ }
44+
1845export function TextTools ( { editor } : { editor : EditorWithWebsocket } ) {
1946 return (
2047 < Popup
@@ -25,62 +52,19 @@ export function TextTools({ editor }: { editor: EditorWithWebsocket }) {
2552 >
2653 < MenuItemButton
2754 onClick = { ( ) => {
28- const mergePoints : number [ ] = [ ] ;
29- for ( let i = 0 ; i < editor . doc . children . length - 1 ; i ++ ) {
30- const paragraph = editor . doc . children [ i ] ;
31- const nextParagraph = editor . doc . children [ i + 1 ] ;
32- if ( paragraph . speaker == nextParagraph . speaker ) {
33- mergePoints . push ( i ) ;
34- }
35- }
36- editor . update ( ( doc : Document ) => {
37- let removed = 0 ;
38- mergePoints . forEach ( ( index ) => {
39- const i = index - removed ;
40- doc . children [ i ] . children . push (
41- ...JSON . parse ( JSON . stringify ( doc . children [ i + 1 ] . children ) ) ,
42- ) ;
43- doc . children . splice ( i + 1 , 1 ) ;
44- removed ++ ;
45- } ) ;
46- } ) ;
55+ editor . update ( mergeSameSpeakerParagraphs ) ;
4756 } }
4857 >
4958 Reflow to One Paragraph per Speaker
5059 </ MenuItemButton >
5160
5261 < MenuItemButton
5362 onClick = { ( ) => {
54- const punctuations = [ '.' , '?' , '!' ] ;
55- const non_punctuations = [ '...' ] ;
56- const contains_punctuation = ( text : string ) =>
57- punctuations . some ( ( punct ) => text . includes ( punct ) ) &&
58- ! non_punctuations . some ( ( np ) => text . includes ( np ) ) ;
59-
60- // stategy: we first merge everything that could possibly be merged...
61- const mergePoints : number [ ] = [ ] ;
62- for ( let i = 0 ; i < editor . doc . children . length - 1 ; i ++ ) {
63- const paragraph = editor . doc . children [ i ] ;
64- const nextParagraph = editor . doc . children [ i + 1 ] ;
65- if (
66- ! contains_punctuation ( paragraph . children [ paragraph . children . length - 1 ] . text ) &&
67- paragraph . speaker == nextParagraph . speaker
68- ) {
69- mergePoints . push ( i ) ;
70- }
71- }
7263 editor . update ( ( doc : Document ) => {
73- let removed = 0 ;
74- mergePoints . forEach ( ( index ) => {
75- const i = index - removed ;
76- doc . children [ i ] . children . push (
77- ...JSON . parse ( JSON . stringify ( doc . children [ i + 1 ] . children ) ) ,
78- ) ;
79- doc . children . splice ( i + 1 , 1 ) ;
80- removed ++ ;
81- } ) ;
64+ // stategy: we first merge everything that could possibly be merged...
65+ mergeSameSpeakerParagraphs ( doc ) ;
8266
83- // ...and only then break up
67+ // ...and only then break up on sentence boundaries
8468 const newChildren : Paragraph [ ] = [ ] ;
8569 doc . children . forEach ( ( paragraph ) => {
8670 let currentParagraph = {
@@ -89,7 +73,7 @@ export function TextTools({ editor }: { editor: EditorWithWebsocket }) {
8973 } ;
9074 paragraph . children . forEach ( ( token ) => {
9175 currentParagraph . children . push ( JSON . parse ( JSON . stringify ( token ) ) ) ;
92- if ( contains_punctuation ( token . text ) ) {
76+ if ( containsSentenceEnd ( token . text ) ) {
9377 newChildren . push ( currentParagraph ) ;
9478 currentParagraph = {
9579 ...paragraph ,
@@ -107,6 +91,92 @@ export function TextTools({ editor }: { editor: EditorWithWebsocket }) {
10791 >
10892 Reflow to One Paragraph per Sentence
10993 </ MenuItemButton >
94+
95+ < MenuItemButton
96+ onClick = { ( ) => {
97+ // this strategy tries to split paragraphs at sentence boundaries, but only if there is a pause between the sentences
98+ // or the paragraphs would become too long.
99+ const initial = 2 ;
100+ const decay = 0.95 ;
101+
102+ const getPause = ( i : number , paragraph : Paragraph ) => {
103+ const token = paragraph . children [ i ] ;
104+ const nextToken = paragraph . children [ i + 1 ] ;
105+ if ( nextToken ?. start !== undefined && token ?. end !== undefined ) {
106+ return nextToken . start - token . end ;
107+ }
108+ return 0 ;
109+ } ;
110+
111+ editor . update ( ( doc : Document ) => {
112+ mergeSameSpeakerParagraphs ( doc ) ;
113+ const newChildren : Paragraph [ ] = [ ] ;
114+ const addNewChild = ( paragraph : Paragraph ) => {
115+ // if the paragraph is very long and does not contain any sentence ends, we still want to break it up
116+ if ( paragraph . children . length <= 100 ) {
117+ newChildren . push ( paragraph ) ;
118+ } else {
119+ const silences = paragraph . children
120+ . map ( ( x , i ) => ( { ...x , pause : getPause ( i , paragraph ) } ) )
121+ . filter ( ( token ) => token . text . includes ( ',' ) )
122+ . map ( ( token ) => token . pause ) ;
123+ silences . sort ( ) ;
124+ const thresholdIndex = Math . floor ( paragraph . children . length / 100 ) ; // aim for paragraphs of max ~50 tokens
125+ const threshold = silences [ silences . length - 1 - thresholdIndex ] ;
126+ let currentParagraph = {
127+ ...paragraph ,
128+ children : [ ] as { text : string } [ ] ,
129+ } ;
130+ paragraph . children . forEach ( ( token , i ) => {
131+ currentParagraph . children . push ( JSON . parse ( JSON . stringify ( token ) ) ) ;
132+ if (
133+ getPause ( i , paragraph ) >= threshold &&
134+ token . text . includes ( ',' ) &&
135+ currentParagraph . children . length > 3
136+ ) {
137+ newChildren . push ( currentParagraph ) ;
138+ currentParagraph = {
139+ ...paragraph ,
140+ children : [ ] ,
141+ } ;
142+ }
143+ } ) ;
144+ if ( currentParagraph . children . length > 0 ) {
145+ newChildren . push ( currentParagraph ) ;
146+ }
147+ }
148+ } ;
149+ doc . children . forEach ( ( paragraph ) => {
150+ let minPauseBetweenSentences = initial ; // this gets reduced with every additional token
151+ let currentParagraph = {
152+ ...paragraph ,
153+ children : [ ] as { text : string } [ ] ,
154+ } ;
155+ paragraph . children . forEach ( ( token , i ) => {
156+ currentParagraph . children . push ( JSON . parse ( JSON . stringify ( token ) ) ) ;
157+ minPauseBetweenSentences *= decay ;
158+ if (
159+ getPause ( i , paragraph ) >= minPauseBetweenSentences &&
160+ containsSentenceEnd ( token . text )
161+ ) {
162+ addNewChild ( currentParagraph ) ;
163+ minPauseBetweenSentences = initial ;
164+ currentParagraph = {
165+ ...paragraph ,
166+ children : [ ] ,
167+ } ;
168+ }
169+ } ) ;
170+ if ( currentParagraph . children . length > 0 ) {
171+ addNewChild ( currentParagraph ) ;
172+ }
173+ } ) ;
174+ doc . children = newChildren ;
175+ } ) ;
176+ } }
177+ >
178+ Smart Reflow ✨
179+ </ MenuItemButton >
110180 </ Popup >
111181 ) ;
112182}
0 commit comments