more

albertz · albertz · commit bd55cb7eba90 · 2026-05-15T10:01:08.000+02:00
diff --git a/users/zeyer/experiments/exp2025_10_21_chunked_ctc.py b/users/zeyer/experiments/exp2025_10_21_chunked_ctc.py
@@ -168,6 +168,8 @@ def py():
 
     # V2.3: using ChunkedConformerEncoderV2, setting version=3.
     # First exp, try to reproduce the orig.
+    # train_time_hours: 168.9 (v1: 215.6)
+    # CTC-only: 9.45 (v1: 9.56)
     train(
         f"chunked-L{left_n * center_size}-C{center_size}-R{right_size}-v2.3-compat",
         {
@@ -186,7 +188,9 @@ def py():
     )
 
     # V2.3: using ChunkedConformerEncoderV2, setting version=3.
-    #   reduce chunk sizes, history, if the input is not long enough.
+    #   reduce chunk sizes, history, if the input is not long enough (adapt_chunk_history_for_short_seqs=True default)
+    # train_time_hours: 168.8 (v1: 215.6; adapt_chunk_history_...=False: 168.9)
+    # CTC-only: 9.46 (v1: 9.56; adapt_chunk_history_...=False: 9.45)
     train(
         f"chunked-L{left_n * center_size}-C{center_size}-R{right_size}-v2.3",
         {
@@ -203,9 +207,11 @@ def py():
         },
     )
 
-    # try grad checkpointing
+    # Try grad checkpointing (mem_chunks_grad_checkpointing=True).
     # (In terms of WER, should really be the same.
     # if in terms of speed this is better, and same for memory consumption, we could maybe just always enable it.)
+    # train_time_hours: 201.1 (v1: 215.6; ..._checkpointing=False: 168.8) (but requires less memory)
+    # CTC-only: 9.52 (..._checkpointing=False: 9.46)
     train(
         f"chunked-L{left_n * center_size}-C{center_size}-R{right_size}-v2.3-gdckpt",
         {

Original file line number	Diff line number	Diff line change
`@@ -168,6 +168,8 @@ def py():`
`168`	`168`
`169`	`169`	`# V2.3: using ChunkedConformerEncoderV2, setting version=3.`
`170`	`170`	`# First exp, try to reproduce the orig.`
	`171`	`+ # train_time_hours: 168.9 (v1: 215.6)`
	`172`	`+ # CTC-only: 9.45 (v1: 9.56)`
`171`	`173`	`train(`
`172`	`174`	`f"chunked-L{left_n * center_size}-C{center_size}-R{right_size}-v2.3-compat",`
`173`	`175`	`{`
`@@ -186,7 +188,9 @@ def py():`
`186`	`188`	`)`
`187`	`189`
`188`	`190`	`# V2.3: using ChunkedConformerEncoderV2, setting version=3.`
`189`		`- # reduce chunk sizes, history, if the input is not long enough.`
	`191`	`+ # reduce chunk sizes, history, if the input is not long enough (adapt_chunk_history_for_short_seqs=True default)`
	`192`	`+ # train_time_hours: 168.8 (v1: 215.6; adapt_chunk_history_...=False: 168.9)`
	`193`	`+ # CTC-only: 9.46 (v1: 9.56; adapt_chunk_history_...=False: 9.45)`
`190`	`194`	`train(`
`191`	`195`	`f"chunked-L{left_n * center_size}-C{center_size}-R{right_size}-v2.3",`
`192`	`196`	`{`
`@@ -203,9 +207,11 @@ def py():`
`203`	`207`	`},`
`204`	`208`	`)`
`205`	`209`
`206`		`- # try grad checkpointing`
	`210`	`+ # Try grad checkpointing (mem_chunks_grad_checkpointing=True).`
`207`	`211`	`# (In terms of WER, should really be the same.`
`208`	`212`	`# if in terms of speed this is better, and same for memory consumption, we could maybe just always enable it.)`
	`213`	`+ # train_time_hours: 201.1 (v1: 215.6; ..._checkpointing=False: 168.8) (but requires less memory)`
	`214`	`+ # CTC-only: 9.52 (..._checkpointing=False: 9.46)`
`209`	`215`	`train(`
`210`	`216`	`f"chunked-L{left_n * center_size}-C{center_size}-R{right_size}-v2.3-gdckpt",`
`211`	`217`	`{`