fix the CTC zipformer2 training (#1713)

KarelVesely84 · web-flow · commit 2d2470b36df6 · 2026-03-03T20:40:34.000+08:00
- too many supervision tokens
- change filtering rule to `if (T - 2) &lt; len(tokens): return False`
- this prevents inf. from appearing in the CTC loss value
diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py
@@ -1409,9 +1409,11 @@ def remove_short_and_long_utt(c: Cut):
         T = ((c.num_frames - 7) // 2 + 1) // 2
         tokens = sp.encode(c.supervisions[0].text, out_type=str)
 
-        if T < len(tokens):
+        # For CTC `(T - 2)  < len(tokens)` is needed. otherwise inf. in loss appears.
+        # For Transducer `T < len(tokens)` was okay.
+        if (T - 2) < len(tokens):
             logging.warning(
-                f"Exclude cut with ID {c.id} from training. "
+                f"Exclude cut with ID {c.id} from training (too many supervision tokens). "
                 f"Number of frames (before subsampling): {c.num_frames}. "
                 f"Number of frames (after subsampling): {T}. "
                 f"Text: {c.supervisions[0].text}. "