docs(perf_logger): note pad_to_multiple_of / cu_seq_lens_q collapse (#1561)

gagank1 · gagank1 · commit 29121fe3d6d8 · 2026-04-24T14:00:21.000-07:00
When the collator's pad_to_multiple_of option is set (FP8/FP4 alignment), cu_seq_lens_q is mutated in place to include an appended mock pad sequence and no cu_seq_lens_q_padded key is written — that key is reserved for TE's per-sequence CP padding. In that path the unpadded and padded MFU metrics collapse, inflated by at most pad_to_multiple_of² of the real Σ(Lᵢ²) — typically <10⁻⁵, below measurement noise. Documented as a known limitation in _attn_work_from_batch's docstring in all four MFU-tracking recipes (esm2, llama3, opengenome2_llama, codonfm), with a pointer to issue #1561 for the full analysis and proposed fixes. No behavior change. Signed-off-by: Gagan Kaushik <gkaushik@nvidia.com>
diff --git a/bionemo-recipes/recipes/codonfm_native_te/perf_logger.py b/bionemo-recipes/recipes/codonfm_native_te/perf_logger.py
@@ -132,6 +132,15 @@ def _attn_work_from_batch(
     CodonFM currently runs FSDP without CP (cp_size=1), but the formula stays correct
     if CP is added later.
     Int32 lens cast to int64 BEFORE squaring (overflow at L ≈ 46k otherwise).
+
+    NOTE: With the collator's ``pad_to_multiple_of`` option (FP8/FP4 alignment, inlined
+    in ``CodonTHDCollator.__call__`` in dataset.py), the cu_seq_lens_q tensor is mutated
+    in place to include one or more appended mock pad sequences and no
+    ``cu_seq_lens_q_padded`` key is written (that key is reserved for TE's per-sequence
+    CP padding). In that path the unpadded and padded metrics collapse, inflated by
+    ≤``pad_to_multiple_of²`` relative to the real Σ(Lᵢ²) — typically <10⁻⁵ and below
+    measurement noise. Known limitation; see
+    https://github.com/NVIDIA/bionemo-framework/issues/1561.
     """
     if include_padding:
         cu = batch.get("cu_seq_lens_q_padded")
diff --git a/bionemo-recipes/recipes/esm2_native_te/perf_logger.py b/bionemo-recipes/recipes/esm2_native_te/perf_logger.py
@@ -129,6 +129,14 @@ def _attn_work_from_batch(
       * BSHD: uses full ``input_ids.shape``, scaled by ``cp_size²``.
 
     Int32 lens cast to int64 BEFORE squaring (overflow at L ≈ 46k otherwise).
+
+    NOTE: With the collator's ``pad_to_multiple_of`` option (FP8/FP4 alignment), the
+    cu_seq_lens_q tensor is mutated in place to include an appended mock pad sequence
+    and no ``cu_seq_lens_q_padded`` key is written (that key is reserved for TE's
+    per-sequence CP padding). In that path the unpadded and padded metrics collapse,
+    inflated by ≤``pad_to_multiple_of²`` relative to the real Σ(Lᵢ²) — typically
+    <10⁻⁵ and below measurement noise. Known limitation; see
+    https://github.com/NVIDIA/bionemo-framework/issues/1561.
     """
     if include_padding:
         cu = batch.get("cu_seq_lens_q_padded")
diff --git a/bionemo-recipes/recipes/llama3_native_te/perf_logger.py b/bionemo-recipes/recipes/llama3_native_te/perf_logger.py
@@ -131,6 +131,14 @@ def _attn_work_from_batch(
         scaled by ``cp_size²``.
 
     Int32 lens cast to int64 BEFORE squaring (overflow at L ≈ 46k otherwise).
+
+    NOTE: With the collator's ``pad_to_multiple_of`` option (FP8/FP4 alignment), the
+    cu_seq_lens_q tensor is mutated in place to include an appended mock pad sequence
+    and no ``cu_seq_lens_q_padded`` key is written (that key is reserved for TE's
+    per-sequence CP padding). In that path the unpadded and padded metrics collapse,
+    inflated by ≤``pad_to_multiple_of²`` relative to the real Σ(Lᵢ²) — typically
+    <10⁻⁵ and below measurement noise. Known limitation; see
+    https://github.com/NVIDIA/bionemo-framework/issues/1561.
     """
     if include_padding:
         cu = batch.get("cu_seq_lens_q_padded")
diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py
@@ -136,6 +136,14 @@ def _attn_work_from_batch(
       * BSHD: uses full ``input_ids.shape``, scaled by ``cp_size²``.
 
     Int32 lens cast to int64 BEFORE squaring (overflow at L ≈ 46k otherwise).
+
+    NOTE: With the collator's ``pad_to_multiple_of`` option (FP8/FP4 alignment), the
+    cu_seq_lens_q tensor is mutated in place to include an appended mock pad sequence
+    and no ``cu_seq_lens_q_padded`` key is written (that key is reserved for TE's
+    per-sequence CP padding). In that path the unpadded and padded metrics collapse,
+    inflated by ≤``pad_to_multiple_of²`` relative to the real Σ(Lᵢ²) — typically
+    <10⁻⁵ and below measurement noise. Known limitation; see
+    https://github.com/NVIDIA/bionemo-framework/issues/1561.
     """
     if include_padding:
         cu = batch.get("cu_seq_lens_q_padded")