MFU: use padded_vocab_size for mfu_padded_pct LM-head FLOPs

gagank1 · gagank1 · commit b979eed89dfa · 2026-04-23T11:59:11.000-07:00
For configs with padded_vocab_size set (ESM-2: 33→64 for FP8/tensor-core friendliness), the LM-head matmul physically runs at padded width and the logits are sliced back afterward. Count the padded width in the hardware-view metric (mfu_padded_pct, tflops_per_gpu_padded) while continuing to count raw vocab_size in the useful-work metric (mfu_pct, tflops_per_gpu). For configs without padded_vocab_size (llama3, og2, codonfm) the two values collapse and nothing changes. Addresses review feedback from @trvachov on PR #1548. Signed-off-by: Gagan Kaushik <gkaushik@nvidia.com>
diff --git a/bionemo-recipes/recipes/codonfm_native_te/perf_logger.py b/bionemo-recipes/recipes/codonfm_native_te/perf_logger.py
@@ -66,7 +66,7 @@ def _detect_peak_tflops_bf16():
     return None, name
 
 
-def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
+def _compute_non_attn_per_token_flops(model_config_dict: dict, use_padded_vocab: bool = False) -> int:
     """Per-token FLOPs for everything EXCEPT the S² attention term.
 
     Q/K/V/O projections (GQA-aware) + MLP + LM head, 3x for fwd+bwd. Multiply by the
@@ -81,6 +81,10 @@ def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
     kv_dim = n_kv * head_dim
     ffn = model_config_dict["intermediate_size"]
     vocab = model_config_dict.get("vocab_size", 0)
+    if use_padded_vocab:
+        # LM-head matmul runs at padded width (e.g. ESM-2: vocab=33 → padded=64 for
+        # FP8/tensor-core friendliness); logits are sliced back post-matmul.
+        vocab = model_config_dict.get("padded_vocab_size") or vocab
     num_layers = model_config_dict["num_hidden_layers"]
     model_type = model_config_dict.get("model_type", "")
     num_mlp_proj = 3 if model_type in _GATED_MLP_MODEL_TYPES else 2
@@ -192,11 +196,15 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig, model_confi
         # reflects each rank's share under DP and sequence packing.
         self._log_mfu = bool(args.get("log_mfu", False)) and model_config_dict is not None
         self._non_attn_per_token_flops = 0
+        self._non_attn_per_token_flops_padded = 0
         self._attn_flop_coeff = 0
         self._cp_size = int(args.get("cp_size", 1))
         self._peak_tflops: float | None = None
         if self._log_mfu:
             self._non_attn_per_token_flops = _compute_non_attn_per_token_flops(model_config_dict)
+            self._non_attn_per_token_flops_padded = _compute_non_attn_per_token_flops(
+                model_config_dict, use_padded_vocab=True
+            )
             self._attn_flop_coeff = _compute_attn_flop_coeff(model_config_dict)
             self._peak_tflops, gpu_name = _detect_peak_tflops_bf16()
             if dist_config.local_rank == 0:
@@ -348,7 +356,7 @@ def log_step(
                     flops_unpadded = non_attn_unpadded + attn_flops_unpadded
                     tflops_unpadded = flops_unpadded / step_time / 1e12
 
-                    non_attn_padded = self._non_attn_per_token_flops * self.num_tokens
+                    non_attn_padded = self._non_attn_per_token_flops_padded * self.num_tokens
                     attn_flops_padded = (self._attn_flop_coeff * attn_padded) // self._cp_size
                     flops_padded = non_attn_padded + attn_flops_padded
                     tflops_padded = flops_padded / step_time / 1e12
diff --git a/bionemo-recipes/recipes/esm2_native_te/perf_logger.py b/bionemo-recipes/recipes/esm2_native_te/perf_logger.py
@@ -65,7 +65,7 @@ def _detect_peak_tflops_bf16():
     return None, name
 
 
-def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
+def _compute_non_attn_per_token_flops(model_config_dict: dict, use_padded_vocab: bool = False) -> int:
     """Per-token FLOPs for everything EXCEPT the S² attention term.
 
     Q/K/V/O projections (GQA-aware) + MLP + LM head, 3x for fwd+bwd. Multiply by the
@@ -80,6 +80,10 @@ def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
     kv_dim = n_kv * head_dim
     ffn = model_config_dict["intermediate_size"]
     vocab = model_config_dict.get("vocab_size", 0)
+    if use_padded_vocab:
+        # LM-head matmul runs at padded width (e.g. ESM-2: vocab=33 → padded=64 for
+        # FP8/tensor-core friendliness); logits are sliced back post-matmul.
+        vocab = model_config_dict.get("padded_vocab_size") or vocab
     num_layers = model_config_dict["num_hidden_layers"]
     model_type = model_config_dict.get("model_type", "")
     num_mlp_proj = 3 if model_type in _GATED_MLP_MODEL_TYPES else 2
@@ -195,11 +199,15 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig, model_confi
         # already reflects each rank's share under DP/CP and sequence packing.
         self._log_mfu = bool(args.get("log_mfu", False)) and model_config_dict is not None
         self._non_attn_per_token_flops = 0
+        self._non_attn_per_token_flops_padded = 0
         self._attn_flop_coeff = 0
         self._cp_size = int(args.get("cp_size", 1))
         self._peak_tflops: float | None = None
         if self._log_mfu:
             self._non_attn_per_token_flops = _compute_non_attn_per_token_flops(model_config_dict)
+            self._non_attn_per_token_flops_padded = _compute_non_attn_per_token_flops(
+                model_config_dict, use_padded_vocab=True
+            )
             self._attn_flop_coeff = _compute_attn_flop_coeff(model_config_dict)
             self._peak_tflops, gpu_name = _detect_peak_tflops_bf16()
             if dist_config.local_rank == 0:
@@ -357,7 +365,7 @@ def log_step(
                     flops_unpadded = non_attn_unpadded + attn_flops_unpadded
                     tflops_unpadded = flops_unpadded / step_time / 1e12
 
-                    non_attn_padded = self._non_attn_per_token_flops * self.num_tokens
+                    non_attn_padded = self._non_attn_per_token_flops_padded * self.num_tokens
                     attn_flops_padded = (self._attn_flop_coeff * attn_padded) // self._cp_size
                     flops_padded = non_attn_padded + attn_flops_padded
                     tflops_padded = flops_padded / step_time / 1e12
diff --git a/bionemo-recipes/recipes/llama3_native_te/perf_logger.py b/bionemo-recipes/recipes/llama3_native_te/perf_logger.py
@@ -63,7 +63,7 @@ def _detect_peak_tflops_bf16():
     return None, name
 
 
-def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
+def _compute_non_attn_per_token_flops(model_config_dict: dict, use_padded_vocab: bool = False) -> int:
     """Per-token FLOPs for everything EXCEPT the S² attention term.
 
     Q/K/V/O projections (GQA-aware) + MLP + LM head, 3x for fwd+bwd. Multiply by the
@@ -78,6 +78,10 @@ def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
     kv_dim = n_kv * head_dim
     ffn = model_config_dict["intermediate_size"]
     vocab = model_config_dict.get("vocab_size", 0)
+    if use_padded_vocab:
+        # LM-head matmul runs at padded width (e.g. ESM-2: vocab=33 → padded=64 for
+        # FP8/tensor-core friendliness); logits are sliced back post-matmul.
+        vocab = model_config_dict.get("padded_vocab_size") or vocab
     num_layers = model_config_dict["num_hidden_layers"]
     model_type = model_config_dict.get("model_type", "")
     num_mlp_proj = 3 if model_type in _GATED_MLP_MODEL_TYPES else 2
@@ -201,11 +205,15 @@ def __init__(
         # reflects each rank's share under DP/CP and sequence packing.
         self._log_mfu = bool(args.get("log_mfu", False)) and model_config_dict is not None
         self._non_attn_per_token_flops = 0
+        self._non_attn_per_token_flops_padded = 0
         self._attn_flop_coeff = 0
         self._cp_size = int(args.get("cp_size", 1))
         self._peak_tflops: float | None = None
         if self._log_mfu:
             self._non_attn_per_token_flops = _compute_non_attn_per_token_flops(model_config_dict)
+            self._non_attn_per_token_flops_padded = _compute_non_attn_per_token_flops(
+                model_config_dict, use_padded_vocab=True
+            )
             self._attn_flop_coeff = _compute_attn_flop_coeff(model_config_dict)
             self._peak_tflops, gpu_name = _detect_peak_tflops_bf16()
             if dist_config.local_rank == 0:
@@ -384,7 +392,7 @@ def log_step(
                     flops_unpadded = non_attn_unpadded + attn_flops_unpadded
                     tflops_unpadded = flops_unpadded / step_time / 1e12
 
-                    non_attn_padded = self._non_attn_per_token_flops * self.num_tokens
+                    non_attn_padded = self._non_attn_per_token_flops_padded * self.num_tokens
                     attn_flops_padded = (self._attn_flop_coeff * attn_padded) // self._cp_size
                     flops_padded = non_attn_padded + attn_flops_padded
                     tflops_padded = flops_padded / step_time / 1e12
diff --git a/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py b/bionemo-recipes/recipes/opengenome2_llama_native_te/perf_logger.py
@@ -71,7 +71,7 @@ def _detect_peak_tflops_bf16():
     return None, name
 
 
-def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
+def _compute_non_attn_per_token_flops(model_config_dict: dict, use_padded_vocab: bool = False) -> int:
     """Per-token FLOPs for everything EXCEPT the S² attention term.
 
     Q/K/V/O projections (GQA-aware) + MLP + LM head, 3x for fwd+bwd. Multiply by the
@@ -86,6 +86,10 @@ def _compute_non_attn_per_token_flops(model_config_dict: dict) -> int:
     kv_dim = n_kv * head_dim
     ffn = model_config_dict["intermediate_size"]
     vocab = model_config_dict.get("vocab_size", 0)
+    if use_padded_vocab:
+        # LM-head matmul runs at padded width (e.g. ESM-2: vocab=33 → padded=64 for
+        # FP8/tensor-core friendliness); logits are sliced back post-matmul.
+        vocab = model_config_dict.get("padded_vocab_size") or vocab
     num_layers = model_config_dict["num_hidden_layers"]
     model_type = model_config_dict.get("model_type", "")
     num_mlp_proj = 3 if model_type in _GATED_MLP_MODEL_TYPES else 2
@@ -197,11 +201,15 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig, model_confi
         # reflects each rank's share under DP/CP and sequence packing.
         self._log_mfu = bool(args.get("log_mfu", False)) and model_config_dict is not None
         self._non_attn_per_token_flops = 0
+        self._non_attn_per_token_flops_padded = 0
         self._attn_flop_coeff = 0
         self._cp_size = int(args.get("cp_size", 1))
         self._peak_tflops: float | None = None
         if self._log_mfu:
             self._non_attn_per_token_flops = _compute_non_attn_per_token_flops(model_config_dict)
+            self._non_attn_per_token_flops_padded = _compute_non_attn_per_token_flops(
+                model_config_dict, use_padded_vocab=True
+            )
             self._attn_flop_coeff = _compute_attn_flop_coeff(model_config_dict)
             self._peak_tflops, gpu_name = _detect_peak_tflops_bf16()
             if dist_config.local_rank == 0:
@@ -373,7 +381,7 @@ def log_step(
                     flops_unpadded = non_attn_unpadded + attn_flops_unpadded
                     tflops_unpadded = flops_unpadded / step_time / 1e12
 
-                    non_attn_padded = self._non_attn_per_token_flops * self.num_tokens
+                    non_attn_padded = self._non_attn_per_token_flops_padded * self.num_tokens
                     attn_flops_padded = (self._attn_flop_coeff * attn_padded) // self._cp_size
                     flops_padded = non_attn_padded + attn_flops_padded
                     tflops_padded = flops_padded / step_time / 1e12