adds comment to explain why we always use bf16 heads (#1431)

jomitchellnv · web-flow · commit f7bef3f20617 · 2026-01-22T12:28:50.000-07:00
diff --git a/bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py b/bionemo-recipes/models/esm2/src/esm/modeling_esm_te.py
@@ -484,6 +484,8 @@ def forward(self, features, **kwargs):
             features (torch.Tensor): The features.
             **kwargs: Additional arguments.
         """
+        # Keep the last layers of the network in higher precision to avoid numerical instability.
+        # Please see recipes/fp8_analysis/README.md for more details.
         with transformer_engine.pytorch.fp8_autocast(enabled=False):
             x = self.dense(features)
             x = torch.nn.functional.gelu(x)
diff --git a/bionemo-recipes/recipes/esm2_accelerate_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_accelerate_te/example_8m_checkpoint/esm_nv.py
@@ -484,6 +484,8 @@ def forward(self, features, **kwargs):
             features (torch.Tensor): The features.
             **kwargs: Additional arguments.
         """
+        # Keep the last layers of the network in higher precision to avoid numerical instability.
+        # Please see recipes/fp8_analysis/README.md for more details.
         with transformer_engine.pytorch.fp8_autocast(enabled=False):
             x = self.dense(features)
             x = torch.nn.functional.gelu(x)
diff --git a/bionemo-recipes/recipes/esm2_native_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_native_te/example_8m_checkpoint/esm_nv.py
@@ -484,6 +484,8 @@ def forward(self, features, **kwargs):
             features (torch.Tensor): The features.
             **kwargs: Additional arguments.
         """
+        # Keep the last layers of the network in higher precision to avoid numerical instability.
+        # Please see recipes/fp8_analysis/README.md for more details.
         with transformer_engine.pytorch.fp8_autocast(enabled=False):
             x = self.dense(features)
             x = torch.nn.functional.gelu(x)
diff --git a/bionemo-recipes/recipes/esm2_peft_te/example_8m_checkpoint/esm_nv.py b/bionemo-recipes/recipes/esm2_peft_te/example_8m_checkpoint/esm_nv.py
@@ -484,6 +484,8 @@ def forward(self, features, **kwargs):
             features (torch.Tensor): The features.
             **kwargs: Additional arguments.
         """
+        # Keep the last layers of the network in higher precision to avoid numerical instability.
+        # Please see recipes/fp8_analysis/README.md for more details.
         with transformer_engine.pytorch.fp8_autocast(enabled=False):
             x = self.dense(features)
             x = torch.nn.functional.gelu(x)