removes torch.autocast and uses fsdp2 for autocasting to bf16

jomitchellnv · jomitchellnv · commit 213ef6edf9e6 · 2026-04-06T13:21:59.000-07:00
Signed-off-by: Jonathan Mitchell &lt;jomitchell@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/eval_fsdp2.py b/bionemo-recipes/recipes/esm2_minifold_te/eval_fsdp2.py
@@ -179,8 +179,7 @@ def main(args: DictConfig) -> None:
         for batch in progress:
             batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
 
-            with torch.autocast("cuda", dtype=torch.bfloat16):
-                r_dict = model(batch, num_recycling=args.model.get("num_recycling", 0))
+            r_dict = model(batch, num_recycling=args.model.get("num_recycling", 0))
 
             # Distogram loss
             disto_loss = compute_distogram_loss(
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/miniformer_te.py b/bionemo-recipes/recipes/esm2_minifold_te/miniformer_te.py
@@ -145,13 +145,11 @@ def _gate_ctx():
         # Apply mask
         x = x * mask.unsqueeze(-1)
 
-        # Triangular multiplication (MUST stay in FP32)
-        device = torch.device("mps" if torch.backends.mps.is_available() else "cuda")
-        with torch.autocast(device.type, enabled=False):
-            a1, b1, a2, b2 = torch.chunk(x.float(), 4, dim=-1)
-            x1 = torch.einsum("bikd,bjkd->bijd", a1, b1)
-            x2 = torch.einsum("bkid,bkjd->bijd", a2, b2)
-            x = torch.cat([x1, x2], dim=-1).to(mask.dtype if mask.is_floating_point() else torch.float32)
+        # Triangular multiplication (in FP32 via explicit .float() cast)
+        a1, b1, a2, b2 = torch.chunk(x.float(), 4, dim=-1)
+        x1 = torch.einsum("bikd,bjkd->bijd", a1, b1)
+        x2 = torch.einsum("bkid,bkjd->bijd", a2, b2)
+        x = torch.cat([x1, x2], dim=-1).to(mask.dtype if mask.is_floating_point() else torch.float32)
 
         # Output gating: D/2 -> D
         x = te_layernorm_nd(self.output_norm, x)
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/structure_te.py b/bionemo-recipes/recipes/esm2_minifold_te/structure_te.py
@@ -323,32 +323,30 @@ def forward(self, s, z, aatype, mask):
         # Predict angles
         unnormalized_angles, angles = self.angle_resnet(s, s_initial)
 
-        # Predict positions
-        device = torch.device("mps" if torch.backends.mps.is_available() else "cuda")
-        with torch.autocast(device.type, enabled=False):
-            n, ca, c = te_linear_nd(self.bb_update, s.float()).chunk(3, dim=-1)
-            rigids = Rigid.make_transform_from_reference(n, ca, c, eps=1e-7)
-            scaled_rigids = rigids.scale_translation(self.trans_scale_factor)
-
-            all_frames_to_global = torsion_angles_to_frames(scaled_rigids, angles, aatype, self.default_frames)
-            pred_xyz = frames_and_literature_positions_to_atom14_pos(
-                all_frames_to_global,
-                aatype,
-                self.default_frames,
-                self.group_idx,
-                self.atom_mask,
-                self.lit_positions,
-            )
-            outputs.append(
-                {
-                    "angles": angles,
-                    "unnormalized_angles": unnormalized_angles,
-                    "frames": scaled_rigids.to_tensor_4x4(),
-                    "sidechain_frames": all_frames_to_global.to_tensor_4x4(),
-                    "positions": pred_xyz,
-                    "states": s,
-                }
-            )
+        # Predict positions (in FP32 via explicit .float() cast)
+        n, ca, c = te_linear_nd(self.bb_update, s.float()).chunk(3, dim=-1)
+        rigids = Rigid.make_transform_from_reference(n, ca, c, eps=1e-7)
+        scaled_rigids = rigids.scale_translation(self.trans_scale_factor)
+
+        all_frames_to_global = torsion_angles_to_frames(scaled_rigids, angles, aatype, self.default_frames)
+        pred_xyz = frames_and_literature_positions_to_atom14_pos(
+            all_frames_to_global,
+            aatype,
+            self.default_frames,
+            self.group_idx,
+            self.atom_mask,
+            self.lit_positions,
+        )
+        outputs.append(
+            {
+                "angles": angles,
+                "unnormalized_angles": unnormalized_angles,
+                "frames": scaled_rigids.to_tensor_4x4(),
+                "sidechain_frames": all_frames_to_global.to_tensor_4x4(),
+                "positions": pred_xyz,
+                "states": s,
+            }
+        )
 
         outputs = dict_multimap(torch.stack, outputs)
         outputs["single"] = s
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/train_fsdp2.py b/bionemo-recipes/recipes/esm2_minifold_te/train_fsdp2.py
@@ -317,9 +317,8 @@ def main(args: DictConfig) -> float | None:
         for batch in train_dataloader:
             batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
 
-            # Forward pass
-            with torch.autocast("cuda", dtype=torch.bfloat16):
-                r_dict = model(batch, num_recycling=args.model.get("num_recycling", 0))
+            # Forward pass (BF16 handled by FSDP2 MixedPrecisionPolicy)
+            r_dict = model(batch, num_recycling=args.model.get("num_recycling", 0))
 
             # Compute distogram loss
             disto_loss = compute_distogram_loss(