Sync modeling_mixtral_te.py fix to models/ source and register copy mapping

svc-bionemo · svc-bionemo · commit c590028749a7 · 2026-04-06T14:59:34.000-07:00
- Apply FSDP2 DTensor fix to bionemo-recipes/models/mixtral/modeling_mixtral_te.py (source)
- Add mixtral modeling file to check_copied_files SOURCE_TO_DESTINATION_MAP
- Recipe file now gets copied-file banner via check_copied_files --fix

Signed-off-by: svc-bionemo &lt;267129667+svc-bionemo@users.noreply.github.com&gt;
diff --git a/bionemo-recipes/models/mixtral/modeling_mixtral_te.py b/bionemo-recipes/models/mixtral/modeling_mixtral_te.py
@@ -279,9 +279,22 @@ def _restack_from_views(self) -> None:
         device = torch.cuda.current_device()
         for attr_name in ("experts_gate_up_weight", "experts_down_weight"):
             old_param = getattr(self, attr_name)
-            new_data = torch.empty_like(old_param, device=device)
-            torch.nn.init.normal_(new_data, mean=0.0, std=self.initializer_range)
-            setattr(self, attr_name, nn.Parameter(new_data))
+            if isinstance(old_param.data, DTensor):
+                # FSDP2 has sharded this param; materialize the local shard on CUDA
+                # and reconstruct the DTensor wrapper so FSDP2 can manage it.
+                local_data = old_param.data.to_local()
+                new_local = torch.empty(local_data.shape, dtype=local_data.dtype, device=device)
+                torch.nn.init.normal_(new_local, mean=0.0, std=self.initializer_range)
+                new_dtensor = DTensor.from_local(
+                    new_local,
+                    device_mesh=old_param.data.device_mesh,
+                    placements=old_param.data.placements,
+                )
+                setattr(self, attr_name, nn.Parameter(new_dtensor))
+            else:
+                new_data = torch.empty_like(old_param, device=device)
+                torch.nn.init.normal_(new_data, mean=0.0, std=self.initializer_range)
+                setattr(self, attr_name, nn.Parameter(new_data))
 
         # Re-sync views to point to the new stacked parameter
         self._sync_expert_views()
@@ -298,13 +311,15 @@ def _sync_expert_views(self) -> None:
         gate_up_w = self.experts_gate_up_weight
         if isinstance(gate_up_w, DTensor):
             gate_up_w = gate_up_w.to_local()
-        for i in range(self.num_local_experts):
+        num_local = gate_up_w.shape[0]
+        for i in range(num_local):
             object.__setattr__(self.experts_gate_up, f"weight{i}", gate_up_w[i])
 
         down_w = self.experts_down_weight
         if isinstance(down_w, DTensor):
             down_w = down_w.to_local()
-        for i in range(self.num_local_experts):
+        num_local_down = down_w.shape[0]
+        for i in range(num_local_down):
             object.__setattr__(self.experts_down, f"weight{i}", down_w[i])
 
     def set_ep_group(self, ep_group: dist.ProcessGroup, ep_mesh: DeviceMesh) -> None:
diff --git a/ci/scripts/check_copied_files.py b/ci/scripts/check_copied_files.py
@@ -205,6 +205,10 @@ def _compare_file_contents(source_file: Path, dest_file: Path, source_display: s
     "bionemo-recipes/models/codonfm/modeling_codonfm_te.py": [
         "bionemo-recipes/recipes/codonfm_native_te/modeling_codonfm_te.py",
     ],
+    # Mixtral TE model -> recipe sync
+    "bionemo-recipes/models/mixtral/modeling_mixtral_te.py": [
+        "bionemo-recipes/recipes/mixtral_native_te/modeling_mixtral_te.py",
+    ],
     # Common test library - synced between models
     "bionemo-recipes/models/esm2/tests/common": [
         "bionemo-recipes/models/llama3/tests/common",