fix: add modeling_mixtral_te.py to opengenome2 recipe and fix imports for sparse checkout

svc-bionemo · svc-bionemo · commit 376a00f530ad · 2026-04-06T15:01:12.000-07:00
CI uses sparse-checkout, so each recipe job only has its own directory.
The opengenome2_mixtral_native_te tests were importing modeling_mixtral_te
from the shared mixtral_native_te recipe path, which does not exist in
the sparse checkout.

Fix:
- Copy modeling_mixtral_te.py to opengenome2_mixtral_native_te recipe root
- Register the copy in check_copied_files.py source-destination map
- Update test imports to use local recipe root instead of shared path

Signed-off-by: svc-bionemo &lt;267129667+svc-bionemo@users.noreply.github.com&gt;
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py
@@ -13,12 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# --- BEGIN COPIED FILE NOTICE ---
-# This file is copied from: bionemo-recipes/models/mixtral/modeling_mixtral_te.py
-# Do not modify this file directly. Instead, modify the source and run:
-#     python ci/scripts/check_copied_files.py --fix
-# --- END COPIED FILE NOTICE ---
-
 """TransformerEngine-optimized Mixtral model with Mixture of Experts."""
 
 import logging
@@ -285,9 +279,22 @@ def _restack_from_views(self) -> None:
         device = torch.cuda.current_device()
         for attr_name in ("experts_gate_up_weight", "experts_down_weight"):
             old_param = getattr(self, attr_name)
-            new_data = torch.empty_like(old_param, device=device)
-            torch.nn.init.normal_(new_data, mean=0.0, std=self.initializer_range)
-            setattr(self, attr_name, nn.Parameter(new_data))
+            if isinstance(old_param.data, DTensor):
+                # FSDP2 has sharded this param; materialize the local shard on CUDA
+                # and reconstruct the DTensor wrapper so FSDP2 can manage it.
+                local_data = old_param.data.to_local()
+                new_local = torch.empty(local_data.shape, dtype=local_data.dtype, device=device)
+                torch.nn.init.normal_(new_local, mean=0.0, std=self.initializer_range)
+                new_dtensor = DTensor.from_local(
+                    new_local,
+                    device_mesh=old_param.data.device_mesh,
+                    placements=old_param.data.placements,
+                )
+                setattr(self, attr_name, nn.Parameter(new_dtensor))
+            else:
+                new_data = torch.empty_like(old_param, device=device)
+                torch.nn.init.normal_(new_data, mean=0.0, std=self.initializer_range)
+                setattr(self, attr_name, nn.Parameter(new_data))
 
         # Re-sync views to point to the new stacked parameter
         self._sync_expert_views()
@@ -304,13 +311,15 @@ def _sync_expert_views(self) -> None:
         gate_up_w = self.experts_gate_up_weight
         if isinstance(gate_up_w, DTensor):
             gate_up_w = gate_up_w.to_local()
-        for i in range(self.num_local_experts):
+        num_local = gate_up_w.shape[0]
+        for i in range(num_local):
             object.__setattr__(self.experts_gate_up, f"weight{i}", gate_up_w[i])
 
         down_w = self.experts_down_weight
         if isinstance(down_w, DTensor):
             down_w = down_w.to_local()
-        for i in range(self.num_local_experts):
+        num_local_down = down_w.shape[0]
+        for i in range(num_local_down):
             object.__setattr__(self.experts_down, f"weight{i}", down_w[i])
 
     def set_ep_group(self, ep_group: dist.ProcessGroup, ep_mesh: DeviceMesh) -> None:
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/tests/distributed_helpers.py b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/tests/distributed_helpers.py
@@ -23,9 +23,9 @@
 import torch
 
 
-# Import NVMixtralConfig from the shared mixtral_native_te recipe
-SHARED_MIXTRAL_RECIPE = Path(__file__).resolve().parent.parent.parent / "mixtral_native_te"
-sys.path.insert(0, str(SHARED_MIXTRAL_RECIPE))
+# Import NVMixtralConfig from the local recipe copy (CI uses sparse-checkout)
+RECIPE_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(RECIPE_ROOT))
 
 from modeling_mixtral_te import NVMixtralConfig  # noqa: E402
 
diff --git a/bionemo-recipes/recipes/opengenome2_mixtral_native_te/tests/test_fsdp_ep.py b/bionemo-recipes/recipes/opengenome2_mixtral_native_te/tests/test_fsdp_ep.py
@@ -25,9 +25,9 @@
 from pathlib import Path
 
 
-# Import NVMixtralForCausalLM from the shared mixtral_native_te recipe
-SHARED_MIXTRAL_RECIPE = Path(__file__).resolve().parent.parent.parent / "mixtral_native_te"
-sys.path.insert(0, str(SHARED_MIXTRAL_RECIPE))
+# Import from local recipe copy (CI uses sparse-checkout, shared recipe may not exist)
+RECIPE_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(RECIPE_ROOT))
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 
 import pytest  # noqa: E402
diff --git a/ci/scripts/check_copied_files.py b/ci/scripts/check_copied_files.py
@@ -208,6 +208,7 @@ def _compare_file_contents(source_file: Path, dest_file: Path, source_display: s
     # Mixtral TE model -> recipe sync
     "bionemo-recipes/models/mixtral/modeling_mixtral_te.py": [
         "bionemo-recipes/recipes/mixtral_native_te/modeling_mixtral_te.py",
+        "bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py",
     ],
     # Common test library - synced between models
     "bionemo-recipes/models/esm2/tests/common": [