Skip to content

Commit 376a00f

Browse files
committed
fix: add modeling_mixtral_te.py to opengenome2 recipe and fix imports for sparse checkout
CI uses sparse-checkout, so each recipe job only has its own directory. The opengenome2_mixtral_native_te tests were importing modeling_mixtral_te from the shared mixtral_native_te recipe path, which does not exist in the sparse checkout. Fix: - Copy modeling_mixtral_te.py to opengenome2_mixtral_native_te recipe root - Register the copy in check_copied_files.py source-destination map - Update test imports to use local recipe root instead of shared path Signed-off-by: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com>
1 parent c590028 commit 376a00f

4 files changed

Lines changed: 27 additions & 17 deletions

File tree

bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,6 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
# --- BEGIN COPIED FILE NOTICE ---
17-
# This file is copied from: bionemo-recipes/models/mixtral/modeling_mixtral_te.py
18-
# Do not modify this file directly. Instead, modify the source and run:
19-
# python ci/scripts/check_copied_files.py --fix
20-
# --- END COPIED FILE NOTICE ---
21-
2216
"""TransformerEngine-optimized Mixtral model with Mixture of Experts."""
2317

2418
import logging
@@ -285,9 +279,22 @@ def _restack_from_views(self) -> None:
285279
device = torch.cuda.current_device()
286280
for attr_name in ("experts_gate_up_weight", "experts_down_weight"):
287281
old_param = getattr(self, attr_name)
288-
new_data = torch.empty_like(old_param, device=device)
289-
torch.nn.init.normal_(new_data, mean=0.0, std=self.initializer_range)
290-
setattr(self, attr_name, nn.Parameter(new_data))
282+
if isinstance(old_param.data, DTensor):
283+
# FSDP2 has sharded this param; materialize the local shard on CUDA
284+
# and reconstruct the DTensor wrapper so FSDP2 can manage it.
285+
local_data = old_param.data.to_local()
286+
new_local = torch.empty(local_data.shape, dtype=local_data.dtype, device=device)
287+
torch.nn.init.normal_(new_local, mean=0.0, std=self.initializer_range)
288+
new_dtensor = DTensor.from_local(
289+
new_local,
290+
device_mesh=old_param.data.device_mesh,
291+
placements=old_param.data.placements,
292+
)
293+
setattr(self, attr_name, nn.Parameter(new_dtensor))
294+
else:
295+
new_data = torch.empty_like(old_param, device=device)
296+
torch.nn.init.normal_(new_data, mean=0.0, std=self.initializer_range)
297+
setattr(self, attr_name, nn.Parameter(new_data))
291298

292299
# Re-sync views to point to the new stacked parameter
293300
self._sync_expert_views()
@@ -304,13 +311,15 @@ def _sync_expert_views(self) -> None:
304311
gate_up_w = self.experts_gate_up_weight
305312
if isinstance(gate_up_w, DTensor):
306313
gate_up_w = gate_up_w.to_local()
307-
for i in range(self.num_local_experts):
314+
num_local = gate_up_w.shape[0]
315+
for i in range(num_local):
308316
object.__setattr__(self.experts_gate_up, f"weight{i}", gate_up_w[i])
309317

310318
down_w = self.experts_down_weight
311319
if isinstance(down_w, DTensor):
312320
down_w = down_w.to_local()
313-
for i in range(self.num_local_experts):
321+
num_local_down = down_w.shape[0]
322+
for i in range(num_local_down):
314323
object.__setattr__(self.experts_down, f"weight{i}", down_w[i])
315324

316325
def set_ep_group(self, ep_group: dist.ProcessGroup, ep_mesh: DeviceMesh) -> None:

bionemo-recipes/recipes/opengenome2_mixtral_native_te/tests/distributed_helpers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
import torch
2424

2525

26-
# Import NVMixtralConfig from the shared mixtral_native_te recipe
27-
SHARED_MIXTRAL_RECIPE = Path(__file__).resolve().parent.parent.parent / "mixtral_native_te"
28-
sys.path.insert(0, str(SHARED_MIXTRAL_RECIPE))
26+
# Import NVMixtralConfig from the local recipe copy (CI uses sparse-checkout)
27+
RECIPE_ROOT = Path(__file__).resolve().parent.parent
28+
sys.path.insert(0, str(RECIPE_ROOT))
2929

3030
from modeling_mixtral_te import NVMixtralConfig # noqa: E402
3131

bionemo-recipes/recipes/opengenome2_mixtral_native_te/tests/test_fsdp_ep.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@
2525
from pathlib import Path
2626

2727

28-
# Import NVMixtralForCausalLM from the shared mixtral_native_te recipe
29-
SHARED_MIXTRAL_RECIPE = Path(__file__).resolve().parent.parent.parent / "mixtral_native_te"
30-
sys.path.insert(0, str(SHARED_MIXTRAL_RECIPE))
28+
# Import from local recipe copy (CI uses sparse-checkout, shared recipe may not exist)
29+
RECIPE_ROOT = Path(__file__).resolve().parent.parent
30+
sys.path.insert(0, str(RECIPE_ROOT))
3131
sys.path.insert(0, str(Path(__file__).resolve().parent))
3232

3333
import pytest # noqa: E402

ci/scripts/check_copied_files.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ def _compare_file_contents(source_file: Path, dest_file: Path, source_display: s
208208
# Mixtral TE model -> recipe sync
209209
"bionemo-recipes/models/mixtral/modeling_mixtral_te.py": [
210210
"bionemo-recipes/recipes/mixtral_native_te/modeling_mixtral_te.py",
211+
"bionemo-recipes/recipes/opengenome2_mixtral_native_te/modeling_mixtral_te.py",
211212
],
212213
# Common test library - synced between models
213214
"bionemo-recipes/models/esm2/tests/common": [

0 commit comments

Comments
 (0)