fix: ft logic

willmj · willmj · commit 07d38b0e3285 · 2025-04-11T11:10:20.000-04:00
Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py
@@ -128,6 +128,10 @@ def prepare_scattermoe(
     # pylint: disable=import-outside-toplevel
     from .scattermoe import ScatterMoE
 
+    lora = False
+    if lora_config:
+        lora = True
+
     if disable_distributed and ep_degree > 1:
         raise ValueError(
             "expert sharding can not be deferred to top level sharding"
@@ -251,6 +255,7 @@ def prepare_scattermoe(
                 module_name,
                 router_name,
                 "|".join(expert_name),
+                lora_start=lora
                 target_modules=lora_config.target_modules,
             )
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_state_dict.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_state_dict.py
@@ -88,6 +88,7 @@ def get_checkpoint_meta_from_sharded_safetensor(
     router_name: str = "gate",  # e.g., named "gate" within block_sparse_moe
     expert_name: str = "experts",  # e.g., named "experts" within block_sparse_moe
     expert_map: Dict = None,  # map -> [w1,w2,w3]
+    lora_start: bool = False, # if lora is detected in prepare_scattermoe.py
     lora_utils: bool = False,  # if lora is detected in checkpoint_utils.py
     target_modules: Dict = None,  # target modules from prepare_scattermoe.py
 ) -> Dict[str, List[Tuple]]:
@@ -176,12 +177,14 @@ def _insert(L: List, i: int, v):
             else:
                 _map[KEY_SCATTERMOE_ROUTER].append((k, stfile))
         elif m.group(1) in expert_name:
+            index = m.group(2)
+            index = 0 if index is None else int(index)
+            mod = None
+
+            # LoRA case
             if (
                 "input_linear" in target_modules and "output_linear" in target_modules
             ) or lora_utils:
-                index = m.group(2)
-                index = 0 if index is None else int(index)
-                mod = None
                 if not lora_utils:
                     for mod in expert_map.get(m.group(1), expert_map.get(m.group(3))):
                         _insert(_map[f"{mod}.weight"], index, (k, stfile))
@@ -190,7 +193,14 @@ def _insert(L: List, i: int, v):
                         _insert(_map[f"{mod}.lora_A"], index, (k, stfile))
                         _insert(_map[f"{mod}.lora_B"], index, (k, stfile))
 
-                assert mod is not None, f"cannot map '{rel_k}'"
+            # Fine-tuning case
+            elif not lora_utils and not lora_start:
+                for mod in expert_map.get(m.group(1), expert_map.get(m.group(3))):
+                    _insert(_map[f"{mod}.weight"], index, (k, stfile))
+            
+            assert mod is not None, f"cannot map '{rel_k}'"
+
+                
 
     if len(_map) == 0:
         raise ValueError(