From cac0b8cd93eeabfa5f2f892af9fca8aa1a5d1dd5 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 24 Mar 2025 13:39:56 -0400
Subject: [PATCH 01/32] save peft

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index f36fbf4c3..d346fabcb 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -14,6 +14,7 @@
 
 # Standard
 from dataclasses import dataclass
+from peft import PeftModel
 import os
 
 # Third Party
@@ -113,9 +114,13 @@ def checkpoint(checkpoint_dir, save_dir):
                             os.path.join(hf_converted_output_dir, TRAINING_ARGS_NAME),
                         )
                         # Save model config files
-                        self.trainer.model.config.save_pretrained(
-                            hf_converted_output_dir
-                        )
+                        if isinstance(self.trainer.model, PeftModel):
+                            # Save PEFT adapter configuration
+                            PeftModel.save_pretrained(hf_converted_output_dir)
+                        else:
+                            self.trainer.model.config.save_pretrained(
+                                hf_converted_output_dir
+                            )
 
                     except Exception as e:
                         raise ValueError(

From c5224296bcd98b71c8fb249477e88175a339f1ea Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 24 Mar 2025 14:03:04 -0400
Subject: [PATCH 02/32] fix: model

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index d346fabcb..f370b9805 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -116,7 +116,7 @@ def checkpoint(checkpoint_dir, save_dir):
                         # Save model config files
                         if isinstance(self.trainer.model, PeftModel):
                             # Save PEFT adapter configuration
-                            PeftModel.save_pretrained(hf_converted_output_dir)
+                            PeftModel.save_pretrained(self.trainer.model, hf_converted_output_dir)
                         else:
                             self.trainer.model.config.save_pretrained(
                                 hf_converted_output_dir

From 481dde627e26ba85d51361c2966ab135fb9ab327 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Tue, 1 Apr 2025 14:30:42 -0400
Subject: [PATCH 03/32] post process hf converted dir

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 build/accelerate_launch.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index 6cbc7d252..f1472f534 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -145,6 +145,13 @@ def main():
                 post_process_vLLM_adapters_new_tokens(
                     save_model_dir, save_model_dir, num_added_tokens
                 )
+                hf_converted_checkpoint = os.path.join(save_model_dir, "hf_converted_checkpoint")
+                if os.path.exists(
+                    os.path.join(hf_converted_checkpoint, "adapter_model.safetensors")
+                ):
+                    post_process_vLLM_adapters_new_tokens(
+                        hf_converted_checkpoint, hf_converted_checkpoint, num_added_tokens
+                    )
 
         if (
             os.path.exists(os.path.join(output_dir, "added_tokens_info.json"))

From 397c9ba1d42c50bfe09b58afedfa04becf8474cc Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 7 Apr 2025 10:26:55 -0400
Subject: [PATCH 04/32] fix: convert hf converted checkpoint

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 build/accelerate_launch.py | 42 ++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index f1472f534..6dcb282dd 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -145,13 +145,16 @@ def main():
                 post_process_vLLM_adapters_new_tokens(
                     save_model_dir, save_model_dir, num_added_tokens
                 )
-                hf_converted_checkpoint = os.path.join(save_model_dir, "hf_converted_checkpoint")
-                if os.path.exists(
-                    os.path.join(hf_converted_checkpoint, "adapter_model.safetensors")
-                ):
-                    post_process_vLLM_adapters_new_tokens(
-                        hf_converted_checkpoint, hf_converted_checkpoint, num_added_tokens
-                    )
+
+            hf_converted_checkpoint = os.path.join(
+                save_model_dir, "hf_converted_checkpoint"
+            )
+            if os.path.exists(
+                os.path.join(hf_converted_checkpoint, "adapter_model.safetensors")
+            ):
+                post_process_vLLM_adapters_new_tokens(
+                    hf_converted_checkpoint, hf_converted_checkpoint, num_added_tokens
+                )
 
         if (
             os.path.exists(os.path.join(output_dir, "added_tokens_info.json"))
@@ -166,11 +169,28 @@ def main():
             for _, dirs, _ in os.walk(output_dir, topdown=False):
                 for name in dirs:
                     if "checkpoint-" in name.lower():
-                        post_process_vLLM_adapters_new_tokens(
-                            os.path.join(output_dir, name),
-                            os.path.join(output_dir, name),
-                            num_added_tokens,
+                        checkpoint_dir = os.path.join(output_dir, name)
+                        if os.path.exists(
+                            os.path.join(checkpoint_dir, "adapter_model.safetensors")
+                        ):
+                            post_process_vLLM_adapters_new_tokens(
+                                checkpoint_dir,
+                                checkpoint_dir,
+                                num_added_tokens,
+                            )
+                        hf_converted_checkpoint = os.path.join(
+                            checkpoint_dir, "hf_converted_checkpoint"
                         )
+                        if os.path.exists(
+                            os.path.join(
+                                hf_converted_checkpoint, "adapter_model.safetensors"
+                            )
+                        ):
+                            post_process_vLLM_adapters_new_tokens(
+                                hf_converted_checkpoint,
+                                hf_converted_checkpoint,
+                                num_added_tokens,
+                            )
         else:
             logging.warning(
                 "Failed to post-process: file added_tokens_info.json not in path %s",

From 79dec24d030c22b2a5bae14653b26345abc91223 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 7 Apr 2025 13:13:33 -0400
Subject: [PATCH 05/32] lora config

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index f370b9805..04425d6fa 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -14,10 +14,10 @@
 
 # Standard
 from dataclasses import dataclass
-from peft import PeftModel
 import os
 
 # Third Party
+from peft import LoraModel, PeftModel
 from transformers import (
     Trainer,
     TrainerCallback,
@@ -114,9 +114,10 @@ def checkpoint(checkpoint_dir, save_dir):
                             os.path.join(hf_converted_output_dir, TRAINING_ARGS_NAME),
                         )
                         # Save model config files
-                        if isinstance(self.trainer.model, PeftModel):
+                        if isinstance(self.trainer.model._fsdp_wrapped_module.base_model, LoraModel):
                             # Save PEFT adapter configuration
-                            PeftModel.save_pretrained(self.trainer.model, hf_converted_output_dir)
+                            self.trainer.model._fsdp_wrapped_module.base_model.save_pretrained(hf_converted_output_dir)
+
                         else:
                             self.trainer.model.config.save_pretrained(
                                 hf_converted_output_dir

From 3103720afcfcbe10b0044b78b70d4f0b758488f5 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 7 Apr 2025 16:20:04 -0400
Subject: [PATCH 06/32] save adapter config

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index 04425d6fa..7ed501b7b 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -15,6 +15,7 @@
 # Standard
 from dataclasses import dataclass
 import os
+import json
 
 # Third Party
 from peft import LoraModel, PeftModel
@@ -113,10 +114,18 @@ def checkpoint(checkpoint_dir, save_dir):
                             args,
                             os.path.join(hf_converted_output_dir, TRAINING_ARGS_NAME),
                         )
-                        # Save model config files
-                        if isinstance(self.trainer.model._fsdp_wrapped_module.base_model, LoraModel):
-                            # Save PEFT adapter configuration
-                            self.trainer.model._fsdp_wrapped_module.base_model.save_pretrained(hf_converted_output_dir)
+
+                        # Unwrap FSDP module
+                        model = self.trainer.model
+                        if hasattr(model, "module"):
+                            model = model.module
+
+                        if model.peft_config:
+                            lora_config = model.peft_config["default"]
+                            config_dict = lora_config.to_dict()
+                            config_dict['target_modules'] = sorted(list(config_dict['target_modules']))
+                            with open(os.path.join(hf_converted_output_dir,"adapter_config.json"), "w") as f:
+                                json.dump(config_dict, f, indent=2)
 
                         else:
                             self.trainer.model.config.save_pretrained(

From b61cbde6d1986318f197be91eda7ae17356b20e4 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 7 Apr 2025 16:31:22 -0400
Subject: [PATCH 07/32] fmt + comments

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 build/accelerate_launch.py                    |  3 +++
 .../config/acceleration_configs/fast_moe.py   | 19 ++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index 6dcb282dd..bea6d032b 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -146,6 +146,7 @@ def main():
                     save_model_dir, save_model_dir, num_added_tokens
                 )
 
+            # In case of ScatterMoE LoRa
             hf_converted_checkpoint = os.path.join(
                 save_model_dir, "hf_converted_checkpoint"
             )
@@ -178,6 +179,8 @@ def main():
                                 checkpoint_dir,
                                 num_added_tokens,
                             )
+
+                        # In case of ScatterMoE LoRa
                         hf_converted_checkpoint = os.path.join(
                             checkpoint_dir, "hf_converted_checkpoint"
                         )
diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index 7ed501b7b..7573dd7ff 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -14,11 +14,10 @@
 
 # Standard
 from dataclasses import dataclass
-import os
 import json
+import os
 
 # Third Party
-from peft import LoraModel, PeftModel
 from transformers import (
     Trainer,
     TrainerCallback,
@@ -123,14 +122,20 @@ def checkpoint(checkpoint_dir, save_dir):
                         if model.peft_config:
                             lora_config = model.peft_config["default"]
                             config_dict = lora_config.to_dict()
-                            config_dict['target_modules'] = sorted(list(config_dict['target_modules']))
-                            with open(os.path.join(hf_converted_output_dir,"adapter_config.json"), "w") as f:
+                            config_dict["target_modules"] = sorted(
+                                list(config_dict["target_modules"])
+                            )
+                            with open(
+                                os.path.join(
+                                    hf_converted_output_dir, "adapter_config.json"
+                                ),
+                                "w",
+                                encoding="utf-8"
+                            ) as f:
                                 json.dump(config_dict, f, indent=2)
 
                         else:
-                            self.trainer.model.config.save_pretrained(
-                                hf_converted_output_dir
-                            )
+                            model.config.save_pretrained(hf_converted_output_dir)
 
                     except Exception as e:
                         raise ValueError(

From c12be0ef2a69798573d47fc911f480942015eebc Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Tue, 8 Apr 2025 13:02:21 -0400
Subject: [PATCH 08/32] fix: add input linear and output linear to target
 modules

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index 7573dd7ff..142d7655b 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -125,6 +125,8 @@ def checkpoint(checkpoint_dir, save_dir):
                             config_dict["target_modules"] = sorted(
                                 list(config_dict["target_modules"])
                             )
+                            if "router" in config_dict["target_modules"]:
+                                config_dict["target_modules"].append("input_linear, output_linear")
                             with open(
                                 os.path.join(
                                     hf_converted_output_dir, "adapter_config.json"

From 123c2d481ae77e0974ce2f2adf02ed1010445c87 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Tue, 8 Apr 2025 14:09:05 -0400
Subject: [PATCH 09/32] fix: extend instead of append

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index 142d7655b..94507855c 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -126,7 +126,7 @@ def checkpoint(checkpoint_dir, save_dir):
                                 list(config_dict["target_modules"])
                             )
                             if "router" in config_dict["target_modules"]:
-                                config_dict["target_modules"].append("input_linear, output_linear")
+                                config_dict["target_modules"].extend(["input_linear", "output_linear"])
                             with open(
                                 os.path.join(
                                     hf_converted_output_dir, "adapter_config.json"

From f68500b64f320cffb7883a767743f4f3e3837152 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Tue, 8 Apr 2025 15:40:33 -0400
Subject: [PATCH 10/32] fix: if hasattr peft config

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index 94507855c..a2258d35c 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -119,7 +119,7 @@ def checkpoint(checkpoint_dir, save_dir):
                         if hasattr(model, "module"):
                             model = model.module
 
-                        if model.peft_config:
+                        if hasattr(model, "peft_config"):
                             lora_config = model.peft_config["default"]
                             config_dict = lora_config.to_dict()
                             config_dict["target_modules"] = sorted(

From 55ec4b505365161c491929e8383f670f6f01ddc6 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Wed, 9 Apr 2025 14:46:07 -0400
Subject: [PATCH 11/32] fix: remove unneeded target modules

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/config/acceleration_configs/fast_moe.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index a2258d35c..40d1e286a 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -125,8 +125,6 @@ def checkpoint(checkpoint_dir, save_dir):
                             config_dict["target_modules"] = sorted(
                                 list(config_dict["target_modules"])
                             )
-                            if "router" in config_dict["target_modules"]:
-                                config_dict["target_modules"].extend(["input_linear", "output_linear"])
                             with open(
                                 os.path.join(
                                     hf_converted_output_dir, "adapter_config.json"

From 23623494c3750cb8e30a541ff5e765afd062c178 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Thu, 10 Apr 2025 09:34:05 -0400
Subject: [PATCH 12/32] lint + fmt

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 .pylintrc                                      | 2 +-
 tuning/config/acceleration_configs/fast_moe.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 41f7e4e73..5e9f356b9 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -475,7 +475,7 @@ notes-rgx=
 [REFACTORING]
 
 # Maximum number of nested blocks for function / method body
-max-nested-blocks=5
+max-nested-blocks=6
 
 # Complete name of functions that never returns. When checking for
 # inconsistent-return-statements if a never returning function is called then
diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py
index 97eb214cd..37602daf1 100644
--- a/tuning/config/acceleration_configs/fast_moe.py
+++ b/tuning/config/acceleration_configs/fast_moe.py
@@ -139,7 +139,7 @@ def checkpoint(checkpoint_dir, save_dir):
                                     hf_converted_output_dir, "adapter_config.json"
                                 ),
                                 "w",
-                                encoding="utf-8"
+                                encoding="utf-8",
                             ) as f:
                                 json.dump(config_dict, f, indent=2)
 

From a848a9b45dcbed7baf4f58aa0fcd34ce924e6c00 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 11 Apr 2025 16:09:46 -0400
Subject: [PATCH 13/32] docs

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index e503a7d63..109985383 100644
--- a/README.md
+++ b/README.md
@@ -902,6 +902,10 @@ Notes:
       - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows:
           - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP).
           - if False, Scatter MoE Kernels with complete replication of experts across ranks.
+      - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, experts should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train:
+          - Passing `all-linear` to adapter layers will include the router, which is a linear layer, and all attn layers. This **will not** train the expert layers.
+          - To train only attention layers, specify target modules specifically (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`).
+          - To train expert layers, specify `input_linear` and `output_linear` in target modules along with `router` (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj", "router", "input_linear", "output_linear"]`). If you specify these layers, inference with vLLM/vanilla HF PEFT **is not possible**.
     - `world_size` must be divisible by the `ep_degree`
     - `number of experts` in the MoE module must be divisible by the `ep_degree`
     - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.

From 42c420c3028713899e5a03bde6f72903478869e6 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 14 Apr 2025 16:50:13 -0400
Subject: [PATCH 14/32] test: lora for scattermoe

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tests/test_sft_trainer.py | 42 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index 664c67ad7..c4dc8a5ed 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -1447,6 +1447,44 @@ def test_run_moe_ft_and_inference_ep1_kernels(dataset_path, ep_degree):
         )
 
 
+@pytest.mark.skipif(
+    not is_fms_accelerate_available(plugins="moe"),
+    reason="Only runs if fms-accelerate is installed along with accelerated-moe plugin",
+)
+@pytest.mark.parametrize(
+    "dataset_path",
+    [
+        TWITTER_COMPLAINTS_DATA_JSONL,
+    ],
+)
+def test_run_moe_lora_and_inference(dataset_path):
+    """Check if we can finetune a moe model and check if hf checkpoint is created"""
+    with tempfile.TemporaryDirectory() as tempdir:
+        data_args = copy.deepcopy(DATA_ARGS)
+        data_args.training_data_path = dataset_path
+        model_args = copy.deepcopy(MODEL_ARGS)
+        model_args.model_name_or_path = "ibm-granite/granite-3.1-1b-a400m-base"
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+        lora_args = copy.deepcopy(PEFT_LORA_ARGS)
+        lora_args.r = 16
+        lora_args.target_modules = ["q_proj", "v_proj", "o_proj", "k_proj"] # Router doesn't work with LoRA test inference
+        fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=False))
+        sft_trainer.train(
+            model_args,
+            data_args,
+            train_args,
+            lora_args,
+            fast_moe_config=fast_moe_config,
+        )
+        _test_run_inference(
+            checkpoint_path=os.path.join(
+                _get_checkpoint_path(tempdir), "hf_converted_checkpoint"
+            ),
+            base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base"
+        )
+
+
 @pytest.mark.skipif(
     not is_fms_accelerate_available(plugins="moe"),
     reason="Only runs if fms-accelerate is installed along with accelerated-moe plugin",
@@ -1485,9 +1523,9 @@ def _test_run_causallm_ft(training_args, model_args, data_args, tempdir):
     _validate_training(tempdir)
 
 
-def _test_run_inference(checkpoint_path):
+def _test_run_inference(checkpoint_path, base_model_name_or_path=None):
     # Load the model
-    loaded_model = TunedCausalLM.load(checkpoint_path)
+    loaded_model = TunedCausalLM.load(checkpoint_path, base_model_name_or_path)
 
     # Run inference on the text
     output_inference = loaded_model.run(

From e3e7525db94d3ebcf83a327bc0f3b91ef04b83c1 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Tue, 15 Apr 2025 09:29:54 -0400
Subject: [PATCH 15/32] fmt tests

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tests/test_sft_trainer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index c4dc8a5ed..e97e51383 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -1468,7 +1468,12 @@ def test_run_moe_lora_and_inference(dataset_path):
         train_args.output_dir = tempdir
         lora_args = copy.deepcopy(PEFT_LORA_ARGS)
         lora_args.r = 16
-        lora_args.target_modules = ["q_proj", "v_proj", "o_proj", "k_proj"] # Router doesn't work with LoRA test inference
+        lora_args.target_modules = [
+            "q_proj",
+            "v_proj",
+            "o_proj",
+            "k_proj",
+        ]  # Router doesn't work with LoRA test inference
         fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=False))
         sft_trainer.train(
             model_args,
@@ -1481,7 +1486,7 @@ def test_run_moe_lora_and_inference(dataset_path):
             checkpoint_path=os.path.join(
                 _get_checkpoint_path(tempdir), "hf_converted_checkpoint"
             ),
-            base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base"
+            base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base",
         )
 
 

From 844965959b86c31997e103ee397d8a112549953f Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Wed, 16 Apr 2025 11:48:59 -0400
Subject: [PATCH 16/32] docs: notes on restrictions

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 109985383..51876ef6b 100644
--- a/README.md
+++ b/README.md
@@ -906,7 +906,9 @@ Notes:
           - Passing `all-linear` to adapter layers will include the router, which is a linear layer, and all attn layers. This **will not** train the expert layers.
           - To train only attention layers, specify target modules specifically (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`).
           - To train expert layers, specify `input_linear` and `output_linear` in target modules along with `router` (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj", "router", "input_linear", "output_linear"]`). If you specify these layers, inference with vLLM/vanilla HF PEFT **is not possible**.
-    - `world_size` must be divisible by the `ep_degree`
+      - When lora tuning with ScatterMoE, the values `--fast_moe 1` or `--fast_moe True` are not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>1`.
+      - When lora tuning with ScatterMoE, `--r` must be set to 16 or greater.
+    - `world_size` must be divisible by the `--ep_degree`
     - `number of experts` in the MoE module must be divisible by the `ep_degree`
     - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.
       - The typical usecase for this script is to run:

From 3c25265bfc0f66485f71fc578bf4c77132c07a36 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Wed, 16 Apr 2025 20:00:14 -0400
Subject: [PATCH 17/32] explitcitly don't support router layer

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 README.md             |  9 +++------
 tuning/sft_trainer.py | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 51876ef6b..a26c963b9 100644
--- a/README.md
+++ b/README.md
@@ -902,12 +902,9 @@ Notes:
       - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows:
           - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP).
           - if False, Scatter MoE Kernels with complete replication of experts across ranks.
-      - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, experts should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train:
-          - Passing `all-linear` to adapter layers will include the router, which is a linear layer, and all attn layers. This **will not** train the expert layers.
-          - To train only attention layers, specify target modules specifically (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`).
-          - To train expert layers, specify `input_linear` and `output_linear` in target modules along with `router` (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj", "router", "input_linear", "output_linear"]`). If you specify these layers, inference with vLLM/vanilla HF PEFT **is not possible**.
-      - When lora tuning with ScatterMoE, the values `--fast_moe 1` or `--fast_moe True` are not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>1`.
-      - When lora tuning with ScatterMoE, `--r` must be set to 16 or greater.
+      - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train:
+          - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`.
+      - When lora tuning with ScatterMoE, the value `--fast_moe True` is not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>=1`.
     - `world_size` must be divisible by the `--ep_degree`
     - `number of experts` in the MoE module must be divisible by the `ep_degree`
     - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 7b06846fe..2c61700fd 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -155,6 +155,23 @@ def train(
                 "Trainer should not perform packing when using `--padding_free`"
             )
 
+    if fast_moe_config is not None:
+        # Checking for unsupported modules with Scatter MoE for LoRA
+        restricted_modules = ["all-linear", "output_linear", "input_linear", "router"]
+        if (
+            peft_config is not None
+            and hasattr(peft_config, "target_modules")
+            and any(
+                module in (peft_config.target_modules or [])
+                for module in restricted_modules
+            )
+        ):
+            raise ValueError(
+                "`--fast_moe` with LoRA does not currently support `all-linear`, `router`, "
+                "`input_linear` or `output_linear` as target modules at this time. Please "
+                "explicitly specify target modules when using `--fast_moe` with LoRA."
+            )
+
     task_type = "CAUSAL_LM"
     additional_metrics = {}
 

From da81f93f771bd8936fa5b3a028f49a9040778833 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Thu, 17 Apr 2025 21:34:39 -0400
Subject: [PATCH 18/32] docs: generalize

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a26c963b9..63f83b268 100644
--- a/README.md
+++ b/README.md
@@ -904,8 +904,8 @@ Notes:
           - if False, Scatter MoE Kernels with complete replication of experts across ranks.
       - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train:
           - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`.
-      - When lora tuning with ScatterMoE, the value `--fast_moe True` is not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>=1`.
-    - `world_size` must be divisible by the `--ep_degree`
+      - FSDP must be used when lora tuning with `--fast_moe`
+    - `world_size` must be divisible by the `ep_degree`
     - `number of experts` in the MoE module must be divisible by the `ep_degree`
     - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.
       - The typical usecase for this script is to run:

From 1424efd2d8c8a68629eea87f3347c2a9848ec100 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 09:28:58 -0400
Subject: [PATCH 19/32] docs: update documentation

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 63f83b268..22d812288 100644
--- a/README.md
+++ b/README.md
@@ -902,9 +902,9 @@ Notes:
       - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows:
           - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP).
           - if False, Scatter MoE Kernels with complete replication of experts across ranks.
-      - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train:
-          - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`.
-      - FSDP must be used when lora tuning with `--fast_moe`
+    - FSDP must be used when lora tuning with `--fast_moe`
+    - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train:
+        - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`.
     - `world_size` must be divisible by the `ep_degree`
     - `number of experts` in the MoE module must be divisible by the `ep_degree`
     - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.

From b67ef0f59c071eee78ab9ecbfed2556199adcb09 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 09:30:54 -0400
Subject: [PATCH 20/32] fix: simplify accelerate launch post processing

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 build/accelerate_launch.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index bea6d032b..43cf8dda0 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -170,7 +170,22 @@ def main():
             for _, dirs, _ in os.walk(output_dir, topdown=False):
                 for name in dirs:
                     if "checkpoint-" in name.lower():
-                        checkpoint_dir = os.path.join(output_dir, name)
+                        base_checkpoint_dir = os.path.join(output_dir, name)
+                        hf_converted_checkpoint = os.path.join(
+                            base_checkpoint_dir, "hf_converted_checkpoint"
+                        )
+
+                        # Use hf_converted_checkpoint if exists, otherwise use base_checkpoint_dir
+                        checkpoint_dir = (
+                            hf_converted_checkpoint
+                            if os.path.exists(
+                                os.path.join(
+                                    hf_converted_checkpoint, "adapter_model.safetensors"
+                                )
+                            )
+                            else base_checkpoint_dir
+                        )
+
                         if os.path.exists(
                             os.path.join(checkpoint_dir, "adapter_model.safetensors")
                         ):
@@ -179,21 +194,6 @@ def main():
                                 checkpoint_dir,
                                 num_added_tokens,
                             )
-
-                        # In case of ScatterMoE LoRa
-                        hf_converted_checkpoint = os.path.join(
-                            checkpoint_dir, "hf_converted_checkpoint"
-                        )
-                        if os.path.exists(
-                            os.path.join(
-                                hf_converted_checkpoint, "adapter_model.safetensors"
-                            )
-                        ):
-                            post_process_vLLM_adapters_new_tokens(
-                                hf_converted_checkpoint,
-                                hf_converted_checkpoint,
-                                num_added_tokens,
-                            )
         else:
             logging.warning(
                 "Failed to post-process: file added_tokens_info.json not in path %s",

From 6a32d320b2ee3e13fb7abe3e3f3d1e7f3bdfd483 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 09:31:56 -0400
Subject: [PATCH 21/32] tests: more target modules + ep_degree

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tests/test_sft_trainer.py | 58 +++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index e97e51383..bca3b193f 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -1452,12 +1452,18 @@ def test_run_moe_ft_and_inference_ep1_kernels(dataset_path, ep_degree):
     reason="Only runs if fms-accelerate is installed along with accelerated-moe plugin",
 )
 @pytest.mark.parametrize(
-    "dataset_path",
+    "target_modules",
     [
-        TWITTER_COMPLAINTS_DATA_JSONL,
+        "all-linear",
+        ["q_proj"],
+        ["q_proj", "k_proj"],
+        ["q_proj", "k_proj", "v_proj"],
+        ["q_proj", "k_proj", "v_proj", "o_proj"],
     ],
 )
-def test_run_moe_lora_and_inference(dataset_path):
+@pytest.mark.parametrize("ep_degree", [True, False])
+@pytest.mark.parametrize("dataset_path", [TWITTER_COMPLAINTS_DATA_JSONL])
+def test_run_moe_lora_and_inference(dataset_path, target_modules, ep_degree):
     """Check if we can finetune a moe model and check if hf checkpoint is created"""
     with tempfile.TemporaryDirectory() as tempdir:
         data_args = copy.deepcopy(DATA_ARGS)
@@ -1468,26 +1474,32 @@ def test_run_moe_lora_and_inference(dataset_path):
         train_args.output_dir = tempdir
         lora_args = copy.deepcopy(PEFT_LORA_ARGS)
         lora_args.r = 16
-        lora_args.target_modules = [
-            "q_proj",
-            "v_proj",
-            "o_proj",
-            "k_proj",
-        ]  # Router doesn't work with LoRA test inference
-        fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=False))
-        sft_trainer.train(
-            model_args,
-            data_args,
-            train_args,
-            lora_args,
-            fast_moe_config=fast_moe_config,
-        )
-        _test_run_inference(
-            checkpoint_path=os.path.join(
-                _get_checkpoint_path(tempdir), "hf_converted_checkpoint"
-            ),
-            base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base",
-        )
+        lora_args.target_modules = target_modules
+        fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=ep_degree))
+
+        if target_modules == "all-linear":
+            with pytest.raises(ValueError):
+                sft_trainer.train(
+                    model_args,
+                    data_args,
+                    train_args,
+                    lora_args,
+                    fast_moe_config=fast_moe_config,
+                )
+        else:
+            sft_trainer.train(
+                model_args,
+                data_args,
+                train_args,
+                lora_args,
+                fast_moe_config=fast_moe_config,
+            )
+            _test_run_inference(
+                checkpoint_path=os.path.join(
+                    _get_checkpoint_path(tempdir), "hf_converted_checkpoint"
+                ),
+                base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base",
+            )
 
 
 @pytest.mark.skipif(

From d2b6153c4fbd8c81a1cfe7df5ec426a5cff14b1f Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 09:32:29 -0400
Subject: [PATCH 22/32] fix: only restrict all-linear, raise warning for other
 modules

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 2c61700fd..e7a2ca473 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -157,7 +157,8 @@ def train(
 
     if fast_moe_config is not None:
         # Checking for unsupported modules with Scatter MoE for LoRA
-        restricted_modules = ["all-linear", "output_linear", "input_linear", "router"]
+        # Only raise an error for `all-linear`
+        restricted_modules = ["all-linear"]
         if (
             peft_config is not None
             and hasattr(peft_config, "target_modules")
@@ -167,9 +168,25 @@ def train(
             )
         ):
             raise ValueError(
-                "`--fast_moe` with LoRA does not currently support `all-linear`, `router`, "
-                "`input_linear` or `output_linear` as target modules at this time. Please "
-                "explicitly specify target modules when using `--fast_moe` with LoRA."
+                "`--fast_moe` with LoRA does not currently support `all-linear`, as "
+                "target modules at this time. Please explicitly specify target "
+                "modules when using `--fast_moe` with LoRA."
+            )
+        # If other common non-linear modules, raise warning
+        restrained_modules = ["input_linear", "output_linear", "router"]
+        if (
+            peft_config is not None
+            and hasattr(peft_config, "target_modules")
+            and any(
+                module in (peft_config.target_modules or [])
+                for module in restrained_modules
+            )
+        ):
+            logger.warning(
+                "Passing target modules that are part of the moe module can cause unexpected "
+                "behaviors and unsuccessful tuning while LoRA tuning with ScatterMoE. "
+                "For safe tuning, only pass linear modules such as those in the attn layer "
+                "(i.e. ['q_proj', 'v_proj', 'o_proj', 'k_proj'])"
             )
 
     task_type = "CAUSAL_LM"

From 765ec95222d1c475f987afe7dfc6ca0534688ae0 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 10:13:42 -0400
Subject: [PATCH 23/32] fix: augmentation test

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tests/acceleration/test_acceleration_framework.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py
index 80a445304..3213c47d0 100644
--- a/tests/acceleration/test_acceleration_framework.py
+++ b/tests/acceleration/test_acceleration_framework.py
@@ -532,8 +532,8 @@ def test_framework_initialized_properly_moe():
                 )
 
         # spy inside the train to ensure that the ilab plugin is called
-        assert spy["model_loader_calls"] == 1
-        assert spy["augmentation_calls"] == 0
+        assert spy["model_loader_calls"] == 0
+        assert spy["augmentation_calls"] == 1
         assert spy["get_ready_for_train_calls"] == 1
 
 

From b0dea82f34454057c31d272124961ff5bda8c18d Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 10:15:47 -0400
Subject: [PATCH 24/32] fix: raise error

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tests/acceleration/test_acceleration_framework.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py
index 3213c47d0..15635fc07 100644
--- a/tests/acceleration/test_acceleration_framework.py
+++ b/tests/acceleration/test_acceleration_framework.py
@@ -807,12 +807,13 @@ def test_error_raised_fast_moe_with_non_moe_model():
                 instantiate=False,
             ):
                 with instantiate_model_patcher():
-                    sft_trainer.train(
-                        model_args,
-                        data_args,
-                        train_args,
-                        fast_moe_config=moe_config,
-                    )
+                    with pytest.raises(ValueError):
+                        sft_trainer.train(
+                            model_args,
+                            data_args,
+                            train_args,
+                            fast_moe_config=moe_config,
+                        )
 
 
 @pytest.mark.skipif(

From 806b716ab0dc5cb19fcae42928fef76d9a2cdfb4 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 10:20:36 -0400
Subject: [PATCH 25/32] fix: raise error

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 .../test_acceleration_framework.py            | 66 +++++++++----------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py
index 15635fc07..757d9fa00 100644
--- a/tests/acceleration/test_acceleration_framework.py
+++ b/tests/acceleration/test_acceleration_framework.py
@@ -776,44 +776,40 @@ def test_error_raised_fast_moe_with_non_moe_model():
     """
     Ensure error is thrown when `--fast_moe` is passed and model is not MoE
     """
-    with pytest.raises(
-        AttributeError,
-        match="'LlamaConfig' object has no attribute 'num_local_experts'",
-    ):
-        with tempfile.TemporaryDirectory() as tempdir:
+    with tempfile.TemporaryDirectory() as tempdir:
 
-            model_args = copy.deepcopy(MODEL_ARGS)
-            model_args.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
-            model_args.torch_dtype = torch.bfloat16
-            train_args = copy.deepcopy(TRAIN_ARGS)
-            train_args.output_dir = tempdir
-            train_args.save_strategy = "no"
-            train_args.bf16 = True
-            data_args = copy.deepcopy(DATA_ARGS)
-            data_args.training_data_path = TWITTER_COMPLAINTS_JSON_FORMAT
-            data_args.response_template = "\n\n### Label:"
-            data_args.dataset_text_field = "output"
+        model_args = copy.deepcopy(MODEL_ARGS)
+        model_args.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
+        model_args.torch_dtype = torch.bfloat16
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+        train_args.save_strategy = "no"
+        train_args.bf16 = True
+        data_args = copy.deepcopy(DATA_ARGS)
+        data_args.training_data_path = TWITTER_COMPLAINTS_JSON_FORMAT
+        data_args.response_template = "\n\n### Label:"
+        data_args.dataset_text_field = "output"
 
-            # initialize a config
-            moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=1))
+        # initialize a config
+        moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=1))
 
-            # 1. mock a plugin class
-            # 2. register the mocked plugins
-            # 3. call sft_trainer.train
-            with build_framework_and_maybe_instantiate(
-                [
-                    (["training.moe.scattermoe"], ScatterMoEAccelerationPlugin),
-                ],
-                instantiate=False,
-            ):
-                with instantiate_model_patcher():
-                    with pytest.raises(ValueError):
-                        sft_trainer.train(
-                            model_args,
-                            data_args,
-                            train_args,
-                            fast_moe_config=moe_config,
-                        )
+        # 1. mock a plugin class
+        # 2. register the mocked plugins
+        # 3. call sft_trainer.train
+        with build_framework_and_maybe_instantiate(
+            [
+                (["training.moe.scattermoe"], ScatterMoEAccelerationPlugin),
+            ],
+            instantiate=False,
+        ):
+            with instantiate_model_patcher():
+                with pytest.raises((ValueError, AttributeError)):
+                    sft_trainer.train(
+                        model_args,
+                        data_args,
+                        train_args,
+                        fast_moe_config=moe_config,
+                    )
 
 
 @pytest.mark.skipif(

From 2567d30b5ead9cf6645bef77325e7fec1877ca2b Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 10:56:44 -0400
Subject: [PATCH 26/32] fix: make warning more general

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 146ca4af6..a4ea16b8c 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -186,17 +186,13 @@ def train(
                 "modules when using `--fast_moe` with LoRA."
             )
         # If other common non-linear modules, raise warning
-        restrained_modules = ["input_linear", "output_linear", "router"]
-        if (
+        elif (
             peft_config is not None
             and hasattr(peft_config, "target_modules")
-            and any(
-                module in (peft_config.target_modules or [])
-                for module in restrained_modules
-            )
         ):
             logger.warning(
-                "Passing target modules that are part of the moe module can cause unexpected "
+                "You are running lora with the ScatterMoE plugin, please note that "
+                "passing target modules that are part of the moe module can cause unexpected "
                 "behaviors and unsuccessful tuning while LoRA tuning with ScatterMoE. "
                 "For safe tuning, only pass linear modules such as those in the attn layer "
                 "(i.e. ['q_proj', 'v_proj', 'o_proj', 'k_proj'])"

From 70468db70c5802561b6c5053b8f2186b11a15fa8 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 14:26:10 -0400
Subject: [PATCH 27/32] turn off requires grad if using scattermoe with lora

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index a4ea16b8c..28a7b9670 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -186,10 +186,7 @@ def train(
                 "modules when using `--fast_moe` with LoRA."
             )
         # If other common non-linear modules, raise warning
-        elif (
-            peft_config is not None
-            and hasattr(peft_config, "target_modules")
-        ):
+        if peft_config is not None and hasattr(peft_config, "target_modules"):
             logger.warning(
                 "You are running lora with the ScatterMoE plugin, please note that "
                 "passing target modules that are part of the moe module can cause unexpected "
@@ -390,6 +387,16 @@ def train(
         model, (peft_config,) = framework.augmentation(
             model, train_args, modifiable_args=(peft_config,)
         )
+        # For LoRa ScatterMoE, if expert layers are included, disable grad
+        if peft_config is not None:
+            frozen_keywords = [
+                "block_sparse_moe.w1.weight",
+                "block_sparse_moe.w2.weight",
+                "block_sparse_moe.w3.weight",
+            ]
+            for name, param in model.named_parameters():
+                if any(key in name for key in frozen_keywords):
+                    param.requires_grad = False
 
     # HACK - The SFT Trainer has internal validation which inspects the name of the class
     # being used for the HF training args; if it's a TrainingArguments class, which is

From 5b826c8c61986e592bcff0680844430643b2347f Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 14:48:14 -0400
Subject: [PATCH 28/32] fix: freeze scattermoe params

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 28a7b9670..58dd4b0a5 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -387,15 +387,10 @@ def train(
         model, (peft_config,) = framework.augmentation(
             model, train_args, modifiable_args=(peft_config,)
         )
-        # For LoRa ScatterMoE, if expert layers are included, disable grad
+        # For LoRa ScatterMoE, disable grad for ScatterMoE
         if peft_config is not None:
-            frozen_keywords = [
-                "block_sparse_moe.w1.weight",
-                "block_sparse_moe.w2.weight",
-                "block_sparse_moe.w3.weight",
-            ]
             for name, param in model.named_parameters():
-                if any(key in name for key in frozen_keywords):
+                if "block_sparse_moe" in name:
                     param.requires_grad = False
 
     # HACK - The SFT Trainer has internal validation which inspects the name of the class

From af408f9822730e4df8c4943639767f6883793fa6 Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 15:08:15 -0400
Subject: [PATCH 29/32] fix: safer freezing

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 58dd4b0a5..b24aef60e 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -67,12 +67,18 @@
     USER_ERROR_EXIT_CODE,
     write_termination_log,
 )
+from tuning.utils.import_utils import is_fms_accelerate_available
 from tuning.utils.logging import set_log_level
 from tuning.utils.tokenizer_data_utils import (
     get_special_tokens_dict,
     tokenizer_and_embedding_resize,
 )
 
+if is_fms_accelerate_available(plugins="moe"):
+    # Third Party
+    # pylint: disable=import-error
+    from fms_acceleration_moe.utils.scattermoe import ScatterMoE
+
 
 def train(
     model_args: configs.ModelArguments,
@@ -389,9 +395,10 @@ def train(
         )
         # For LoRa ScatterMoE, disable grad for ScatterMoE
         if peft_config is not None:
-            for name, param in model.named_parameters():
-                if "block_sparse_moe" in name:
-                    param.requires_grad = False
+            for module in model.modules():
+                if isinstance(module, ScatterMoE):
+                    for param in module.parameters():
+                        param.requires_grad = False
 
     # HACK - The SFT Trainer has internal validation which inspects the name of the class
     # being used for the HF training args; if it's a TrainingArguments class, which is

From d7c2d159545d56210e9234b147ae5981efadf9ff Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 15:32:36 -0400
Subject: [PATCH 30/32] just use string for class name

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 60c210ec7..3044a3d90 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -67,18 +67,12 @@
     USER_ERROR_EXIT_CODE,
     write_termination_log,
 )
-from tuning.utils.import_utils import is_fms_accelerate_available
 from tuning.utils.logging import set_log_level
 from tuning.utils.tokenizer_data_utils import (
     get_special_tokens_dict,
     tokenizer_and_embedding_resize,
 )
 
-if is_fms_accelerate_available(plugins="moe"):
-    # Third Party
-    # pylint: disable=import-error
-    from fms_acceleration_moe.utils.scattermoe import ScatterMoE
-
 
 def train(
     model_args: configs.ModelArguments,
@@ -403,7 +397,7 @@ def train(
         # For LoRa ScatterMoE, disable grad for ScatterMoE
         if peft_config is not None:
             for module in model.modules():
-                if isinstance(module, ScatterMoE):
+                if module.__class__.__name__ == "ScatterMoE":
                     for param in module.parameters():
                         param.requires_grad = False
 

From 0f7796e18e6d01a03cb4f94c34d3389d82545dcc Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Fri, 18 Apr 2025 15:33:46 -0400
Subject: [PATCH 31/32] comment

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index 3044a3d90..6bcabafba 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -397,6 +397,7 @@ def train(
         # For LoRa ScatterMoE, disable grad for ScatterMoE
         if peft_config is not None:
             for module in model.modules():
+                # Use string comparison to check if ScatterMoE module
                 if module.__class__.__name__ == "ScatterMoE":
                     for param in module.parameters():
                         param.requires_grad = False

From 1759a2fed4086816c85321ef9720fbf83489163c Mon Sep 17 00:00:00 2001
From: Will Johnson <mwjohnson728@gmail.com>
Date: Mon, 21 Apr 2025 09:31:42 -0400
Subject: [PATCH 32/32] add comment

Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
---
 tuning/sft_trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
index bd9a02122..b51a723e8 100644
--- a/tuning/sft_trainer.py
+++ b/tuning/sft_trainer.py
@@ -387,7 +387,9 @@ def train(
         model, (peft_config,) = framework.augmentation(
             model, train_args, modifiable_args=(peft_config,)
         )
-        # For LoRa ScatterMoE, disable grad for ScatterMoE
+        # HACK - For LoRa ScatterMoE, disable grad for ScatterMoE.
+        # In the future, requires_grad should be enabled for LoRA tuning
+        # with ScatterMoE and this code should be removed.
         if peft_config is not None:
             for module in model.modules():
                 # Use string comparison to check if ScatterMoE module