From cac0b8cd93eeabfa5f2f892af9fca8aa1a5d1dd5 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 24 Mar 2025 13:39:56 -0400 Subject: [PATCH 01/32] save peft Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index f36fbf4c3..d346fabcb 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -14,6 +14,7 @@ # Standard from dataclasses import dataclass +from peft import PeftModel import os # Third Party @@ -113,9 +114,13 @@ def checkpoint(checkpoint_dir, save_dir): os.path.join(hf_converted_output_dir, TRAINING_ARGS_NAME), ) # Save model config files - self.trainer.model.config.save_pretrained( - hf_converted_output_dir - ) + if isinstance(self.trainer.model, PeftModel): + # Save PEFT adapter configuration + PeftModel.save_pretrained(hf_converted_output_dir) + else: + self.trainer.model.config.save_pretrained( + hf_converted_output_dir + ) except Exception as e: raise ValueError( From c5224296bcd98b71c8fb249477e88175a339f1ea Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 24 Mar 2025 14:03:04 -0400 Subject: [PATCH 02/32] fix: model Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index d346fabcb..f370b9805 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -116,7 +116,7 @@ def checkpoint(checkpoint_dir, save_dir): # Save model config files if isinstance(self.trainer.model, PeftModel): # Save PEFT adapter configuration - PeftModel.save_pretrained(hf_converted_output_dir) + PeftModel.save_pretrained(self.trainer.model, hf_converted_output_dir) else: self.trainer.model.config.save_pretrained( hf_converted_output_dir From 481dde627e26ba85d51361c2966ab135fb9ab327 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Tue, 1 Apr 2025 14:30:42 -0400 Subject: [PATCH 03/32] post process hf converted dir Signed-off-by: Will Johnson --- build/accelerate_launch.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index 6cbc7d252..f1472f534 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -145,6 +145,13 @@ def main(): post_process_vLLM_adapters_new_tokens( save_model_dir, save_model_dir, num_added_tokens ) + hf_converted_checkpoint = os.path.join(save_model_dir, "hf_converted_checkpoint") + if os.path.exists( + os.path.join(hf_converted_checkpoint, "adapter_model.safetensors") + ): + post_process_vLLM_adapters_new_tokens( + hf_converted_checkpoint, hf_converted_checkpoint, num_added_tokens + ) if ( os.path.exists(os.path.join(output_dir, "added_tokens_info.json")) From 397c9ba1d42c50bfe09b58afedfa04becf8474cc Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 7 Apr 2025 10:26:55 -0400 Subject: [PATCH 04/32] fix: convert hf converted checkpoint Signed-off-by: Will Johnson --- build/accelerate_launch.py | 42 ++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index f1472f534..6dcb282dd 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -145,13 +145,16 @@ def main(): post_process_vLLM_adapters_new_tokens( save_model_dir, save_model_dir, num_added_tokens ) - hf_converted_checkpoint = os.path.join(save_model_dir, "hf_converted_checkpoint") - if os.path.exists( - os.path.join(hf_converted_checkpoint, "adapter_model.safetensors") - ): - post_process_vLLM_adapters_new_tokens( - hf_converted_checkpoint, hf_converted_checkpoint, num_added_tokens - ) + + hf_converted_checkpoint = os.path.join( + save_model_dir, "hf_converted_checkpoint" + ) + if os.path.exists( + os.path.join(hf_converted_checkpoint, "adapter_model.safetensors") + ): + post_process_vLLM_adapters_new_tokens( + hf_converted_checkpoint, hf_converted_checkpoint, num_added_tokens + ) if ( os.path.exists(os.path.join(output_dir, "added_tokens_info.json")) @@ -166,11 +169,28 @@ def main(): for _, dirs, _ in os.walk(output_dir, topdown=False): for name in dirs: if "checkpoint-" in name.lower(): - post_process_vLLM_adapters_new_tokens( - os.path.join(output_dir, name), - os.path.join(output_dir, name), - num_added_tokens, + checkpoint_dir = os.path.join(output_dir, name) + if os.path.exists( + os.path.join(checkpoint_dir, "adapter_model.safetensors") + ): + post_process_vLLM_adapters_new_tokens( + checkpoint_dir, + checkpoint_dir, + num_added_tokens, + ) + hf_converted_checkpoint = os.path.join( + checkpoint_dir, "hf_converted_checkpoint" ) + if os.path.exists( + os.path.join( + hf_converted_checkpoint, "adapter_model.safetensors" + ) + ): + post_process_vLLM_adapters_new_tokens( + hf_converted_checkpoint, + hf_converted_checkpoint, + num_added_tokens, + ) else: logging.warning( "Failed to post-process: file added_tokens_info.json not in path %s", From 79dec24d030c22b2a5bae14653b26345abc91223 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 7 Apr 2025 13:13:33 -0400 Subject: [PATCH 05/32] lora config Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index f370b9805..04425d6fa 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -14,10 +14,10 @@ # Standard from dataclasses import dataclass -from peft import PeftModel import os # Third Party +from peft import LoraModel, PeftModel from transformers import ( Trainer, TrainerCallback, @@ -114,9 +114,10 @@ def checkpoint(checkpoint_dir, save_dir): os.path.join(hf_converted_output_dir, TRAINING_ARGS_NAME), ) # Save model config files - if isinstance(self.trainer.model, PeftModel): + if isinstance(self.trainer.model._fsdp_wrapped_module.base_model, LoraModel): # Save PEFT adapter configuration - PeftModel.save_pretrained(self.trainer.model, hf_converted_output_dir) + self.trainer.model._fsdp_wrapped_module.base_model.save_pretrained(hf_converted_output_dir) + else: self.trainer.model.config.save_pretrained( hf_converted_output_dir From 3103720afcfcbe10b0044b78b70d4f0b758488f5 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 7 Apr 2025 16:20:04 -0400 Subject: [PATCH 06/32] save adapter config Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index 04425d6fa..7ed501b7b 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -15,6 +15,7 @@ # Standard from dataclasses import dataclass import os +import json # Third Party from peft import LoraModel, PeftModel @@ -113,10 +114,18 @@ def checkpoint(checkpoint_dir, save_dir): args, os.path.join(hf_converted_output_dir, TRAINING_ARGS_NAME), ) - # Save model config files - if isinstance(self.trainer.model._fsdp_wrapped_module.base_model, LoraModel): - # Save PEFT adapter configuration - self.trainer.model._fsdp_wrapped_module.base_model.save_pretrained(hf_converted_output_dir) + + # Unwrap FSDP module + model = self.trainer.model + if hasattr(model, "module"): + model = model.module + + if model.peft_config: + lora_config = model.peft_config["default"] + config_dict = lora_config.to_dict() + config_dict['target_modules'] = sorted(list(config_dict['target_modules'])) + with open(os.path.join(hf_converted_output_dir,"adapter_config.json"), "w") as f: + json.dump(config_dict, f, indent=2) else: self.trainer.model.config.save_pretrained( From b61cbde6d1986318f197be91eda7ae17356b20e4 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 7 Apr 2025 16:31:22 -0400 Subject: [PATCH 07/32] fmt + comments Signed-off-by: Will Johnson --- build/accelerate_launch.py | 3 +++ .../config/acceleration_configs/fast_moe.py | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index 6dcb282dd..bea6d032b 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -146,6 +146,7 @@ def main(): save_model_dir, save_model_dir, num_added_tokens ) + # In case of ScatterMoE LoRa hf_converted_checkpoint = os.path.join( save_model_dir, "hf_converted_checkpoint" ) @@ -178,6 +179,8 @@ def main(): checkpoint_dir, num_added_tokens, ) + + # In case of ScatterMoE LoRa hf_converted_checkpoint = os.path.join( checkpoint_dir, "hf_converted_checkpoint" ) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index 7ed501b7b..7573dd7ff 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -14,11 +14,10 @@ # Standard from dataclasses import dataclass -import os import json +import os # Third Party -from peft import LoraModel, PeftModel from transformers import ( Trainer, TrainerCallback, @@ -123,14 +122,20 @@ def checkpoint(checkpoint_dir, save_dir): if model.peft_config: lora_config = model.peft_config["default"] config_dict = lora_config.to_dict() - config_dict['target_modules'] = sorted(list(config_dict['target_modules'])) - with open(os.path.join(hf_converted_output_dir,"adapter_config.json"), "w") as f: + config_dict["target_modules"] = sorted( + list(config_dict["target_modules"]) + ) + with open( + os.path.join( + hf_converted_output_dir, "adapter_config.json" + ), + "w", + encoding="utf-8" + ) as f: json.dump(config_dict, f, indent=2) else: - self.trainer.model.config.save_pretrained( - hf_converted_output_dir - ) + model.config.save_pretrained(hf_converted_output_dir) except Exception as e: raise ValueError( From c12be0ef2a69798573d47fc911f480942015eebc Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Tue, 8 Apr 2025 13:02:21 -0400 Subject: [PATCH 08/32] fix: add input linear and output linear to target modules Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index 7573dd7ff..142d7655b 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -125,6 +125,8 @@ def checkpoint(checkpoint_dir, save_dir): config_dict["target_modules"] = sorted( list(config_dict["target_modules"]) ) + if "router" in config_dict["target_modules"]: + config_dict["target_modules"].append("input_linear, output_linear") with open( os.path.join( hf_converted_output_dir, "adapter_config.json" From 123c2d481ae77e0974ce2f2adf02ed1010445c87 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Tue, 8 Apr 2025 14:09:05 -0400 Subject: [PATCH 09/32] fix: extend instead of append Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index 142d7655b..94507855c 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -126,7 +126,7 @@ def checkpoint(checkpoint_dir, save_dir): list(config_dict["target_modules"]) ) if "router" in config_dict["target_modules"]: - config_dict["target_modules"].append("input_linear, output_linear") + config_dict["target_modules"].extend(["input_linear", "output_linear"]) with open( os.path.join( hf_converted_output_dir, "adapter_config.json" From f68500b64f320cffb7883a767743f4f3e3837152 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Tue, 8 Apr 2025 15:40:33 -0400 Subject: [PATCH 10/32] fix: if hasattr peft config Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index 94507855c..a2258d35c 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -119,7 +119,7 @@ def checkpoint(checkpoint_dir, save_dir): if hasattr(model, "module"): model = model.module - if model.peft_config: + if hasattr(model, "peft_config"): lora_config = model.peft_config["default"] config_dict = lora_config.to_dict() config_dict["target_modules"] = sorted( From 55ec4b505365161c491929e8383f670f6f01ddc6 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Wed, 9 Apr 2025 14:46:07 -0400 Subject: [PATCH 11/32] fix: remove unneeded target modules Signed-off-by: Will Johnson --- tuning/config/acceleration_configs/fast_moe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index a2258d35c..40d1e286a 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -125,8 +125,6 @@ def checkpoint(checkpoint_dir, save_dir): config_dict["target_modules"] = sorted( list(config_dict["target_modules"]) ) - if "router" in config_dict["target_modules"]: - config_dict["target_modules"].extend(["input_linear", "output_linear"]) with open( os.path.join( hf_converted_output_dir, "adapter_config.json" From 23623494c3750cb8e30a541ff5e765afd062c178 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Thu, 10 Apr 2025 09:34:05 -0400 Subject: [PATCH 12/32] lint + fmt Signed-off-by: Will Johnson --- .pylintrc | 2 +- tuning/config/acceleration_configs/fast_moe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.pylintrc b/.pylintrc index 41f7e4e73..5e9f356b9 100644 --- a/.pylintrc +++ b/.pylintrc @@ -475,7 +475,7 @@ notes-rgx= [REFACTORING] # Maximum number of nested blocks for function / method body -max-nested-blocks=5 +max-nested-blocks=6 # Complete name of functions that never returns. When checking for # inconsistent-return-statements if a never returning function is called then diff --git a/tuning/config/acceleration_configs/fast_moe.py b/tuning/config/acceleration_configs/fast_moe.py index 97eb214cd..37602daf1 100644 --- a/tuning/config/acceleration_configs/fast_moe.py +++ b/tuning/config/acceleration_configs/fast_moe.py @@ -139,7 +139,7 @@ def checkpoint(checkpoint_dir, save_dir): hf_converted_output_dir, "adapter_config.json" ), "w", - encoding="utf-8" + encoding="utf-8", ) as f: json.dump(config_dict, f, indent=2) From a848a9b45dcbed7baf4f58aa0fcd34ce924e6c00 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 11 Apr 2025 16:09:46 -0400 Subject: [PATCH 13/32] docs Signed-off-by: Will Johnson --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index e503a7d63..109985383 100644 --- a/README.md +++ b/README.md @@ -902,6 +902,10 @@ Notes: - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows: - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP). - if False, Scatter MoE Kernels with complete replication of experts across ranks. + - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, experts should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train: + - Passing `all-linear` to adapter layers will include the router, which is a linear layer, and all attn layers. This **will not** train the expert layers. + - To train only attention layers, specify target modules specifically (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`). + - To train expert layers, specify `input_linear` and `output_linear` in target modules along with `router` (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj", "router", "input_linear", "output_linear"]`). If you specify these layers, inference with vLLM/vanilla HF PEFT **is not possible**. - `world_size` must be divisible by the `ep_degree` - `number of experts` in the MoE module must be divisible by the `ep_degree` - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script. From 42c420c3028713899e5a03bde6f72903478869e6 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 14 Apr 2025 16:50:13 -0400 Subject: [PATCH 14/32] test: lora for scattermoe Signed-off-by: Will Johnson --- tests/test_sft_trainer.py | 42 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index 664c67ad7..c4dc8a5ed 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -1447,6 +1447,44 @@ def test_run_moe_ft_and_inference_ep1_kernels(dataset_path, ep_degree): ) +@pytest.mark.skipif( + not is_fms_accelerate_available(plugins="moe"), + reason="Only runs if fms-accelerate is installed along with accelerated-moe plugin", +) +@pytest.mark.parametrize( + "dataset_path", + [ + TWITTER_COMPLAINTS_DATA_JSONL, + ], +) +def test_run_moe_lora_and_inference(dataset_path): + """Check if we can finetune a moe model and check if hf checkpoint is created""" + with tempfile.TemporaryDirectory() as tempdir: + data_args = copy.deepcopy(DATA_ARGS) + data_args.training_data_path = dataset_path + model_args = copy.deepcopy(MODEL_ARGS) + model_args.model_name_or_path = "ibm-granite/granite-3.1-1b-a400m-base" + train_args = copy.deepcopy(TRAIN_ARGS) + train_args.output_dir = tempdir + lora_args = copy.deepcopy(PEFT_LORA_ARGS) + lora_args.r = 16 + lora_args.target_modules = ["q_proj", "v_proj", "o_proj", "k_proj"] # Router doesn't work with LoRA test inference + fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=False)) + sft_trainer.train( + model_args, + data_args, + train_args, + lora_args, + fast_moe_config=fast_moe_config, + ) + _test_run_inference( + checkpoint_path=os.path.join( + _get_checkpoint_path(tempdir), "hf_converted_checkpoint" + ), + base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base" + ) + + @pytest.mark.skipif( not is_fms_accelerate_available(plugins="moe"), reason="Only runs if fms-accelerate is installed along with accelerated-moe plugin", @@ -1485,9 +1523,9 @@ def _test_run_causallm_ft(training_args, model_args, data_args, tempdir): _validate_training(tempdir) -def _test_run_inference(checkpoint_path): +def _test_run_inference(checkpoint_path, base_model_name_or_path=None): # Load the model - loaded_model = TunedCausalLM.load(checkpoint_path) + loaded_model = TunedCausalLM.load(checkpoint_path, base_model_name_or_path) # Run inference on the text output_inference = loaded_model.run( From e3e7525db94d3ebcf83a327bc0f3b91ef04b83c1 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Tue, 15 Apr 2025 09:29:54 -0400 Subject: [PATCH 15/32] fmt tests Signed-off-by: Will Johnson --- tests/test_sft_trainer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index c4dc8a5ed..e97e51383 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -1468,7 +1468,12 @@ def test_run_moe_lora_and_inference(dataset_path): train_args.output_dir = tempdir lora_args = copy.deepcopy(PEFT_LORA_ARGS) lora_args.r = 16 - lora_args.target_modules = ["q_proj", "v_proj", "o_proj", "k_proj"] # Router doesn't work with LoRA test inference + lora_args.target_modules = [ + "q_proj", + "v_proj", + "o_proj", + "k_proj", + ] # Router doesn't work with LoRA test inference fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=False)) sft_trainer.train( model_args, @@ -1481,7 +1486,7 @@ def test_run_moe_lora_and_inference(dataset_path): checkpoint_path=os.path.join( _get_checkpoint_path(tempdir), "hf_converted_checkpoint" ), - base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base" + base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base", ) From 844965959b86c31997e103ee397d8a112549953f Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Wed, 16 Apr 2025 11:48:59 -0400 Subject: [PATCH 16/32] docs: notes on restrictions Signed-off-by: Will Johnson --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 109985383..51876ef6b 100644 --- a/README.md +++ b/README.md @@ -906,7 +906,9 @@ Notes: - Passing `all-linear` to adapter layers will include the router, which is a linear layer, and all attn layers. This **will not** train the expert layers. - To train only attention layers, specify target modules specifically (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`). - To train expert layers, specify `input_linear` and `output_linear` in target modules along with `router` (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj", "router", "input_linear", "output_linear"]`). If you specify these layers, inference with vLLM/vanilla HF PEFT **is not possible**. - - `world_size` must be divisible by the `ep_degree` + - When lora tuning with ScatterMoE, the values `--fast_moe 1` or `--fast_moe True` are not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>1`. + - When lora tuning with ScatterMoE, `--r` must be set to 16 or greater. + - `world_size` must be divisible by the `--ep_degree` - `number of experts` in the MoE module must be divisible by the `ep_degree` - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script. - The typical usecase for this script is to run: From 3c25265bfc0f66485f71fc578bf4c77132c07a36 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Wed, 16 Apr 2025 20:00:14 -0400 Subject: [PATCH 17/32] explitcitly don't support router layer Signed-off-by: Will Johnson --- README.md | 9 +++------ tuning/sft_trainer.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 51876ef6b..a26c963b9 100644 --- a/README.md +++ b/README.md @@ -902,12 +902,9 @@ Notes: - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows: - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP). - if False, Scatter MoE Kernels with complete replication of experts across ranks. - - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, experts should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train: - - Passing `all-linear` to adapter layers will include the router, which is a linear layer, and all attn layers. This **will not** train the expert layers. - - To train only attention layers, specify target modules specifically (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`). - - To train expert layers, specify `input_linear` and `output_linear` in target modules along with `router` (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj", "router", "input_linear", "output_linear"]`). If you specify these layers, inference with vLLM/vanilla HF PEFT **is not possible**. - - When lora tuning with ScatterMoE, the values `--fast_moe 1` or `--fast_moe True` are not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>1`. - - When lora tuning with ScatterMoE, `--r` must be set to 16 or greater. + - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train: + - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`. + - When lora tuning with ScatterMoE, the value `--fast_moe True` is not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>=1`. - `world_size` must be divisible by the `--ep_degree` - `number of experts` in the MoE module must be divisible by the `ep_degree` - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script. diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 7b06846fe..2c61700fd 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -155,6 +155,23 @@ def train( "Trainer should not perform packing when using `--padding_free`" ) + if fast_moe_config is not None: + # Checking for unsupported modules with Scatter MoE for LoRA + restricted_modules = ["all-linear", "output_linear", "input_linear", "router"] + if ( + peft_config is not None + and hasattr(peft_config, "target_modules") + and any( + module in (peft_config.target_modules or []) + for module in restricted_modules + ) + ): + raise ValueError( + "`--fast_moe` with LoRA does not currently support `all-linear`, `router`, " + "`input_linear` or `output_linear` as target modules at this time. Please " + "explicitly specify target modules when using `--fast_moe` with LoRA." + ) + task_type = "CAUSAL_LM" additional_metrics = {} From da81f93f771bd8936fa5b3a028f49a9040778833 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Thu, 17 Apr 2025 21:34:39 -0400 Subject: [PATCH 18/32] docs: generalize Signed-off-by: Will Johnson --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a26c963b9..63f83b268 100644 --- a/README.md +++ b/README.md @@ -904,8 +904,8 @@ Notes: - if False, Scatter MoE Kernels with complete replication of experts across ranks. - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train: - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`. - - When lora tuning with ScatterMoE, the value `--fast_moe True` is not expected to work, as FSDP must be enabled when lora tuning. Run either `--fast_moe False` or `--fast-moe x>=1`. - - `world_size` must be divisible by the `--ep_degree` + - FSDP must be used when lora tuning with `--fast_moe` + - `world_size` must be divisible by the `ep_degree` - `number of experts` in the MoE module must be divisible by the `ep_degree` - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script. - The typical usecase for this script is to run: From 1424efd2d8c8a68629eea87f3347c2a9848ec100 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 09:28:58 -0400 Subject: [PATCH 19/32] docs: update documentation Signed-off-by: Will Johnson --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 63f83b268..22d812288 100644 --- a/README.md +++ b/README.md @@ -902,9 +902,9 @@ Notes: - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows: - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP). - if False, Scatter MoE Kernels with complete replication of experts across ranks. - - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train: - - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`. - - FSDP must be used when lora tuning with `--fast_moe` + - FSDP must be used when lora tuning with `--fast_moe` + - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train: + - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`. - `world_size` must be divisible by the `ep_degree` - `number of experts` in the MoE module must be divisible by the `ep_degree` - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script. From b67ef0f59c071eee78ab9ecbfed2556199adcb09 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 09:30:54 -0400 Subject: [PATCH 20/32] fix: simplify accelerate launch post processing Signed-off-by: Will Johnson --- build/accelerate_launch.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index bea6d032b..43cf8dda0 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -170,7 +170,22 @@ def main(): for _, dirs, _ in os.walk(output_dir, topdown=False): for name in dirs: if "checkpoint-" in name.lower(): - checkpoint_dir = os.path.join(output_dir, name) + base_checkpoint_dir = os.path.join(output_dir, name) + hf_converted_checkpoint = os.path.join( + base_checkpoint_dir, "hf_converted_checkpoint" + ) + + # Use hf_converted_checkpoint if exists, otherwise use base_checkpoint_dir + checkpoint_dir = ( + hf_converted_checkpoint + if os.path.exists( + os.path.join( + hf_converted_checkpoint, "adapter_model.safetensors" + ) + ) + else base_checkpoint_dir + ) + if os.path.exists( os.path.join(checkpoint_dir, "adapter_model.safetensors") ): @@ -179,21 +194,6 @@ def main(): checkpoint_dir, num_added_tokens, ) - - # In case of ScatterMoE LoRa - hf_converted_checkpoint = os.path.join( - checkpoint_dir, "hf_converted_checkpoint" - ) - if os.path.exists( - os.path.join( - hf_converted_checkpoint, "adapter_model.safetensors" - ) - ): - post_process_vLLM_adapters_new_tokens( - hf_converted_checkpoint, - hf_converted_checkpoint, - num_added_tokens, - ) else: logging.warning( "Failed to post-process: file added_tokens_info.json not in path %s", From 6a32d320b2ee3e13fb7abe3e3f3d1e7f3bdfd483 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 09:31:56 -0400 Subject: [PATCH 21/32] tests: more target modules + ep_degree Signed-off-by: Will Johnson --- tests/test_sft_trainer.py | 58 +++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index e97e51383..bca3b193f 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -1452,12 +1452,18 @@ def test_run_moe_ft_and_inference_ep1_kernels(dataset_path, ep_degree): reason="Only runs if fms-accelerate is installed along with accelerated-moe plugin", ) @pytest.mark.parametrize( - "dataset_path", + "target_modules", [ - TWITTER_COMPLAINTS_DATA_JSONL, + "all-linear", + ["q_proj"], + ["q_proj", "k_proj"], + ["q_proj", "k_proj", "v_proj"], + ["q_proj", "k_proj", "v_proj", "o_proj"], ], ) -def test_run_moe_lora_and_inference(dataset_path): +@pytest.mark.parametrize("ep_degree", [True, False]) +@pytest.mark.parametrize("dataset_path", [TWITTER_COMPLAINTS_DATA_JSONL]) +def test_run_moe_lora_and_inference(dataset_path, target_modules, ep_degree): """Check if we can finetune a moe model and check if hf checkpoint is created""" with tempfile.TemporaryDirectory() as tempdir: data_args = copy.deepcopy(DATA_ARGS) @@ -1468,26 +1474,32 @@ def test_run_moe_lora_and_inference(dataset_path): train_args.output_dir = tempdir lora_args = copy.deepcopy(PEFT_LORA_ARGS) lora_args.r = 16 - lora_args.target_modules = [ - "q_proj", - "v_proj", - "o_proj", - "k_proj", - ] # Router doesn't work with LoRA test inference - fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=False)) - sft_trainer.train( - model_args, - data_args, - train_args, - lora_args, - fast_moe_config=fast_moe_config, - ) - _test_run_inference( - checkpoint_path=os.path.join( - _get_checkpoint_path(tempdir), "hf_converted_checkpoint" - ), - base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base", - ) + lora_args.target_modules = target_modules + fast_moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=ep_degree)) + + if target_modules == "all-linear": + with pytest.raises(ValueError): + sft_trainer.train( + model_args, + data_args, + train_args, + lora_args, + fast_moe_config=fast_moe_config, + ) + else: + sft_trainer.train( + model_args, + data_args, + train_args, + lora_args, + fast_moe_config=fast_moe_config, + ) + _test_run_inference( + checkpoint_path=os.path.join( + _get_checkpoint_path(tempdir), "hf_converted_checkpoint" + ), + base_model_name_or_path="ibm-granite/granite-3.1-1b-a400m-base", + ) @pytest.mark.skipif( From d2b6153c4fbd8c81a1cfe7df5ec426a5cff14b1f Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 09:32:29 -0400 Subject: [PATCH 22/32] fix: only restrict all-linear, raise warning for other modules Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 2c61700fd..e7a2ca473 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -157,7 +157,8 @@ def train( if fast_moe_config is not None: # Checking for unsupported modules with Scatter MoE for LoRA - restricted_modules = ["all-linear", "output_linear", "input_linear", "router"] + # Only raise an error for `all-linear` + restricted_modules = ["all-linear"] if ( peft_config is not None and hasattr(peft_config, "target_modules") @@ -167,9 +168,25 @@ def train( ) ): raise ValueError( - "`--fast_moe` with LoRA does not currently support `all-linear`, `router`, " - "`input_linear` or `output_linear` as target modules at this time. Please " - "explicitly specify target modules when using `--fast_moe` with LoRA." + "`--fast_moe` with LoRA does not currently support `all-linear`, as " + "target modules at this time. Please explicitly specify target " + "modules when using `--fast_moe` with LoRA." + ) + # If other common non-linear modules, raise warning + restrained_modules = ["input_linear", "output_linear", "router"] + if ( + peft_config is not None + and hasattr(peft_config, "target_modules") + and any( + module in (peft_config.target_modules or []) + for module in restrained_modules + ) + ): + logger.warning( + "Passing target modules that are part of the moe module can cause unexpected " + "behaviors and unsuccessful tuning while LoRA tuning with ScatterMoE. " + "For safe tuning, only pass linear modules such as those in the attn layer " + "(i.e. ['q_proj', 'v_proj', 'o_proj', 'k_proj'])" ) task_type = "CAUSAL_LM" From 765ec95222d1c475f987afe7dfc6ca0534688ae0 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 10:13:42 -0400 Subject: [PATCH 23/32] fix: augmentation test Signed-off-by: Will Johnson --- tests/acceleration/test_acceleration_framework.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py index 80a445304..3213c47d0 100644 --- a/tests/acceleration/test_acceleration_framework.py +++ b/tests/acceleration/test_acceleration_framework.py @@ -532,8 +532,8 @@ def test_framework_initialized_properly_moe(): ) # spy inside the train to ensure that the ilab plugin is called - assert spy["model_loader_calls"] == 1 - assert spy["augmentation_calls"] == 0 + assert spy["model_loader_calls"] == 0 + assert spy["augmentation_calls"] == 1 assert spy["get_ready_for_train_calls"] == 1 From b0dea82f34454057c31d272124961ff5bda8c18d Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 10:15:47 -0400 Subject: [PATCH 24/32] fix: raise error Signed-off-by: Will Johnson --- tests/acceleration/test_acceleration_framework.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py index 3213c47d0..15635fc07 100644 --- a/tests/acceleration/test_acceleration_framework.py +++ b/tests/acceleration/test_acceleration_framework.py @@ -807,12 +807,13 @@ def test_error_raised_fast_moe_with_non_moe_model(): instantiate=False, ): with instantiate_model_patcher(): - sft_trainer.train( - model_args, - data_args, - train_args, - fast_moe_config=moe_config, - ) + with pytest.raises(ValueError): + sft_trainer.train( + model_args, + data_args, + train_args, + fast_moe_config=moe_config, + ) @pytest.mark.skipif( From 806b716ab0dc5cb19fcae42928fef76d9a2cdfb4 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 10:20:36 -0400 Subject: [PATCH 25/32] fix: raise error Signed-off-by: Will Johnson --- .../test_acceleration_framework.py | 66 +++++++++---------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py index 15635fc07..757d9fa00 100644 --- a/tests/acceleration/test_acceleration_framework.py +++ b/tests/acceleration/test_acceleration_framework.py @@ -776,44 +776,40 @@ def test_error_raised_fast_moe_with_non_moe_model(): """ Ensure error is thrown when `--fast_moe` is passed and model is not MoE """ - with pytest.raises( - AttributeError, - match="'LlamaConfig' object has no attribute 'num_local_experts'", - ): - with tempfile.TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir: - model_args = copy.deepcopy(MODEL_ARGS) - model_args.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v0.3" - model_args.torch_dtype = torch.bfloat16 - train_args = copy.deepcopy(TRAIN_ARGS) - train_args.output_dir = tempdir - train_args.save_strategy = "no" - train_args.bf16 = True - data_args = copy.deepcopy(DATA_ARGS) - data_args.training_data_path = TWITTER_COMPLAINTS_JSON_FORMAT - data_args.response_template = "\n\n### Label:" - data_args.dataset_text_field = "output" + model_args = copy.deepcopy(MODEL_ARGS) + model_args.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v0.3" + model_args.torch_dtype = torch.bfloat16 + train_args = copy.deepcopy(TRAIN_ARGS) + train_args.output_dir = tempdir + train_args.save_strategy = "no" + train_args.bf16 = True + data_args = copy.deepcopy(DATA_ARGS) + data_args.training_data_path = TWITTER_COMPLAINTS_JSON_FORMAT + data_args.response_template = "\n\n### Label:" + data_args.dataset_text_field = "output" - # initialize a config - moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=1)) + # initialize a config + moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=1)) - # 1. mock a plugin class - # 2. register the mocked plugins - # 3. call sft_trainer.train - with build_framework_and_maybe_instantiate( - [ - (["training.moe.scattermoe"], ScatterMoEAccelerationPlugin), - ], - instantiate=False, - ): - with instantiate_model_patcher(): - with pytest.raises(ValueError): - sft_trainer.train( - model_args, - data_args, - train_args, - fast_moe_config=moe_config, - ) + # 1. mock a plugin class + # 2. register the mocked plugins + # 3. call sft_trainer.train + with build_framework_and_maybe_instantiate( + [ + (["training.moe.scattermoe"], ScatterMoEAccelerationPlugin), + ], + instantiate=False, + ): + with instantiate_model_patcher(): + with pytest.raises((ValueError, AttributeError)): + sft_trainer.train( + model_args, + data_args, + train_args, + fast_moe_config=moe_config, + ) @pytest.mark.skipif( From 2567d30b5ead9cf6645bef77325e7fec1877ca2b Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 10:56:44 -0400 Subject: [PATCH 26/32] fix: make warning more general Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 146ca4af6..a4ea16b8c 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -186,17 +186,13 @@ def train( "modules when using `--fast_moe` with LoRA." ) # If other common non-linear modules, raise warning - restrained_modules = ["input_linear", "output_linear", "router"] - if ( + elif ( peft_config is not None and hasattr(peft_config, "target_modules") - and any( - module in (peft_config.target_modules or []) - for module in restrained_modules - ) ): logger.warning( - "Passing target modules that are part of the moe module can cause unexpected " + "You are running lora with the ScatterMoE plugin, please note that " + "passing target modules that are part of the moe module can cause unexpected " "behaviors and unsuccessful tuning while LoRA tuning with ScatterMoE. " "For safe tuning, only pass linear modules such as those in the attn layer " "(i.e. ['q_proj', 'v_proj', 'o_proj', 'k_proj'])" From 70468db70c5802561b6c5053b8f2186b11a15fa8 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 14:26:10 -0400 Subject: [PATCH 27/32] turn off requires grad if using scattermoe with lora Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index a4ea16b8c..28a7b9670 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -186,10 +186,7 @@ def train( "modules when using `--fast_moe` with LoRA." ) # If other common non-linear modules, raise warning - elif ( - peft_config is not None - and hasattr(peft_config, "target_modules") - ): + if peft_config is not None and hasattr(peft_config, "target_modules"): logger.warning( "You are running lora with the ScatterMoE plugin, please note that " "passing target modules that are part of the moe module can cause unexpected " @@ -390,6 +387,16 @@ def train( model, (peft_config,) = framework.augmentation( model, train_args, modifiable_args=(peft_config,) ) + # For LoRa ScatterMoE, if expert layers are included, disable grad + if peft_config is not None: + frozen_keywords = [ + "block_sparse_moe.w1.weight", + "block_sparse_moe.w2.weight", + "block_sparse_moe.w3.weight", + ] + for name, param in model.named_parameters(): + if any(key in name for key in frozen_keywords): + param.requires_grad = False # HACK - The SFT Trainer has internal validation which inspects the name of the class # being used for the HF training args; if it's a TrainingArguments class, which is From 5b826c8c61986e592bcff0680844430643b2347f Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 14:48:14 -0400 Subject: [PATCH 28/32] fix: freeze scattermoe params Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 28a7b9670..58dd4b0a5 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -387,15 +387,10 @@ def train( model, (peft_config,) = framework.augmentation( model, train_args, modifiable_args=(peft_config,) ) - # For LoRa ScatterMoE, if expert layers are included, disable grad + # For LoRa ScatterMoE, disable grad for ScatterMoE if peft_config is not None: - frozen_keywords = [ - "block_sparse_moe.w1.weight", - "block_sparse_moe.w2.weight", - "block_sparse_moe.w3.weight", - ] for name, param in model.named_parameters(): - if any(key in name for key in frozen_keywords): + if "block_sparse_moe" in name: param.requires_grad = False # HACK - The SFT Trainer has internal validation which inspects the name of the class From af408f9822730e4df8c4943639767f6883793fa6 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 15:08:15 -0400 Subject: [PATCH 29/32] fix: safer freezing Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 58dd4b0a5..b24aef60e 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -67,12 +67,18 @@ USER_ERROR_EXIT_CODE, write_termination_log, ) +from tuning.utils.import_utils import is_fms_accelerate_available from tuning.utils.logging import set_log_level from tuning.utils.tokenizer_data_utils import ( get_special_tokens_dict, tokenizer_and_embedding_resize, ) +if is_fms_accelerate_available(plugins="moe"): + # Third Party + # pylint: disable=import-error + from fms_acceleration_moe.utils.scattermoe import ScatterMoE + def train( model_args: configs.ModelArguments, @@ -389,9 +395,10 @@ def train( ) # For LoRa ScatterMoE, disable grad for ScatterMoE if peft_config is not None: - for name, param in model.named_parameters(): - if "block_sparse_moe" in name: - param.requires_grad = False + for module in model.modules(): + if isinstance(module, ScatterMoE): + for param in module.parameters(): + param.requires_grad = False # HACK - The SFT Trainer has internal validation which inspects the name of the class # being used for the HF training args; if it's a TrainingArguments class, which is From d7c2d159545d56210e9234b147ae5981efadf9ff Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 15:32:36 -0400 Subject: [PATCH 30/32] just use string for class name Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 60c210ec7..3044a3d90 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -67,18 +67,12 @@ USER_ERROR_EXIT_CODE, write_termination_log, ) -from tuning.utils.import_utils import is_fms_accelerate_available from tuning.utils.logging import set_log_level from tuning.utils.tokenizer_data_utils import ( get_special_tokens_dict, tokenizer_and_embedding_resize, ) -if is_fms_accelerate_available(plugins="moe"): - # Third Party - # pylint: disable=import-error - from fms_acceleration_moe.utils.scattermoe import ScatterMoE - def train( model_args: configs.ModelArguments, @@ -403,7 +397,7 @@ def train( # For LoRa ScatterMoE, disable grad for ScatterMoE if peft_config is not None: for module in model.modules(): - if isinstance(module, ScatterMoE): + if module.__class__.__name__ == "ScatterMoE": for param in module.parameters(): param.requires_grad = False From 0f7796e18e6d01a03cb4f94c34d3389d82545dcc Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 18 Apr 2025 15:33:46 -0400 Subject: [PATCH 31/32] comment Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index 3044a3d90..6bcabafba 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -397,6 +397,7 @@ def train( # For LoRa ScatterMoE, disable grad for ScatterMoE if peft_config is not None: for module in model.modules(): + # Use string comparison to check if ScatterMoE module if module.__class__.__name__ == "ScatterMoE": for param in module.parameters(): param.requires_grad = False From 1759a2fed4086816c85321ef9720fbf83489163c Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Mon, 21 Apr 2025 09:31:42 -0400 Subject: [PATCH 32/32] add comment Signed-off-by: Will Johnson --- tuning/sft_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py index bd9a02122..b51a723e8 100644 --- a/tuning/sft_trainer.py +++ b/tuning/sft_trainer.py @@ -387,7 +387,9 @@ def train( model, (peft_config,) = framework.augmentation( model, train_args, modifiable_args=(peft_config,) ) - # For LoRa ScatterMoE, disable grad for ScatterMoE + # HACK - For LoRa ScatterMoE, disable grad for ScatterMoE. + # In the future, requires_grad should be enabled for LoRA tuning + # with ScatterMoE and this code should be removed. if peft_config is not None: for module in model.modules(): # Use string comparison to check if ScatterMoE module