foundation-model-stack
diff --git a/‎.pylintrc‎
Lines changed: 1 addition & 1 deletion b/‎.pylintrc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 9 additions & 6 deletions b/‎README.md‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎build/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎build/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build/accelerate_launch.py‎
Lines changed: 34 additions & 4 deletions b/‎build/accelerate_launch.py‎
Lines changed: 34 additions & 4 deletions
diff --git a/‎docs/offline-data-preprocessing.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/offline-data-preprocessing.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/acceleration/test_acceleration_framework.py‎
Lines changed: 27 additions & 30 deletions b/‎tests/acceleration/test_acceleration_framework.py‎
Lines changed: 27 additions & 30 deletions
diff --git a/‎tests/artifacts/testdata/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/artifacts/testdata/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/artifacts/testdata/jsonl/image_dataset.jsonl‎
Lines changed: 2 additions & 0 deletions b/‎tests/artifacts/testdata/jsonl/image_dataset.jsonl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/artifacts/vision_models/__init__.py‎
Lines changed: 27 additions & 0 deletions b/‎tests/artifacts/vision_models/__init__.py‎
Lines changed: 27 additions & 0 deletions
@@ -475,7 +475,7 @@ notes-rgx=
 [REFACTORING]
 
 # Maximum number of nested blocks for function / method body
-max-nested-blocks=5
+max-nested-blocks=6
 
 # Complete name of functions that never returns. When checking for
 # inconsistent-return-statements if a never returning function is called then
 
@@ -855,6 +855,9 @@ Notes:
       - When a boolean is passed, the expert parallel degree defaults to 1 and further the behaviour would be as follows:
           - if True, it is Scatter MoE Kernels with experts sharded based on the top level sharding protocol (e.g. FSDP).
           - if False, Scatter MoE Kernels with complete replication of experts across ranks.
+    - FSDP must be used when lora tuning with `--fast_moe`
+    - lora tuning with ScatterMoE is supported, but because of inference restrictions on vLLM/vanilla PEFT, the expert layers and router linear layer should not be trained as `target_modules` for models being tuned with ScatterMoE. Users have control over which `target_modules` they wish to train:
+        - At this time, only attention layers are trainable when using LoRA with scatterMoE. Until support for the router linear layer is added in, target modules must be specified explicitly (i.e `target_modules: ["q_proj", "v_proj", "o_proj", "k_proj"]`) instead of passing `target_modules: ["all-linear"]`.
     - `world_size` must be divisible by the `ep_degree`
     - `number of experts` in the MoE module must be divisible by the `ep_degree`
     - Running fast moe modifies the state dict of the model, and must be post-processed which happens automatically and the converted checkpoint can be found at `hf_converted_checkpoint` folder within every saved checkpoint directory. Alternatively, we can perform similar option manually through [checkpoint utils](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py) script.
@@ -916,12 +919,12 @@ For information on supported dataset formats and how to tune a vision-language m
 
   ? May be supported, but not tested
 
-Model Name & Size  | Model Architecture | Full Finetuning |
--------------------- | ---------------- | --------------- |
-Llama 3.2-11B Vision  | MllamaForConditionalGeneration | ✅* |
-Llava 1.5-7B  | LlavaForConditionalGeneration | ✅* |
-Granite 3.1-2B Vision  | LlavaNextForConditionalGeneration | ✅* |
-Llava Mistral 1.6-7B  | LlavaNextForConditionalGeneration | ✅* |
+Model Name & Size  | Model Architecture | LoRA Tuning | Full Finetuning |
+-------------------- | ---------------- | --------------- | --------------- |
+Llama 3.2-11B Vision  | MllamaForConditionalGeneration | ✅* | ✅* | 
+Llava 1.5-7B  | LlavaForConditionalGeneration | ✅* | ✅* | 
+Granite 3.1-2B Vision  | LlavaNextForConditionalGeneration | ✅* | ✅* |
+Llava Mistral 1.6-7B  | LlavaNextForConditionalGeneration | ✅* | ✅* |
 
 (*) - Supported with `fms-hf-tuning` v2.8.0 or later.
 
 
@@ -149,7 +149,7 @@ RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
     python -m pip install --user wheel && \
     python -m pip install --user "$(head bdist_name)" && \
     python -m pip install --user "$(head bdist_name)[flash-attn]" && \
-    python -m pip install --user "$(head bdist_name)[mamba]"
+    python -m pip install --user --no-build-isolation "$(head bdist_name)[mamba]"
 
 # fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
 # fms_acceleration_foak = Fused LoRA and triton kernels
 
@@ -146,6 +146,17 @@ def main():
                     save_model_dir, save_model_dir, num_added_tokens
                 )
 
+            # In case of ScatterMoE LoRa
+            hf_converted_checkpoint = os.path.join(
+                save_model_dir, "hf_converted_checkpoint"
+            )
+            if os.path.exists(
+                os.path.join(hf_converted_checkpoint, "adapter_model.safetensors")
+            ):
+                post_process_vLLM_adapters_new_tokens(
+                    hf_converted_checkpoint, hf_converted_checkpoint, num_added_tokens
+                )
+
         if (
             os.path.exists(os.path.join(output_dir, "added_tokens_info.json"))
             and job_config.get("save_strategy") != "no"
@@ -159,11 +170,30 @@ def main():
             for _, dirs, _ in os.walk(output_dir, topdown=False):
                 for name in dirs:
                     if "checkpoint-" in name.lower():
-                        post_process_vLLM_adapters_new_tokens(
-                            os.path.join(output_dir, name),
-                            os.path.join(output_dir, name),
-                            num_added_tokens,
+                        base_checkpoint_dir = os.path.join(output_dir, name)
+                        hf_converted_checkpoint = os.path.join(
+                            base_checkpoint_dir, "hf_converted_checkpoint"
+                        )
+
+                        # Use hf_converted_checkpoint if exists, otherwise use base_checkpoint_dir
+                        checkpoint_dir = (
+                            hf_converted_checkpoint
+                            if os.path.exists(
+                                os.path.join(
+                                    hf_converted_checkpoint, "adapter_model.safetensors"
+                                )
+                            )
+                            else base_checkpoint_dir
                         )
+
+                        if os.path.exists(
+                            os.path.join(checkpoint_dir, "adapter_model.safetensors")
+                        ):
+                            post_process_vLLM_adapters_new_tokens(
+                                checkpoint_dir,
+                                checkpoint_dir,
+                                num_added_tokens,
+                            )
         else:
             logging.warning(
                 "Failed to post-process: file added_tokens_info.json not in path %s",
 
@@ -37,6 +37,8 @@ python scripts/offline_data_processing.py \
 
 Additionally, once the offline data processing is complete, users can leverage the shards stored in `output_dir` for tuning by passing it through the `--training_data_path` flag or passing it via `data_paths` argument in data config yaml, provided they find the sharded datasets beneficial for training.
 
+**NOTE**: The offline data preprocessing script is not compatible with processing image datasets for vision models. 
+
 ## Example Usage
 ### Applying Chat Template
 
 
@@ -34,7 +34,7 @@ dependencies = [
 "sentencepiece>=0.1.99,<0.3",
 "tokenizers>=0.13.3,<1.0",
 "tqdm>=4.66.2,<5.0",
-"trl>=0.13,<0.17",
+"trl>=0.13,<0.18",
 "peft>=0.8.0,<0.14",
 "protobuf>=5.28.0,<6.0.0",
 "datasets>=2.15.0,<4.0",
 
@@ -532,8 +532,8 @@ def test_framework_initialized_properly_moe():
                 )
 
         # spy inside the train to ensure that the ilab plugin is called
-        assert spy["model_loader_calls"] == 1
-        assert spy["augmentation_calls"] == 0
+        assert spy["model_loader_calls"] == 0
+        assert spy["augmentation_calls"] == 1
         assert spy["get_ready_for_train_calls"] == 1
 
 
@@ -776,37 +776,34 @@ def test_error_raised_fast_moe_with_non_moe_model():
     """
     Ensure error is thrown when `--fast_moe` is passed and model is not MoE
     """
-    with pytest.raises(
-        AttributeError,
-        match="'LlamaConfig' object has no attribute 'num_local_experts'",
-    ):
-        with tempfile.TemporaryDirectory() as tempdir:
+    with tempfile.TemporaryDirectory() as tempdir:
 
-            model_args = copy.deepcopy(MODEL_ARGS)
-            model_args.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
-            model_args.torch_dtype = torch.bfloat16
-            train_args = copy.deepcopy(TRAIN_ARGS)
-            train_args.output_dir = tempdir
-            train_args.save_strategy = "no"
-            train_args.bf16 = True
-            data_args = copy.deepcopy(DATA_ARGS)
-            data_args.training_data_path = TWITTER_COMPLAINTS_JSON_FORMAT
-            data_args.response_template = "\n\n### Label:"
-            data_args.dataset_text_field = "output"
+        model_args = copy.deepcopy(MODEL_ARGS)
+        model_args.model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
+        model_args.torch_dtype = torch.bfloat16
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+        train_args.save_strategy = "no"
+        train_args.bf16 = True
+        data_args = copy.deepcopy(DATA_ARGS)
+        data_args.training_data_path = TWITTER_COMPLAINTS_JSON_FORMAT
+        data_args.response_template = "\n\n### Label:"
+        data_args.dataset_text_field = "output"
 
-            # initialize a config
-            moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=1))
+        # initialize a config
+        moe_config = FastMoeConfig(fast_moe=FastMoe(ep_degree=1))
 
-            # 1. mock a plugin class
-            # 2. register the mocked plugins
-            # 3. call sft_trainer.train
-            with build_framework_and_maybe_instantiate(
-                [
-                    (["training.moe.scattermoe"], ScatterMoEAccelerationPlugin),
-                ],
-                instantiate=False,
-            ):
-                with instantiate_model_patcher():
+        # 1. mock a plugin class
+        # 2. register the mocked plugins
+        # 3. call sft_trainer.train
+        with build_framework_and_maybe_instantiate(
+            [
+                (["training.moe.scattermoe"], ScatterMoEAccelerationPlugin),
+            ],
+            instantiate=False,
+        ):
+            with instantiate_model_patcher():
+                with pytest.raises((ValueError, AttributeError)):
                     sft_trainer.train(
                         model_args,
                         data_args,
 
@@ -74,6 +74,7 @@
 CHAT_DATA_MULTI_TURN_GRANITE_3_1B = os.path.join(
     JSONL_DATA_DIR, "multi_turn_chat_granite_instruct.jsonl"
 )
+IMAGE_DATASET = os.path.join(JSONL_DATA_DIR, "image_dataset.jsonl")
 EMPTY_DATA = os.path.join(JSON_DATA_DIR, "empty_data.json")
 MALFORMATTED_DATA = os.path.join(JSON_DATA_DIR, "malformatted_data.json")
 
 
@@ -0,0 +1,27 @@
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpful saved vison models for unit tests.
+"""
+# Standard
+import os
+
+### Constants used for model path
+PREDEFINED_MODEL_PATH = os.path.join(os.path.dirname(__file__))
+TINY_LLAMA_VISION_MODEL_NAME = os.path.join(
+    PREDEFINED_MODEL_PATH, "tiny_llama_vision_model"
+)
+TINY_GRANITE_VISION_MODEL_NAME = os.path.join(
+    PREDEFINED_MODEL_PATH, "tiny_granite_vision_model"
+)
Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@`
`74`	`74`	`CHAT_DATA_MULTI_TURN_GRANITE_3_1B = os.path.join(`
`75`	`75`	`JSONL_DATA_DIR, "multi_turn_chat_granite_instruct.jsonl"`
`76`	`76`	`)`
	`77`	`+IMAGE_DATASET = os.path.join(JSONL_DATA_DIR, "image_dataset.jsonl")`
`77`	`78`	`EMPTY_DATA = os.path.join(JSON_DATA_DIR, "empty_data.json")`
`78`	`79`	`MALFORMATTED_DATA = os.path.join(JSON_DATA_DIR, "malformatted_data.json")`
`79`	`80`