feat: MoE Kernels, EP, and Fast Kernels for Granite 4 Preview architecture

kmehant · kmehant · commit ada737d49085 · 2025-06-13T00:56:54.000+05:30
Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -36,6 +36,7 @@ class ScatterMoEAccelerationPlugin(AccelerationPlugin):
         "GraniteMoeForCausalLM",
         "MixtralForCausalLM",
         "GraniteMoeSharedForCausalLM",
+        "GraniteMoeHybridForCausalLM",
     ]
 
     def __init__(self, configurations: Dict[str, Dict]):
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_constants.py
@@ -83,6 +83,13 @@
         SCATTERMOE_SPEC_HAS_GATE,
         False,
     ),
+    "GraniteMoeHybridForCausalLM": (
+        "GraniteMoeHybridMoE",
+        "router",
+        "input_linear|output_linear|input_linear",
+        SCATTERMOE_SPEC_HAS_GATE,
+        False,
+    ),
 }
 
 
diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt
@@ -5,10 +5,9 @@
 accelerate >= 0.29
 
 # bitsandbytes for the BNB plugin
-# - lower bound is because bnb is missing quant_state
-# - upper bound is because of segmentation faults
-# see https://github.com/foundation-model-stack/fms-acceleration/issues/17
-bitsandbytes >=0.41,<=0.43.3
+# exact version is needed 0.45.1 for torch upgrade to 2.6
+
+bitsandbytes == 0.45.1
 
 # Used to manage the thread limit in functions for converting old 
 # GPTQ models to new GPTQ model format that support symmetrical=False
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
@@ -45,6 +45,7 @@ def register_foak_model_patch_rules(
         granite,
         granitemoe,
         granitemoeshared,
+        granitemoehybrid,
         llama,
         mistral,
         mixtral,
@@ -56,6 +57,7 @@ def register_foak_model_patch_rules(
         *granite.get_mp_rules(base_type, config),
         *granitemoe.get_mp_rules(base_type),
         *granitemoeshared.get_mp_rules(base_type),
+        *granitemoehybrid.get_mp_rules(base_type),
         *llama.get_mp_rules(base_type, config),
         *mistral.get_mp_rules(base_type, config),
         *mixtral.get_mp_rules(base_type),
@@ -94,6 +96,7 @@ class FastKernelsAccelerationPlugin(AccelerationPlugin):
         "LlamaForCausalLM",
         "MistralForCausalLM",
         "GraniteMoeSharedForCausalLM",
+        "GraniteMoeHybridForCausalLM",
     ]
 
     def __init__(self, configurations: Dict[str, Dict]):
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoehybrid.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granitemoehybrid.py
@@ -0,0 +1,141 @@
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Standard
+from functools import partial
+
+# Third Party
+from fms_acceleration.model_patcher import (
+    ModelPatcherRule,
+    ModelPatcherTrigger,
+    combine_functions,
+    combine_triggers,
+)
+
+# Local
+from ..kernels.unsloth.cross_entropy_loss import (
+    FastCrossEntropyLoss,
+    replace_custom_loss_when_triggered,
+)
+from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm
+from ..kernels.unsloth.rope_embedding import fast_rope_embedding
+from .utils import (
+    KEY_O,
+    KEY_QKV,
+    build_lora_fused_ops,
+    get_transformers_version,
+    trigger_fused_ops,
+)
+
+
+def get_mp_rules(base_type: str):
+    """
+    Function to access all patch rules in this module.
+    If it is a forward_builder rule with `base_type` in
+    its forward builder argument, wrap the forward_builder
+    function as a partial function with the base_type argument
+    """
+    try:
+        # Third Party
+        from transformers.models.granitemoehybrid.modeling_granitemoehybrid import (  # pylint: disable=import-outside-toplevel
+            GraniteMoeHybridAttention,
+            GraniteMoeHybridForCausalLM,
+            GraniteMoeHybridRMSNorm,
+        )
+    except ImportError:
+        return []
+
+    return [
+        # TODO: have a generic version of this rule
+        # - do regex on RMSNorm class name
+        # - check on the tensors required for fast_rms_layernorm
+        ModelPatcherRule(
+            rule_id="granitemoehybrid-rms",
+            trigger=ModelPatcherTrigger(check=GraniteMoeHybridRMSNorm),
+            forward=fast_rms_layernorm,
+        ),
+        # TODO: have a generic version of this rule
+        # - do regex on Attention class name
+        # - have a set of qkv / o module names and check on that
+        ModelPatcherRule(
+            rule_id="granitemoehybrid-qkvo",
+            trigger=combine_triggers(
+                ModelPatcherTrigger(
+                    check=partial(
+                        trigger_fused_ops,
+                        attn_cls=GraniteMoeHybridAttention,
+                        submodule_names=["q_proj", "k_proj", "v_proj"],
+                    )
+                ),
+                ModelPatcherTrigger(
+                    check=partial(
+                        trigger_fused_ops,
+                        attn_cls=GraniteMoeHybridAttention,
+                        submodule_names=["o_proj"],
+                    )
+                ),
+                logic="OR",
+            ),
+            forward_builder=combine_functions(
+                partial(
+                    build_lora_fused_ops,
+                    submodule_names=["q_proj", "k_proj", "v_proj"],
+                    fused_op=KEY_QKV,
+                    base_type=base_type,
+                ),
+                partial(
+                    build_lora_fused_ops,
+                    submodule_names=["o_proj"],
+                    fused_op=KEY_O,
+                    base_type=base_type,
+                ),
+                logic="APPEND",
+            ),
+        ),
+        *[
+            (
+                ModelPatcherRule(
+                    rule_id="granitemoehybrid-custom-loss",
+                    trigger=ModelPatcherTrigger(
+                        check=replace_custom_loss_when_triggered(
+                            GraniteMoeHybridForCausalLM,
+                            custom_loss_type="granite-custom-loss",
+                        )
+                    ),
+                )
+                if get_transformers_version() >= "4.46"
+                else ModelPatcherRule(
+                    rule_id="granitemoehybrid-cross-ent",
+                    import_and_maybe_reload=(
+                        "torch.nn.CrossEntropyLoss",
+                        FastCrossEntropyLoss,
+                        "transformers.models.granitemoehybrid.modeling_granitemoehybrid",
+                    ),
+                )
+            )
+        ],
+        # TODO: have a generic version of this rule
+        # - get the module name
+        # - check if "apply_rotary_pos_emb" exists
+        # - patch
+        ModelPatcherRule(
+            rule_id="granitemoehybrid-rope",
+            import_and_maybe_reload=(
+                "transformers.models.granitemoehybrid.\
+                    modeling_granitemoehybrid.apply_rotary_pos_emb",
+                fast_rope_embedding,
+                None,
+            ),
+        ),
+    ]
diff --git a/sample-configurations/moe-scattermoe-granite-ep8-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep8-padding-free-foak-sample-configuration.yaml
@@ -0,0 +1,51 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  # Configurations to accelerate data packing/padding in training
+  training:
+
+    # attention module configurations
+    # e.g. padding-free modifications to attention layer
+    attention:
+
+      # this controls the confgurations for padding free computation of flash attention
+      padding_free:
+        method: huggingface
+    fused_ops_and_kernels:
+
+      # if under training stanza, then putting
+      # base_layer and fused_lora will be a misnomer
+      # - this should be in peft.quantized
+      # However, if it is specified, it will still 
+      # be read. This is useful in use cases where
+      # the yaml is system generated and not shown
+      # to a user.
+
+      # activate various unsloth optimizations
+      # there are two versions of the plugin
+      # - the FastKernel version supports individual kernels
+      # - the FastQuantized version is all-or-nothing
+
+      # fast loss triton kernels
+      fast_loss: true
+
+      # fast rms norm triton kernels
+      fast_rms_layernorm: true
+
+      # fast RoPE embedding triton kernels
+      fast_rope_embeddings: true
+    moe:
+
+      # expert-parallel for MoE
+      scattermoe:
+
+        # The level of expert parallel sharding. 
+        # - 1 means no sharding
+        # - if > 1, please ensure that this divides the world_size. This is because
+        #   the devices will be replicated for every ep_degree devices, and 
+        #   the experts will be sharded within each group.
+        # - if > 1, also ensure that it divides the number of experts, as each device
+        #   will then have num_of_experts / ep_degree experts.
+        ep_degree: 8
diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
@@ -10,15 +10,16 @@
 
 # default columns to compare
 DEFAULT_PLOT_COLUMNS = [
-    "mem_torch_mem_alloc_in_bytes",
-    "mem_peak_torch_mem_alloc_in_bytes",
+    # "mem_torch_mem_alloc_in_bytes",
+    # "mem_peak_torch_mem_alloc_in_bytes",
+    'mem_nvidia_mem_reserved',
     "train_loss",
     "train_tokens_per_second",
 ]
 # Used as combined identifier of experiment
 DEFAULT_INDICES = [
     "framework_config",
-    "peft_method",
+    # "peft_method",
     "model_name_or_path",
     "num_gpus",
     "per_device_train_batch_size",
@@ -29,7 +30,7 @@
     "train_runtime",
     "train_steps_per_second",
     "train_samples_per_second",
-    "mem_nvidia_mem_reserved",
+    # "mem_nvidia_mem_reserved",
 ]
 
 DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
diff --git a/scripts/benchmarks/scenarios-moe.yaml b/scripts/benchmarks/scenarios-moe.yaml
@@ -59,6 +59,7 @@ scenarios:
             model_name_or_path: 
                 - 'ibm-granite/granite-3.0-3b-a800m-instruct'
                 - 'ibm-research/moe-7b-1b-active-shared-experts'
+                - 'ibm-granite/granite-4.0-tiny-preview'
 
 
     -   name: accelerated-moe-full-mixtral
@@ -77,4 +78,4 @@ scenarios:
             packing: False
             adam_epsilon: 1e-8
             model_name_or_path: 
-                - 'mistralai/Mixtral-8x7B-Instruct-v0.1'
+                - 'mistralai/Mixtral-8x7B-Instruct-v0.1'
diff --git a/tox.ini b/tox.ini
@@ -34,6 +34,9 @@ commands =
     # some models need this for tokenizers
     pip install protobuf
 
+    # for mamba based models
+    pip install --no-build-isolation mamba_ssm[causal-conv1d]>=2.0.0
+
     # install the plugins for test
     # NOTE: when there are more plugins install here
     python -m fms_acceleration.cli install -e {toxinidir}/plugins/accelerated-peft
@@ -42,7 +45,7 @@ commands =
     python -m fms_acceleration.cli install -e {toxinidir}/plugins/accelerated-moe
 
     # install the flash attn at the last 
-    pip install flash-attn
+    pip install flash-attn --no-build-isolation
 
     # run the benchmark script
     bash scripts/run_benchmarks.sh {posargs:"1 2" "4 8" benchmark_outputs}

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ class ScatterMoEAccelerationPlugin(AccelerationPlugin):`
`36`	`36`	`"GraniteMoeForCausalLM",`
`37`	`37`	`"MixtralForCausalLM",`
`38`	`38`	`"GraniteMoeSharedForCausalLM",`
	`39`	`+ "GraniteMoeHybridForCausalLM",`
`39`	`40`	`]`
`40`	`41`
`41`	`42`	`def __init__(self, configurations: Dict[str, Dict]):`