debug

kmehant · kmehant · commit e1e8c18ff4ec · 2025-05-15T15:22:15.000+05:30
Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/sample-configurations/moe-scattermoe-granite-ep8-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep8-padding-free-foak-sample-configuration.yaml
@@ -0,0 +1,51 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  # Configurations to accelerate data packing/padding in training
+  training:
+
+    # attention module configurations
+    # e.g. padding-free modifications to attention layer
+    attention:
+
+      # this controls the confgurations for padding free computation of flash attention
+      padding_free:
+        method: huggingface
+    fused_ops_and_kernels:
+
+      # if under training stanza, then putting
+      # base_layer and fused_lora will be a misnomer
+      # - this should be in peft.quantized
+      # However, if it is specified, it will still 
+      # be read. This is useful in use cases where
+      # the yaml is system generated and not shown
+      # to a user.
+
+      # activate various unsloth optimizations
+      # there are two versions of the plugin
+      # - the FastKernel version supports individual kernels
+      # - the FastQuantized version is all-or-nothing
+
+      # fast loss triton kernels
+      fast_loss: true
+
+      # fast rms norm triton kernels
+      fast_rms_layernorm: true
+
+      # fast RoPE embedding triton kernels
+      fast_rope_embeddings: true
+    moe:
+
+      # expert-parallel for MoE
+      scattermoe:
+
+        # The level of expert parallel sharding. 
+        # - 1 means no sharding
+        # - if > 1, please ensure that this divides the world_size. This is because
+        #   the devices will be replicated for every ep_degree devices, and 
+        #   the experts will be sharded within each group.
+        # - if > 1, also ensure that it divides the number of experts, as each device
+        #   will then have num_of_experts / ep_degree experts.
+        ep_degree: 8
diff --git a/scripts/benchmarks/scenarios-moe.yaml b/scripts/benchmarks/scenarios-moe.yaml
@@ -48,6 +48,23 @@ scenarios:
             # - moe-scattermoe-granite-ep2-padding-free-foak
             # - moe-scattermoe-granite-ep4-padding-free
             # - moe-scattermoe-granite-ep4-padding-free-foak
+        arguments:
+            learning_rate: 5e-5
+            torch_dtype: bfloat16
+            gradient_accumulation_steps: null
+            per_device_train_batch_size: 8
+            logging_steps: 1
+            packing: False
+            adam_epsilon: 1e-8
+            model_name_or_path: 
+                - 'ibm-granite/granite-3.0-3b-a800m-instruct'
+                - 'ibm-research/moe-7b-1b-active-shared-experts'
+
+    -   name: accelerated-moe-granite4
+        framework_config:
+            - moe-scattermoe-granite-ep8
+            - moe-scattermoe-granite-ep8-foak
+            - moe-scattermoe-granite-ep8-padding-free-foak
         arguments:
             learning_rate: 5e-5
             torch_dtype: bfloat16
@@ -58,8 +75,6 @@ scenarios:
             adam_epsilon: 1e-8
             model_name_or_path: 
                 - 'ibm-granite/granite-4.0-tiny-preview'
-                # - 'ibm-granite/granite-3.0-3b-a800m-instruct'
-                # - 'ibm-research/moe-7b-1b-active-shared-experts'
 
 
     -   name: accelerated-moe-full-mixtral