Skip to content

Commit e1e8c18

Browse files
committed
debug
Signed-off-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
1 parent a7eb4f6 commit e1e8c18

2 files changed

Lines changed: 68 additions & 2 deletions

File tree

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# FMS Acceleration Plugin Configuration.
2+
#
3+
# Each stanza incorporates various configurations for
4+
# different fine-tuning / training tasks.
5+
plugins:
6+
# Configurations to accelerate data packing/padding in training
7+
training:
8+
9+
# attention module configurations
10+
# e.g. padding-free modifications to attention layer
11+
attention:
12+
13+
# this controls the confgurations for padding free computation of flash attention
14+
padding_free:
15+
method: huggingface
16+
fused_ops_and_kernels:
17+
18+
# if under training stanza, then putting
19+
# base_layer and fused_lora will be a misnomer
20+
# - this should be in peft.quantized
21+
# However, if it is specified, it will still
22+
# be read. This is useful in use cases where
23+
# the yaml is system generated and not shown
24+
# to a user.
25+
26+
# activate various unsloth optimizations
27+
# there are two versions of the plugin
28+
# - the FastKernel version supports individual kernels
29+
# - the FastQuantized version is all-or-nothing
30+
31+
# fast loss triton kernels
32+
fast_loss: true
33+
34+
# fast rms norm triton kernels
35+
fast_rms_layernorm: true
36+
37+
# fast RoPE embedding triton kernels
38+
fast_rope_embeddings: true
39+
moe:
40+
41+
# expert-parallel for MoE
42+
scattermoe:
43+
44+
# The level of expert parallel sharding.
45+
# - 1 means no sharding
46+
# - if > 1, please ensure that this divides the world_size. This is because
47+
# the devices will be replicated for every ep_degree devices, and
48+
# the experts will be sharded within each group.
49+
# - if > 1, also ensure that it divides the number of experts, as each device
50+
# will then have num_of_experts / ep_degree experts.
51+
ep_degree: 8

scripts/benchmarks/scenarios-moe.yaml

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,23 @@ scenarios:
4848
# - moe-scattermoe-granite-ep2-padding-free-foak
4949
# - moe-scattermoe-granite-ep4-padding-free
5050
# - moe-scattermoe-granite-ep4-padding-free-foak
51+
arguments:
52+
learning_rate: 5e-5
53+
torch_dtype: bfloat16
54+
gradient_accumulation_steps: null
55+
per_device_train_batch_size: 8
56+
logging_steps: 1
57+
packing: False
58+
adam_epsilon: 1e-8
59+
model_name_or_path:
60+
- 'ibm-granite/granite-3.0-3b-a800m-instruct'
61+
- 'ibm-research/moe-7b-1b-active-shared-experts'
62+
63+
- name: accelerated-moe-granite4
64+
framework_config:
65+
- moe-scattermoe-granite-ep8
66+
- moe-scattermoe-granite-ep8-foak
67+
- moe-scattermoe-granite-ep8-padding-free-foak
5168
arguments:
5269
learning_rate: 5e-5
5370
torch_dtype: bfloat16
@@ -58,8 +75,6 @@ scenarios:
5875
adam_epsilon: 1e-8
5976
model_name_or_path:
6077
- 'ibm-granite/granite-4.0-tiny-preview'
61-
# - 'ibm-granite/granite-3.0-3b-a800m-instruct'
62-
# - 'ibm-research/moe-7b-1b-active-shared-experts'
6378

6479

6580
- name: accelerated-moe-full-mixtral

0 commit comments

Comments
 (0)