3838scenarios :
3939 - name : accelerated-moe-full
4040 framework_config :
41- - # without acceleration
42- - moe-scattermoe-granite-ep1
43- - moe-scattermoe-granite-ep2
44- # - moe-scattermoe-granite-ep4
45- # - moe-scattermoe-granite-ep1-padding-free
46- # - moe-scattermoe-granite-ep1-padding-free-foak
47- # - moe-scattermoe-granite-ep2-padding-free
48- # - moe-scattermoe-granite-ep2-padding-free-foak
49- # - moe-scattermoe-granite-ep4-padding-free
50- # - moe-scattermoe-granite-ep4-padding-free-foak
51- arguments :
52- learning_rate : 5e-5
53- torch_dtype : bfloat16
54- gradient_accumulation_steps : null
55- per_device_train_batch_size : 8
56- logging_steps : 1
57- packing : False
58- adam_epsilon : 1e-8
59- model_name_or_path :
60- - ' ibm-granite/granite-3.0-3b-a800m-instruct'
61- - ' ibm-research/moe-7b-1b-active-shared-experts'
62-
63- - name : accelerated-moe-granite-4
64- framework_config :
6541 - # without acceleration
6642 - moe-scattermoe-granite-ep1
6743 - moe-scattermoe-granite-ep2
@@ -81,6 +57,8 @@ scenarios:
8157 packing : False
8258 adam_epsilon : 1e-8
8359 model_name_or_path :
60+ - ' ibm-granite/granite-3.0-3b-a800m-instruct'
61+ - ' ibm-research/moe-7b-1b-active-shared-experts'
8462 - ' ibm-granite/granite-4.0-tiny-preview'
8563
8664
@@ -100,4 +78,4 @@ scenarios:
10078 packing : False
10179 adam_epsilon : 1e-8
10280 model_name_or_path :
103- - ' mistralai/Mixtral-8x7B-Instruct-v0.1'
81+ - ' mistralai/Mixtral-8x7B-Instruct-v0.1'
0 commit comments