fms-acceleration/scripts/benchmarks/scenarios-moe.yaml at main · foundation-model-stack/fms-acceleration · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# This file holds a list of scenarios to may be run.
# - to limit to a number of scenarios, use the --run-only-scenarios flag.
# - Each scenario will be run against a particular acceleration framework
#   config, if the framework_config: key is specified.
#   * a particular framework configuration
# - the arguments tag will hold arguments to be passed to sft_trainer
#   * the arguments are singular except for model_name_or_path which can handle
#     multiple arguments.
# - So anything that is critical for the scenario MUST be specified here
#   and not in the defaults, e.g. fp16

# This stanza will be used in future to replace the custom processing functions in data_processing.py
# data_processing:
#   dataset_name: yahma/alpaca-cleaned
#   chat_template: |
#     {%- for message in messages %}
#         {% if message['input'] != '' %}
#     Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

#         {% else %}
#     Below is an instruction that describes a task. Write a response that appropriately completes the request.

#         {% endif %}
#     ### Instruction:
#     {{ message['instruction'] }}

#         {% if message['input'] != '' %}
#     ### Input:
#     {{ message['input'] }}

#         {% endif %}
#     ### Response:
#     {{ message['output'] + eos_token }}
#     {% endfor %}
#   tokenize: True


scenarios:
    -   name: accelerated-moe-full
        framework_config:
            - # without acceleration
            - moe-scattermoe-granite-ep1
            - moe-scattermoe-granite-ep2
            - moe-scattermoe-granite-ep4
            - moe-scattermoe-granite-ep1-padding-free
            - moe-scattermoe-granite-ep1-padding-free-foak
            - moe-scattermoe-granite-ep2-padding-free
            - moe-scattermoe-granite-ep2-padding-free-foak
            - moe-scattermoe-granite-ep4-padding-free
            - moe-scattermoe-granite-ep4-padding-free-foak
        arguments:
            learning_rate: 5e-5
            torch_dtype: bfloat16
            gradient_accumulation_steps: null
            per_device_train_batch_size: 8
            logging_steps: 1
            packing: False
            adam_epsilon: 1e-8
            model_name_or_path:
                - 'ibm-granite/granite-3.0-3b-a800m-instruct'
                - 'ibm-research/moe-7b-1b-active-shared-experts'
                - 'ibm-granite/granite-4.0-tiny-preview'


    -   name: accelerated-moe-full-mixtral
        framework_config:
            - # without acceleration
            - moe-scattermoe-granite-ep8
            - moe-scattermoe-granite-ep8-foak
        slow: True
        arguments:
            learning_rate: 5e-5
            torch_dtype: bfloat16
            accelerator_config: scripts/benchmarks/accelerator-config.json
            gradient_accumulation_steps: null
            per_device_train_batch_size: 1
            logging_steps: 1
            packing: False
            adam_epsilon: 1e-8
            model_name_or_path:
                - 'mistralai/Mixtral-8x7B-Instruct-v0.1'