1616This script tests to ensure that `accelerate` performs at the same level as raw `MS-AMP`.
1717
1818This particular script verifies this for DeepSpeed training.
19+
20+ NOTE: MS-AMP does *not* support ZeRO-3.
1921"""
20- from unittest .mock import patch
2122
22- from msamp import deepspeed
23+ # import msamp. deepspeed as msamp_deepspeed
2324import evaluate
2425import torch
25- # import transformer_engine.common.recipe as te_recipe
26- # import transformer_engine.pytorch as te
2726from fp8_utils import evaluate_model , get_training_utilities
28- # from transformer_engine.common.recipe import DelayedScaling
27+ from msamp import deepspeed as msamp_deepspeed
2928
3029from accelerate import Accelerator , DeepSpeedPlugin
3130from accelerate .state import AcceleratorState
32- from accelerate .utils import FP8RecipeKwargs , set_seed
31+ from accelerate .utils import set_seed
3332
3433
3534MODEL_NAME = "bert-base-cased"
3635METRIC = evaluate .load ("glue" , "mrpc" )
3736
3837
3938def train_baseline (zero_stage : int = 1 , opt_level : str = "O1" ):
40- # This forces transformers to think Zero-3 Init should be used
41- with patch ("transformers.integrations.deepspeed.is_deepspeed_zero3_enabled" ) as mock :
42- mock .return_value = zero_stage == 3
4339 set_seed (42 )
44-
4540 accelerator = Accelerator ()
4641 model , optimizer , train_dataloader , eval_dataloader , lr_scheduler = get_training_utilities (
4742 MODEL_NAME , accelerator = accelerator
@@ -57,7 +52,6 @@ def train_baseline(zero_stage: int = 1, opt_level: str = "O1"):
5752 "stage" : zero_stage ,
5853 "offload_optimizer" : {"device" : "none" , "nvme_path" : None },
5954 "offload_param" : {"device" : "none" , "nvme_path" : None },
60- "stage3_gather_16bit_weights_on_model_save" : False ,
6155 },
6256 "gradient_clipping" : 1.0 ,
6357 "steps_per_print" : np .inf ,
@@ -67,15 +61,14 @@ def train_baseline(zero_stage: int = 1, opt_level: str = "O1"):
6761 "msamp" : {
6862 "enabled" : True ,
6963 "opt_level" : opt_level ,
70- }
64+ },
7165 }
72-
7366 (
7467 model ,
7568 optimizer ,
7669 _ ,
7770 _ ,
78- ) = deepspeed .initialize (
71+ ) = msamp_deepspeed .initialize (
7972 model = model ,
8073 optimizer = optimizer ,
8174 config_params = config ,
@@ -107,18 +100,14 @@ def train_baseline(zero_stage: int = 1, opt_level: str = "O1"):
107100 return base_model_results , trained_model_results
108101
109102
110- def train_integration (zero_stage : int = 1 ):
103+ def train_integration (zero_stage : int = 1 , opt_level : str = "O1" ):
111104 set_seed (42 )
112- FP8_RECIPE_KWARGS = {"fp8_format" : "HYBRID" , "amax_history_len" : 32 , "amax_compute_algo" : "max" }
113- kwargs_handlers = [FP8RecipeKwargs (backend = "TE" , ** FP8_RECIPE_KWARGS )]
114- AcceleratorState ()._reset_state (True )
115105 deepspeed_plugin = DeepSpeedPlugin (
116106 zero_stage = zero_stage ,
117- zero3_init_flag = zero_stage == 3 ,
118- )
119- accelerator = Accelerator (
120- mixed_precision = "fp8" , kwargs_handlers = kwargs_handlers , deepspeed_plugin = deepspeed_plugin
107+ enable_msamp = True ,
108+ msamp_opt_level = opt_level ,
121109 )
110+ accelerator = Accelerator (mixed_precision = "fp8" , deepspeed_plugin = deepspeed_plugin )
122111 accelerator .state .deepspeed_plugin .deepspeed_config ["train_micro_batch_size_per_gpu" ] = 16
123112
124113 model , optimizer , train_dataloader , eval_dataloader , lr_scheduler = get_training_utilities (
@@ -128,13 +117,9 @@ def train_integration(zero_stage: int = 1):
128117 model , optimizer , lr_scheduler = accelerator .prepare (model , optimizer , lr_scheduler )
129118 base_model_results = evaluate_model (model , eval_dataloader , METRIC , accelerator = accelerator )
130119 model .train ()
131- model_outputs = []
132- data = []
133120 for _ in range (2 ):
134121 for batch in train_dataloader :
135122 outputs = model (** batch )
136- data .append (batch .to ("cpu" ))
137- model_outputs .append (outputs .logits .to ("cpu" ))
138123 loss = outputs .loss
139124 accelerator .backward (loss )
140125 optimizer .step ()
@@ -151,32 +136,26 @@ def train_integration(zero_stage: int = 1):
151136 trained_model_results ["f1" ] > base_model_results ["f1" ]
152137 ), f'F1 score should be higher for the trained model: { trained_model_results ["f1" ]} > { base_model_results ["f1" ]} '
153138
139+ AcceleratorState ()._reset_state (True )
154140 return base_model_results , trained_model_results
155141
156142
157143if __name__ == "__main__" :
158- # results = {"1": [], "2": [], "3": []}
159- # for zero_stage in [1, 2, 3]:
160- # for opt_level in ["O1", "O2", "O3"]:
161- baseline_not_trained , baseline_trained = train_baseline (3 , "O3" )
162- print (baseline_not_trained , baseline_trained )
163- # results[str(zero_stage)].append({"opt_level": opt_level, "not_trained": baseline_not_trained, "trained": baseline_trained})
164- # for stage, stage_results in results.items():
165- # print(f'zero_stage={stage}:\n')
166- # for result in stage_results:
167- # print(f'opt_level={result["opt_level"]}:\nBaseline not trained: {result["not_trained"]}\nBaseline trained: {result["trained"]}\n')
168- # accelerator_not_trained, accelerator_trained, accelerator_outputs, accelerator_data = train_integration(zero_stage)
169- # assert (
170- # baseline_not_trained["accuracy"] == accelerator_not_trained["accuracy"]
171- # ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_not_trained["accuracy"]} == {accelerator_not_trained["accuracy"]}'
172- # assert (
173- # baseline_not_trained["f1"] == accelerator_not_trained["f1"]
174- # ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_not_trained["f1"]} == {accelerator_not_trained["f1"]}'
175- # assert (
176- # baseline_trained["accuracy"] == accelerator_trained["accuracy"]
177- # ), f'ZERO stage {zero_stage}: Accuracy should be the same for the baseline and accelerator: {baseline_trained["accuracy"]} == {accelerator_trained["accuracy"]}'
178- # assert (
179- # baseline_trained["f1"] == accelerator_trained["f1"]
180- # ), f'ZERO stage {zero_stage}: F1 score should be the same for the baseline and accelerator: {baseline_trained["f1"]} == {accelerator_trained["f1"]}'
144+ for zero_stage in [1 , 2 ]:
145+ for opt_level in ["O1" , "O2" , "O3" ]:
146+ baseline_not_trained , baseline_trained = train_baseline (zero_stage , opt_level )
147+ accelerator_not_trained , accelerator_trained = train_integration (zero_stage , opt_level )
148+ assert (
149+ baseline_not_trained ["accuracy" ] == accelerator_not_trained ["accuracy" ]
150+ ), f'ZERO stage { zero_stage } , opt_level={ opt_level } :\n Accuracy should be the same for the baseline and accelerator: { baseline_not_trained ["accuracy" ]} == { accelerator_not_trained ["accuracy" ]} '
151+ assert (
152+ baseline_not_trained ["f1" ] == accelerator_not_trained ["f1" ]
153+ ), f'ZERO stage { zero_stage } , opt_level={ opt_level } :\n F1 score should be the same for the baseline and accelerator: { baseline_not_trained ["f1" ]} == { accelerator_not_trained ["f1" ]} '
154+ assert (
155+ baseline_trained ["accuracy" ] == accelerator_trained ["accuracy" ]
156+ ), f'ZERO stage { zero_stage } , opt_level={ opt_level } :\n Accuracy should be the same for the baseline and accelerator: { baseline_trained ["accuracy" ]} == { accelerator_trained ["accuracy" ]} '
157+ assert (
158+ baseline_trained ["f1" ] == accelerator_trained ["f1" ]
159+ ), f'ZERO stage { zero_stage } , opt_level={ opt_level } :\n F1 score should be the same for the baseline and accelerator: { baseline_trained ["f1" ]} == { accelerator_trained ["f1" ]} '
181160
182161 torch .distributed .destroy_process_group ()
0 commit comments