diff --git a/launcher_scripts/conf/evaluation/peft_mixtral/squad.yaml b/launcher_scripts/conf/evaluation/peft_mixtral/squad.yaml index a87f690f0..c1455c438 100755 --- a/launcher_scripts/conf/evaluation/peft_mixtral/squad.yaml +++ b/launcher_scripts/conf/evaluation/peft_mixtral/squad.yaml @@ -28,8 +28,8 @@ exp_manager: model: seed: 1234 - tensor_model_parallel_size: 8 - pipeline_model_parallel_size: 1 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 8 global_batch_size: 32 micro_batch_size: 4 @@ -39,7 +39,7 @@ model: sync_batch_comm: False megatron_amp_02: False - sequence_parallel: True + sequence_parallel: False activations_checkpoint_granularity: null activations_checkpoint_method: null diff --git a/launcher_scripts/conf/evaluation/peft_mixtral/squad_8x22b.yaml b/launcher_scripts/conf/evaluation/peft_mixtral/squad_8x22b.yaml index ac485de74..ddb02dbd8 100755 --- a/launcher_scripts/conf/evaluation/peft_mixtral/squad_8x22b.yaml +++ b/launcher_scripts/conf/evaluation/peft_mixtral/squad_8x22b.yaml @@ -28,8 +28,8 @@ exp_manager: model: seed: 1234 - tensor_model_parallel_size: 8 - pipeline_model_parallel_size: 1 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 8 global_batch_size: 32 micro_batch_size: 4 @@ -39,7 +39,7 @@ model: sync_batch_comm: False megatron_amp_02: False - sequence_parallel: True + sequence_parallel: False activations_checkpoint_granularity: null activations_checkpoint_method: null diff --git a/launcher_scripts/conf/peft/mixtral/squad.yaml b/launcher_scripts/conf/peft/mixtral/squad.yaml index c9692a4d7..216980be6 100644 --- a/launcher_scripts/conf/peft/mixtral/squad.yaml +++ b/launcher_scripts/conf/peft/mixtral/squad.yaml @@ -55,8 +55,8 @@ exp_manager: model: seed: 1234 - tensor_model_parallel_size: 8 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 8 # inter-layer model parallelism tp_comm_overlap_disable_qkv: True global_batch_size: 128 @@ -71,7 +71,7 @@ model: ## Sequence Parallelism # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. - sequence_parallel: True + sequence_parallel: False ## Activation Checkpoint activations_checkpoint_granularity: null # 'selective' or 'full' diff --git a/launcher_scripts/conf/peft/mixtral/squad_8x22b.yaml b/launcher_scripts/conf/peft/mixtral/squad_8x22b.yaml index 2ad3d010f..9708fb78a 100644 --- a/launcher_scripts/conf/peft/mixtral/squad_8x22b.yaml +++ b/launcher_scripts/conf/peft/mixtral/squad_8x22b.yaml @@ -55,8 +55,8 @@ exp_manager: model: seed: 1234 - tensor_model_parallel_size: 8 # intra-layer model parallelism - pipeline_model_parallel_size: 1 # inter-layer model parallelism + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 8 # inter-layer model parallelism tp_comm_overlap_disable_qkv: True global_batch_size: 128