Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
384 changes: 384 additions & 0 deletions config_files/training/config_lorem_ipsum_long_moe_ep_fsdp2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,384 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
prediction_key: logits
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpoint_saving_path: data/checkpoints
train_dataset_path: ./data/lorem_ipsum_long.pbin
test_dataset_path: ./data/lorem_ipsum.pbin
experiments_root_path: ${modalities_env:experiments_root_path}
intervals:
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 32
evaluation_interval_in_steps: 32
consistency_enforcement:
enforce_tokens_per_step_consistency: false
enforce_last_step_logged: false
enforce_last_step_evaluated: false
enforce_last_step_checkpointed: false
step_profile:
gradient_accumulation_steps: 1
local_train_micro_batch_size: 1
sequence_length: 256
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
training_target:
num_target_tokens:
component_key: number_conversion
variant_key: num_tokens_from_packed_mem_map_dataset_continuous
config:
dataset_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
num_target_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
dp_degree:
instance_key: dp_degree
pass_type: BY_REFERENCE
local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size}
global_num_tokens: ${settings.training_target.num_target_tokens}
sequence_length: ${settings.step_profile.sequence_length}
gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps}
training_progress:
global_num_seen_tokens: 0
num_seen_steps: 0
num_seen_samples: 0
last_step: -1

collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
config:
sample_key: ${settings.referencing_keys.sample_key}
target_key: ${settings.referencing_keys.target_key}

train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: ${settings.paths.train_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
dataloader_tag: train
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: resumable_distributed_sampler
config:
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: true
seed: 42
drop_last: true
skip_num_global_samples: ${settings.training_progress.num_seen_samples}
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

test_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: ${settings.paths.test_dataset_path}
sequence_length: ${settings.step_profile.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

test_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
dataloader_tag: test
dataset:
instance_key: test_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.step_profile.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: false
drop_last: true
dataset:
instance_key: test_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

eval_dataloaders:
- instance_key: test_dataloader
pass_type: BY_REFERENCE

checkpoint_saving:
component_key: checkpoint_saving
variant_key: default
config:
checkpoint_saving_strategy:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: dcp
config:
checkpoint_path: ${settings.paths.checkpoint_saving_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}

loss_fn:
component_key: loss
variant_key: moe_cross_entropy
config:
target_key: ${settings.referencing_keys.target_key}
prediction_key: ${settings.referencing_keys.prediction_key}
model:
instance_key: model_raw
pass_type: BY_REFERENCE

device_mesh:
component_key: device_mesh
variant_key: default
config:
device_type: cuda
data_parallel_replicate_degree: 1
data_parallel_shard_degree: -1
expert_parallel_degree: 4
world_size: ${settings.cuda_env.world_size}

dp_degree:
component_key: number_conversion
variant_key: parallel_degree
config:
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE
parallelism_methods: [dp_shard, dp_replicate]

app_state:
component_key: app_state
variant_key: raw
config:
model:
instance_key: initialized_model
pass_type: BY_REFERENCE
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE
lr_scheduler:
instance_key: lr_scheduler
pass_type: BY_REFERENCE

initialized_model:
component_key: model
variant_key: model_initialized
config:
model:
instance_key: fsdp_model
pass_type: BY_REFERENCE
model_initializer:
component_key: model_initialization
variant_key: composed
config:
model_type: gpt2
weight_init_type: scaled
mean: 0.0
std: 0.02
num_layers: ${model_raw.config.num_layers}
multi_device_generator_policy: error

ep_model:
component_key: model
variant_key: ep_wrapped
config:
model:
instance_key: model_raw
pass_type: BY_REFERENCE
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE
mixed_precision_settings:
param_dtype: BF_16
reduce_dtype: BF_16
block_names: [TransformerBlock]

ac_model:
component_key: model
variant_key: activation_checkpointed
config:
model:
instance_key: ep_model
pass_type: BY_REFERENCE
ac_variant: full_activation_checkpointing
layers_fqn: layers
ac_fun_params:
ac_freq: 1

fsdp_model:
component_key: model
variant_key: fsdp2_wrapped
config:
model:
instance_key: ac_model
pass_type: BY_REFERENCE
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE
mixed_precision_settings:
param_dtype: BF_16
reduce_dtype: BF_16
reshard_after_forward: true
block_names: [TransformerBlock]

model_raw:
component_key: model
variant_key: moe
config:
sample_key: ${settings.referencing_keys.sample_key}
prediction_key: ${loss_fn.config.prediction_key}
vocab_size: 50304
max_seq_len: ${settings.step_profile.sequence_length}
d_model: 128
n_heads: 8
n_kv_heads: 4
num_layers: 2
d_ff: 128
attn_dropout: 0.0
ffn_dropout: 0.0
tie_embeddings: false
norm_eps: 1e-6
rope_base: 1000000.0
moe_num_experts: 8
moe_top_k: 2
moe_d_ff: 128
moe_capacity_factor: 1.25
moe_min_capacity: 4
moe_overflow_policy: residual
moe_router_noise_std: 0.0
moe_router_temperature: 1.0
moe_router_dropout: 0.0
moe_aux_loss_coef: 0.001
moe_z_loss_coef: 0.0

lr_scheduler:
component_key: scheduler
variant_key: onecycle_lr
config:
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE
max_lr: 6e-4
div_factor: 10
final_div_factor: 1
total_steps: ${settings.training_target.num_target_steps}
pct_start: 0.01
anneal_strategy: cos
last_epoch: ${settings.training_progress.last_step}

optimizer:
component_key: optimizer
variant_key: ep_adam_w
config:
lr: 0.0001
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
weight_decay_groups_excluded: [embedding, layernorm]
wrapped_model:
instance_key: initialized_model
pass_type: BY_REFERENCE
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE

gradient_clipper:
component_key: gradient_clipper
variant_key: ep
config:
wrapped_model:
instance_key: initialized_model
pass_type: BY_REFERENCE
norm_type: P2_NORM
max_norm: 1.0
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE

progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
global_rank: ${settings.cuda_env.global_rank}
num_seen_steps: ${settings.training_progress.num_seen_steps}
num_target_steps: ${settings.training_target.num_target_steps}
train_dataloader_tag: ${train_dataloader.config.dataloader_tag}
eval_dataloaders:
instance_key: eval_dataloaders
pass_type: BY_REFERENCE

evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
config:
global_rank: ${settings.cuda_env.global_rank}
project: modalities_dcp_tests
mode: OFFLINE
experiment_id: ${settings.experiment_id}
directory: wandb_storage
config_file_path: ${settings.config_file_path}

mfu_calculator:
component_key: mfu_calculator
variant_key: gpt2
config:
n_layer: ${model_raw.config.num_layers}
sequence_length: ${settings.step_profile.sequence_length}
n_embd: ${model_raw.config.d_model}
world_size: ${settings.cuda_env.world_size}
wrapped_model:
instance_key: initialized_model
pass_type: BY_REFERENCE
device_mesh:
instance_key: device_mesh
pass_type: BY_REFERENCE
Loading
Loading