chore: Change order of model initialization

rrutmann · Copilot · rrutmann · commit ede150ee11c6 · 2026-05-07T15:35:05.000Z
Co-authored-by: Copilot &lt;copilot@github.com&gt;
diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass_defer_init.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass_defer_init.yaml
@@ -0,0 +1,186 @@
+settings:
+  experiment_id: ${modalities_env:experiment_id}
+  config_file_path: ${modalities_env:config_file_path}
+  referencing_keys:
+    sample_key: input_ids
+    target_key: target_ids
+    prediction_key: logits
+  cuda_env:
+    local_rank: ${cuda_env:LOCAL_RANK}
+    global_rank: ${cuda_env:RANK}
+    world_size: ${cuda_env:WORLD_SIZE}
+  step_profile:
+    gradient_accumulation_steps: 1
+    local_train_micro_batch_size: 4
+    sequence_length: 256
+
+loss_fn:
+  component_key: loss
+  variant_key: clm_cross_entropy_loss
+  config:
+    target_key: ${settings.referencing_keys.target_key}
+    prediction_key: ${settings.referencing_keys.prediction_key}
+
+device_mesh:
+  component_key: device_mesh
+  variant_key: default
+  config:
+    device_type: cuda
+    data_parallel_replicate_degree: 1
+    pipeline_parallel_degree: 2
+    data_parallel_shard_degree: -1
+    world_size: ${settings.cuda_env.world_size}
+
+initialized_model:
+  component_key: model
+  variant_key: model_initialized
+  config:
+    model:
+      component_key: pipeline
+      variant_key: selector
+      config:
+        pipeline:
+          instance_key: scheduled_pipeline
+          pass_type: BY_REFERENCE
+        selection_type: MODEL_PART
+    model_initializer:
+      component_key: model_initialization
+      variant_key: composed
+      config:
+        model_type: gpt2
+        weight_init_type: scaled
+        mean: 0.0
+        std: 0.02
+        seed: 42
+        num_layers: ${model_raw.config.n_layer}
+        device_mesh:
+          instance_key: device_mesh
+          pass_type: BY_REFERENCE
+
+scheduled_pipeline:
+  component_key: pipeline
+  variant_key: scheduled
+  config:
+    loss_fn:
+      instance_key: loss_fn
+      pass_type: BY_REFERENCE
+    pp_schedule_name: Interleaved1F1B
+    batch_size: ${settings.step_profile.local_train_micro_batch_size}
+    microbatch_size: 2
+    pp_degree: ${device_mesh.config.pipeline_parallel_degree}
+    pipeline:
+      component_key: pipeline
+      variant_key: builder
+      config:
+        pp_stage:
+          component_key: pipeline
+          variant_key: selector
+          config:
+            pipeline:
+              instance_key: staged_pipeline
+              pass_type: BY_REFERENCE
+            selection_type: PP_STAGE
+        model_part:
+          instance_key: fsdp_model
+          pass_type: BY_REFERENCE
+        
+fsdp_model:
+  component_key: model
+  variant_key: fsdp2_wrapped
+  config:
+    model:
+      instance_key: gpt2_tp_model
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    mixed_precision_settings:
+      param_dtype: BF_16
+      reduce_dtype: BF_16
+    block_names: [GPT2Block]
+
+gpt2_tp_model:
+  component_key: model
+  variant_key: gpt2_tp
+  config:
+    model:
+      instance_key: model_part
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+
+model_part:
+  component_key: pipeline
+  variant_key: selector
+  config:
+    pipeline:
+      instance_key: staged_pipeline
+      pass_type: BY_REFERENCE
+    selection_type: MODEL_PART
+
+staged_pipeline:
+  component_key: pipeline
+  variant_key: staged
+  config:
+    whole_model:
+      instance_key: model_raw
+      pass_type: BY_REFERENCE
+    stages_generator:
+      component_key: stages_generator
+      variant_key: gpt2_stages_generator
+      config:
+        num_model_layers: ${model_raw.config.n_layer}
+        input_layer_equivalence: 1
+        output_layer_equivalence: 1
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    local_rank: ${settings.cuda_env.local_rank}
+    pp_schedule_name: ${scheduled_pipeline.config.pp_schedule_name}
+    num_layers_per_stage: 4
+
+model_raw:
+  component_key: model
+  variant_key: gpt2
+  config:
+    use_meta_device: true
+    use_weight_tying: false
+    sample_key: ${settings.referencing_keys.sample_key}
+    poe_type: NOPE
+    sequence_length: ${settings.step_profile.sequence_length}
+    prediction_key: ${loss_fn.config.prediction_key}
+    vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: 6
+    n_head_q: 8
+    n_head_kv: 4
+    ffn_hidden: 128
+    n_embd: 128
+    dropout: 0.0
+    bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    attention_config:
+      qkv_transforms:
+        - type_hint: RotaryTransform
+          config:
+            n_embd: ${model_raw.config.n_embd}
+            n_head: ${model_raw.config.n_head_q} #it has to be head_q here
+            seq_length_dim: -2
+            base_freq: 10000
+    attention_implementation: manual
+    activation_type: swiglu
+    attention_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    ffn_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    lm_head_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+
diff --git a/tests/fsdp2_parallelization/test_parallel_seed_initialization.py b/tests/fsdp2_parallelization/test_parallel_seed_initialization.py
@@ -68,7 +68,7 @@ def _seed_distribution_impl_wrapper(self, process_id: int, world_size: int, tmp_
     def _seed_distribution_impl(self, world_size: int, tmp_path: Path):
         # initialize components
         class ComponentsInstantiationModel(BaseModel):
-            fsdp_model: PydanticFSDP2ModuleType | list[PydanticFSDP2ModuleType]
+            initialized_model: PydanticFSDP2ModuleType | list[PydanticFSDP2ModuleType]
             device_mesh: PydanticDeviceMeshIFType
 
         config_file_path = self._get_tmp_sharding_config_path(dp_degree=2, tp_degree=2, pp_degree=2, tmp_path=tmp_path)
@@ -78,7 +78,10 @@ class ComponentsInstantiationModel(BaseModel):
             main_obj.build_components(components_model_type=ComponentsInstantiationModel),
         )
         model = cast(
-            Any, components.fsdp_model[0] if isinstance(components.fsdp_model, list) else components.fsdp_model
+            Any,
+            components.initialized_model[0]
+            if isinstance(components.initialized_model, list)
+            else components.initialized_model,
         )
         device_mesh = components.device_mesh
         # for each pp stage get first transformer block's MLP weight parameter shards and full tensor
@@ -148,7 +151,8 @@ def _get_tmp_sharding_config_path(self, dp_degree: int, tp_degree: int, pp_degre
         temp_file_path = tmp_path / "pp_tp_sharding_config.yaml"
         working_dir = Path(os.path.dirname(__file__))
         config_file_path = (
-            working_dir / "pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass.yaml"
+            working_dir
+            / "pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass_defer_init.yaml"
         )
 
         with open(config_file_path, "r") as file: