@@ -2409,75 +2409,75 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
24092409
24102410 # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
24112411 # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
2412- if self .using_pipeline_parallelism and self .mesh_axes and self .mesh_axes [0 ] == "stage" :
2413- self .ici_parallelism = [
2414- self .ici_diloco_parallelism ,
2415- self .ici_pipeline_parallelism ,
2416- self .ici_data_parallelism ,
2417- self .ici_fsdp_parallelism ,
2418- self .ici_fsdp_transpose_parallelism ,
2419- self .ici_sequence_parallelism ,
2420- self .ici_context_parallelism ,
2421- self .ici_context_autoregressive_parallelism ,
2422- self .ici_tensor_parallelism ,
2423- self .ici_tensor_transpose_parallelism ,
2424- self .ici_tensor_sequence_parallelism ,
2425- self .ici_expert_parallelism ,
2426- self .ici_autoregressive_parallelism ,
2427- ]
2428- self .dcn_parallelism = [
2429- self .dcn_diloco_parallelism ,
2430- self .dcn_pipeline_parallelism ,
2431- self .dcn_data_parallelism ,
2432- self .dcn_fsdp_parallelism ,
2433- self .dcn_fsdp_transpose_parallelism ,
2434- self .dcn_sequence_parallelism ,
2435- self .dcn_context_parallelism ,
2436- self .dcn_context_autoregressive_parallelism ,
2437- self .dcn_tensor_parallelism ,
2438- self .dcn_tensor_transpose_parallelism ,
2439- self .dcn_tensor_sequence_parallelism ,
2440- self .dcn_expert_parallelism ,
2441- self .dcn_autoregressive_parallelism ,
2442- ]
2443- else :
2444- ici_map = {
2445- "diloco" : self .ici_diloco_parallelism ,
2446- "data" : self .ici_data_parallelism ,
2447- "stage" : self .ici_pipeline_parallelism ,
2448- "fsdp" : self .ici_fsdp_parallelism ,
2449- "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2450- "sequence" : self .ici_sequence_parallelism ,
2451- "context" : self .ici_context_parallelism ,
2452- "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2453- "tensor" : self .ici_tensor_parallelism ,
2454- "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2455- "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2456- "model" : self .ici_tensor_parallelism ,
2457- "expert" : self .ici_expert_parallelism ,
2458- "autoregressive" : self .ici_autoregressive_parallelism ,
2459- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2460- }
2461- self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2462-
2463- dcn_map = {
2464- "diloco" : self .dcn_diloco_parallelism ,
2465- "data" : self .dcn_data_parallelism ,
2466- "stage" : self .dcn_pipeline_parallelism ,
2467- "fsdp" : self .dcn_fsdp_parallelism ,
2468- "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2469- "sequence" : self .dcn_sequence_parallelism ,
2470- "context" : self .dcn_context_parallelism ,
2471- "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2472- "tensor" : self .dcn_tensor_parallelism ,
2473- "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2474- "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2475- "model" : self .dcn_tensor_parallelism ,
2476- "expert" : self .dcn_expert_parallelism ,
2477- "autoregressive" : self .dcn_autoregressive_parallelism ,
2478- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2479- }
2480- self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
2412+ # if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
2413+ # self.ici_parallelism = [
2414+ # self.ici_diloco_parallelism,
2415+ # self.ici_pipeline_parallelism,
2416+ # self.ici_data_parallelism,
2417+ # self.ici_fsdp_parallelism,
2418+ # self.ici_fsdp_transpose_parallelism,
2419+ # self.ici_sequence_parallelism,
2420+ # self.ici_context_parallelism,
2421+ # self.ici_context_autoregressive_parallelism,
2422+ # self.ici_tensor_parallelism,
2423+ # self.ici_tensor_transpose_parallelism,
2424+ # self.ici_tensor_sequence_parallelism,
2425+ # self.ici_expert_parallelism,
2426+ # self.ici_autoregressive_parallelism,
2427+ # ]
2428+ # self.dcn_parallelism = [
2429+ # self.dcn_diloco_parallelism,
2430+ # self.dcn_pipeline_parallelism,
2431+ # self.dcn_data_parallelism,
2432+ # self.dcn_fsdp_parallelism,
2433+ # self.dcn_fsdp_transpose_parallelism,
2434+ # self.dcn_sequence_parallelism,
2435+ # self.dcn_context_parallelism,
2436+ # self.dcn_context_autoregressive_parallelism,
2437+ # self.dcn_tensor_parallelism,
2438+ # self.dcn_tensor_transpose_parallelism,
2439+ # self.dcn_tensor_sequence_parallelism,
2440+ # self.dcn_expert_parallelism,
2441+ # self.dcn_autoregressive_parallelism,
2442+ # ]
2443+ # else:
2444+ ici_map = {
2445+ "diloco" : self .ici_diloco_parallelism ,
2446+ "data" : self .ici_data_parallelism ,
2447+ "stage" : self .ici_pipeline_parallelism ,
2448+ "fsdp" : self .ici_fsdp_parallelism ,
2449+ "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2450+ "sequence" : self .ici_sequence_parallelism ,
2451+ "context" : self .ici_context_parallelism ,
2452+ "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2453+ "tensor" : self .ici_tensor_parallelism ,
2454+ "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2455+ "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2456+ "model" : self .ici_tensor_parallelism ,
2457+ "expert" : self .ici_expert_parallelism ,
2458+ "autoregressive" : self .ici_autoregressive_parallelism ,
2459+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2460+ }
2461+ self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2462+
2463+ dcn_map = {
2464+ "diloco" : self .dcn_diloco_parallelism ,
2465+ "data" : self .dcn_data_parallelism ,
2466+ "stage" : self .dcn_pipeline_parallelism ,
2467+ "fsdp" : self .dcn_fsdp_parallelism ,
2468+ "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2469+ "sequence" : self .dcn_sequence_parallelism ,
2470+ "context" : self .dcn_context_parallelism ,
2471+ "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2472+ "tensor" : self .dcn_tensor_parallelism ,
2473+ "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2474+ "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2475+ "model" : self .dcn_tensor_parallelism ,
2476+ "expert" : self .dcn_expert_parallelism ,
2477+ "autoregressive" : self .dcn_autoregressive_parallelism ,
2478+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2479+ }
2480+ self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
24812481
24822482 # Diloco params
24832483 self .num_diloco_replicas = int (self .ici_diloco_parallelism * self .dcn_diloco_parallelism )
0 commit comments