@@ -2502,75 +2502,75 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
25022502
25032503 # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
25042504 # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
2505- if self .using_pipeline_parallelism and self .mesh_axes and self .mesh_axes [0 ] == "stage" :
2506- self .ici_parallelism = [
2507- self .ici_diloco_parallelism ,
2508- self .ici_pipeline_parallelism ,
2509- self .ici_data_parallelism ,
2510- self .ici_fsdp_parallelism ,
2511- self .ici_fsdp_transpose_parallelism ,
2512- self .ici_sequence_parallelism ,
2513- self .ici_context_parallelism ,
2514- self .ici_context_autoregressive_parallelism ,
2515- self .ici_tensor_parallelism ,
2516- self .ici_tensor_transpose_parallelism ,
2517- self .ici_tensor_sequence_parallelism ,
2518- self .ici_expert_parallelism ,
2519- self .ici_autoregressive_parallelism ,
2520- ]
2521- self .dcn_parallelism = [
2522- self .dcn_diloco_parallelism ,
2523- self .dcn_pipeline_parallelism ,
2524- self .dcn_data_parallelism ,
2525- self .dcn_fsdp_parallelism ,
2526- self .dcn_fsdp_transpose_parallelism ,
2527- self .dcn_sequence_parallelism ,
2528- self .dcn_context_parallelism ,
2529- self .dcn_context_autoregressive_parallelism ,
2530- self .dcn_tensor_parallelism ,
2531- self .dcn_tensor_transpose_parallelism ,
2532- self .dcn_tensor_sequence_parallelism ,
2533- self .dcn_expert_parallelism ,
2534- self .dcn_autoregressive_parallelism ,
2535- ]
2536- else :
2537- ici_map = {
2538- "diloco" : self .ici_diloco_parallelism ,
2539- "data" : self .ici_data_parallelism ,
2540- "stage" : self .ici_pipeline_parallelism ,
2541- "fsdp" : self .ici_fsdp_parallelism ,
2542- "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2543- "sequence" : self .ici_sequence_parallelism ,
2544- "context" : self .ici_context_parallelism ,
2545- "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2546- "tensor" : self .ici_tensor_parallelism ,
2547- "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2548- "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2549- "model" : self .ici_tensor_parallelism ,
2550- "expert" : self .ici_expert_parallelism ,
2551- "autoregressive" : self .ici_autoregressive_parallelism ,
2552- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2553- }
2554- self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2555-
2556- dcn_map = {
2557- "diloco" : self .dcn_diloco_parallelism ,
2558- "data" : self .dcn_data_parallelism ,
2559- "stage" : self .dcn_pipeline_parallelism ,
2560- "fsdp" : self .dcn_fsdp_parallelism ,
2561- "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2562- "sequence" : self .dcn_sequence_parallelism ,
2563- "context" : self .dcn_context_parallelism ,
2564- "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2565- "tensor" : self .dcn_tensor_parallelism ,
2566- "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2567- "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2568- "model" : self .dcn_tensor_parallelism ,
2569- "expert" : self .dcn_expert_parallelism ,
2570- "autoregressive" : self .dcn_autoregressive_parallelism ,
2571- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2572- }
2573- self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
2505+ # if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
2506+ # self.ici_parallelism = [
2507+ # self.ici_diloco_parallelism,
2508+ # self.ici_pipeline_parallelism,
2509+ # self.ici_data_parallelism,
2510+ # self.ici_fsdp_parallelism,
2511+ # self.ici_fsdp_transpose_parallelism,
2512+ # self.ici_sequence_parallelism,
2513+ # self.ici_context_parallelism,
2514+ # self.ici_context_autoregressive_parallelism,
2515+ # self.ici_tensor_parallelism,
2516+ # self.ici_tensor_transpose_parallelism,
2517+ # self.ici_tensor_sequence_parallelism,
2518+ # self.ici_expert_parallelism,
2519+ # self.ici_autoregressive_parallelism,
2520+ # ]
2521+ # self.dcn_parallelism = [
2522+ # self.dcn_diloco_parallelism,
2523+ # self.dcn_pipeline_parallelism,
2524+ # self.dcn_data_parallelism,
2525+ # self.dcn_fsdp_parallelism,
2526+ # self.dcn_fsdp_transpose_parallelism,
2527+ # self.dcn_sequence_parallelism,
2528+ # self.dcn_context_parallelism,
2529+ # self.dcn_context_autoregressive_parallelism,
2530+ # self.dcn_tensor_parallelism,
2531+ # self.dcn_tensor_transpose_parallelism,
2532+ # self.dcn_tensor_sequence_parallelism,
2533+ # self.dcn_expert_parallelism,
2534+ # self.dcn_autoregressive_parallelism,
2535+ # ]
2536+ # else:
2537+ ici_map = {
2538+ "diloco" : self .ici_diloco_parallelism ,
2539+ "data" : self .ici_data_parallelism ,
2540+ "stage" : self .ici_pipeline_parallelism ,
2541+ "fsdp" : self .ici_fsdp_parallelism ,
2542+ "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2543+ "sequence" : self .ici_sequence_parallelism ,
2544+ "context" : self .ici_context_parallelism ,
2545+ "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2546+ "tensor" : self .ici_tensor_parallelism ,
2547+ "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2548+ "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2549+ "model" : self .ici_tensor_parallelism ,
2550+ "expert" : self .ici_expert_parallelism ,
2551+ "autoregressive" : self .ici_autoregressive_parallelism ,
2552+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2553+ }
2554+ self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2555+
2556+ dcn_map = {
2557+ "diloco" : self .dcn_diloco_parallelism ,
2558+ "data" : self .dcn_data_parallelism ,
2559+ "stage" : self .dcn_pipeline_parallelism ,
2560+ "fsdp" : self .dcn_fsdp_parallelism ,
2561+ "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2562+ "sequence" : self .dcn_sequence_parallelism ,
2563+ "context" : self .dcn_context_parallelism ,
2564+ "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2565+ "tensor" : self .dcn_tensor_parallelism ,
2566+ "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2567+ "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2568+ "model" : self .dcn_tensor_parallelism ,
2569+ "expert" : self .dcn_expert_parallelism ,
2570+ "autoregressive" : self .dcn_autoregressive_parallelism ,
2571+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2572+ }
2573+ self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
25742574
25752575 # Diloco params
25762576 self .num_diloco_replicas = int (self .ici_diloco_parallelism * self .dcn_diloco_parallelism )
0 commit comments