@@ -2403,75 +2403,75 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
24032403
24042404 # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
24052405 # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
2406- if self .using_pipeline_parallelism and self .mesh_axes and self .mesh_axes [0 ] == "stage" :
2407- self .ici_parallelism = [
2408- self .ici_diloco_parallelism ,
2409- self .ici_pipeline_parallelism ,
2410- self .ici_data_parallelism ,
2411- self .ici_fsdp_parallelism ,
2412- self .ici_fsdp_transpose_parallelism ,
2413- self .ici_sequence_parallelism ,
2414- self .ici_context_parallelism ,
2415- self .ici_context_autoregressive_parallelism ,
2416- self .ici_tensor_parallelism ,
2417- self .ici_tensor_transpose_parallelism ,
2418- self .ici_tensor_sequence_parallelism ,
2419- self .ici_expert_parallelism ,
2420- self .ici_autoregressive_parallelism ,
2421- ]
2422- self .dcn_parallelism = [
2423- self .dcn_diloco_parallelism ,
2424- self .dcn_pipeline_parallelism ,
2425- self .dcn_data_parallelism ,
2426- self .dcn_fsdp_parallelism ,
2427- self .dcn_fsdp_transpose_parallelism ,
2428- self .dcn_sequence_parallelism ,
2429- self .dcn_context_parallelism ,
2430- self .dcn_context_autoregressive_parallelism ,
2431- self .dcn_tensor_parallelism ,
2432- self .dcn_tensor_transpose_parallelism ,
2433- self .dcn_tensor_sequence_parallelism ,
2434- self .dcn_expert_parallelism ,
2435- self .dcn_autoregressive_parallelism ,
2436- ]
2437- else :
2438- ici_map = {
2439- "diloco" : self .ici_diloco_parallelism ,
2440- "data" : self .ici_data_parallelism ,
2441- "stage" : self .ici_pipeline_parallelism ,
2442- "fsdp" : self .ici_fsdp_parallelism ,
2443- "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2444- "sequence" : self .ici_sequence_parallelism ,
2445- "context" : self .ici_context_parallelism ,
2446- "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2447- "tensor" : self .ici_tensor_parallelism ,
2448- "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2449- "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2450- "model" : self .ici_tensor_parallelism ,
2451- "expert" : self .ici_expert_parallelism ,
2452- "autoregressive" : self .ici_autoregressive_parallelism ,
2453- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2454- }
2455- self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2456-
2457- dcn_map = {
2458- "diloco" : self .dcn_diloco_parallelism ,
2459- "data" : self .dcn_data_parallelism ,
2460- "stage" : self .dcn_pipeline_parallelism ,
2461- "fsdp" : self .dcn_fsdp_parallelism ,
2462- "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2463- "sequence" : self .dcn_sequence_parallelism ,
2464- "context" : self .dcn_context_parallelism ,
2465- "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2466- "tensor" : self .dcn_tensor_parallelism ,
2467- "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2468- "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2469- "model" : self .dcn_tensor_parallelism ,
2470- "expert" : self .dcn_expert_parallelism ,
2471- "autoregressive" : self .dcn_autoregressive_parallelism ,
2472- "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2473- }
2474- self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
2406+ # if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
2407+ # self.ici_parallelism = [
2408+ # self.ici_diloco_parallelism,
2409+ # self.ici_pipeline_parallelism,
2410+ # self.ici_data_parallelism,
2411+ # self.ici_fsdp_parallelism,
2412+ # self.ici_fsdp_transpose_parallelism,
2413+ # self.ici_sequence_parallelism,
2414+ # self.ici_context_parallelism,
2415+ # self.ici_context_autoregressive_parallelism,
2416+ # self.ici_tensor_parallelism,
2417+ # self.ici_tensor_transpose_parallelism,
2418+ # self.ici_tensor_sequence_parallelism,
2419+ # self.ici_expert_parallelism,
2420+ # self.ici_autoregressive_parallelism,
2421+ # ]
2422+ # self.dcn_parallelism = [
2423+ # self.dcn_diloco_parallelism,
2424+ # self.dcn_pipeline_parallelism,
2425+ # self.dcn_data_parallelism,
2426+ # self.dcn_fsdp_parallelism,
2427+ # self.dcn_fsdp_transpose_parallelism,
2428+ # self.dcn_sequence_parallelism,
2429+ # self.dcn_context_parallelism,
2430+ # self.dcn_context_autoregressive_parallelism,
2431+ # self.dcn_tensor_parallelism,
2432+ # self.dcn_tensor_transpose_parallelism,
2433+ # self.dcn_tensor_sequence_parallelism,
2434+ # self.dcn_expert_parallelism,
2435+ # self.dcn_autoregressive_parallelism,
2436+ # ]
2437+ # else:
2438+ ici_map = {
2439+ "diloco" : self .ici_diloco_parallelism ,
2440+ "data" : self .ici_data_parallelism ,
2441+ "stage" : self .ici_pipeline_parallelism ,
2442+ "fsdp" : self .ici_fsdp_parallelism ,
2443+ "fsdp_transpose" : self .ici_fsdp_transpose_parallelism ,
2444+ "sequence" : self .ici_sequence_parallelism ,
2445+ "context" : self .ici_context_parallelism ,
2446+ "context_autoregressive" : self .ici_context_autoregressive_parallelism ,
2447+ "tensor" : self .ici_tensor_parallelism ,
2448+ "tensor_transpose" : self .ici_tensor_transpose_parallelism ,
2449+ "tensor_sequence" : self .ici_tensor_sequence_parallelism ,
2450+ "model" : self .ici_tensor_parallelism ,
2451+ "expert" : self .ici_expert_parallelism ,
2452+ "autoregressive" : self .ici_autoregressive_parallelism ,
2453+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2454+ }
2455+ self .ici_parallelism = [ici_map [axis ] for axis in self .mesh_axes ]
2456+
2457+ dcn_map = {
2458+ "diloco" : self .dcn_diloco_parallelism ,
2459+ "data" : self .dcn_data_parallelism ,
2460+ "stage" : self .dcn_pipeline_parallelism ,
2461+ "fsdp" : self .dcn_fsdp_parallelism ,
2462+ "fsdp_transpose" : self .dcn_fsdp_transpose_parallelism ,
2463+ "sequence" : self .dcn_sequence_parallelism ,
2464+ "context" : self .dcn_context_parallelism ,
2465+ "context_autoregressive" : self .dcn_context_autoregressive_parallelism ,
2466+ "tensor" : self .dcn_tensor_parallelism ,
2467+ "tensor_transpose" : self .dcn_tensor_transpose_parallelism ,
2468+ "tensor_sequence" : self .dcn_tensor_sequence_parallelism ,
2469+ "model" : self .dcn_tensor_parallelism ,
2470+ "expert" : self .dcn_expert_parallelism ,
2471+ "autoregressive" : self .dcn_autoregressive_parallelism ,
2472+ "attn_dp" : 1 , # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
2473+ }
2474+ self .dcn_parallelism = [dcn_map [axis ] for axis in self .mesh_axes ]
24752475
24762476 # Diloco params
24772477 self .num_diloco_replicas = int (self .ici_diloco_parallelism * self .dcn_diloco_parallelism )
0 commit comments