@@ -455,8 +455,7 @@ custom_mesh_and_rule: "" # replace default mesh and logical rule by specifying y
455455mesh_axes : ['diloco', 'data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'context_autoregressive', 'tensor', 'tensor_transpose', 'tensor_sequence', 'expert', 'autoregressive']
456456logical_axis_rules : [
457457 ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
458- ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
459- ['activation_batch_no_exp_moe', ['data', 'fsdp', 'fsdp_transpose']],
458+ ['activation_batch_moe', ['data', 'fsdp', 'fsdp_transpose']],
460459 ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert']],
461460 ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
462461 ['activation_heads', ['tensor', 'tensor_transpose', 'sequence','tensor_sequence','autoregressive']],
@@ -477,6 +476,7 @@ logical_axis_rules: [
477476 ['activation_embed', ['tensor', 'tensor_transpose']],
478477 ['activation_embed_moe', ['tensor', 'tensor_transpose']],
479478 ['activation_mlp', ['tensor', 'tensor_transpose', 'tensor_sequence']],
479+ ['activation_mlp_moe', ['tensor', 'tensor_transpose', 'tensor_sequence']],
480480 ['activation_kv', ['tensor', 'tensor_transpose', 'tensor_sequence']],
481481 ['activation_prefill_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
482482 ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose']],
@@ -490,6 +490,7 @@ logical_axis_rules: [
490490 ['decode_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
491491 ['decode_length', ['sequence']],
492492 ['mlp', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
493+ ['mlp_moe', ['fsdp_transpose', 'tensor', 'tensor_sequence', 'autoregressive']],
493494 ['mlp_no_fsdp', ['tensor', 'tensor_sequence', 'autoregressive']],
494495 ['vocab', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
495496 ['heads', ['tensor', 'tensor_transpose', 'tensor_sequence', 'autoregressive']],
@@ -499,18 +500,10 @@ logical_axis_rules: [
499500 ['embed', ['fsdp', 'sequence', 'tensor_transpose', 'context' , 'expert']],
500501 ['embed', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
501502 ['embed', ['fsdp', 'sequence', 'context', 'expert']],
502- ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context']],
503- ['embed_no_exp', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
504- ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
505- ['embed_no_exp', ['fsdp', 'sequence', 'context']],
506- ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context', 'expert']],
507- ['embed_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context' , 'expert']],
508- ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
509- ['embed_moe', ['fsdp', 'sequence', 'context', 'expert']],
510- ['embed_no_exp_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context']],
511- ['embed_no_exp_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
512- ['embed_no_exp_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
513- ['embed_no_exp_moe', ['fsdp', 'sequence', 'context']],
503+ ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context']],
504+ ['embed_moe', ['fsdp', 'sequence', 'tensor_transpose', 'context']],
505+ ['embed_moe', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
506+ ['embed_moe', ['fsdp', 'sequence', 'context']],
514507 ['embed_tensor_transpose', ['tensor_transpose']],
515508 ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
516509 ['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
0 commit comments